Repository: IntelligentSoftwareSystems/Galois
Branch: master
Commit: b67f94206a8c
Files: 1074
Total size: 8.2 MB

Directory structure:
gitextract_7r5c247e/

├── .circleci/
│   ├── config.yml
│   └── longest_common_path.sh
├── .clang-format
├── .clang-tidy
├── .git-blame-ignore-revs
├── .gitignore
├── .gitmodules
├── .travis.yml
├── CMakeLists.txt
├── COPYRIGHT
├── LICENSE.txt
├── README.md
├── cmake/
│   ├── GaloisConfig.cmake.in
│   └── Modules/
│       ├── CheckArchFlags.cmake
│       ├── CheckCilk.cmake
│       ├── CheckEndian.cmake
│       ├── CheckHugePages.cmake
│       ├── CheckMmap.cmake
│       ├── CheckSchedSetAffinity.cmake
│       ├── FindCBLAS.cmake
│       ├── FindFortran.cmake
│       ├── FindGASNET.cmake
│       ├── FindGMP.cmake
│       ├── FindGit.cmake
│       ├── FindNUMA.cmake
│       ├── FindOpenCL.cmake
│       ├── FindPAPI.cmake
│       ├── FindQGLViewer.cmake
│       ├── FindTBB.cmake
│       ├── FindVTune.cmake
│       ├── GetGitVersion-write.cmake
│       ├── GetGitVersion.cmake
│       ├── HandleSanitizer.cmake
│       ├── LibFindMacros.cmake
│       └── UseStdMacro.cmake
├── config/
│   ├── sanitizers/
│   │   └── ubsan_blacklist.txt.in
│   └── version.txt
├── external/
│   └── bliss/
│       └── bliss/
│           ├── COPYING
│           ├── COPYING.LESSER
│           ├── abgraph.hh
│           ├── bignum.hh
│           ├── defs.hh
│           ├── graph.hh
│           ├── heap.hh
│           ├── kqueue.hh
│           ├── kstack.hh
│           ├── orbit.hh
│           ├── partition.hh
│           ├── search.h
│           ├── uintseqhash.hh
│           └── utils.hh
├── inputs/
│   ├── CMakeLists.txt
│   └── cholesky/
│       ├── matrix1.txt
│       ├── matrix1.txt.choleskyedges
│       ├── matrix1.txt.dep
│       ├── matrix1.txt.filled
│       ├── very-sparse.txt
│       ├── very-sparse.txt.choleskyedges
│       ├── very-sparse.txt.dep
│       └── very-sparse.txt.filled
├── libcusp/
│   ├── CMakeLists.txt
│   └── include/
│       └── galois/
│           └── graphs/
│               ├── BasePolicies.h
│               ├── CuSPPartitioner.h
│               ├── DistributedGraph.h
│               ├── GenericPartitioners.h
│               ├── MiningPartitioner.h
│               └── NewGeneric.h
├── libdist/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── galois/
│   │       ├── DReducible.h
│   │       ├── DTerminationDetector.h
│   │       ├── DistGalois.h
│   │       └── runtime/
│   │           ├── BareMPI.h
│   │           ├── DistStats.h
│   │           ├── LWCI.h
│   │           ├── MemUsage.h
│   │           ├── Network.h
│   │           ├── NetworkIO.h
│   │           └── Serialize.h
│   └── src/
│       ├── Barrier.cpp
│       ├── DistGalois.cpp
│       ├── DistStats.cpp
│       ├── Network.cpp
│       ├── NetworkBuffered.cpp
│       ├── NetworkIOMPI.cpp
│       └── NetworkLCI.cpp
├── libgalois/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── galois/
│   │       ├── ArrayWrapper.h
│   │       ├── AtomicHelpers.h
│   │       ├── AtomicWrapper.h
│   │       ├── Bag.h
│   │       ├── CheckedObject.h
│   │       ├── CopyableTuple.h
│   │       ├── DynamicBitset.h
│   │       ├── Endian.h
│   │       ├── FixedSizeRing.h
│   │       ├── FlatMap.h
│   │       ├── Galois.h
│   │       ├── GaloisForwardDecl.h
│   │       ├── LargeArray.h
│   │       ├── LazyArray.h
│   │       ├── LazyObject.h
│   │       ├── Loops.h
│   │       ├── Mem.h
│   │       ├── MethodFlags.h
│   │       ├── NoDerefIterator.h
│   │       ├── PODResizeableArray.h
│   │       ├── ParallelSTL.h
│   │       ├── PerThreadContainer.h
│   │       ├── PriorityQueue.h
│   │       ├── Reduction.h
│   │       ├── SharedMemSys.h
│   │       ├── Threads.h
│   │       ├── Timer.h
│   │       ├── Traits.h
│   │       ├── TwoLevelIterator.h
│   │       ├── TwoLevelIteratorA.h
│   │       ├── UnionFind.h
│   │       ├── UserContext.h
│   │       ├── Version.h
│   │       ├── config.h.in
│   │       ├── gIO.h
│   │       ├── gdeque.h
│   │       ├── graphs/
│   │       │   ├── BufferedGraph.h
│   │       │   ├── Details.h
│   │       │   ├── FileGraph.h
│   │       │   ├── Graph.h
│   │       │   ├── GraphHelpers.h
│   │       │   ├── LCGraph.h
│   │       │   ├── LC_Adaptor_Graph.h
│   │       │   ├── LC_CSR_CSC_Graph.h
│   │       │   ├── LC_CSR_Graph.h
│   │       │   ├── LC_CSR_Hypergraph.h
│   │       │   ├── LC_InOut_Graph.h
│   │       │   ├── LC_InlineEdge_Graph.h
│   │       │   ├── LC_Linear_Graph.h
│   │       │   ├── LC_Morph_Graph.h
│   │       │   ├── MorphGraph.h
│   │       │   ├── MorphHyperGraph.h
│   │       │   ├── Morph_SepInOut_Graph.h
│   │       │   ├── OCGraph.h
│   │       │   ├── OfflineGraph.h
│   │       │   ├── ReadGraph.h
│   │       │   ├── SpatialTree.h
│   │       │   └── TypeTraits.h
│   │       ├── gslist.h
│   │       ├── gstl.h
│   │       ├── optional.h
│   │       ├── runtime/
│   │       │   ├── Context.h
│   │       │   ├── Executor_Deterministic.h
│   │       │   ├── Executor_DoAll.h
│   │       │   ├── Executor_ForEach.h
│   │       │   ├── Executor_OnEach.h
│   │       │   ├── Executor_Ordered.h
│   │       │   ├── Executor_ParaMeter.h
│   │       │   ├── ExtraTraits.h
│   │       │   ├── Iterable.h
│   │       │   ├── LoopStatistics.h
│   │       │   ├── Mem.h
│   │       │   ├── OperatorReferenceTypes.h
│   │       │   ├── PagePool.h
│   │       │   ├── Profile.h
│   │       │   ├── Range.h
│   │       │   ├── SharedMem.h
│   │       │   ├── Statistics.h
│   │       │   ├── Substrate.h
│   │       │   ├── ThreadTimer.h
│   │       │   ├── TiledExecutor.h
│   │       │   ├── Tracer.h
│   │       │   └── UserContextAccess.h
│   │       ├── substrate/
│   │       │   ├── Barrier.h
│   │       │   ├── CacheLineStorage.h
│   │       │   ├── CompilerSpecific.h
│   │       │   ├── EnvCheck.h
│   │       │   ├── HWTopo.h
│   │       │   ├── NumaMem.h
│   │       │   ├── PaddedLock.h
│   │       │   ├── PageAlloc.h
│   │       │   ├── PerThreadStorage.h
│   │       │   ├── PtrLock.h
│   │       │   ├── SharedMem.h
│   │       │   ├── SimpleLock.h
│   │       │   ├── StaticInstance.h
│   │       │   ├── Termination.h
│   │       │   ├── ThreadPool.h
│   │       │   └── ThreadRWlock.h
│   │       └── worklists/
│   │           ├── AdaptiveObim.h
│   │           ├── BulkSynchronous.h
│   │           ├── Chunk.h
│   │           ├── ExternalReference.h
│   │           ├── LocalQueue.h
│   │           ├── Obim.h
│   │           ├── OrderedList.h
│   │           ├── OwnerComputes.h
│   │           ├── PerThreadChunk.h
│   │           ├── Simple.h
│   │           ├── StableIterator.h
│   │           ├── WLCompileCheck.h
│   │           ├── WorkList.h
│   │           └── WorkListHelpers.h
│   ├── src/
│   │   ├── Barrier.cpp
│   │   ├── Barrier_Counting.cpp
│   │   ├── Barrier_Dissemination.cpp
│   │   ├── Barrier_MCS.cpp
│   │   ├── Barrier_Pthread.cpp
│   │   ├── Barrier_Simple.cpp
│   │   ├── Barrier_Topo.cpp
│   │   ├── Context.cpp
│   │   ├── Deterministic.cpp
│   │   ├── DynamicBitset.cpp
│   │   ├── EnvCheck.cpp
│   │   ├── FileGraph.cpp
│   │   ├── FileGraphParallel.cpp
│   │   ├── GraphHelpers.cpp
│   │   ├── HWTopo.cpp
│   │   ├── HWTopoDarwin.cpp
│   │   ├── HWTopoLinux.cpp
│   │   ├── Mem.cpp
│   │   ├── NumaMem.cpp
│   │   ├── OCFileGraph.cpp
│   │   ├── PageAlloc.cpp
│   │   ├── PagePool.cpp
│   │   ├── ParaMeter.cpp
│   │   ├── PerThreadStorage.cpp
│   │   ├── PreAlloc.cpp
│   │   ├── Profile.cpp
│   │   ├── PtrLock.cpp
│   │   ├── SharedMem.cpp
│   │   ├── SharedMemSys.cpp
│   │   ├── SimpleLock.cpp
│   │   ├── Statistics.cpp
│   │   ├── Substrate.cpp
│   │   ├── Support.cpp
│   │   ├── Termination.cpp
│   │   ├── ThreadPool.cpp
│   │   ├── ThreadTimer.cpp
│   │   ├── Threads.cpp
│   │   ├── Timer.cpp
│   │   ├── Tracer.cpp
│   │   ├── Version.cpp.in
│   │   └── gIO.cpp
│   └── test/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── acquire.cpp
│       ├── bandwidth.cpp
│       ├── barriers.cpp
│       ├── empty-member-lcgraph.cpp
│       ├── flatmap.cpp
│       ├── floatingPointErrors.cpp
│       ├── foreach.cpp
│       ├── forward-declare-graph.cpp
│       ├── gcollections.cpp
│       ├── graph-compile.cpp
│       ├── graph.cpp
│       ├── gslist.cpp
│       ├── hwtopo.cpp
│       ├── lc-adaptor.cpp
│       ├── lock.cpp
│       ├── lockmgr.cpp
│       ├── loop-overhead.cpp
│       ├── mem.cpp
│       ├── morphgraph-removal.cpp
│       ├── morphgraph.cpp
│       ├── move.cpp
│       ├── oneach.cpp
│       ├── papi.cpp
│       ├── pc.cpp
│       ├── reduction.cpp
│       ├── sort.cpp
│       ├── static.cpp
│       ├── traits.cpp
│       ├── twoleveliteratora.cpp
│       ├── wakeup-overhead.cpp
│       └── worklists-compile.cpp
├── libgluon/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── galois/
│   │       ├── cuda/
│   │       │   ├── Context.h
│   │       │   ├── DynamicBitset.h
│   │       │   ├── EdgeContext.h
│   │       │   ├── EdgeHostDecls.h
│   │       │   └── HostDecls.h
│   │       ├── graphs/
│   │       │   ├── GluonEdgeSubstrate.h
│   │       │   └── GluonSubstrate.h
│   │       └── runtime/
│   │           ├── DataCommMode.h
│   │           ├── GlobalObj.h
│   │           ├── SyncStructures.h
│   │           └── cuda/
│   │               ├── DeviceEdgeSync.h
│   │               └── DeviceSync.h
│   └── src/
│       ├── GlobalObj.cpp
│       ├── GluonSubstrate.cpp
│       ├── SyncStructures.cpp
│       └── cuda_device.cpp
├── libgpu/
│   ├── CMakeLists.txt
│   ├── include/
│   │   ├── Timer.h
│   │   ├── abitset.h
│   │   ├── aolist.h
│   │   ├── atomic_helpers.h
│   │   ├── bmk2.h
│   │   ├── component.h
│   │   ├── counter.h
│   │   ├── csr_graph.h
│   │   ├── cuda_launch_config.hpp
│   │   ├── cutil_subset.h
│   │   ├── exclusive.h
│   │   ├── failfast.h
│   │   ├── gbar.cuh
│   │   ├── gg.h
│   │   ├── ggc_rt.h
│   │   ├── ggcuda.h
│   │   ├── instr.h
│   │   ├── internal.h
│   │   ├── lockarray.h
│   │   ├── pipe.h
│   │   ├── rv.h
│   │   ├── sharedptr.h
│   │   ├── snfile.h
│   │   ├── thread_work.h
│   │   └── worklist.h
│   └── src/
│       ├── bmk2.c
│       ├── csr_graph.cu
│       ├── ggc_rt.cu
│       ├── instr.cu
│       ├── skelapp/
│       │   └── skel.cu
│       ├── snappy.c
│       └── snappy_test.c
├── libpangolin/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gpu/
│   │   └── pangolin/
│   │       ├── bitsets.h
│   │       ├── checker.h
│   │       ├── cutils.h
│   │       ├── element.cuh
│   │       ├── embedding.cuh
│   │       ├── graph_gpu.h
│   │       ├── miner.cuh
│   │       ├── timer.h
│   │       └── types.cuh
│   ├── include/
│   │   └── pangolin/
│   │       ├── BfsMining/
│   │       │   ├── edge_miner.h
│   │       │   ├── edge_miner_api.h
│   │       │   ├── embedding_list.h
│   │       │   ├── engine.h
│   │       │   ├── vertex_miner.h
│   │       │   └── vertex_miner_api.h
│   │       ├── base_embedding.h
│   │       ├── canonical_graph.h
│   │       ├── core.h
│   │       ├── domain_support.h
│   │       ├── edge_embedding.h
│   │       ├── edge_type.h
│   │       ├── element.h
│   │       ├── embedding.h
│   │       ├── embedding_queue.h
│   │       ├── equivalence.h
│   │       ├── gtypes.h
│   │       ├── mgraph.h
│   │       ├── miner.h
│   │       ├── ptypes.h
│   │       ├── quick_pattern.h
│   │       ├── res_man.h
│   │       ├── scan.h
│   │       ├── types.h
│   │       ├── util.h
│   │       └── vertex_embedding.h
│   └── src/
│       ├── BfsMining/
│       │   └── embedding_list.cpp
│       ├── base_embedding.cpp
│       ├── equivalence.cpp
│       ├── quick_pattern.cpp
│       └── vertex_embedding.cpp
├── libpygalois/
│   ├── CMakeLists.txt
│   └── include/
│       └── galois/
│           └── Constants.h
├── libsupport/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── galois/
│   │       ├── GetEnv.h
│   │       └── Logging.h
│   ├── src/
│   │   ├── GetEnv.cpp
│   │   └── Logging.cpp
│   └── test/
│       ├── CMakeLists.txt
│       ├── getenv.cpp
│       └── logging.cpp
├── lonestar/
│   ├── CMakeLists.txt
│   ├── analytics/
│   │   ├── CMakeLists.txt
│   │   ├── cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── betweennesscentrality/
│   │   │   │   ├── AsyncStructs.h
│   │   │   │   ├── BCEdge.h
│   │   │   │   ├── BCNode.h
│   │   │   │   ├── BetweennessCentrality.cpp
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── LevelStructs.h
│   │   │   │   ├── OuterStructs.h
│   │   │   │   ├── README.md
│   │   │   │   └── control.h
│   │   │   ├── bfs/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── bfs.cpp
│   │   │   │   └── bfsDirectionOpt.cpp
│   │   │   ├── bipart/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Coarsening.cpp
│   │   │   │   ├── Metric.cpp
│   │   │   │   ├── Partitioning.cpp
│   │   │   │   ├── README.md
│   │   │   │   ├── Refine.cpp
│   │   │   │   ├── bipart.cpp
│   │   │   │   └── bipart.h
│   │   │   ├── clustering/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── clustering.h
│   │   │   │   ├── leidenClustering.cpp
│   │   │   │   └── louvainClustering.cpp
│   │   │   ├── connected-components/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── ConnectedComponents.cpp
│   │   │   │   └── README.md
│   │   │   ├── gmetis/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Coarsening.cpp
│   │   │   │   ├── GMetis.cpp
│   │   │   │   ├── GraphReader.h
│   │   │   │   ├── Metis.h
│   │   │   │   ├── Metric.cpp
│   │   │   │   ├── Partitioning.cpp
│   │   │   │   ├── README.md
│   │   │   │   └── Refine.cpp
│   │   │   ├── independentset/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── IndependentSet.cpp
│   │   │   │   └── README.md
│   │   │   ├── k-core/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   └── kcore.cpp
│   │   │   ├── k-truss/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── K-Truss.cpp
│   │   │   │   ├── README.md
│   │   │   │   ├── Verify.cpp
│   │   │   │   └── bmktest2.py
│   │   │   ├── matching/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   └── bipartite-mcm.cpp
│   │   │   ├── matrixcompletion/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── bipartite-gen.py
│   │   │   │   ├── commandLineParam.h
│   │   │   │   ├── matrixCompletion.cpp
│   │   │   │   ├── matrixCompletion.h
│   │   │   │   ├── parselog.sh
│   │   │   │   ├── plot.R
│   │   │   │   └── runexp.py
│   │   │   ├── pagerank/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── PageRank-constants.h
│   │   │   │   ├── PageRank-pull.cpp
│   │   │   │   ├── PageRank-push.cpp
│   │   │   │   └── README.md
│   │   │   ├── pointstoanalysis/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── PointsTo.cpp
│   │   │   │   ├── README.md
│   │   │   │   └── SparseBitVector.h
│   │   │   ├── preflowpush/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Preflowpush.cpp
│   │   │   │   └── README.md
│   │   │   ├── spanningtree/
│   │   │   │   ├── Boruvka.cpp
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   └── UnionFind.h
│   │   │   ├── sssp/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   └── SSSP.cpp
│   │   │   └── triangle-counting/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── README.md
│   │   │       └── Triangles.cpp
│   │   ├── distributed/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── betweennesscentrality/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── bc_level.cpp
│   │   │   │   ├── bc_level_cuda.cu
│   │   │   │   ├── bc_level_cuda.cuh
│   │   │   │   ├── bc_level_cuda.h
│   │   │   │   ├── bc_level_cuda.py
│   │   │   │   ├── bc_level_sync.hh
│   │   │   │   ├── bc_mr.cpp
│   │   │   │   ├── mrbc_bitset.hh
│   │   │   │   ├── mrbc_sync.hh
│   │   │   │   └── mrbc_tree.h
│   │   │   ├── bfs/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── bfs_pull.cpp
│   │   │   │   ├── bfs_pull_cuda.cu
│   │   │   │   ├── bfs_pull_cuda.cuh
│   │   │   │   ├── bfs_pull_cuda.h
│   │   │   │   ├── bfs_pull_cuda.py
│   │   │   │   ├── bfs_pull_sync.hh
│   │   │   │   ├── bfs_push.cpp
│   │   │   │   ├── bfs_push_cuda.cu
│   │   │   │   ├── bfs_push_cuda.cuh
│   │   │   │   ├── bfs_push_cuda.h
│   │   │   │   ├── bfs_push_cuda.py
│   │   │   │   └── bfs_push_sync.hh
│   │   │   ├── connected-components/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── cc_pull.cpp
│   │   │   │   ├── cc_pull_cuda.cu
│   │   │   │   ├── cc_pull_cuda.cuh
│   │   │   │   ├── cc_pull_cuda.h
│   │   │   │   ├── cc_pull_cuda.py
│   │   │   │   ├── cc_pull_sync.hh
│   │   │   │   ├── cc_push.cpp
│   │   │   │   ├── cc_push_cuda.cu
│   │   │   │   ├── cc_push_cuda.cuh
│   │   │   │   ├── cc_push_cuda.h
│   │   │   │   ├── cc_push_cuda.py
│   │   │   │   └── cc_push_sync.hh
│   │   │   ├── k-core/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── kcore_pull.cpp
│   │   │   │   ├── kcore_pull_cuda.cu
│   │   │   │   ├── kcore_pull_cuda.cuh
│   │   │   │   ├── kcore_pull_cuda.h
│   │   │   │   ├── kcore_pull_cuda.py
│   │   │   │   ├── kcore_pull_sync.hh
│   │   │   │   ├── kcore_push.cpp
│   │   │   │   ├── kcore_push_cuda.cu
│   │   │   │   ├── kcore_push_cuda.cuh
│   │   │   │   ├── kcore_push_cuda.h
│   │   │   │   ├── kcore_push_cuda.py
│   │   │   │   └── kcore_push_sync.hh
│   │   │   ├── matrixcompletion/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── matrixCompletion.cpp
│   │   │   │   └── matrixCompletion_sync.hh
│   │   │   ├── pagerank/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── pagerank_pull.cpp
│   │   │   │   ├── pagerank_pull_cuda.cu
│   │   │   │   ├── pagerank_pull_cuda.cuh
│   │   │   │   ├── pagerank_pull_cuda.h
│   │   │   │   ├── pagerank_pull_cuda.py
│   │   │   │   ├── pagerank_pull_sync.hh
│   │   │   │   ├── pagerank_push.cpp
│   │   │   │   ├── pagerank_push_cuda.cu
│   │   │   │   ├── pagerank_push_cuda.cuh
│   │   │   │   ├── pagerank_push_cuda.h
│   │   │   │   ├── pagerank_push_cuda.py
│   │   │   │   └── pagerank_push_sync.hh
│   │   │   ├── partition/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   └── partition.cpp
│   │   │   ├── sssp/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── sssp_pull.cpp
│   │   │   │   ├── sssp_pull_cuda.cu
│   │   │   │   ├── sssp_pull_cuda.cuh
│   │   │   │   ├── sssp_pull_cuda.h
│   │   │   │   ├── sssp_pull_cuda.py
│   │   │   │   ├── sssp_pull_sync.hh
│   │   │   │   ├── sssp_push.cpp
│   │   │   │   ├── sssp_push_cuda.cu
│   │   │   │   ├── sssp_push_cuda.cuh
│   │   │   │   ├── sssp_push_cuda.h
│   │   │   │   ├── sssp_push_cuda.py
│   │   │   │   └── sssp_push_sync.hh
│   │   │   └── triangle-counting/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── README.md
│   │   │       ├── tc.cpp
│   │   │       ├── tc_cuda.cu
│   │   │       ├── tc_cuda.cuh
│   │   │       ├── tc_cuda.h
│   │   │       └── tc_cuda.py
│   │   └── gpu/
│   │       ├── CMakeLists.txt
│   │       ├── README.md
│   │       ├── bfs/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── bfs.cu
│   │       │   └── support.cu
│   │       ├── connected-components/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── cc.cu
│   │       │   └── support.cu
│   │       ├── independentset/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── mis.cu
│   │       │   └── support.cu
│   │       ├── matrixcompletion/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── SGDAsyncEdgeCu.h
│   │       │   ├── SGDCommonCu.h
│   │       │   ├── SGDGraphCu.h
│   │       │   ├── sgd.cu
│   │       │   └── support.cu
│   │       ├── pagerank/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── pagerank.cu
│   │       │   └── support.cu
│   │       ├── pointstoanalysis/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── andersen.cu
│   │       │   ├── andersen.h
│   │       │   ├── pta.cu
│   │       │   ├── pta_tuning.h
│   │       │   └── support.cu
│   │       ├── spanningtree/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── mst-tex.cu
│   │       │   ├── mst.cu
│   │       │   ├── mst.h
│   │       │   └── support.cu
│   │       ├── sssp/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── sssp.cu
│   │       │   └── support.cu
│   │       └── triangle-counting/
│   │           ├── CMakeLists.txt
│   │           ├── README.md
│   │           ├── support.cu
│   │           └── tc.cu
│   ├── eda/
│   │   ├── CMakeLists.txt
│   │   └── cpu/
│   │       ├── CMakeLists.txt
│   │       ├── aig-rewriting/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── README.md
│   │       │   ├── algorithms/
│   │       │   │   ├── ChoiceManager.cpp
│   │       │   │   ├── ChoiceManager.h
│   │       │   │   ├── CutManager.cpp
│   │       │   │   ├── CutManager.h
│   │       │   │   ├── CutPool.cpp
│   │       │   │   ├── CutPool.h
│   │       │   │   ├── NPNManager.cpp
│   │       │   │   ├── NPNManager.h
│   │       │   │   ├── PreCompGraphManager.cpp
│   │       │   │   ├── PreCompGraphManager.h
│   │       │   │   ├── PriorityCutManager.cpp
│   │       │   │   ├── PriorityCutManager.h
│   │       │   │   ├── PriorityCutPool.cpp
│   │       │   │   ├── PriorityCutPool.h
│   │       │   │   ├── ReconvDrivenCut.cpp
│   │       │   │   ├── ReconvDrivenCut.h
│   │       │   │   ├── RewriteManager.cpp
│   │       │   │   └── RewriteManager.h
│   │       │   ├── functional/
│   │       │   │   ├── BitVectorPool.cpp
│   │       │   │   ├── BitVectorPool.h
│   │       │   │   ├── FunctionHandler.h
│   │       │   │   ├── FunctionHandler32.h
│   │       │   │   ├── FunctionUtil.cpp
│   │       │   │   └── FunctionUtil.h
│   │       │   ├── main.cpp
│   │       │   ├── misc/
│   │       │   │   └── util/
│   │       │   │       ├── utilString.cpp
│   │       │   │       └── utilString.h
│   │       │   ├── parsers/
│   │       │   │   ├── AigParser.cpp
│   │       │   │   ├── AigParser.h
│   │       │   │   ├── LookupTableParser.cpp
│   │       │   │   ├── LookupTableParser.h
│   │       │   │   ├── semantic_error.cpp
│   │       │   │   ├── semantic_error.h
│   │       │   │   ├── syntax_error.cpp
│   │       │   │   ├── syntax_error.h
│   │       │   │   ├── unexpected_eof.cpp
│   │       │   │   └── unexpected_eof.h
│   │       │   ├── subjectgraph/
│   │       │   │   └── aig/
│   │       │   │       ├── Aig.cpp
│   │       │   │       └── Aig.h
│   │       │   ├── writers/
│   │       │   │   ├── AigWriter.cpp
│   │       │   │   ├── AigWriter.h
│   │       │   │   ├── BlifWriter.cpp
│   │       │   │   └── BlifWriter.h
│   │       │   └── xxHash/
│   │       │       ├── xxhash.c
│   │       │       └── xxhash.h
│   │       └── sproute/
│   │           ├── BoilerPlate.h
│   │           ├── CMakeLists.txt
│   │           ├── DataProc.h
│   │           ├── DataType.h
│   │           ├── EdgeShift.h
│   │           ├── LICENSE
│   │           ├── README.md
│   │           ├── RSMT.h
│   │           ├── RipUp.h
│   │           ├── bitmap_image.hpp
│   │           ├── bitmap_test.cpp
│   │           ├── bookshelf_IO.c
│   │           ├── bookshelf_IO.h
│   │           ├── cong.c
│   │           ├── cong.h
│   │           ├── dist.c
│   │           ├── dist.h
│   │           ├── dl.c
│   │           ├── dl.h
│   │           ├── err.c
│   │           ├── err.h
│   │           ├── flute-ckt
│   │           ├── flute-ckt.c
│   │           ├── flute-net
│   │           ├── flute-net.c
│   │           ├── flute.h
│   │           ├── flute_mst.h
│   │           ├── global.h
│   │           ├── heap.c
│   │           ├── heap.h
│   │           ├── main.cpp
│   │           ├── maze.h
│   │           ├── maze3D.h
│   │           ├── maze_finegrain.h
│   │           ├── maze_finegrain_concurrent.h
│   │           ├── maze_finegrain_lateupdate.h
│   │           ├── maze_lock.h
│   │           ├── memAlloc.c
│   │           ├── memAlloc.h
│   │           ├── mst2.c
│   │           ├── mst2.h
│   │           ├── neighbors.c
│   │           ├── neighbors.h
│   │           ├── parallel_router_morphgraph.cpp
│   │           ├── rand-pts.c
│   │           ├── route.h
│   │           └── utility.h
│   ├── libdistbench/
│   │   ├── CMakeLists.txt
│   │   ├── include/
│   │   │   └── DistBench/
│   │   │       ├── Input.h
│   │   │       ├── MiningStart.h
│   │   │       ├── Output.h
│   │   │       └── Start.h
│   │   └── src/
│   │       ├── Input.cpp
│   │       ├── Output.cpp
│   │       └── Start.cpp
│   ├── liblonestar/
│   │   ├── CMakeLists.txt
│   │   ├── include/
│   │   │   └── Lonestar/
│   │   │       ├── BFS_SSSP.h
│   │   │       ├── BoilerPlate.h
│   │   │       └── Utils.h
│   │   └── src/
│   │       └── BoilerPlate.cpp
│   ├── mining/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── frequent-subgraph-mining/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── fsm.cpp
│   │   │   │   └── fsm.h
│   │   │   ├── k-clique-listing/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── kcl.cpp
│   │   │   │   └── kcl.h
│   │   │   ├── motif-counting/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── motif.cpp
│   │   │   │   └── motif.h
│   │   │   ├── subgraph-listing/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.mb
│   │   │   │   ├── sgl_cycle.cpp
│   │   │   │   └── sgl_diamond.cpp
│   │   │   └── triangle-counting/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── README.md
│   │   │       ├── tc.h
│   │   │       └── tc_mine.cpp
│   │   ├── gpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── frequent-subgraph-mining/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── fsm.cu
│   │   │   │   ├── fsm.h
│   │   │   │   └── fsm_gpu.cpp
│   │   │   ├── k-clique-listing/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── kcl.cu
│   │   │   │   ├── kcl.h
│   │   │   │   └── kcl_gpu.cpp
│   │   │   ├── motif-counting/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── README.md
│   │   │   │   ├── motif.cu
│   │   │   │   ├── motif.h
│   │   │   │   └── motif_gpu.cpp
│   │   │   └── triangle-counting/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── README.md
│   │   │       ├── tc.h
│   │   │       ├── tc_mine.cu
│   │   │       └── tc_mine_gpu.cpp
│   │   └── libminingbench/
│   │       ├── CMakeLists.txt
│   │       ├── include/
│   │       │   └── MiningBench/
│   │       │       └── Start.h
│   │       └── src/
│   │           ├── Input.cpp
│   │           └── Start.cpp
│   ├── scientific/
│   │   ├── CMakeLists.txt
│   │   ├── cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── barneshut/
│   │   │   │   ├── Barneshut.cpp
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Point.h
│   │   │   │   └── README.md
│   │   │   ├── delaunayrefinement/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Cavity.h
│   │   │   │   ├── DelaunayRefinement.cpp
│   │   │   │   ├── Edge.h
│   │   │   │   ├── Element.h
│   │   │   │   ├── Mesh.h
│   │   │   │   ├── README.md
│   │   │   │   ├── Subgraph.h
│   │   │   │   ├── Tuple.h
│   │   │   │   └── Verifier.h
│   │   │   ├── delaunaytriangulation/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Cavity.h
│   │   │   │   ├── DelaunayTriangulation.cpp
│   │   │   │   ├── DelaunayTriangulationDet.cpp
│   │   │   │   ├── Element.cpp
│   │   │   │   ├── Element.h
│   │   │   │   ├── Graph.h
│   │   │   │   ├── Point.h
│   │   │   │   ├── QuadTree.h
│   │   │   │   ├── README.md
│   │   │   │   ├── Tuple.h
│   │   │   │   └── Verifier.h
│   │   │   └── longestedge/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── README.md
│   │   │       ├── out/
│   │   │       │   └── .gitignore
│   │   │       ├── src/
│   │   │       │   ├── LongestEdge.cpp
│   │   │       │   ├── conditions/
│   │   │       │   │   ├── ConditionChecker.h
│   │   │       │   │   ├── DummyConditionChecker.h
│   │   │       │   │   └── TerrainConditionChecker.h
│   │   │       │   ├── libmgrs/
│   │   │       │   │   ├── LICENSE
│   │   │       │   │   ├── mgrs.c
│   │   │       │   │   ├── mgrs.h
│   │   │       │   │   ├── polarst.c
│   │   │       │   │   ├── polarst.h
│   │   │       │   │   ├── tranmerc.c
│   │   │       │   │   ├── tranmerc.h
│   │   │       │   │   ├── ups.c
│   │   │       │   │   ├── ups.h
│   │   │       │   │   ├── utm.c
│   │   │       │   │   └── utm.h
│   │   │       │   ├── model/
│   │   │       │   │   ├── Coordinates.h
│   │   │       │   │   ├── EdgeData.h
│   │   │       │   │   ├── Graph.h
│   │   │       │   │   ├── Map.cpp
│   │   │       │   │   ├── Map.h
│   │   │       │   │   ├── NodeData.h
│   │   │       │   │   └── ProductionState.h
│   │   │       │   ├── productions/
│   │   │       │   │   ├── Production.h
│   │   │       │   │   ├── Production1.h
│   │   │       │   │   ├── Production2.h
│   │   │       │   │   ├── Production3.h
│   │   │       │   │   ├── Production4.h
│   │   │       │   │   ├── Production5.h
│   │   │       │   │   └── Production6.h
│   │   │       │   ├── readers/
│   │   │       │   │   ├── AsciiReader.cpp
│   │   │       │   │   ├── AsciiReader.h
│   │   │       │   │   ├── InpReader.cpp
│   │   │       │   │   ├── InpReader.h
│   │   │       │   │   ├── SrtmReader.cpp
│   │   │       │   │   └── SrtmReader.h
│   │   │       │   ├── utils/
│   │   │       │   │   ├── ConnectivityManager.h
│   │   │       │   │   ├── GaloisUtils.h
│   │   │       │   │   ├── GraphGenerator.h
│   │   │       │   │   ├── MyGraphFormatWriter.h
│   │   │       │   │   ├── Utils.cpp
│   │   │       │   │   └── Utils.h
│   │   │       │   └── writers/
│   │   │       │       ├── InpWriter.cpp
│   │   │       │       ├── InpWriter.h
│   │   │       │       ├── TriangleFormatWriter.cpp
│   │   │       │       └── TriangleFormatWriter.h
│   │   │       └── test/
│   │   │           ├── TestMain.cpp
│   │   │           ├── catch.hpp
│   │   │           ├── model/
│   │   │           │   ├── MapTest.cpp
│   │   │           │   └── ProductionStateTest.cpp
│   │   │           ├── productions/
│   │   │           │   └── Production1Test.cpp
│   │   │           ├── testUtils.cpp
│   │   │           └── utils/
│   │   │               ├── ConnectivityManagerTest.cpp
│   │   │               └── UtilsTest.cpp
│   │   └── gpu/
│   │       ├── CMakeLists.txt
│   │       ├── README.md
│   │       ├── barneshut/
│   │       │   ├── CMakeLists.txt
│   │       │   ├── LICENSE.md
│   │       │   ├── README.md
│   │       │   ├── bh.cu
│   │       │   └── bh_tuning.h
│   │       └── delaunayrefinement/
│   │           ├── CMakeLists.txt
│   │           ├── README.md
│   │           ├── devel.h
│   │           ├── dmr-nontex.cu
│   │           ├── dmr.cu
│   │           ├── dmr.h
│   │           ├── dmrggc.inc
│   │           ├── geomprim.h
│   │           ├── main.inc
│   │           └── meshfiles.h
│   └── tutorial_examples/
│       ├── CMakeLists.txt
│       ├── ConflictAwareTorus.cpp
│       ├── CountLevels.cpp
│       ├── ExampleWrappedWorklist.cpp
│       ├── GraphTraversalPullOperator.cpp
│       ├── GraphTraversalPushOperator.cpp
│       ├── GraphTraversalSerial.cpp
│       ├── HelloWorld.cpp
│       ├── SSSPPullSimple.cpp
│       ├── SSSPPushSimple.cpp
│       ├── SSSPsimple.cpp
│       ├── SpanningTree.cpp
│       ├── ThirdPartyMalloc.cpp
│       ├── Torus.cpp
│       ├── TorusConstruction.cpp
│       └── TorusImproved.cpp
├── pyproject.toml
├── python/
│   ├── CMakeLists.txt
│   └── galois/
│       ├── CMakeLists.txt
│       ├── __init__.py
│       ├── _bfs.pyx
│       ├── _connected_components.pyx
│       ├── _pagerank.pyx
│       ├── _sssp.pyx
│       ├── bfs.py
│       ├── connected_components.py
│       ├── cpp/
│       │   ├── __init__.pxd
│       │   ├── libgalois/
│       │   │   ├── Galois.pxd
│       │   │   ├── Timer.pxd
│       │   │   ├── Worklist.pxd
│       │   │   ├── __init__.pxd
│       │   │   └── graphs/
│       │   │       ├── Graph.pxd
│       │   │       ├── Util.pxd
│       │   │       └── __init__.pxd
│       │   └── libstd/
│       │       ├── __init__.pxd
│       │       └── atomic.pxd
│       ├── pagerank.py
│       ├── shmem.pxd
│       └── shmem.pyx
├── scripts/
│   ├── CMakeLists.txt
│   ├── abelian_log_parser.py
│   ├── check_format.sh
│   ├── constraints_checking/
│   │   └── parse_dump.rb
│   ├── docker/
│   │   ├── Dockerfile
│   │   ├── Dockerfile.msan
│   │   ├── README.md
│   │   ├── msan/
│   │   │   ├── build-boost.sh
│   │   │   ├── build-llvm.sh
│   │   │   └── config-galois.sh
│   │   └── run-image.sh
│   ├── experimental/
│   │   ├── abelian_log_parser_analysis.py
│   │   ├── abelian_log_parser_deprecated.py
│   │   ├── abelian_log_parser_multipleRuns.py
│   │   ├── abelian_log_parser_multipleRuns2.py
│   │   ├── bmk2/
│   │   │   ├── __init__.py
│   │   │   ├── bispec.py
│   │   │   ├── bmk2.py
│   │   │   ├── checkers.py
│   │   │   ├── collect.py
│   │   │   ├── collect_multi.py
│   │   │   ├── common.py
│   │   │   ├── config.py
│   │   │   ├── convert.py
│   │   │   ├── convgraph.py
│   │   │   ├── core.py
│   │   │   ├── extras.py
│   │   │   ├── inputdb.py
│   │   │   ├── inputprops.py
│   │   │   ├── logproc.py
│   │   │   ├── mapfile.py
│   │   │   ├── measure_energy.py
│   │   │   ├── opdb.py
│   │   │   ├── overlays.py
│   │   │   ├── perf.py
│   │   │   ├── rsinfo.py
│   │   │   ├── sconvert.py
│   │   │   ├── summlog.py
│   │   │   └── test2.py
│   │   ├── buildFunc.sh
│   │   ├── buildMultiCompiler.sh
│   │   ├── buildMultiVersion.sh
│   │   ├── buildOnce.sh
│   │   ├── distbmk2/
│   │   │   ├── README
│   │   │   ├── bmk2.cfg
│   │   │   ├── bmkprops.py
│   │   │   ├── bmktest2.py
│   │   │   ├── dist.bispec
│   │   │   ├── dist.inputdb
│   │   │   └── dist.inputprops
│   │   ├── galois_license_fixer.py
│   │   ├── githubbmk2_setup/
│   │   │   ├── README
│   │   │   ├── bmk2.cfg
│   │   │   ├── bmkprops.py
│   │   │   ├── bmktest2.py
│   │   │   ├── defaultrunscript.sh
│   │   │   ├── lonestar.bispec
│   │   │   ├── lonestar.inputdb
│   │   │   └── lonestar.inputprops
│   │   ├── heterogeneousGalois/
│   │   │   ├── CPU_run_scripts_stampede/
│   │   │   │   ├── ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull.sh
│   │   │   │   ├── ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull_Vcut.sh
│   │   │   │   ├── ruby_BFS_CC_SSSP_rmat_USA_twitter_Push.sh
│   │   │   │   └── ruby_BFS_CC_SSSP_rmat_USA_twitter_Push_Vcut.sh
│   │   │   ├── README_compiler
│   │   │   ├── batch_bridges_all.sh
│   │   │   ├── batch_single-host_multi-device_all.sh
│   │   │   ├── batch_stampede_all.sh
│   │   │   ├── batch_verify.sh
│   │   │   ├── compile.sh
│   │   │   ├── compile_all.sh
│   │   │   ├── cuda_compile.sh
│   │   │   ├── run_bridges.template.sbatch
│   │   │   ├── run_bridges_all.sh
│   │   │   ├── run_single-host_multi-device_all.sh
│   │   │   ├── run_stampede.template.sbatch
│   │   │   ├── run_stampede_all.sh
│   │   │   └── verify.sh
│   │   ├── lonestarbmk2/
│   │   │   ├── README
│   │   │   ├── bmk2.cfg
│   │   │   ├── bmkprops.py
│   │   │   ├── bmktest2.py
│   │   │   ├── defaultrunscript.sh
│   │   │   ├── lonestar.bispec
│   │   │   ├── lonestar.inputdb
│   │   │   └── lonestar.inputprops
│   │   ├── older/
│   │   │   ├── backend.pl
│   │   │   ├── prune_headers_function.pl
│   │   │   ├── prune_headers_line.pl
│   │   │   ├── report.pl
│   │   │   ├── report_vtune.pl
│   │   │   ├── run_boruvka.pl
│   │   │   ├── run_clustering.pl
│   │   │   ├── run_delaunayrefinement.pl
│   │   │   ├── run_sssp.pl
│   │   │   └── vtune_sssp.pl
│   │   ├── pangolin/
│   │   │   ├── batch_verify.sh
│   │   │   ├── fsm.citeseer.2.300
│   │   │   ├── fsm.citeseer.2.500
│   │   │   ├── fsm.patent.2.1000
│   │   │   ├── fsm.patent.2.300
│   │   │   ├── fsm.patent.2.500
│   │   │   ├── fsm.patent.2.5000
│   │   │   ├── kcl.citeseer.4
│   │   │   ├── kcl.citeseer.5
│   │   │   ├── kcl.mico.4
│   │   │   ├── kcl.mico.5
│   │   │   ├── kcl.patent.3
│   │   │   ├── kcl.patent.4
│   │   │   ├── kcl.patent.5
│   │   │   ├── motif.citeseer.3
│   │   │   ├── motif.citeseer.4
│   │   │   ├── motif.mico.3
│   │   │   ├── motif.mico.4
│   │   │   ├── motif.patent.3
│   │   │   ├── motif.patent.4
│   │   │   ├── result_checker.py
│   │   │   └── verify.sh
│   │   ├── runBFS.sh
│   │   └── runSSSP.sh
│   ├── find_ifdefs.sh
│   ├── galois_log_parser.R
│   ├── galois_log_parser_minimal.R
│   ├── gitFindBigCommits.sh
│   ├── hcompiler.sh
│   ├── intel_study_scripts/
│   │   ├── README.md
│   │   ├── download_inputs.sh
│   │   ├── run_bc.sh
│   │   ├── run_bfs.sh
│   │   ├── run_cc.sh
│   │   ├── run_pr.sh
│   │   ├── run_sssp.sh
│   │   └── run_tc.sh
│   ├── iss_load_modules.sh
│   ├── make_dist.sh.in
│   ├── merge_vtune.pl
│   ├── plot_lonestar_apps.R
│   ├── quick_plot.pl
│   ├── rcat.py
│   ├── report.py
│   ├── report_vtune.pl
│   ├── result_checker.py
│   ├── run.py
│   ├── run_vtune.pl
│   ├── sparse-matrices/
│   │   ├── diff_edgelists.py
│   │   ├── iperm2order.pl
│   │   ├── mtx2edgelist.pl
│   │   └── reorder.pl
│   ├── tcp_starter.py
│   ├── visual/
│   │   ├── plot2Dmesh.m
│   │   ├── plotGraph.R
│   │   ├── plotGraph3d.R
│   │   ├── plotTimeStamps.m
│   │   └── triplot.m
│   └── vtune.sh
├── setup.py
├── tests/
│   └── test_imports.py
└── tools/
    ├── CMakeLists.txt
    ├── dist-graph-convert/
    │   ├── CMakeLists.txt
    │   ├── dist-graph-convert-helpers.cpp
    │   ├── dist-graph-convert-helpers.h
    │   └── dist-graph-convert.cpp
    ├── graph-convert/
    │   ├── CMakeLists.txt
    │   ├── graph-convert-huge.cpp
    │   ├── graph-convert.cpp
    │   └── test-inputs/
    │       ├── sample.csv
    │       ├── with-blank-lines.edgelist
    │       ├── with-blank-lines.edgelist.expected
    │       ├── with-comments.edgelist
    │       └── with-comments.edgelist.expected
    ├── graph-remap/
    │   ├── CMakeLists.txt
    │   └── graph-remap.cpp
    └── graph-stats/
        ├── CMakeLists.txt
        └── graph-stats.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .circleci/config.yml
================================================
version: 2.1

common_step: &cmake_build_test
  - run: |
      cmake --build /tmp/build --target input

      if [ -n "$CIRCLE_PULL_REQUEST" ]; then \
        subset=$(/bin/bash .circleci/longest_common_path.sh); \
        echo "Changes of ${CIRCLE_SHA1} are all under $subset"; \
      fi

      cmake --build /tmp/build/${subset:-.} --parallel 2
      # Run tests as non-root otherwise MPI will complain
      (cd /tmp/build/${subset:-.} \
        && chown -R runner . \
        && su runner -c "ctest --output-on-failure --label-regex quick --parallel 2")

# TODO: These builds are currently configured to
# install the needed dependencies in each container
# at the start of each build. The dependencies aren't huge,
# but that is slower and does waste some bandwidth.
# We should eventually roll the set up for each
# container into a separate dockerfile and push custom
# build images to dockerhub so that setting up packages
# during the actual CI testing is no longer necessary.

jobs:
  "CheckFormat":
    docker:
      - image: ubuntu:bionic
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget
          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | apt-key add -
          apt-add-repository -y 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
          apt-get -q update -y
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            clang-format-10

          update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-10 50

          if [ -n "$CIRCLE_PULL_REQUEST" ]; then \
            subset=$(/bin/bash .circleci/longest_common_path.sh); \
            echo "Changes of ${CIRCLE_SHA1} are all under $subset"; \
          fi
          scripts/check_format.sh ${subset:-.}
  "Sanitize":
    docker:
      - image: ubuntu:bionic
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget
          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | apt-key add -
          apt-add-repository -y 'ppa:ubuntu-toolchain-r/test'
          apt-add-repository -y 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
          apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main'
          apt-add-repository -y 'ppa:mhier/libboost-latest'
          apt-get -q update -y
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            clang-10 \
            cmake \
            libboost1.70-dev \
            libeigen3-dev \
            openmpi-bin \
            libopenmpi-dev \
            llvm-7-dev \
            libz-dev \
            libfmt-dev

          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-10 50
          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-10 50

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DCMAKE_C_COMPILER=clang \
            -DCMAKE_CXX_COMPILER=clang++ \
            -DGALOIS_USE_SANITIZER="Address;Undefined"
      - <<: *cmake_build_test
  "Debian":
    docker:
      - image: debian:10
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            cmake \
            g++ \
            gcc \
            libboost-iostreams-dev \
            libboost-serialization-dev \
            libeigen3-dev \
            libmpich-dev \
            llvm-7-dev \
            mpich \
            zlib1g-dev \
            libfmt-dev

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "Ubuntu-18_04":
    docker:
      - image: ubuntu:18.04
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget
          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
          apt-get -q update -y
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            cmake \
            g++ \
            gcc \
            libboost-all-dev \
            libeigen3-dev \
            libopenmpi-dev \
            llvm-7-dev \
            openmpi-bin \
            ssh \
            libfmt-dev

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "Ubuntu-18_04-cuda11_0_3-build-only":
    docker:
      - image: nvidia/cuda:11.0.3-devel-ubuntu18.04
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget
          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
          apt-get -q update -y
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            cmake \
            g++ \
            gcc \
            libboost-all-dev \
            libeigen3-dev \
            libopenmpi-dev \
            llvm-7-dev \
            openmpi-bin \
            ssh \
            libfmt-dev

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON \
            -DGALOIS_ENABLE_GPU=ON
          cmake --build /tmp/build --target input
          cmake --build /tmp/build --parallel 2
  "Ubuntu-18_04-cuda11_1_1-build-only":
    docker:
      - image: nvidia/cuda:11.1.1-devel-ubuntu18.04
    steps:
      - checkout
      - run: |
          apt-get -q update -y
          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget
          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
          apt-get -q update -y
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apt-get -q install -y \
            cmake \
            g++ \
            gcc \
            libboost-all-dev \
            libeigen3-dev \
            libopenmpi-dev \
            llvm-7-dev \
            openmpi-bin \
            ssh \
            libfmt-dev

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON \
            -DGALOIS_ENABLE_GPU=ON
          cmake --build /tmp/build --target input
          cmake --build /tmp/build --parallel 2
  "CentOS-8-gcc":
    docker:
      - image: centos:8
    steps:
      - checkout
      - run: |
          # CentOS Linux 8 has reached End Of Life (EOL) on December 31st, 2021
          ls /etc/yum.repos.d/ > /dev/null 2>&1
          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
      - run: |
          # fmt-devel is in EPEL
          yum -y -q install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm

          # eigen3-devel needs PowerTools packages
          yum -y -q install dnf-plugins-core
          yum -y -q config-manager --set-enabled powertools

          yum -y -q install git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          yum -y -q install \
            boost-devel \
            cmake \
            eigen3-devel \
            gcc \
            gcc-c++ \
            llvm-devel \
            llvm-static \
            make \
            mpich-devel \
            ncurses-devel \
            wget \
            zlib-devel \
            fmt-devel

          wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz | tar -xz -f - -C /usr/local
          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/cmake /usr/local/bin/cmake
          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/ctest /usr/local/bin/ctest

          # Make the "module" command work in the subsequent shell sessions.
          cat /etc/profile.d/modules.sh >> $BASH_ENV
          echo "module load mpi" >> $BASH_ENV
      - run: |
          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "CentOS-8-clang":
    docker:
      - image: centos:8
    steps:
      - checkout
      - run: |
          # CentOS Linux 8 has reached End Of Life (EOL) on December 31st, 2021
          ls /etc/yum.repos.d/ > /dev/null 2>&1
          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
      - run: |
          # fmt-devel is in EPEL
          yum -y -q install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm

          # eigen3-devel needs PowerTools packages
          yum -y -q install dnf-plugins-core
          yum -y -q config-manager --set-enabled powertools

          yum -y -q install git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          yum -y -q install \
            boost-devel \
            eigen3-devel \
            llvm-devel \
            llvm-static \
            llvm-toolset \
            make \
            openmpi-devel \
            ncurses-devel \
            wget \
            zlib-devel \
            fmt-devel

          wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz | tar -xz -f - -C /usr/local
          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/cmake /usr/local/bin/cmake
          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/ctest /usr/local/bin/ctest

          # Make the "module" command work in the subsequent shell sessions.
          cat /etc/profile.d/modules.sh >> $BASH_ENV
          echo "module load mpi" >> $BASH_ENV
      - run: |
          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DCMAKE_C_COMPILER=clang \
            -DCMAKE_CXX_COMPILER=clang++ \
            # -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "Arch":
    docker:
      - image: archlinux:base
    steps:
      - checkout
      - run: |
          pacman -Syu --noconfirm
          pacman -q -S --noconfirm git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          # NB(ddn): make requires libffi but its package doesn't depend on it.
          pacman -q -S --noconfirm \
            boost \
            cmake \
            eigen \
            gcc \
            libffi \
            llvm \
            make \
            openmpi \
            fmt

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "Alpine":
    docker:
      - image: alpine:latest
    steps:
      - checkout
      - run: |
          apk add --no-cache --no-progress git bash
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          apk add --no-cache --no-progress \
            boost-dev \
            cmake \
            eigen \
            g++ \
            gcc \
            llvm14-dev \
            llvm14-static \
            make \
            musl-dev \
            openssh-client \
            zlib-dev \
            fmt-dev

          chmod 755 /root
          adduser -D runner
          mkdir -p /tmp/build

          cmake -S . -B /tmp/build
      - <<: *cmake_build_test
  "Fedora-gcc":
    docker:
      - image: fedora:latest
    steps:
      - checkout
      - run: |
          yum -y -q install git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          yum -y -q install \
            boost-devel \
            cmake \
            eigen3-devel \
            gcc-c++ \
            llvm-devel \
            llvm-static \
            make \
            mpich-devel \
            wget \
            zlib-devel \
            fmt-devel

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          # Get the "module" function set up before loading MPI.
          cat /etc/profile.d/modules.sh >> $BASH_ENV
          echo "module load mpi" >> $BASH_ENV
      - run: |
          cmake -S . -B /tmp/build \
            -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test
  "Fedora-clang":
    docker:
      - image: fedora:latest
    steps:
      - checkout
      - run: |
          yum -y -q install git
      - run: git submodule sync
      - run: git submodule update --init
      - run: |
          yum -y -q install \
            boost-devel \
            clang \
            cmake \
            eigen3-devel \
            llvm-devel \
            llvm-static \
            make \
            openmpi-devel \
            wget \
            zlib-devel \
            fmt-devel

          chmod 755 /root
          useradd runner
          mkdir -p /tmp/build

          # Get the "module" function set up before loading MPI.
          cat /etc/profile.d/modules.sh >> $BASH_ENV
          echo "module load mpi" >> $BASH_ENV
      - run: |
          cmake -S . -B /tmp/build \
            -DCMAKE_C_COMPILER=clang \
            -DCMAKE_CXX_COMPILER=clang++ \
            # -DGALOIS_ENABLE_DIST=ON
      - <<: *cmake_build_test

workflows:
  build:
    jobs:
      - "CheckFormat"
      - "Sanitize"
      - "Alpine":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Arch":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "CentOS-8-clang":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "CentOS-8-gcc":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Debian":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Fedora-clang":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Fedora-gcc":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Ubuntu-18_04":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Ubuntu-18_04-cuda11_1_1-build-only":
          requires:
            - "CheckFormat"
            # - "Sanitize"
      - "Ubuntu-18_04-cuda11_0_3-build-only":
          requires:
            - "CheckFormat"
            # - "Sanitize"


================================================
FILE: .circleci/longest_common_path.sh
================================================
#!/bin/bash
# For PR build only; find the longest common path prefix as the build and test subset

longest_common_prefix() {
    declare -a possible_prefix
    declare i=0

    path="${1%/}"
    while [ "$path" != "." ]; do
        if [[ -d $path && -f "$path/CMakeLists.txt" ]]; then
            possible_prefix[$i]="$path"
        fi
        i=$(($i + 1))
        path=$(dirname "$path");
    done

    lcp="."
    for prefix in "${possible_prefix[@]}"; do
        for path in $@; do
            if [ "${path#$prefix}" = "${path}" ]; then
                continue 2
            fi
        done
        lcp="$prefix"
        break
    done
    echo $lcp
}
base=$( \
    wget -q -O - "https://api.github.com/repos/$(echo ${CIRCLE_PULL_REQUEST:19} | sed "s/\/pull\//\/pulls\//")" \
    | sed -n -e "s/^.*IntelligentSoftwareSystems://p" \
    | sed -n -e "s/\".*$//p" \
)
longest_common_prefix $(git -c core.quotepath=false diff --name-only $base $CIRCLE_SHA1)

================================================
FILE: .clang-format
================================================
---
Language:        Cpp
BasedOnStyle:  LLVM
AccessModifierOffset: -2
AlignAfterOpenBracket: true
AlignConsecutiveAssignments: true
AlignEscapedNewlinesLeft: false
AlignOperands:   true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit:     80
CommentPragmas:  '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: false
DerivePointerAlignment: false
DisableFormat:   false
ExperimentalAutoDetectBinPacking: false
ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: false
IndentWidth:     2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
SortIncludes: false
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles:  false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        4
UseTab:          Never
...


================================================
FILE: .clang-tidy
================================================
---
# Enable most checks then disable (-) problematics ones:
#
# Some checks are good in principle but cannot be applied automatically either
# because they require taste or the autofix can generate wrong code:
#
# - cppcoreguidelines-pro-type-member-init: wrong code sometimes
# - google-explicit-constructor: libllvm has implicit conversions
# - modernize-use-no-discard
# - modernize-use-transparent-functors
# - modernize-use-using: autofix doesn't handle dependent type templates
# - readability-static-accessed-through-instance: wrong code sometimes
#
# No consensus:
#
# - modernize-use-trailing-return-type: also huge code churn
# - readability-convert-member-functions-to-static
# - readability-implicit-bool-conversion
Checks: |
  abseil-*,
  boost-*,
  bugprone-*,
  clang-analyzer-*,
  clang-diagnostic-*,
  cppcoreguidelines-*,
  -cppcoreguidelines-pro-type-member-init,
  google-*,
  -google-explicit-constructor,
  modernize-*,
  -modernize-use-nodiscard,
  -modernize-use-trailing-return-type,
  -modernize-use-transparent-functors,
  -modernize-use-using,
  mpi-*,
  openmp-*,
  performance-*,
  readability-*,
  -readability-convert-member-functions-to-static,
  -readability-static-accessed-through-instance,
  -readability-implicit-bool-conversion,
WarningsAsErrors: ''
HeaderFilterRegex: ''
AnalyzeTemporaryDtors: false
FormatStyle:     file
CheckOptions:
  - key:             cert-dcl16-c.NewSuffixes
    value:           'L;LL;LU;LLU'
  - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
    value:           '0'
  - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors
    value:           '1'
  - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic
    value:           '1'
  - key:             google-readability-braces-around-statements.ShortStatementLines
    value:           '1'
  - key:             google-readability-function-size.StatementThreshold
    value:           '800'
  - key:             google-readability-namespace-comments.ShortNamespaceLines
    value:           '10'
  - key:             google-readability-namespace-comments.SpacesBeforeComments
    value:           '2'
  - key:             modernize-loop-convert.MaxCopySize
    value:           '16'
  - key:             modernize-loop-convert.MinConfidence
    value:           reasonable
  - key:             modernize-loop-convert.NamingStyle
    value:           CamelCase
  - key:             modernize-pass-by-value.IncludeStyle
    value:           llvm
  - key:             modernize-replace-auto-ptr.IncludeStyle
    value:           llvm
  - key:             modernize-use-nullptr.NullMacros
    value:           'NULL'
...


================================================
FILE: .git-blame-ignore-revs
================================================
# Bulk-change revisions to ignore in git blame
#
# Requires git v2.23
#
# To use:
#
#   git blame --ignore-revs-file .git-blame-ignore-revs
#
# or more permanently:
#
#   git config blame.ignoreRevsFile .git-blame-ignore-revs

# Run clang-format.
02ecf4f4ea6ed8618a3826f98c3ea192ee38ca2d

# Re-run clang-format.
47ddbe14de2e61b87749cd20bd368f07ef3c322f

# Reorganize the lonestar directories.
6ade1c5ac3cf0c261aff7bee863e46b2c124d174

# Run clang-format.
517fca343c75f842096b661e3ff883bb93f5c09e

# Another round of clang-format
2264b05ece3f9ec2b9bf397594cc14ef99f498de

# Fix endlines for barneshut app
558ccb83ab2e388c1202396f42d0881912e6393d


================================================
FILE: .gitignore
================================================
# no editor files
*~
*.backup
/.dir-locals.el
*.orig
*.patch
/.project
.settings
.*.swo
*.swp
.vscode

# no tool generated files
.clang-complete
.clangd
compile_commands.json
cscope.out
/GPATH
/GRTAGS
/GTAGS
.tags*
tags
.ycm_extra_conf.py

# no build files
/build*

# no python build artifacts
*.pyc
/python/galois.egg-info
/python/galois/*.so
/_skbuild


================================================
FILE: .gitmodules
================================================
[submodule "moderngpu"]
	path = external/moderngpu
	url = https://github.com/moderngpu/moderngpu.git
[submodule "cub"]
	path = external/cub
	url = https://github.com/NVlabs/cub.git
[submodule "docs"]
	path = docs
	url = https://github.com/IntelligentSoftwareSystems/Galois-docs.git


================================================
FILE: .travis.yml
================================================
dist: bionic

language: c++

git:
  submodules: true

matrix:
  include:
    - os: osx
      osx_image: xcode11.3
      before_script:
        - export CC=clang
        - export CXX=clang++
        - brew install openmpi llvm fmt
        - mkdir build
        - export PATH=$PATH:/usr/local/opt/llvm/bin
        - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGALOIS_ENABLE_DIST=ON || exit 1
    - env:
        - GCC_VER=7
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - gcc-7
            - g++-7
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - GCC_VER=8
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - gcc-8
            - g++-8
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - GCC_VER=9
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - gcc-9
            - g++-9
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - GCC_VER=10
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - gcc-10
            - g++-10
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - GCC_VER=10
        - BUILD_TYPE=Debug
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - gcc-10
            - g++-10
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - CLANG_VER=7
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-7 main'
              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - clang-7
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - CLANG_VER=8
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main'
              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - clang-8
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - CLANG_VER=9
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main'
              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - clang-9
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - CLANG_VER=10
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - clang-10
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev
    - env:
        - CLANG_VER=10
        - BUILD_TYPE=Debug
      addons:
        apt:
          sources:
            - sourceline: 'ppa:ubuntu-toolchain-r/test'
            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'
              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'
            - sourceline: 'ppa:mhier/libboost-latest'
          packages:
            - clang-10
            - cmake
            - libboost1.70-dev
            - libeigen3-dev
            - openmpi-bin
            - libopenmpi-dev
            - llvm-7-dev
            - libz-dev
            - libfmt-dev

before_script:
  # Depending on whether GCC_VER or CLANG_VER is set and nonempty,
  # set CC and CXX accordingly.
  - |
    if [ -n "$GCC_VER" ]; then
      export CC="gcc-$GCC_VER"
      export CXX="g++-$GCC_VER"
    fi
  - |
    if [ -n "$CLANG_VER" ]; then
      export CC="clang-$CLANG_VER"
      export CXX="clang++-$CLANG_VER"
    fi
  - |
    # Check if BUILD_TYPE is set at all, not just whether it is empty or unset.
    # See https://stackoverflow.com/a/13864829/1935144.
    if [ -z ${BUILD_TYPE+x} ]; then
      export BUILD_TYPE=Release
    fi
  - mkdir build
  # Use apt-installed llvm-7-dev rather than travis-provided one which is
  # picked up through the local clang-7 install in /usr/local/clang-7.
  - export CMAKE_PREFIX_PATH=/usr/lib/llvm-7
  # Use apt-installed cmake rather than travis-provided one
  # (/usr/local/cmake-3.12.4/bin/cmake).
  - /usr/bin/cmake -S . -B build -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DGALOIS_ENABLE_DIST=ON || exit 1

script:
  - make -C build input
  - cmake --build build --parallel 2 || exit 1
  - (cd build && ctest --output-on-failure --parallel 2 --label-regex quick) || exit 1

notifications:
  email: false


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.13)

project(Galois)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")

include(GNUInstallDirs)

file(STRINGS config/version.txt GALOIS_VERSION)
string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION})
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION})
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\2" GALOIS_VERSION_MINOR ${GALOIS_VERSION})
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\3" GALOIS_VERSION_PATCH ${GALOIS_VERSION})
set(GALOIS_COPYRIGHT_YEAR "2018") # Also in COPYRIGHT

if(NOT CMAKE_BUILD_TYPE)
  message(STATUS "No build type selected, default to Release")
  # cmake default flags with relwithdebinfo is -O2 -g
  # cmake default flags with release is -O3 -DNDEBUG
  set(CMAKE_BUILD_TYPE "Release")
endif()

###### Options (alternatively pass as options to cmake -DName=Value) ######
###### Distributed-heterogeneous features ######
set(GALOIS_ENABLE_DIST OFF CACHE BOOL "Enable distributed features")
set(GALOIS_CUDA_CAPABILITY "" CACHE STRING "Semi-colon list of CUDA compute capability version numbers to enable GPU features") # e.g., "3.7;6.1"
set(GALOIS_COMM_STATS OFF CACHE BOOL "Report more detailed statistics of communication")
###### General features ######
set(GALOIS_ENABLE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling")
set(GALOIS_ENABLE_VTUNE OFF CACHE BOOL "Use VTune for profiling")
set(GALOIS_STRICT_CONFIG OFF CACHE BOOL "Instead of falling back gracefully, fail")
set(GALOIS_GRAPH_LOCATION "" CACHE PATH "Location of inputs for tests if downloaded/stored separately.")
set(CXX_CLANG_TIDY "" CACHE STRING "Semi-colon list specifying clang-tidy command and arguments")
set(CMAKE_CXX_COMPILER_LAUNCHER "" CACHE STRING "Semi-colon list specifying command to wrap compiler invocations (e.g., ccache)")
set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture ('none' to disable)")
set(GALOIS_USE_SANITIZER "" CACHE STRING "Semi-colon list of sanitizers to use (Memory, MemoryWithOrigins, Address, Undefined, Thread)")
# This option is automatically handled by CMake.
# It makes add_library build a shared lib unless STATIC is explicitly specified.
# Putting this here is mostly just a placeholder so people know it's an option.
# Currently this is really only intended to change anything for the libgalois_shmem target.
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
set(BUILD_DOCS "" CACHE STRING "Build documentation with make doc. Supported values: <unset>, external, internal. external docs hide '*-draft*' and '*-internal* documentation pages and directories when building documentation")
###### Developer features ######
set(GALOIS_PER_ROUND_STATS OFF CACHE BOOL "Report statistics of each round of execution")
set(GALOIS_NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a single machine) for running the tests.")
set(GALOIS_USE_LCI OFF CACHE BOOL "Use LCI network runtime instead of MPI")
set(GALOIS_USE_BARE_MPI OFF CACHE BOOL "Use MPI directly (no dedicated network-runtime thread)")
set(GALOIS_NUM_TEST_THREADS "" CACHE STRING "Maximum number of threads to use when running tests (default: number of physical cores)")

if(NOT GALOIS_NUM_TEST_THREADS)
  cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES)
endif()
if(GALOIS_NUM_TEST_THREADS LESS_EQUAL 0)
  set(GALOIS_NUM_TEST_THREADS 1)
endif()

###### Configure (users don't need to go beyond here) ######

include(CTest)

###### Configure compiler ######

# generate compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

# Always include debug info
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-g>")

# GCC
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
    message(FATAL_ERROR "gcc must be version 7 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.")
  endif()

  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>")

  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Werror>")
  endif()
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
    message(FATAL_ERROR "clang must be version 7 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.")
  endif()

  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>")

  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Werror>")
  endif()
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>")

  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Werror>")
  endif()
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0.1)
    message(FATAL_ERROR "icpc must be 19.0.1 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.")
  endif()

  # Avoid warnings when using noinline for methods defined inside class defintion.
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-wd2196>")
endif()

# Enable architecture-specific optimizations
include(CheckArchFlags)
if(ARCH_FLAGS_FOUND)
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_CXX_FLAGS}>")
  add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_C_FLAGS}>")
  add_link_options(${ARCH_LINK_FLAGS})
endif()

if(CXX_CLANG_TIDY)
  set(CMAKE_CXX_CLANG_TIDY ${CXX_CLANG_TIDY} "-header-filter=.*${PROJECT_SOURCE_DIR}.*")
  # Ignore warning flags intended for the CXX program. This only works because
  # the two compilers we care about, clang and gcc, both understand
  # -Wno-unknown-warning-option.
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wno-unknown-warning-option>")
endif()

###### Configure features ######

if(GALOIS_ENABLE_VTUNE)
  set(VTune_ROOT /opt/intel/vtune_amplifier)
  find_package(VTune REQUIRED)
  include_directories(${VTune_INCLUDE_DIRS})
  add_definitions(-DGALOIS_ENABLE_VTUNE)
endif()

if(GALOIS_ENABLE_PAPI)
  find_package(PAPI REQUIRED)
  include_directories(${PAPI_INCLUDE_DIRS})
  add_definitions(-DGALOIS_ENABLE_PAPI)
endif()

find_package(Threads REQUIRED)

include(CheckMmap)

include(CheckHugePages)
if(NOT HAVE_HUGEPAGES AND GALOIS_STRICT_CONFIG)
  message(FATAL_ERROR "Need huge pages")
endif()

find_package(Boost 1.58.0 REQUIRED COMPONENTS serialization iostreams)

find_package(LLVM REQUIRED CONFIG)
if("${LLVM_PACKAGE_VERSION}" VERSION_LESS "7")
  message(FATAL_ERROR "LLVM 7 or greater is required.")
endif()
if(NOT DEFINED LLVM_ENABLE_RTTI)
  message(FATAL_ERROR "Could not determine if LLVM has RTTI enabled.")
endif()
if(NOT ${LLVM_ENABLE_RTTI})
  message(FATAL_ERROR "Galois requires a build of LLVM that includes RTTI. Most package managers do this already, but if you built LLVM from source you need to configure it with `-DLLVM_ENABLE_RTTI=ON`")
endif()
target_include_directories(LLVMSupport INTERFACE ${LLVM_INCLUDE_DIRS})

include(HandleSanitizer)

include(CheckEndian)

###### Test Inputs ######

if(GALOIS_GRAPH_LOCATION)
  set(BASEINPUT "${GALOIS_GRAPH_LOCATION}")
  set(BASEOUTPUT "${GALOIS_GRAPH_LOCATION}")
  message(STATUS "Using graph input and output location ${GALOIS_GRAPH_LOCATION}")
elseif(EXISTS /net/ohm/export/iss)
  set(BASEINPUT /net/ohm/export/iss/inputs)
  MESSAGE(STATUS "Using graph input location /net/ohm/export/iss/inputs")
  set(BASEOUTPUT /net/ohm/export/iss/dist-outputs)
  MESSAGE(STATUS "Using graph output location /net/ohm/export/iss/dist-outputs")
else()
  set(BASEINPUT "${PROJECT_BINARY_DIR}/inputs")
  set(BASEOUTPUT "${PROJECT_BINARY_DIR}/inputs")
  message(STATUS "Use 'make input' to download inputs and outputs in the build directory")
endif()

###### Source finding ######

add_custom_target(lib)
add_custom_target(apps)

# Core libraries (lib)
add_subdirectory(libsupport)
add_subdirectory(libgalois)
add_subdirectory(libpygalois)
if (GALOIS_ENABLE_DIST)
  find_package(MPI REQUIRED)
  add_subdirectory(libdist)
  add_subdirectory(libcusp)
  add_subdirectory(libgluon)
endif()
string(COMPARE NOTEQUAL "${GALOIS_CUDA_CAPABILITY}" "" GALOIS_ENABLE_GPU)
if (GALOIS_ENABLE_GPU)
  enable_language(CUDA)
  foreach(GENCODE ${GALOIS_CUDA_CAPABILITY})
    string(REPLACE "." "" GENCODE ${GENCODE})
    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
  endforeach()

  # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included)
  add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)

  add_subdirectory(libgpu)
endif()
add_subdirectory(libpangolin)

# Applications (apps)
add_subdirectory(lonestar)

add_subdirectory(scripts)
add_subdirectory(inputs)
add_subdirectory(tools)

if(USE_EXP)
  add_subdirectory(lonestar/experimental)
endif(USE_EXP)

###### Documentation ######

if(BUILD_DOCS)
  set(GALOIS_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
  add_subdirectory(docs)
endif()

###### Installation ######

include(CMakePackageConfigHelpers)
write_basic_package_version_file(
  ${CMAKE_CURRENT_BINARY_DIR}/GaloisConfigVersion.cmake
  VERSION ${GALOIS_VERSION}
  COMPATIBILITY SameMajorVersion
)
configure_package_config_file(
  cmake/GaloisConfig.cmake.in
  ${CMAKE_CURRENT_BINARY_DIR}/GaloisConfig.cmake
  INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/Galois"
  PATH_VARS CMAKE_INSTALL_INCLUDEDIR CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_BINDIR
)
install(
  FILES "${CMAKE_CURRENT_BINARY_DIR}/GaloisConfigVersion.cmake" "${CMAKE_CURRENT_BINARY_DIR}/GaloisConfig.cmake"
  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/Galois"
  COMPONENT dev
)
install(
  EXPORT GaloisTargets
  NAMESPACE Galois::
  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/Galois"
  COMPONENT dev
)

###### Distribution ######

set(CPACK_GENERATOR "TGZ")
set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/COPYRIGHT")
set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
set(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH})
include(CPack)


================================================
FILE: COPYRIGHT
================================================
Galois, a framework to exploit amorphous data-parallelism in irregular
programs.

Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
shall University be liable for incidental, special, indirect, direct or
consequential damages or loss of profits, interruption of business, or
related expenses which may arise from use of Software or Documentation,
including but not limited to those resulting from defects in Software and/or
Documentation, or loss or inaccuracy of data of any kind.

This software is released under the terms of the 3-Clause BSD License (a
copy is located in LICENSE.txt at the top-level directory).


================================================
FILE: LICENSE.txt
================================================
The 3-Clause BSD License

Copyright 2018 The University of Texas at Austin

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
Overview
========

[![CircleCI](https://circleci.com/gh/IntelligentSoftwareSystems/Galois.svg?style=svg)](https://circleci.com/gh/IntelligentSoftwareSystems/Galois)
[![Build Status](https://travis-ci.org/IntelligentSoftwareSystems/Galois.svg?branch=master)](https://travis-ci.org/IntelligentSoftwareSystems/Galois)

Galois is a C++ library designed to ease parallel programming, especially for
applications with irregular parallelism (e.g., irregular amount of work in parallel
sections, irregular memory accesses and branching patterns). It implements
an implicitly parallel programming model, where the programmer replaces serial loop
constructs (e.g. for and while) and serial data structures in their algorithms with parallel loop
constructs and concurrent data structures provided by Galois to express their algorithms.
Galois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as
threads, locks, barriers, condition variables, etc. 

Highlights include:
- Parallel *for_each* loop that handles dependencies between iterations, as well as
  dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent
  scalability on multi-socket systems
- A concurrent graph library designed for graph analytics algorithms as well as
  other domains such as irregular meshes. 
- Scalable concurrent containers such as bag, vector, list, etc. 

Galois is released under the BSD-3-Clause license. 


Building Galois
===============

You can checkout the latest release by typing (in a terminal):

```Shell
git clone -b release-5.0 https://github.com/IntelligentSoftwareSystems/Galois
```

The master branch will be regularly updated, so you may try out the latest
development code as well by checking out master branch:

```Shell
git clone https://github.com/IntelligentSoftwareSystems/Galois
```

Dependencies
------------

Galois builds, runs, and has been tested on GNU/Linux. Even though
Galois may build on systems similar to Linux, we have not tested correctness or performance, so please
beware. 

At the minimum, Galois depends on the following software:

- A modern C++ compiler compliant with the C++-17 standard (gcc >= 7, Intel >= 19.0.1, clang >= 7.0)
- CMake (>= 3.13)
- Boost library (>= 1.58.0, we recommend building/installing the full library)
- libllvm (>= 7.0 with RTTI support)
- libfmt (>= 4.0)

Here are the dependencies for the optional features: 

- Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES
  enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages:
  ```Shell
  cat /proc/meminfo | fgrep Huge
  AnonHugePages:    104448 kB
  HugePages_Total:   65536
  HugePages_Free:    65536
  HugePages_Rsvd:        0
  HugePages_Surp:        0
  Hugepagesize:       2048 kB
  ```

- libnuma support. Performance may degrade without it. Please install
  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. 
- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files 
- PAPI (>= 5.2.0.0 ) for profiling sections of code
- Vtune (>= 2017 ) for profiling sections of code
- MPICH2 (>= 3.2) if you are interested in building and running distributed system
  applications in Galois
- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications.
  Note that versions >= 11.0 use an incompatible CUB module and will fail to execute.
- Eigen (3.3.1 works for us) for some matrix-completion app variants


Compiling and Testing Galois
----------------------------
We use CMake to streamline building, testing and installing Galois. In the
following, we will highlight some common commands.

Let's assume that `SRC_DIR` is the directory where the source code for Galois
resides, and you wish to build Galois in some `BUILD_DIR`. Run the following
commands to set up a build directory:

```Shell
SRC_DIR=`pwd` # Or top-level Galois source dir
BUILD_DIR=<path-to-your-build-dir>

mkdir -p $BUILD_DIR
cmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Release
```

You can also set up a `Debug` build by running the following instead of the last command above:

```Shell
cmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Debug
```

Galois applications are in `lonestar` directory.  In order to build a particular application:

```Shell
make -C $BUILD_DIR/lonestar/<app-dir-name> -j
# or alternatively
make -C $BUILD_DIR <app-executable-name> -j
# or
cmake --build $BUILD_DIR <app-executable-name> --parallel
```

You can also build everything by running `make -j` in the top-level of build directory, but that may
take a lot of time.

Setting the `BUILD_SHARED_LIBS` to `ON` when calling CMake will make the core runtime library be built as a shared object instead of a static library.

The tests for the core runtime will be built by default when you run `make`
with no target specified. They can be also built explicitly with:

```Shell
make -C $BUILD_DIR/test
```

We provide a few sample inputs that can be downloaded by running:

```Shell
make -C $BUILD_DIR input
```

`make input` will download a tarball of inputs and extract it to
`$BUILD_DIR/inputs/small_inputs` directory. The tarball is downloaded to
`$BUILD_DIR/inputs`

Most of the Galois apps have corresponding tests.
These tests depend on downloading the reference inputs and building the corresponding apps and test binaries.
Once the reference inputs have been downloaded and everything has been built,
the tests for the core library and all the apps can be run by running:

```Shell
make test
# or alternatively
ctest
```

in the build directory.


Running Galois Applications
===========================

Graph Format
------------

Many Galois/Lonestar applications work with graphs. We store graphs in a binary format
called *galois graph file* 
(`.gr` file extension). Other formats such as edge-list or Matrix-Market can be
converted to `.gr` format with `graph-convert` tool provided in galois. 
You can build graph-convert as follows:

```Shell
cd $BUILD_DIR
make graph-convert
./tools/graph-convert/graph-convert --help
```

Other applications, such as Delaunay Mesh Refinement may read special file formats
or some may even generate random inputs on the fly. 

Running
-------

All Lonestar applications take a `-t` command-line option to specify the number of
threads to use. All applications run a basic sanity check (often insufficient for
correctness) on the program output, which can be turned off with the `-noverify` option. You 
can specify `-help` command-line option to print all available options. 

Upon successful completion, each application will produce some stats regarding running
time of various sections, parallel loop iterations and memory usage, etc. These
stats are in CSV format and can be redirected to a file using `-statFile` option.
Please refer to the manual for details on stats. 

Running LonestarGPU applications
--------------------------

Please refer to `lonestar/analytics/gpu/README.md` and `lonestar/scientific/gpu/README.md` for more details on
compiling and running LonestarGPU applications.

Running Distributed Galois
--------------------------

Please refer to `lonestar/analytics/distributed/README.md` for more details on
running distributed benchmarks.

Documentation
=============

Galois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's
manual and API documentation for the Galois library. 

Users can build doxygen documentation in the build directory using:

```Shell
cd $BUILD_DIR
make doc
your-fav-browser html/index.html &
```

See online documentation at:
 [http://iss.ices.utexas.edu/?p=projects/galois](http://iss.ices.utexas.edu/?p=projects/galois)

Source-Tree Organization
========================

- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. 
- `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois
- `libdist` contains the source code for the distributed-memory and heterogeneous Galois library
- `lonestardist` contains the source code for the distributed-memory and heterogeneous
  benchmark applications. Please refer to `lonestardist/README.md` for instructions on
  building and running these apps. 
- `tools` contains various helper programs such as graph-converter to convert
  between graph file formats and graph-stats to print graph properties

Using Galois as a library
=========================

There are two common ways to use Galois as a library. One way is to copy this
repository into your own CMake project, typically using a git submodule. Then
you can put the following in your CMakeLists.txt:

```CMake
add_subdirectory(galois EXCLUDE_FROM_ALL)
add_executable(app ...)
target_link_libraries(app Galois::shmem)
```

The other common method is to install Galois outside your project and import it
as a package.

If you want to install Galois, assuming that you wish to install it under
`INSTALL_DIR`:

```Shell
cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $SRC_DIR
make install
```

Then, you can put something like the following in CMakeLists.txt:

```CMake
list(APPEND CMAKE_PREFIX_PATH ${INSTALL_DIR})
find_package(Galois REQUIRED)
add_executable(app ...)
target_link_libraries(app Galois::shmem)
```

If you are not using CMake, the corresponding basic commands (although the
specific commands vary by system) are:

```Shell
c++ -std=c++14 app.cpp -I$INSTALL_DIR/include -L$INSTALL_DIR/lib -lgalois_shmem
```
Third-Party Libraries and Licensing
====================

Galois includes some third party libraries that do not use the same license as
Galois. This includes the bliss library (located in lonestar/include/Mining/bliss)
and Modern GPU (located in libgpu/moderngpu). Please be aware of this when
using Galois.

Contact Us
==========
For bugs, please raise an
[issue](https://github.com/IntelligentSoftwareSystems/Galois/issues) on
GiHub.
Questions and comments are also welcome at the Galois users mailing list:
[galois-users@utlists.utexas.edu](galois-users@utlists.utexas.edu). You may
[subscribe here](https://utlists.utexas.edu/sympa/subscribe/galois-users).

If you find a bug, it would help us if you sent (1) the command line and
program inputs and outputs and (2) a core dump, preferably from an executable
built with the debug build.

You can enable core dumps by setting `ulimit -c unlimited` before running your
program. The location where the core dumps will be stored can be determined with
`cat /proc/sys/kernel/core_pattern`.

To create a debug build, assuming you will build Galois in `BUILD_DIR` and the
source is in `SRC_DIR`:

```Shell
cmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Debug
make -C $BUILD_DIR
```

A simple way to capture relevant debugging details is to use the `script`
command, which will record your terminal input and output. For example,

```Shell
script debug-log.txt
ulimit -c unlimited
cat /proc/sys/kernel/core_pattern
make -C $BUILD_DIR <my-app> VERBOSE=1
my-app with-failing-input
exit
```

This will generate a file `debug-log.txt`, which you can send to the mailing
list:[galois-users@utlists.utexas.edu](galois-users@utlists.utexas.edu) for
further debugging or supply when opening a GitHub issue.


================================================
FILE: cmake/GaloisConfig.cmake.in
================================================
# Config file for the Galois package
#
# It exports the following targets:
#   Galois::shmem
#   Galois::dist
#   ...
#   (see GaloisTargets.cmake for all of them)
#
# It defines the following variables for legacy importing:
#   Galois_INCLUDE_DIRS
#   Galois_LIBRARIES
#   Galois_LIBRARY_DIRS
#   Galois_BIN_DIRS
include(CMakeFindDependencyMacro)

@PACKAGE_INIT@

set_and_check(Galois_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
set_and_check(Galois_LIBRARY_DIRS "@PACKAGE_CMAKE_INSTALL_LIBDIR@")
set_and_check(Galois_BIN_DIRS "@PACKAGE_CMAKE_INSTALL_BINDIR@")
set(Galois_LIBRARIES galois_shmem)

find_dependency(Threads REQUIRED)
find_dependency(Boost 1.58.0 REQUIRED COMPONENTS serialization iostreams)
if (@GALOIS_ENABLE_DIST@)
  find_dependency(MPI REQUIRED)
endif()

get_filename_component(GALOIS_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)

if(NOT Galois::shmem)
  include("${GALOIS_CMAKE_DIR}/GaloisTargets.cmake")
endif()


================================================
FILE: cmake/Modules/CheckArchFlags.cmake
================================================
# Find architecture-specific flags
#
# Once done this will define
#  ARCH_FLAGS_FOUND
#  ARCH_CXX_FLAGS - Compiler flags to enable architecture-specific optimizations
#  ARCH_C_FLAGS - Compiler flags to enable architecture-specific optimizations
#  ARCH_LINK_FLAGS - Compiler flags to enable architecture-specific optimizations
include(CheckCXXCompilerFlag)

if(NOT USE_ARCH OR USE_ARCH STREQUAL "none" OR ARCH_FLAGS_FOUND)
  set(ARCH_CXX_FLAGS_CANDIDATES)
else()
  set(ARCH_CXX_FLAGS_CANDIDATES "-march=${USE_ARCH}")
endif()

if(USE_ARCH STREQUAL "mic")
  if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
    list(APPEND ARCH_CXX_FLAGS_CANDIDATES -mmic)
  endif()

  if(CMAKE_COMPILER_IS_GNUCC)
    list(APPEND ARCH_CXX_FLAGS_CANDIDATES -march=knc)
  endif()
endif()

foreach(FLAG ${ARCH_CXX_FLAGS_CANDIDATES})
  message(STATUS "Try architecture flag = [${FLAG}]")
  unset(ARCH_CXX_FLAGS_DETECTED)
  check_cxx_compiler_flag("${FLAG}" ARCH_CXX_FLAGS_DETECTED)
  if(ARCH_CXX_FLAGS_DETECTED)
    set(ARCH_FLAGS_FOUND "YES")
    set(ARCH_CXX_FLAGS "${FLAG}")
    set(ARCH_C_FLAGS "${FLAG}")
    set(ARCH_LINK_FLAGS "${FLAG}")
  endif()
endforeach()


================================================
FILE: cmake/Modules/CheckCilk.cmake
================================================
include(CheckCXXSourceCompiles)
set(Cilk_CXX_TEST_SOURCE
"
#include <cilk/cilk.h>
int main(){ cilk_for(int i=0;i<1; ++i); }
")
CHECK_CXX_SOURCE_COMPILES("${Cilk_CXX_TEST_SOURCE}" HAVE_CILK)
if(HAVE_CILK)
  message(STATUS "A compiler with CILK support found")
endif()


================================================
FILE: cmake/Modules/CheckEndian.cmake
================================================
include(TestBigEndian)
TEST_BIG_ENDIAN(HAVE_BIG_ENDIAN)
include(CheckIncludeFiles)
CHECK_INCLUDE_FILES(endian.h HAVE_ENDIAN_H)
include(CheckSymbolExists)
CHECK_SYMBOL_EXISTS(le64toh "endian.h" HAVE_LE64TOH)
CHECK_SYMBOL_EXISTS(le32toh "endian.h" HAVE_LE32TOH)
CHECK_SYMBOL_EXISTS(htobe64 "endian.h" HAVE_HTOBE64)
CHECK_SYMBOL_EXISTS(htobe32 "endian.h" HAVE_HTOBE32)
CHECK_SYMBOL_EXISTS(htole64 "endian.h" HAVE_HTOLE64)
CHECK_SYMBOL_EXISTS(htole32 "endian.h" HAVE_HTOLE32)


================================================
FILE: cmake/Modules/CheckHugePages.cmake
================================================
include(CheckCSourceRuns)
set(HugePages_C_TEST_SOURCE
"
#ifdef __linux__
#include <linux/mman.h>
#endif
#include <sys/mman.h>

int main(int c, char** argv) {
  void *ptr = mmap(0, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_HUGETLB, -1, 0);

  return ptr != MAP_FAILED;
}
")
if(HAVE_HUGEPAGES)

else()
  CHECK_C_SOURCE_RUNS("${HugePages_C_TEST_SOURCE}" HAVE_HUGEPAGES_INTERNAL)
  if(HAVE_HUGEPAGES_INTERNAL)
    message(STATUS "Huge pages found")
    set(HAVE_HUGEPAGES "${HAVE_HUGEPAGES_INTERNAL}" CACHE BOOL "Have hugepages")
  endif()
endif()


================================================
FILE: cmake/Modules/CheckMmap.cmake
================================================
include(CheckCSourceCompiles)
set(Mmap64_C_TEST_SOURCE
"
#ifdef __linux__
#include <linux/mman.h>
#endif
#include <sys/mman.h>

int main(int c, char** argv) {
  void *ptr = mmap64(0, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_PRIVATE, -1, 0);
  return 0;
}
")

if(HAVE_MMAP64)

else()
  CHECK_C_SOURCE_COMPILES("${Mmap64_C_TEST_SOURCE}" HAVE_MMAP64_INTERNAL)
  if(HAVE_MMAP64_INTERNAL)
    message(STATUS "mmap64 found")
    set(HAVE_MMAP64 "${HAVE_MMAP64_INTERNAL}" CACHE BOOL "Have mmap64")
  endif()
endif()


================================================
FILE: cmake/Modules/CheckSchedSetAffinity.cmake
================================================
include(CheckSymbolExists)

if(SCHED_SETAFFINITY_FOUND)

else()
  set(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
  CHECK_SYMBOL_EXISTS(sched_setaffinity sched.h HAVE_SCHED_SETAFFINITY_INTERNAL)
  if(HAVE_SCHED_SETAFFINITY_INTERNAL)
    message(STATUS "sched_setaffinity found")
    set(SCHED_SETAFFINITY_FOUND "${HAVE_SCHED_SETAFFINITY_INTERNAL}")
    set(SCHED_SETAFFINITY_LIBRARIES rt)
  endif()
endif()


================================================
FILE: cmake/Modules/FindCBLAS.cmake
================================================
# Copyright 2009-2011 The VOTCA Development Team (http://www.votca.org)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#! \file
#! \ingroup FindPackage
#! \brief Find CBLAS
#!
#! Find the native CBLAS headers and libraries.
#!
#! - `CBLAS_LIBRARIES`    - List of libraries when using cblas.
#! - `CBLAS_INCLUDE_DIRS` - List of include directories
#! - `CBLAS_FOUND`        - True if cblas found.
#!
#! Cblas can be provided by libblas (Ubuntu), cblas or gslcblas, it will be searched for in
#! this order.

include(LibFindMacros)

if (UNIX)
  find_package(PkgConfig QUIET)
  pkg_check_modules(CBLAS_PKGCONF QUIET cblas)
endif()

if (NOT CBLAS_FOUND)

if(CBLAS_PKGCONF_FOUND)

foreach(NEW_CBLAS_LIB ${CBLAS_PKGCONF_LIBRARIES})
  find_library(LIB_${NEW_CBLAS_LIB} ${NEW_CBLAS_LIB} HINTS ${CBLAS_PKGCONF_LIBRARY_DIRS})
  if(NOT LIB_${NEW_CBLAS_LIB})
    message(FATAL_ERROR "Could not find ${NEW_CBLAS_LIB} where pkgconfig said it is: ${CBLAS_PKGCONF_LIBRARY_DIRS}")
  else(NOT LIB_${NEW_CBLAS_LIB})
    message(STATUS "Found ${LIB_${NEW_CBLAS_LIB}}.")
  endif(NOT LIB_${NEW_CBLAS_LIB})
  set(CBLAS_LIBRARY ${CBLAS_LIBRARY} ${LIB_${NEW_CBLAS_LIB}})
endforeach(NEW_CBLAS_LIB)

else(CBLAS_PKGCONF_FOUND)

set(CBLAS_HINT_PATH $ENV{CBLASDIR}/lib $ENV{CBLASDIR}/lib64 $ENV{UIBK_GSL_LIB})

# Check if libblas provides cblas (Ubuntu)
find_library(BLAS_LIBRARY NAMES blas PATHS ${CBLAS_HINT_PATH})
if(BLAS_LIBRARY)
  include(CheckSymbolExists)
  set(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARY})
  check_symbol_exists(cblas_scopy "cblas.h" BLAS_HAS_CBLAS)
endif(BLAS_LIBRARY)

set(CBLAS_CANDIDATES cblas gslcblas)
if(BLAS_HAS_CBLAS)
  message(STATUS "libblas provides cblas.")
  set(CBLAS_CANDIDATES blas ${CBLAS_CANDIDATES})
endif(BLAS_HAS_CBLAS)

find_library(CBLAS_LIBRARY
  NAMES ${CBLAS_CANDIDATES}
  PATHS ${CBLAS_HINT_PATH}
)
endif(CBLAS_PKGCONF_FOUND)

if("${CBLAS_LIBRARY}" MATCHES gslcblas)
  set(CBLAS_INCLUDE_CANDIDATE gsl/gsl_cblas.h)
else("${CBLAS_LIBRARY}" MATCHES gslcblas)
  set(CBLAS_INCLUDE_CANDIDATE cblas.h)
endif("${CBLAS_LIBRARY}" MATCHES gslcblas)

find_path(CBLAS_INCLUDE_DIR ${CBLAS_INCLUDE_CANDIDATE} HINTS ${CBLAS_PKGCONF_INCLUDE_DIRS} $ENV{CBLASDIR}/include $ENV{UIBK_GSL_INC})

# Set the include dir variables and the libraries and let libfind_process do the rest.
# NOTE: Singular variables for this library, plural for libraries this this lib depends on.
set(CBLAS_PROCESS_INCLUDES CBLAS_INCLUDE_DIR)
set(CBLAS_PROCESS_LIBS CBLAS_LIBRARY)
libfind_process(CBLAS)
message(STATUS "Using '${CBLAS_LIBRARIES}' for cblas.")

endif(NOT CBLAS_FOUND)


================================================
FILE: cmake/Modules/FindFortran.cmake
================================================
# Check if Fortran is possibly around before using enable_lanauge because
# enable_language(... OPTIONAL) does not fail gracefully if language is not
# found:
#  http://public.kitware.com/Bug/view.php?id=9220
set(Fortran_EXECUTABLE)
if(Fortran_EXECUTABLE)
  set(Fortran_FIND_QUIETLY TRUE)
endif()
find_program(Fortran_EXECUTABLE NAMES gfortran ifort g77 f77 g90 f90)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Fortran DEFAULT_MSG Fortran_EXECUTABLE)
if(FORTRAN_FOUND)
  set(Fortran_FOUND TRUE)
endif()


================================================
FILE: cmake/Modules/FindGASNET.cmake
================================================
# Find the GasNet librairy
#  GASNET_FOUND - system has GasNet lib
#  GASNET_INCLUDE_DIR - the GasNet include directory
#  GASNET_LIBRARIES - Libraries needed to use GasNet

if(GASNET_INCLUDE_DIRS AND GASNET_LIBRARIES)
  set(GASNET_FIND_QUIETLY TRUE)
endif()

find_path(GASNET_INCLUDE_DIRS NAMES gasnet.h)
find_library(GASNET_LIBRARY_1 NAMES gasnet amudp HINTS ${GASNET_INCLUDE_DIRS}/../lib )
find_library(GASNET_LIBRARY_2 NAMES gasnet gasnet-udp-par HINTS ${GASNET_INCLUDE_DIRS}/../lib )

set(GASNET_LIBRARIES ${GASNET_LIBRARY_2} ${GASNET_LIBRARY_1})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GASNET DEFAULT_MSG GASNET_INCLUDE_DIRS GASNET_LIBRARIES)

mark_as_advanced(GASNET_INCLUDE_DIRS GASNET_LIBRARIES)


================================================
FILE: cmake/Modules/FindGMP.cmake
================================================
# Find the GMP librairies
#  GMP_FOUND - system has GMP lib
#  GMP_INCLUDE_DIR - the GMP include directory
#  GMP_LIBRARIES - Libraries needed to use GMP

# Copyright (c) 2006, Laurent Montel, <montel@kde.org>
#
# Redistribution and use is allowed according to the terms of the BSD license.
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.

if(GMP_INCLUDE_DIRS AND GMP_LIBRARIES AND GMPXX_LIBRARIES)
  set(GMP_FIND_QUIETLY TRUE)
endif()

find_path(GMP_INCLUDE_DIRS NAMES gmp.h)
find_library(GMP_LIBRARIES NAMES gmp libgmp)
find_library(GMPXX_LIBRARIES NAMES gmpxx libgmpxx)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GMP DEFAULT_MSG GMP_INCLUDE_DIRS GMP_LIBRARIES)

mark_as_advanced(GMP_INCLUDE_DIRS GMP_LIBRARIES GMPXX_LIBRARIES)


================================================
FILE: cmake/Modules/FindGit.cmake
================================================
# The module defines the following variables:
#   GIT_EXECUTABLE - path to git command line client
#   GIT_FOUND - true if the command line client was found
# Example usage:
#   find_package(Git)
#   if(GIT_FOUND)
#     message("git found: ${GIT_EXECUTABLE}")
#   endif()

#=============================================================================
# Copyright 2010 Kitware, Inc.
#
# Distributed under the OSI-approved BSD License (the "License");
# see accompanying file Copyright.txt for details.
#
# This software is distributed WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the License for more information.
#=============================================================================
# (To distributed this file outside of CMake, substitute the full
#  License text for the above reference.)

# Look for 'git' or 'eg' (easy git)
#
set(git_names git eg)

# Prefer .cmd variants on Windows unless running in a Makefile
# in the MSYS shell.
#
if(WIN32)
  if(NOT CMAKE_GENERATOR MATCHES "MSYS")
    set(git_names git.cmd git eg.cmd eg)
  endif()
endif()

find_program(GIT_EXECUTABLE
  NAMES ${git_names}
  DOC "git command line client"
  )
mark_as_advanced(GIT_EXECUTABLE)

# Handle the QUIETLY and REQUIRED arguments and set GIT_FOUND to TRUE if
# all listed variables are TRUE

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Git DEFAULT_MSG GIT_EXECUTABLE)


================================================
FILE: cmake/Modules/FindNUMA.cmake
================================================
# Find numa library
# Once done this will define
#  NUMA_FOUND - libnuma found
#  NUMA_OLD - old libnuma API
if(NOT NUMA_FOUND)
  find_library(NUMA_LIBRARY NAMES numa PATH_SUFFIXES lib lib64)
  if(NUMA_LIBRARY)
    include(CheckLibraryExists)
    check_library_exists(${NUMA_LIBRARY} numa_available "" NUMA_FOUND_INTERNAL)
    if(NUMA_FOUND_INTERNAL)
      check_library_exists(${NUMA_LIBRARY} numa_allocate_nodemask "" NUMA_NEW_INTERNAL)
      if(NOT NUMA_NEW_INTERNAL)
        set(NUMA_OLD "yes" CACHE)
      endif()
    endif()

    include(FindPackageHandleStandardArgs)
    find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARY)
    mark_as_advanced(NUMA_FOUND)
  endif()
endif()


================================================
FILE: cmake/Modules/FindOpenCL.cmake
================================================
#
#  This file taken from FindOpenCL project @ http://gitorious.com/findopencl
#
# - Try to find OpenCL
# This module tries to find an OpenCL implementation on your system. It supports
# AMD / ATI, Apple and NVIDIA implementations, but should work, too.
#
# Once done this will define
#  OPENCL_FOUND        - system has OpenCL
#  OPENCL_INCLUDE_DIRS  - the OpenCL include directory
#  OPENCL_LIBRARIES    - link these to use OpenCL
#
# WIN32 should work, but is untested

FIND_PACKAGE( PackageHandleStandardArgs )

SET (OPENCL_VERSION_STRING "0.1.0")
SET (OPENCL_VERSION_MAJOR 0)
SET (OPENCL_VERSION_MINOR 1)
SET (OPENCL_VERSION_PATCH 0)

IF (APPLE)

  FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
  FIND_PATH(OPENCL_INCLUDE_DIRS opencl/cl.h DOC "Include for OpenCL on OSX")
  FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS opencl/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")

ELSE (APPLE)

	IF (WIN32)
	
	    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h)
	    FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp)
	
	    # The AMD SDK currently installs both x86 and x86_64 libraries
	    # This is only a hack to find out architecture
	    IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )
	    	SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
			SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86_64")
	    ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
	    	SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
	   		SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86")
	    ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )

	    # find out if the user asked for a 64-bit build, and use the corresponding 
	    # 64 or 32 bit NVIDIA library paths to the search:
	    STRING(REGEX MATCH "Win64" ISWIN64 ${CMAKE_GENERATOR})
	    IF("${ISWIN64}" STREQUAL "Win64") 
	    	FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64)
	    ELSE("${ISWIN64}" STREQUAL "Win64") 
	    	FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32)
	    ENDIF("${ISWIN64}" STREQUAL "Win64") 

	    GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
	    
	    # On Win32 search relative to the library
	    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
	    FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
	
	ELSE (WIN32)

            # Unix style platforms
            FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
              ENV LD_LIBRARY_PATH
            )

            GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
            GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)

            # The AMD SDK currently does not place its headers
            # in /usr/include, therefore also search relative
            # to the library
            FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} $ENV{OPENCL_INCLUDE_DIRS} "/usr/local/cuda/include")
            FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} $ENV{OPENCL_LIB_DIR} "/usr/local/cuda/include")

	ENDIF (WIN32)

ENDIF (APPLE)

FIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )

IF( _OPENCL_CPP_INCLUDE_DIRS )
	SET( OPENCL_HAS_CPP_BINDINGS TRUE )
	LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
	# This is often the same, so clean up
	LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
ENDIF( _OPENCL_CPP_INCLUDE_DIRS )

MARK_AS_ADVANCED(
  OPENCL_INCLUDE_DIRS
)


================================================
FILE: cmake/Modules/FindPAPI.cmake
================================================
# Find PAPI libraries
# Once done this will define
#  PAPI_FOUND - System has PAPI
#  PAPI_INCLUDE_DIRS - The PAPI include directories
#  PAPI_LIBRARIES - The libraries needed to use PAPI

if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
  set(PAPI_FIND_QUIETLY TRUE)
endif()

# XXX(ddn): our system papi is broken so ignore for now
# find_path(PAPI_INCLUDE_DIRS papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include NO_DEFAULT_PATH )
find_path(PAPI_INCLUDE_DIRS papi.h HINTS ${PAPI_ROOT} ENV TACC_PAPI_DIR PATH_SUFFIXES include)
message(STATUS "PAPI_INCLUDE_DIRS: ${PAPI_INCLUDE_DIRS}")
find_library(PAPI_LIBRARY NAMES papi HINTS ${PAPI_ROOT} ENV TACC_PAPI_DIR PATH_SUFFIXES lib lib64)
message(STATUS "PAPI_LIBRARY: ${PAPI_LIBRARY}")
find_library(PAPI_LIBRARIES NAMES rt PATH_SUFFIXES lib lib64)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARY PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
if(PAPI_FOUND)
  set(PAPI_LIBRARIES ${PAPI_LIBRARY} ${PAPI_LIBRARIES})
endif()

mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)


================================================
FILE: cmake/Modules/FindQGLViewer.cmake
================================================
# Find QGLViewer libraries
# Once done this will define
#  QGLViewer_FOUND - System has QGLViewer
#  QGLViewer_INCLUDE_DIRS - The QGLViewer include directories
#  QGLViewer_LIBRARIES - The libraries needed to use QGLViewer

if(QGLViewer_INCLUDE_DIRS AND QGLVIEWER_LIBRARIES)
  set(QGLViewer_FIND_QUIETLY TRUE)
endif()

find_path(QGLViewer_INCLUDE_DIRS NAMES QGLViewer/qglviewer.h)
find_library(QGLViewer_LIBRARIES NAMES QGLViewer PATH_SUFFIXES lib lib64)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(QGLViewer DEFAULT_MSG QGLViewer_INCLUDE_DIRS QGLViewer_LIBRARIES)
if(QGLVIEWER_FOUND)
  set(QGLViewer_FOUND TRUE)
endif()

mark_as_advanced(QGLViewer_INCLUDE_DIRS QGLViewer_LIBRARIES)


================================================
FILE: cmake/Modules/FindTBB.cmake
================================================
# Locate Intel Threading Building Blocks include paths and libraries
# FindTBB.cmake can be found at https://code.google.com/p/findtbb/
# Written by Hannes Hofmann <hannes.hofmann _at_ informatik.uni-erlangen.de>
# Improvements by Gino van den Bergen <gino _at_ dtecta.com>,
#   Florian Uhlig <F.Uhlig _at_ gsi.de>,
#   Jiri Marsik <jiri.marsik89 _at_ gmail.com>

# The MIT License
#
# Copyright (c) 2011 Hannes Hofmann
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# GvdB: This module uses the environment variable TBB_ARCH_PLATFORM which defines architecture and compiler.
#   e.g. "ia32/vc8" or "em64t/cc4.1.0_libc2.4_kernel2.6.16.21"
#   TBB_ARCH_PLATFORM is set by the build script tbbvars[.bat|.sh|.csh], which can be found
#   in the TBB installation directory (TBB_INSTALL_DIR).
#
# GvdB: Mac OS X distribution places libraries directly in lib directory.
#
# For backwards compatibility, you may explicitely set the CMake variables TBB_ARCHITECTURE and TBB_COMPILER.
# TBB_ARCHITECTURE [ ia32 | em64t | itanium ]
#   which architecture to use
# TBB_COMPILER e.g. vc9 or cc3.2.3_libc2.3.2_kernel2.4.21 or cc4.0.1_os10.4.9
#   which compiler to use (detected automatically on Windows)

# This module respects
# TBB_INSTALL_DIR or $ENV{TBB21_INSTALL_DIR} or $ENV{TBB_INSTALL_DIR}

# This module defines
# TBB_INCLUDE_DIRS, where to find task_scheduler_init.h, etc.
# TBB_LIBRARY_DIRS, where to find libtbb, libtbbmalloc
# TBB_DEBUG_LIBRARY_DIRS, where to find libtbb_debug, libtbbmalloc_debug
# TBB_INSTALL_DIR, the base TBB install directory
# TBB_LIBRARIES, the libraries to link against to use TBB.
# TBB_DEBUG_LIBRARIES, the libraries to link against to use TBB with debug symbols.
# TBB_FOUND, If false, don't try to use TBB.
# TBB_INTERFACE_VERSION, as defined in tbb/tbb_stddef.h


if (WIN32)
    # has em64t/vc8 em64t/vc9
    # has ia32/vc7.1 ia32/vc8 ia32/vc9
    set(_TBB_DEFAULT_INSTALL_DIR "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB")
    set(_TBB_LIB_NAME "tbb")
    set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
    set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
    set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
    if (MSVC71)
        set (_TBB_COMPILER "vc7.1")
    endif(MSVC71)
    if (MSVC80)
        set(_TBB_COMPILER "vc8")
    endif(MSVC80)
    if (MSVC90)
        set(_TBB_COMPILER "vc9")
    endif(MSVC90)
    if(MSVC10)
        set(_TBB_COMPILER "vc10")
    endif(MSVC10)
    # Todo: add other Windows compilers such as ICL.
    set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
endif (WIN32)

if (UNIX)
    if (APPLE)
        # MAC
        set(_TBB_DEFAULT_INSTALL_DIR "/Library/Frameworks/Intel_TBB.framework/Versions")
        # libs: libtbb.dylib, libtbbmalloc.dylib, *_debug
        set(_TBB_LIB_NAME "tbb")
        set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
        set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
        set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
        # default flavor on apple: ia32/cc4.0.1_os10.4.9
        # Jiri: There is no reason to presume there is only one flavor and
        #       that user's setting of variables should be ignored.
        if(NOT TBB_COMPILER)
            set(_TBB_COMPILER "cc4.0.1_os10.4.9")
        elseif (NOT TBB_COMPILER)
            set(_TBB_COMPILER ${TBB_COMPILER})
        endif(NOT TBB_COMPILER)
        if(NOT TBB_ARCHITECTURE)
            set(_TBB_ARCHITECTURE "ia32")
        elseif(NOT TBB_ARCHITECTURE)
            set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
        endif(NOT TBB_ARCHITECTURE)
    else (APPLE)
        # LINUX
        set(_TBB_DEFAULT_INSTALL_DIR "/opt/intel/tbb" "/usr/local/include" "/usr/include")
        set(_TBB_LIB_NAME "tbb")
        set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
        set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
        set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
        # has em64t/cc3.2.3_libc2.3.2_kernel2.4.21 em64t/cc3.3.3_libc2.3.3_kernel2.6.5 em64t/cc3.4.3_libc2.3.4_kernel2.6.9 em64t/cc4.1.0_libc2.4_kernel2.6.16.21
        # has ia32/*
        # has itanium/*
        set(_TBB_COMPILER ${TBB_COMPILER})
        set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
    endif (APPLE)
endif (UNIX)

if (CMAKE_SYSTEM MATCHES "SunOS.*")
# SUN
# not yet supported
# has em64t/cc3.4.3_kernel5.10
# has ia32/*
endif (CMAKE_SYSTEM MATCHES "SunOS.*")


#-- Clear the public variables
set (TBB_FOUND "NO")


#-- Find TBB install dir and set ${_TBB_INSTALL_DIR} and cached ${TBB_INSTALL_DIR}
# first: use CMake variable TBB_INSTALL_DIR
if (TBB_INSTALL_DIR)
    set (_TBB_INSTALL_DIR ${TBB_INSTALL_DIR})
endif (TBB_INSTALL_DIR)
# second: use environment variable
if (NOT _TBB_INSTALL_DIR)
    if (NOT "$ENV{TBBROOT}" STREQUAL "")
        set (_TBB_INSTALL_DIR $ENV{TBBROOT})
    endif()
    if (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
        set (_TBB_INSTALL_DIR $ENV{TBB_INSTALL_DIR})
    endif (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
    # Intel recommends setting TBB21_INSTALL_DIR
    if (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
        set (_TBB_INSTALL_DIR $ENV{TBB21_INSTALL_DIR})
    endif (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
    if (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
        set (_TBB_INSTALL_DIR $ENV{TBB22_INSTALL_DIR})
    endif (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
    if (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
        set (_TBB_INSTALL_DIR $ENV{TBB30_INSTALL_DIR})
    endif (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
endif (NOT _TBB_INSTALL_DIR)
# third: try to find path automatically
if (NOT _TBB_INSTALL_DIR)
    if (_TBB_DEFAULT_INSTALL_DIR)
        set (_TBB_INSTALL_DIR ${_TBB_DEFAULT_INSTALL_DIR})
    endif (_TBB_DEFAULT_INSTALL_DIR)
endif (NOT _TBB_INSTALL_DIR)
# sanity check
if (NOT _TBB_INSTALL_DIR)
    message ("ERROR: Unable to find Intel TBB install directory. ${_TBB_INSTALL_DIR}")
else (NOT _TBB_INSTALL_DIR)
# finally: set the cached CMake variable TBB_INSTALL_DIR
if (NOT TBB_INSTALL_DIR)
    set (TBB_INSTALL_DIR ${_TBB_INSTALL_DIR} CACHE PATH "Intel TBB install directory")
    mark_as_advanced(TBB_INSTALL_DIR)
endif (NOT TBB_INSTALL_DIR)


#-- A macro to rewrite the paths of the library. This is necessary, because
#   find_library() always found the em64t/vc9 version of the TBB libs
macro(TBB_CORRECT_LIB_DIR var_name)
#    if (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
        string(REPLACE em64t "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
#    endif (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
    string(REPLACE ia32 "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
    string(REPLACE vc7.1 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
    string(REPLACE vc8 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
    string(REPLACE vc9 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
    string(REPLACE vc10 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
endmacro(TBB_CORRECT_LIB_DIR var_content)


#-- Look for include directory and set ${TBB_INCLUDE_DIR}
set (TBB_INC_SEARCH_DIR ${_TBB_INSTALL_DIR}/include)
# Jiri: tbbvars now sets the CPATH environment variable to the directory
#       containing the headers.
find_path(TBB_INCLUDE_DIR
    tbb/task_scheduler_init.h
    PATHS ${TBB_INC_SEARCH_DIR} ENV CPATH
)
mark_as_advanced(TBB_INCLUDE_DIR)


#-- Look for libraries
# GvdB: $ENV{TBB_ARCH_PLATFORM} is set by the build script tbbvars[.bat|.sh|.csh]
if (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
    set (_TBB_LIBRARY_DIR 
         ${_TBB_INSTALL_DIR}/lib/$ENV{TBB_ARCH_PLATFORM}
         ${_TBB_INSTALL_DIR}/$ENV{TBB_ARCH_PLATFORM}/lib
        )
endif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
# Jiri: This block isn't mutually exclusive with the previous one
#       (hence no else), instead I test if the user really specified
#       the variables in question.
if ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))
    # HH: deprecated
    message(STATUS "[Warning] FindTBB.cmake: The use of TBB_ARCHITECTURE and TBB_COMPILER is deprecated and may not be supported in future versions. Please set \$ENV{TBB_ARCH_PLATFORM} (using tbbvars.[bat|csh|sh]).")
    # Jiri: It doesn't hurt to look in more places, so I store the hints from
    #       ENV{TBB_ARCH_PLATFORM} and the TBB_ARCHITECTURE and TBB_COMPILER
    #       variables and search them both.
    set (_TBB_LIBRARY_DIR "${_TBB_INSTALL_DIR}/${_TBB_ARCHITECTURE}/${_TBB_COMPILER}/lib" ${_TBB_LIBRARY_DIR})
endif ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))

# GvdB: Mac OS X distribution places libraries directly in lib directory.
list(APPEND _TBB_LIBRARY_DIR ${_TBB_INSTALL_DIR}/lib)

# Jiri: No reason not to check the default paths. From recent versions,
#       tbbvars has started exporting the LIBRARY_PATH and LD_LIBRARY_PATH
#       variables, which now point to the directories of the lib files.
#       It all makes more sense to use the ${_TBB_LIBRARY_DIR} as a HINTS
#       argument instead of the implicit PATHS as it isn't hard-coded
#       but computed by system introspection. Searching the LIBRARY_PATH
#       and LD_LIBRARY_PATH environment variables is now even more important
#       that tbbvars doesn't export TBB_ARCH_PLATFORM and it facilitates
#       the use of TBB built from sources.
find_library(TBB_LIBRARY ${_TBB_LIB_NAME} HINTS ${_TBB_LIBRARY_DIR}
        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
find_library(TBB_MALLOC_LIBRARY ${_TBB_LIB_MALLOC_NAME} HINTS ${_TBB_LIBRARY_DIR}
        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)

#Extract path from TBB_LIBRARY name
get_filename_component(TBB_LIBRARY_DIR ${TBB_LIBRARY} PATH)

#TBB_CORRECT_LIB_DIR(TBB_LIBRARY)
#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY)
mark_as_advanced(TBB_LIBRARY TBB_MALLOC_LIBRARY)

#-- Look for debug libraries
# Jiri: Changed the same way as for the release libraries.
find_library(TBB_LIBRARY_DEBUG ${_TBB_LIB_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
find_library(TBB_MALLOC_LIBRARY_DEBUG ${_TBB_LIB_MALLOC_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)

# Jiri: Self-built TBB stores the debug libraries in a separate directory.
#       Extract path from TBB_LIBRARY_DEBUG name
get_filename_component(TBB_LIBRARY_DEBUG_DIR ${TBB_LIBRARY_DEBUG} PATH)

#TBB_CORRECT_LIB_DIR(TBB_LIBRARY_DEBUG)
#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY_DEBUG)
mark_as_advanced(TBB_LIBRARY_DEBUG TBB_MALLOC_LIBRARY_DEBUG)


if (TBB_INCLUDE_DIR)
    if (TBB_LIBRARY)
        set (TBB_FOUND "YES")
        set (TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_MALLOC_LIBRARY} ${TBB_LIBRARIES})
        set (TBB_DEBUG_LIBRARIES ${TBB_LIBRARY_DEBUG} ${TBB_MALLOC_LIBRARY_DEBUG} ${TBB_DEBUG_LIBRARIES})
        set (TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR} CACHE PATH "TBB include directory" FORCE)
        set (TBB_LIBRARY_DIRS ${TBB_LIBRARY_DIR} CACHE PATH "TBB library directory" FORCE)
        # Jiri: Self-built TBB stores the debug libraries in a separate directory.
        set (TBB_DEBUG_LIBRARY_DIRS ${TBB_LIBRARY_DEBUG_DIR} CACHE PATH "TBB debug library directory" FORCE)
        mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS TBB_DEBUG_LIBRARY_DIRS TBB_LIBRARIES TBB_DEBUG_LIBRARIES)
        message(STATUS "Found Intel TBB")
    endif (TBB_LIBRARY)
endif (TBB_INCLUDE_DIR)

if (NOT TBB_FOUND)
    message("ERROR: Intel TBB NOT found!")
    message(STATUS "Looked for Threading Building Blocks in ${_TBB_INSTALL_DIR}")
    # do only throw fatal, if this pkg is REQUIRED
    if (TBB_FIND_REQUIRED)
        message(FATAL_ERROR "Could NOT find TBB library.")
    endif (TBB_FIND_REQUIRED)
endif (NOT TBB_FOUND)

endif (NOT _TBB_INSTALL_DIR)

if (TBB_FOUND)
	set(TBB_INTERFACE_VERSION 0)
#	FILE(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _TBB_VERSION_CONTENTS)
	STRING(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_TBB_VERSION_CONTENTS}")
	set(TBB_INTERFACE_VERSION "${TBB_INTERFACE_VERSION}")
endif (TBB_FOUND)


================================================
FILE: cmake/Modules/FindVTune.cmake
================================================
# Find VTune libraries
# Once done this will define
#  VTune_FOUND - System has VTune
#  VTune_INCLUDE_DIRS - The VTune include directories
#  VTune_LIBRARIES - The libraries needed to use VTune

message(STATUS "${VTune_INCLUDE_DIRS}")

if(VTune_INCLUDE_DIRS AND VTune_LIBRARIES)
  set(VTune_FIND_QUIETLY TRUE)
endif()


set(VTune_LIBRARY_PATH_CANDIDATES lib lib64 lib32 bin64/k1om bin32/k1om)
find_path(VTune_INCLUDE_DIRS ittnotify.h PATHS ${VTune_ROOT} PATH_SUFFIXES include)
find_library(VTune_LIBRARY NAMES ittnotify PATHS ${VTune_ROOT} PATH_SUFFIXES ${VTune_LIBRARY_PATH_CANDIDATES})
find_library(VTune_LIBRARIES NAMES dl PATH_SUFFIXES lib lib64 lib32)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(VTune DEFAULT_MSG VTune_LIBRARY VTune_LIBRARIES VTune_INCLUDE_DIRS)
if(VTUNE_FOUND)
  set(VTune_FOUND on)
  set(VTune_LIBRARIES ${VTune_LIBRARY} ${VTune_LIBRARIES})
endif()
mark_as_advanced(VTune_INCLUDE_DIRS VTune_LIBRARIES)


================================================
FILE: cmake/Modules/GetGitVersion-write.cmake
================================================
### Don't include directly, for use by GetSVNVersion.cmake
find_package(Git)
# Extract svn info into MY_XXX variables
if(GIT_FOUND)
  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --verify --short HEAD
    WORKING_DIRECTORY ${SOURCE_DIR}
    OUTPUT_VARIABLE GIT_REVISION
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  file(WRITE include/galois/revision.h.txt "#define GALOIS_REVISION \"${GIT_REVISION}\"\n")
else()
  file(WRITE include/galois/revision.h.txt "#define GALOIS_REVISION \"0\"\n")
endif()

execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different include/galois/revision.h.txt include/galois/revision.h)


================================================
FILE: cmake/Modules/GetGitVersion.cmake
================================================
# DUMMY is a non-existent file to force regeneration of svn header every build
add_custom_target(revision ALL DEPENDS DUMMY ${PROJECT_BINARY_DIR}/include/galois/revision.h)

find_file(_MODULE "GetGitVersion-write.cmake" PATHS ${CMAKE_MODULE_PATH})

add_custom_command(OUTPUT DUMMY ${PROJECT_BINARY_DIR}/include/galois/revision.h
  COMMAND ${CMAKE_COMMAND} -DSOURCE_DIR=${CMAKE_SOURCE_DIR}
  -DCMAKE_MODULE_PATH="${CMAKE_SOURCE_DIR}/cmake/Modules/" -P ${_MODULE})

set(_MODULE off)

set_source_files_properties(${PROJECT_BINARY_DIR}/include/galois/revision.h
  PROPERTIES GENERATED TRUE
  HEADER_FILE_ONLY TRUE)


================================================
FILE: cmake/Modules/HandleSanitizer.cmake
================================================
# Galois: taken from:
#   https://github.com/llvm/llvm-project/blob/master/llvm/cmake/modules/HandleLLVMOptions.cmake 

include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)

string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)

if(NOT GALOIS_USE_SANITIZER)
  return()
endif()

function(append value)
  foreach(variable ${ARGN})
    set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
  endforeach(variable)
endfunction()

function(append_if condition value)
  if (${condition})
    foreach(variable ${ARGN})
      set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
    endforeach(variable)
  endif()
endfunction()

macro(add_flag_if_supported flag name)
  check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
  append_if("C_SUPPORTS_${name}" "${flag}" CMAKE_C_FLAGS)
  check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
  append_if("CXX_SUPPORTS_${name}" "${flag}" CMAKE_CXX_FLAGS)
endmacro()

macro(append_common_sanitizer_flags)
  # Append -fno-omit-frame-pointer and turn on debug info to get better
  # stack traces.
  add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER)
  if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND
      NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
    add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY)
  endif()
  # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
  if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
    add_flag_if_supported("-O1" O1)
  endif()
endmacro()

if (GALOIS_USE_SANITIZER STREQUAL "Address")
  append_common_sanitizer_flags()
  append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER STREQUAL "HWAddress")
  append_common_sanitizer_flags()
  append("-fsanitize=hwaddress" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER MATCHES "Memory(WithOrigins)?")
  append_common_sanitizer_flags()
  append("-fsanitize=memory" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
  if(GALOIS_USE_SANITIZER STREQUAL "MemoryWithOrigins")
    append("-fsanitize-memory-track-origins" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
  endif()
elseif (GALOIS_USE_SANITIZER STREQUAL "Undefined")
  append_common_sanitizer_flags()
  append("-fsanitize=undefined -fno-sanitize-recover=all"
          CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER STREQUAL "Thread")
  append_common_sanitizer_flags()
  append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER STREQUAL "DataFlow")
  append("-fsanitize=dataflow" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER STREQUAL "Address;Undefined" OR
        GALOIS_USE_SANITIZER STREQUAL "Undefined;Address")
  append_common_sanitizer_flags()
  append("-fsanitize=address,undefined -fno-sanitize-recover=all"
          CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
elseif (GALOIS_USE_SANITIZER STREQUAL "Leaks")
  append_common_sanitizer_flags()
  append("-fsanitize=leak" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
else()
  message(FATAL_ERROR "Unsupported value of GALOIS_USE_SANITIZER: ${GALOIS_USE_SANITIZER}")
endif()

if (GALOIS_USE_SANITIZER MATCHES "(Undefined;)?Address(;Undefined)?")
  add_flag_if_supported("-fsanitize-address-use-after-scope"
                        FSANITIZE_USE_AFTER_SCOPE_FLAG)
endif()

if (GALOIS_USE_SANITIZE_COVERAGE)
  append("-fsanitize=fuzzer-no-link" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
endif()

if (GALOIS_USE_SANITIZER MATCHES ".*Undefined.*")
  set(BLACKLIST_CONFIGURE_FILE "${PROJECT_SOURCE_DIR}/config/sanitizers/ubsan_blacklist.txt.in")
  if (EXISTS "${BLACKLIST_CONFIGURE_FILE}")
    set(BLACKLIST_FILE "${PROJECT_BINARY_DIR}/config/sanitizers/ubsan_blacklist.txt")
    configure_file("${BLACKLIST_CONFIGURE_FILE}" "${BLACKLIST_FILE}")
    append("-fsanitize-blacklist=${BLACKLIST_FILE}"
           CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
  endif()
endif()


================================================
FILE: cmake/Modules/LibFindMacros.cmake
================================================
# Copyright Raimar Sandner 2012–2014. Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE.txt)

#! \file
#! \ingroup Helpers
#! \brief Improved versions of %CMake's `find_package`

#! \ingroup Helpers
#! \brief Works the same as `find_package`, but forwards the "REQUIRED" and "QUIET" arguments
#!   used for the current package.
#!
#! For this to work, the first parameter must be the prefix of the current package, then the
#! prefix of the new package etc, which are passed to `find_package`.
macro (libfind_package PREFIX)
  set (LIBFIND_PACKAGE_ARGS ${ARGN})
  if (${PREFIX}_FIND_QUIETLY)
    set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} QUIET)
  endif (${PREFIX}_FIND_QUIETLY)
  if (${PREFIX}_FIND_REQUIRED)
    set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} REQUIRED)
  endif (${PREFIX}_FIND_REQUIRED)
  find_package(${LIBFIND_PACKAGE_ARGS})
endmacro (libfind_package)


#! \ingroup Helpers
#! \brief Do the final processing once the paths have been detected.
#!
#! If include dirs are needed, `${PREFIX}_PROCESS_INCLUDES` should be set to contain
#! all the variables, each of which contain one include directory.
#! Ditto for `${PREFIX}_PROCESS_LIBS` and library files.
#! Will set `${PREFIX}_FOUND`, `${PREFIX}_INCLUDE_DIRS` and `${PREFIX}_LIBRARIES`.
#! Also handles errors in case library detection was required, etc.
macro (libfind_process PREFIX)
  # Skip processing if already processed during this run
  if (NOT ${PREFIX}_FOUND)
    # Start with the assumption that the library was found
    set (${PREFIX}_FOUND TRUE)

    # Process all includes and set _FOUND to false if any are missing
    foreach (i ${${PREFIX}_PROCESS_INCLUDES})
      if (${i})
        set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIRS} ${${i}})
        mark_as_advanced(${i})
      else (${i})
        set (${PREFIX}_FOUND FALSE)
      endif (${i})
    endforeach (i)

    # Process all libraries and set _FOUND to false if any are missing
    foreach (i ${${PREFIX}_PROCESS_LIBS})
      if (${i})
        set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARIES} ${${i}})
        mark_as_advanced(${i})
      else (${i})
        set (${PREFIX}_FOUND FALSE)
      endif (${i})
    endforeach (i)

    # Print message and/or exit on fatal error
    if (${PREFIX}_FOUND)
      if (NOT ${PREFIX}_FIND_QUIETLY)
        message (STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}")
      endif (NOT ${PREFIX}_FIND_QUIETLY)
    else (${PREFIX}_FOUND)
      if (${PREFIX}_FIND_REQUIRED)
        foreach (i ${${PREFIX}_PROCESS_INCLUDES} ${${PREFIX}_PROCESS_LIBS})
          message("${i}=${${i}}")
        endforeach (i)
        message (FATAL_ERROR "Required library ${PREFIX} NOT FOUND.\nInstall the library (dev version) and try again. If the library is already installed, use ccmake to set the missing variables manually.")
      endif (${PREFIX}_FIND_REQUIRED)
    endif (${PREFIX}_FOUND)
  endif (NOT ${PREFIX}_FOUND)
endmacro (libfind_process)


================================================
FILE: cmake/Modules/UseStdMacro.cmake
================================================
add_definitions(-D__STDC_LIMIT_MACROS)
add_definitions(-D__STDC_CONSTANT_MACROS)


================================================
FILE: config/sanitizers/ubsan_blacklist.txt.in
================================================
[undefined]
src:@PROJECT_SOURCE_DIR@/external/bliss/*


================================================
FILE: config/version.txt
================================================
6.0.0


================================================
FILE: external/bliss/bliss/COPYING
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.


================================================
FILE: external/bliss/bliss/COPYING.LESSER
================================================
                   GNU LESSER GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.


  This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.

  0. Additional Definitions.

  As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the GNU
General Public License.

  "The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.

  An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.

  A "Combined Work" is a work produced by combining or linking an
Application with the Library.  The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".

  The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.

  The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.

  1. Exception to Section 3 of the GNU GPL.

  You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.

  2. Conveying Modified Versions.

  If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:

   a) under this License, provided that you make a good faith effort to
   ensure that, in the event an Application does not supply the
   function or data, the facility still operates, and performs
   whatever part of its purpose remains meaningful, or

   b) under the GNU GPL, with none of the additional permissions of
   this License applicable to that copy.

  3. Object Code Incorporating Material from Library Header Files.

  The object code form of an Application may incorporate material from
a header file that is part of the Library.  You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:

   a) Give prominent notice with each copy of the object code that the
   Library is used in it and that the Library and its use are
   covered by this License.

   b) Accompany the object code with a copy of the GNU GPL and this license
   document.

  4. Combined Works.

  You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:

   a) Give prominent notice with each copy of the Combined Work that
   the Library is used in it and that the Library and its use are
   covered by this License.

   b) Accompany the Combined Work with a copy of the GNU GPL and this license
   document.

   c) For a Combined Work that displays copyright notices during
   execution, include the copyright notice for the Library among
   these notices, as well as a reference directing the user to the
   copies of the GNU GPL and this license document.

   d) Do one of the following:

       0) Convey the Minimal Corresponding Source under the terms of this
       License, and the Corresponding Application Code in a form
       suitable for, and under terms that permit, the user to
       recombine or relink the Application with a modified version of
       the Linked Version to produce a modified Combined Work, in the
       manner specified by section 6 of the GNU GPL for conveying
       Corresponding Source.

       1) Use a suitable shared library mechanism for linking with the
       Library.  A suitable mechanism is one that (a) uses at run time
       a copy of the Library already present on the user's computer
       system, and (b) will operate properly with a modified version
       of the Library that is interface-compatible with the Linked
       Version.

   e) Provide Installation Information, but only if you would otherwise
   be required to provide such information under section 6 of the
   GNU GPL, and only to the extent that such information is
   necessary to install and execute a modified version of the
   Combined Work produced by recombining or relinking the
   Application with a modified version of the Linked Version. (If
   you use option 4d0, the Installation Information must accompany
   the Minimal Corresponding Source and Corresponding Application
   Code. If you use option 4d1, you must provide the Installation
   Information in the manner specified by section 6 of the GNU GPL
   for conveying Corresponding Source.)

  5. Combined Libraries.

  You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:

   a) Accompany the combined library with a copy of the same work based
   on the Library, uncombined with any other library facilities,
   conveyed under the terms of this License.

   b) Give prominent notice with the combined library that part of it
   is a work based on the Library, and explaining where to find the
   accompanying uncombined form of the same work.

  6. Revised Versions of the GNU Lesser General Public License.

  The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.

  Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.

  If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.


================================================
FILE: external/bliss/bliss/abgraph.hh
================================================
#ifndef BLISS_AB_GRAPH_HH
#define BLISS_AB_GRAPH_HH
#include <set>
#include <list>
#include <cstdio>
#include <vector>
#include <cassert>
#include <climits>
#include <algorithm>

namespace bliss {
	class AbstractGraph;
}

#include "kstack.hh"
#include "kqueue.hh"
#include "heap.hh"
#include "orbit.hh"
#include "partition.hh"
#include "bignum.hh"
#include "uintseqhash.hh"

namespace bliss {

void fatal_error(const char* fmt, ...) {
	va_list ap;
	va_start(ap, fmt);
	fprintf(stderr,"Bliss fatal error: ");
	vfprintf(stderr, fmt, ap);
	fprintf(stderr, "\nAborting!\n");
	va_end(ap);
	exit(1);
}

#define _INTERNAL_ERROR() fatal_error("%s:%d: internal error",__FILE__,__LINE__)
#define _OUT_OF_MEMORY() fatal_error("%s:%d: out of memory",__FILE__,__LINE__)

typedef std::pair<unsigned,unsigned> Index;

class TreeNode {
//friend class AbstractGraph;
public:
	unsigned int split_cell_first;

	int split_element;
	static const int SPLIT_START = -1;
	static const int SPLIT_END   = -2;
	Partition::BacktrackPoint partition_bt_point;
	unsigned int certificate_index;
	static const char NO = -1;
	static const char MAYBE = 0;
	static const char YES = 1;
	/* First path stuff */
	bool fp_on;
	bool fp_cert_equal;
	char fp_extendable;
	/* Best path stuff */
	bool in_best_path;
	int cmp_to_best_path;
	unsigned int failure_recording_ival;
	/* Component recursion related data */
	unsigned int cr_cep_stack_size;
	unsigned int cr_cep_index;
	unsigned int cr_level;
	bool needs_long_prune;
	unsigned int long_prune_begin;
	std::set<unsigned int, std::less<unsigned int> > long_prune_redundant;
	UintSeqHash eqref_hash;
	unsigned int subcertificate_length;
};

typedef struct {
	unsigned int splitting_element;
	unsigned int certificate_index;
	unsigned int subcertificate_length;
	UintSeqHash eqref_hash;
} PathInfo;

// \brief Statistics returned by the bliss search algorithm.
class Stats {
	friend class AbstractGraph;
	/** \internal The size of the automorphism group. */
	BigNum group_size;
	/** \internal An approximation (due to possible overflows) of
	 * the size of the automorphism group. */
	long double group_size_approx;
	/** \internal The number of nodes in the search tree. */
	long unsigned int nof_nodes;
	/** \internal The number of leaf nodes in the search tree. */
	long unsigned int nof_leaf_nodes;
	/** \internal The number of bad nodes in the search tree. */
	long unsigned int nof_bad_nodes;
	/** \internal The number of canonical representative updates. */
	long unsigned int nof_canupdates;
	/** \internal The number of generator permutations. */
	long unsigned int nof_generators;
	/** \internal The maximal depth of the search tree. */
	unsigned long int max_level;
	/** */
	void reset() {
		group_size.assign(1);
		group_size_approx = 1.0;
		nof_nodes = 0;
		nof_leaf_nodes = 0;
		nof_bad_nodes = 0;
		nof_canupdates = 0;
		nof_generators = 0;
		max_level = 0;
	}
	public:
	Stats() { reset(); }
	/** Print the statistics. */
	size_t print(FILE* const fp) const {
		size_t r = 0;
		r += fprintf(fp, "Nodes:          %lu\n", nof_nodes);
		r += fprintf(fp, "Leaf nodes:     %lu\n", nof_leaf_nodes);
		r += fprintf(fp, "Bad nodes:      %lu\n", nof_bad_nodes);
		r += fprintf(fp, "Canrep updates: %lu\n", nof_canupdates);
		r += fprintf(fp, "Generators:     %lu\n", nof_generators);
		r += fprintf(fp, "Max level:      %lu\n", max_level);
		r += fprintf(fp, "|Aut|:          ")+group_size.print(fp)+fprintf(fp, "\n");
		fflush(fp);
		return r;
	}
	/** An approximation (due to possible overflows/rounding errors) of
	 * the size of the automorphism group. */
	long double get_group_size_approx() const {return group_size_approx;}
	/** The number of nodes in the search tree. */
	long unsigned int get_nof_nodes() const {return nof_nodes;}
	/** The number of leaf nodes in the search tree. */
	long unsigned int get_nof_leaf_nodes() const {return nof_leaf_nodes;}
	/** The number of bad nodes in the search tree. */
	long unsigned int get_nof_bad_nodes() const {return nof_bad_nodes;}
	/** The number of canonical representative updates. */
	long unsigned int get_nof_canupdates() const {return nof_canupdates;}
	/** The number of generator permutations. */
	long unsigned int get_nof_generators() const {return nof_generators;}
	/** The maximal depth of the search tree. */
	unsigned long int get_max_level() const {return max_level;}
};

// \brief An abstract base class for different types of graphs.
class AbstractGraph {
	friend class Partition;
public:
	//AbstractGraph();
	// Constructor and destructor routines for the abstract graph class
	AbstractGraph() {
	// Initialize stuff
	first_path_labeling = 0;
	first_path_labeling_inv = 0;
	best_path_labeling = 0;
	best_path_labeling_inv = 0;
	first_path_automorphism = 0;
	best_path_automorphism = 0;
	in_search = false;
	// Default value for using "long prune"
	opt_use_long_prune = true;
	// Default value for using failure recording
	opt_use_failure_recording = true;
	// Default value for using component recursion
	opt_use_comprec = true;
	verbose_level = 0;
	verbstr = stdout;
	report_hook = 0;
	report_user_param = 0;
	}
	//virtual ~AbstractGraph();
	virtual ~AbstractGraph() {
	if(first_path_labeling) {
		free(first_path_labeling); first_path_labeling = 0; }
	if(first_path_labeling_inv) {
		free(first_path_labeling_inv); first_path_labeling_inv = 0; }
	if(best_path_labeling) {
		free(best_path_labeling); best_path_labeling = 0; }
	if(best_path_labeling_inv) {
		free(best_path_labeling_inv); best_path_labeling_inv = 0; }
	if(first_path_automorphism) {
		free(first_path_automorphism); first_path_automorphism = 0; }
	if(best_path_automorphism) {
		free(best_path_automorphism); best_path_automorphism = 0; }
	report_hook = 0;
	report_user_param = 0;
	}

	//Set the verbose output level for the algorithms.
	// \param level  the level of verbose output, 0 means no verbose output
	//void set_verbose_level(const unsigned int level);
void set_verbose_level(const unsigned int level) {
	verbose_level = level;
}

	/**
	 * Set the file stream for the verbose output.
	 * \param fp  the file stream; if null, no verbose output is written
	 */
	//void set_verbose_file(FILE * const fp);
void set_verbose_file(FILE* const fp) {
	verbstr = fp;
}
	/**
	 * Add a new vertex with color \a color in the graph and return its index.
	 */
	virtual unsigned int add_vertex(const unsigned int color = 0) = 0;

	/**
	 * Add an edge between vertices \a source and \a target.
	 * Duplicate edges between vertices are ignored but try to avoid introducing
	 * them in the first place as they are not ignored immediately but will
	 * consume memory and computation resources for a while.
	 */
	virtual void add_edge(const unsigned int source, const unsigned int target, Index index) = 0;

	/**
	 * Change the color of the vertex \a vertex to \a color.
	 */
	virtual void change_color(const unsigned int vertex, const unsigned int color) = 0;

	/**
	 * Check whether \a perm is an automorphism of this graph.
	 * Unoptimized, mainly for debugging purposes.
	 */
	//virtual bool is_automorphism(const std::vector<unsigned int>& perm) const;

virtual bool is_automorphism(const std::vector<unsigned int>& perm) const {
	_INTERNAL_ERROR();
	return false;
}

	/** Activate/deactivate failure recording.
	 * May not be called during the search, i.e. from an automorphism reporting
	 * hook function.
	 * \param active  if true, activate failure recording, deactivate otherwise
	 */
	void set_failure_recording(const bool active) {assert(!in_search); opt_use_failure_recording = active;}

	/** Activate/deactivate component recursion.
	 * The choice affects the computed canonical labelings;
	 * therefore, if you want to compare whether two graphs are isomorphic by
	 * computing and comparing (for equality) their canonical versions,
	 * be sure to use the same choice for both graphs.
	 * May not be called during the search, i.e. from an automorphism reporting
	 * hook function.
	 * \param active  if true, activate component recursion, deactivate otherwise
	 */
	void set_component_recursion(const bool active) {assert(!in_search); opt_use_comprec = active;}

	/**
	 * Return the number of vertices in the graph.
	 */
	virtual unsigned int get_nof_vertices() const = 0;

	/**
	 * Return a new graph that is the result of applying the permutation \a perm
	 * to this graph. This graph is not modified.
	 * \a perm must contain N=this.get_nof_vertices() elements and be a bijection
	 * on {0,1,...,N-1}, otherwise the result is undefined or a segfault.
	 */
	virtual AbstractGraph* permute(const unsigned* const perm) const = 0;
	virtual AbstractGraph* permute(const std::vector<unsigned int>& perm) const = 0;

	/**
	 * Find a set of generators for the automorphism group of the graph.
	 * The function \a hook (if non-null) is called each time a new generator
	 * for the automorphism group is found.
	 * The first argument \a user_param for the hook is the
	 * \a hook_user_param given below,
	 * the second argument \a n is the length of the automorphism (equal to
	 * get_nof_vertices()) and
	 * the third argument \a aut is the automorphism
	 * (a bijection on {0,...,get_nof_vertices()-1}).
	 * The memory for the automorphism \a aut will be invalidated immediately
	 * after the return from the hook function;
	 * if you want to use the automorphism later, you have to take a copy of it.
	 * Do not call any member functions in the hook.
	 * The search statistics are copied in \a stats.
	 */
	//void find_automorphisms(Stats& stats, void (*hook)(void* user_param, unsigned int n, const unsigned int* aut), void* hook_user_param);
void find_automorphisms(Stats& stats, void (*hook)(void *user_param, unsigned int n, const unsigned int *aut), void *user_param) {
	report_hook = hook;
	report_user_param = user_param;
	search(false, stats);
	if(first_path_labeling) {
		free(first_path_labeling);
		first_path_labeling = 0;
	}
	if(best_path_labeling) {
		free(best_path_labeling);
		best_path_labeling = 0;
	}
}
	/**
	 * Otherwise the same as find_automorphisms() except that
	 * a canonical labeling of the graph (a bijection on
	 * {0,...,get_nof_vertices()-1}) is returned.
	 * The memory allocated for the returned canonical labeling will remain
	 * valid only until the next call to a member function with the exception
	 * that constant member functions (for example, bliss::Graph::permute()) can
	 * be called without invalidating the labeling.
	 * To compute the canonical version of an undirected graph, call this
	 * function and then bliss::Graph::permute() with the returned canonical
	 * labeling.
	 * Note that the computed canonical version may depend on the applied version
	 * of bliss as well as on some other options (for instance, the splitting
	 * heuristic selected with bliss::Graph::set_splitting_heuristic()).
	 */
	//const unsigned int* canonical_form(Stats& stats, void (*hook)(void* user_param, unsigned int n, const unsigned int* aut), void* hook_user_param);
const unsigned * canonical_form(Stats& stats, void (*hook)(void *user_param, unsigned int n, const unsigned int *aut), void *user_param) {
	report_hook = hook;
	report_user_param = user_param;
	search(true, stats);
	return best_path_labeling;
}
	/**
	 * Write the graph to a file in a variant of the DIMACS format.
	 * See the <A href="http://www.tcs.hut.fi/Software/bliss/">bliss website</A>
	 * for the definition of the file format.
	 * Note that in the DIMACS file the vertices are numbered from 1 to N while
	 * in this C++ API they are from 0 to N-1.
	 * Thus the vertex n in the file corresponds to the vertex n-1 in the API.
	 * \param fp  the file stream where the graph is written
	 */
	virtual void write_dimacs(FILE * const fp) = 0;

	/**
	 * Write the graph to a file in the graphviz dotty format.
	 * \param fp  the file stream where the graph is written
	 */
	virtual void write_dot(FILE * const fp) = 0;

	/**
	 * Write the graph in a file in the graphviz dotty format.
	 * Do nothing if the file cannot be written.
	 * \param file_name  the name of the file to which the graph is written
	 */
	virtual void write_dot(const char * const file_name) = 0;

	/**
	 * Get a hash value for the graph.
	 * \return  the hash value
	 */ 
	virtual unsigned int get_hash() = 0;

	/**
	 * Disable/enable the "long prune" method.
	 * The choice affects the computed canonical labelings;
	 * therefore, if you want to compare whether two graphs are isomorphic by
	 * computing and comparing (for equality) their canonical versions,
	 * be sure to use the same choice for both graphs.
	 * May not be called during the search, i.e. from an automorphism reporting
	 * hook function.
	 * \param active  if true, activate "long prune", deactivate otherwise
	 */
	void set_long_prune_activity(const bool active) {
		assert(!in_search);
		opt_use_long_prune = active;
	}

protected:
	/** \internal
	 * How much verbose output is produced (0 means none) */
	unsigned int verbose_level;
	/** \internal
	 * The output stream for verbose output. */
	FILE *verbstr;

protected:
	/** \internal
	 * The ordered partition used in the search algorithm. */
	Partition p;

	/** \internal
	 * Whether the search for automorphisms and a canonical labeling is
	 * in progress.
	 */
	bool in_search;

	/** \internal
	 * Is failure recording in use?
	 */
	bool opt_use_failure_recording;
	/* The "tree-specific" invariant value for the point when current path
	 * got different from the first path */
	unsigned int failure_recording_fp_deviation;

	/** \internal
	 * Is component recursion in use?
	 */
	bool opt_use_comprec;

	unsigned int refine_current_path_certificate_index;
	bool refine_compare_certificate;
	bool refine_equal_to_first;
	unsigned int refine_first_path_subcertificate_end;
	int refine_cmp_to_best;
	unsigned int refine_best_path_subcertificate_end;
	static const unsigned int CERT_SPLIT = 0; //UINT_MAX;
	static const unsigned int CERT_EDGE  = 1; //UINT_MAX-1;
	/** \internal
	 * Add a triple (v1,v2,v3) in the certificate.
	 * May modify refine_equal_to_first and refine_cmp_to_best.
	 * May also update eqref_hash and failure_recording_fp_deviation. */
	//void cert_add(const unsigned int v1, const unsigned int v2, const unsigned int v3);
// Certificate building
void cert_add(const unsigned int v1, const unsigned int v2, const unsigned int v3) {
	if(refine_compare_certificate) {
		if(refine_equal_to_first) {
			/* So far equivalent to the first path... */
			unsigned int index = certificate_current_path.size();
			if(index >= refine_first_path_subcertificate_end) {
				refine_equal_to_first = false;
			} else if(certificate_first_path[index] != v1) {
				refine_equal_to_first = false;
			} else if(certificate_first_path[++index] != v2) {
				refine_equal_to_first = false;
			} else if(certificate_first_path[++index] != v3) {
				refine_equal_to_first = false;
			} if(opt_use_failure_recording and !refine_equal_to_first) {
				/* We just became different from the first path,
				 * remember the deviation point tree-specific invariant
				 * for the use of failure recording */
				UintSeqHash h;
				h.update(v1);
				h.update(v2);
				h.update(v3);
				h.update(index);
				h.update(eqref_hash.get_value());
				failure_recording_fp_deviation = h.get_value();
			}
		}
		if(refine_cmp_to_best == 0) {
			/* So far equivalent to the current best path... */
			unsigned int index = certificate_current_path.size();
			if(index >= refine_best_path_subcertificate_end) {
				refine_cmp_to_best = 1;
			} else if(v1 > certificate_best_path[index]) {
				refine_cmp_to_best = 1;
			} else if(v1 < certificate_best_path[index]) {
				refine_cmp_to_best = -1;
			} else if(v2 > certificate_best_path[++index]) {
				refine_cmp_to_best = 1;
			} else if(v2 < certificate_best_path[index]) {
				refine_cmp_to_best = -1;
			} else if(v3 > certificate_best_path[++index]) {
				refine_cmp_to_best = 1;
			} else if(v3 < certificate_best_path[index]) {
				refine_cmp_to_best = -1;
			}
		}
		if((refine_equal_to_first == false) and (refine_cmp_to_best < 0))
			return;
	}
	/* Update the current path certificate */
	certificate_current_path.push_back(v1);
	certificate_current_path.push_back(v2);
	certificate_current_path.push_back(v3);
}
	/** \internal
	 * Add a redundant triple (v1,v2,v3) in the certificate.
	 * Can also just dicard the triple.
	 * May modify refine_equal_to_first and refine_cmp_to_best.
	 * May also update eqref_hash and failure_recording_fp_deviation. */
	//void cert_add_redundant(const unsigned int x, const unsigned int y, const unsigned int z);
void cert_add_redundant(const unsigned int v1, const unsigned int v2, const unsigned int v3) {
	return cert_add(v1, v2, v3);
}
	/**\internal
	 * Is the long prune method in use?
	 */
	bool opt_use_long_prune;
	/**\internal
	 * Maximum amount of memory (in megabytes) available for
	 * the long prune method
	 */
	static const unsigned int long_prune_options_max_mem = 50;
	/**\internal
	 * Maximum amount of automorphisms stored for the long prune method;
	 * less than this is stored if the memory limit above is reached first
	 */
	static const unsigned int long_prune_options_max_stored_auts = 100;

	unsigned int long_prune_max_stored_autss;
	std::vector<std::vector<bool> *> long_prune_fixed;
	std::vector<std::vector<bool> *> long_prune_mcrs;
	std::vector<bool> long_prune_temp;
	unsigned int long_prune_begin;
	unsigned int long_prune_end;
	/** \internal
	 * Initialize the "long prune" data structures.
	 */
	//void long_prune_init();
	/** \internal
	 * Release the memory allocated for "long prune" data structures.
	 */
	//void long_prune_deallocate();
	//void long_prune_add_automorphism(const unsigned int *aut);
	//std::vector<bool>& long_prune_get_fixed(const unsigned int index);
	//std::vector<bool>& long_prune_allocget_fixed(const unsigned int index);
	//std::vector<bool>& long_prune_get_mcrs(const unsigned int index);
	//std::vector<bool>& long_prune_allocget_mcrs(const unsigned int index);
	/** \internal
	 * Swap the i:th and j:th stored automorphism information;
	 * i and j must be "in window, i.e. in [long_prune_begin,long_prune_end[
	 */
	//void long_prune_swap(const unsigned int i, const unsigned int j);
//Long prune code
void long_prune_init() {
	const unsigned int N = get_nof_vertices();
	long_prune_temp.clear();
	long_prune_temp.resize(N);
	/* Of how many automorphisms we can store information in
	   the predefined, fixed amount of memory? */
	const unsigned int nof_fitting_in_max_mem =
		(long_prune_options_max_mem * 1024 * 1024) / (((N * 2) / 8)+1);
	long_prune_max_stored_autss = long_prune_options_max_stored_auts;
	/* Had some problems with g++ in using (a<b)?a:b when constants involved,
	   so had to make this in a stupid way... */
	if(nof_fitting_in_max_mem < long_prune_options_max_stored_auts)
		long_prune_max_stored_autss = nof_fitting_in_max_mem;
	long_prune_deallocate();
	long_prune_fixed.resize(N, 0);
	long_prune_mcrs.resize(N, 0);
	long_prune_begin = 0;
	long_prune_end = 0;
}

void long_prune_deallocate() {
	while(!long_prune_fixed.empty()) {
		delete long_prune_fixed.back();
		long_prune_fixed.pop_back();
	}
	while(!long_prune_mcrs.empty()) {
		delete long_prune_mcrs.back();
		long_prune_mcrs.pop_back();
	}
}

void long_prune_swap(const unsigned int i, const unsigned int j) {
	const unsigned int real_i = i % long_prune_max_stored_autss;
	const unsigned int real_j = j % long_prune_max_stored_autss;
	std::vector<bool>* tmp = long_prune_fixed[real_i];
	long_prune_fixed[real_i] = long_prune_fixed[real_j];
	long_prune_fixed[real_j] = tmp;
	tmp = long_prune_mcrs[real_i];
	long_prune_mcrs[real_i] = long_prune_mcrs[real_j];
	long_prune_mcrs[real_j] = tmp;
}

std::vector<bool>& long_prune_allocget_fixed(const unsigned int index) {
	const unsigned int i = index % long_prune_max_stored_autss;
	if(!long_prune_fixed[i])
		long_prune_fixed[i] = new std::vector<bool>(get_nof_vertices());
	return *long_prune_fixed[i];
}

std::vector<bool>& long_prune_get_fixed(const unsigned int index) {
	return *long_prune_fixed[index % long_prune_max_stored_autss];
}

std::vector<bool>& long_prune_allocget_mcrs(const unsigned int index) {
	const unsigned int i = index % long_prune_max_stored_autss;
	if(!long_prune_mcrs[i])
		long_prune_mcrs[i] = new std::vector<bool>(get_nof_vertices());
	return *long_prune_mcrs[i];
}

std::vector<bool>& long_prune_get_mcrs(const unsigned int index) {
	return *long_prune_mcrs[index % long_prune_max_stored_autss];
}

void long_prune_add_automorphism(const unsigned int* aut) {
	if(long_prune_max_stored_autss == 0) return;
	const unsigned int N = get_nof_vertices();
	/* If the buffer of stored auts is full, remove the oldest aut */
	if(long_prune_end - long_prune_begin == long_prune_max_stored_autss) {
		long_prune_begin++;
	}
	long_prune_end++;
	std::vector<bool>& fixed = long_prune_allocget_fixed(long_prune_end-1);
	std::vector<bool>& mcrs = long_prune_allocget_mcrs(long_prune_end-1);
	/* Mark nodes that are (i) fixed or (ii) minimal orbit representatives
	 * under the automorphism 'aut' */
	for(unsigned int i = 0; i < N; i++) {
		fixed[i] = (aut[i] == i);
		if(long_prune_temp[i] == false) {
			mcrs[i] = true;
			unsigned int j = aut[i];
			while(j != i) {
				long_prune_temp[j] = true;
				j = aut[j];
			}
		} else {
			mcrs[i] = false;
		}
		/* Clear the temp array on-the-fly... */
		long_prune_temp[i] = false;
	}
}

	/*
	 * Data structures and routines for refining the partition p into equitable
	 */
	Heap neighbour_heap;
	virtual bool split_neighbourhood_of_unit_cell(Partition::Cell *) = 0;
	virtual bool split_neighbourhood_of_cell(Partition::Cell * const) = 0;
	//void refine_to_equitable();
	//void refine_to_equitable(Partition::Cell * const unit_cell);
	//void refine_to_equitable(Partition::Cell * const unit_cell1, Partition::Cell * const unit_cell2);
void refine_to_equitable() {
	/* Start refinement from all cells -> push 'em all in the splitting queue */
	for(Partition::Cell* cell = p.first_cell; cell; cell = cell->next)
		p.splitting_queue_add(cell);
	do_refine_to_equitable();
}

void refine_to_equitable(Partition::Cell* const unit_cell) {
	p.splitting_queue_add(unit_cell);
	do_refine_to_equitable();
}

void refine_to_equitable(Partition::Cell* const unit_cell1, Partition::Cell* const unit_cell2) {
	p.splitting_queue_add(unit_cell1);
	p.splitting_queue_add(unit_cell2);
	do_refine_to_equitable();
}
	/** \internal
	 * \return false if it was detected that the current certificate
	 *         is different from the first and/or best (whether this is checked
	 *         depends on in_search and refine_compare_certificate flags.
	 */
	//bool do_refine_to_equitable();
bool do_refine_to_equitable() {
	eqref_hash.reset();
	while(!p.splitting_queue_is_empty()) {
		Partition::Cell* const cell = p.splitting_queue_pop();
		if(cell->is_unit()) {
			if(in_search) {
				const unsigned int index = cell->first;
				if(first_path_automorphism) {
					/* Build the (potential) automorphism on-the-fly */
					first_path_automorphism[first_path_labeling_inv[index]] =
						p.elements[index];
				}
				if(best_path_automorphism) {
					/* Build the (potential) automorphism on-the-fly */
					best_path_automorphism[best_path_labeling_inv[index]] =
						p.elements[index];
				}
			}
			const bool worse = split_neighbourhood_of_unit_cell(cell);
			if(in_search and worse) goto worse_exit;
		}
		else {
			const bool worse = split_neighbourhood_of_cell(cell);
			if(in_search and worse) goto worse_exit;
		}
	}
	return true;
worse_exit:
	/* Clear splitting_queue */
	p.splitting_queue_clear();
	return false;
}
	unsigned int eqref_max_certificate_index;
	/** \internal
	 * Whether eqref_hash is updated during equitable refinement process.
	 */
	bool compute_eqref_hash;
	UintSeqHash eqref_hash;
	/** \internal
	 * Check whether the current partition p is equitable.
	 * Performance: very slow, use only for debugging purposes.
	 */
	virtual bool is_equitable() const = 0;

	unsigned int *first_path_labeling;
	unsigned int *first_path_labeling_inv;
	Orbit         first_path_orbits;
	unsigned int *first_path_automorphism;
	unsigned int *best_path_labeling;
	unsigned int *best_path_labeling_inv;
	Orbit         best_path_orbits;
	unsigned int *best_path_automorphism;

	//void update_labeling(unsigned int * const lab);
/** \internal
 * Assign the labeling induced by the current partition 'this.p' to
 * \a labeling.
 * That is, if the partition is [[2,0],[1]],
 * then \a labeling will map 0 to 1, 1 to 2, and 2 to 0.
 */
void update_labeling(unsigned int* const labeling) {
	const unsigned int N = get_nof_vertices();
	unsigned int* ep = p.elements;
	for(unsigned int i = 0; i < N; i++, ep++)
		labeling[*ep] = i;
}
	//void update_labeling_and_its_inverse(unsigned int * const lab, unsigned int * const lab_inv);
/** \internal
 * The same as update_labeling() except that the inverse of the labeling
 * is also produced and assigned to \a labeling_inv.
 */
void update_labeling_and_its_inverse(unsigned int* const labeling, unsigned int* const labeling_inv) {
	const unsigned int N = get_nof_vertices();
	unsigned int* ep = p.elements;
	unsigned int* clip = labeling_inv;
	for(unsigned int i = 0; i < N; ) {
		labeling[*ep] = i;
		i++;
		*clip = *ep;
		ep++;
		clip++;
	}
}
	void update_orbit_information(Orbit &o, const unsigned int *perm) {
		const unsigned int N = get_nof_vertices();
		for(unsigned int i = 0; i < N; i++)
			if(perm[i] != i) o.merge_orbits(i, perm[i]);
	}
	//void reset_permutation(unsigned int *perm);
	/* Mainly for debugging purposes */
	//virtual bool is_automorphism(unsigned int* const perm);

// \internal
// Reset the permutation \a perm to the identity permutation.
void reset_permutation(unsigned int* perm) {
	const unsigned int N = get_nof_vertices();
	for(unsigned int i = 0; i < N; i++, perm++)
		*perm = i;
}

virtual bool is_automorphism(unsigned int* const perm) {
	_INTERNAL_ERROR();
	return false;
}
	std::vector<unsigned int> certificate_current_path;
	std::vector<unsigned int> certificate_first_path;
	std::vector<unsigned int> certificate_best_path;
	unsigned int certificate_index;
	virtual void initialize_certificate() = 0;
	virtual void remove_duplicate_edges() = 0;
	virtual void make_initial_equitable_partition() = 0;
	virtual Partition::Cell* find_next_cell_to_be_splitted(Partition::Cell *cell) = 0;
	//void search(const bool canonical, Stats &stats);
#include "search.h"
	void (*report_hook)(void *user_param, unsigned int n, const unsigned int *aut);
	void *report_user_param;
	/*
	 *
	 * Nonuniform component recursion (NUCR)
	 *
	 */

	/** The currently traversed component */
	unsigned int cr_level;

	/** \internal
	 * The "Component End Point" data structure
	 */
	class CR_CEP {
		public:
			/** At which level in the search was this CEP created */
			unsigned int creation_level;
			/** The current component has been fully traversed when the partition has
			 * this many discrete cells left */
			unsigned int discrete_cell_limit;
			/** The component to be traversed after the current one */
			unsigned int next_cr_level;
			/** The next component end point */
			unsigned int next_cep_index;
			bool first_checked;
			bool best_checked;
	};
	/** \internal
	 * A stack for storing Component End Points
	 */
	std::vector<CR_CEP> cr_cep_stack;

	/** \internal
	 * Find the first non-uniformity component at the component recursion
	 * level \a level.
	 * The component is stored in \a cr_component.
	 * If no component is found, \a cr_component is empty.
	 * Returns false if all the cells in the component recursion level \a level
	 * were discrete.
	 * Modifies the max_ival and max_ival_count fields of Partition:Cell
	 * (assumes that they are 0 when called and
	 *  quarantees that they are 0 when returned).
	 */
	virtual bool nucr_find_first_component(const unsigned int level) = 0;
	virtual bool nucr_find_first_component(const unsigned int level,
			std::vector<unsigned int>& component,
			unsigned int& component_elements,
			Partition::Cell*& sh_return) = 0;
	/** \internal
	 * The non-uniformity component found by nucr_find_first_component()
	 * is stored here.
	 */
	std::vector<unsigned int> cr_component;
	/** \internal
	 * The number of vertices in the component \a cr_component
	 */
	unsigned int cr_component_elements;
};

// Assumes that the elements in the cell are sorted according to their invariant values.
Partition::Cell* Partition::split_cell(Partition::Cell* const original_cell) {
  Partition::Cell* cell = original_cell;
  const bool original_cell_was_in_splitting_queue =
    original_cell->in_splitting_queue;
  Partition::Cell* largest_new_cell = 0;

  while(true) {
      unsigned int* ep = elements + cell->first;
      const unsigned int* const lp = ep + cell->length;
      const unsigned int ival = invariant_values[*ep];
      invariant_values[*ep] = 0;
      element_to_cell_map[*ep] = cell;
      in_pos[*ep] = ep;
      ep++;
      while(ep < lp) {
	  const unsigned int e = *ep;
	  if(invariant_values[e] != ival)
	    break;
	  invariant_values[e] = 0;
	  in_pos[e] = ep;
	  ep++;
	  element_to_cell_map[e] = cell;
	}
      if(ep == lp) break;
      Partition::Cell* const new_cell = aux_split_in_two(cell, (ep - elements) - cell->first);
      if(graph and graph->compute_eqref_hash) {
	  graph->eqref_hash.update(new_cell->first);
	  graph->eqref_hash.update(new_cell->length);
	  graph->eqref_hash.update(ival);
	}
      /* Add cells in splitting_queue */
      assert(!new_cell->is_in_splitting_queue());
      if(original_cell_was_in_splitting_queue)
	{
	  /* In this case, all new cells are inserted in splitting_queue */
	  assert(cell->is_in_splitting_queue());
	  splitting_queue_add(new_cell);
	}
      else
	{
	  /* Otherwise, we can omit one new cell from splitting_queue */
	  assert(!cell->is_in_splitting_queue());
	  if(largest_new_cell == 0) {
	    largest_new_cell = cell;
	  } else {
	    assert(!largest_new_cell->is_in_splitting_queue());
	    if(cell->length > largest_new_cell->length) {
	      splitting_queue_add(largest_new_cell);
	      largest_new_cell = cell;
	    } else {
	      splitting_queue_add(cell);
	    }
	  }
	}
      /* Process the rest of the cell */
      cell = new_cell;
    }

  
  if(original_cell == cell) {
    /* All the elements in cell had the same invariant value */
    return cell;
  }

  /* Add cells in splitting_queue */
  if(!original_cell_was_in_splitting_queue) {
      /* Also consider the last new cell */
      assert(largest_new_cell);
      if(cell->length > largest_new_cell->length) {
	  splitting_queue_add(largest_new_cell);
	  largest_new_cell = cell;
	} else {
	  splitting_queue_add(cell);
	}
      if(largest_new_cell->is_unit()) {
	  /* Needed in certificate computation */
	  splitting_queue_add(largest_new_cell);
	}
    }
  return cell;
}

}

#endif


================================================
FILE: external/bliss/bliss/bignum.hh
================================================
#ifndef BLISS_BIGNUM_HH
#define BLISS_BIGNUM_HH

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

#if defined(BLISS_USE_GMP)
#include <gmp.h>
#endif

#include <cstdlib>
#include <cstdio>
#include "defs.hh"

namespace bliss {

/**
 * \brief A very simple class for big integers (or approximation of them).
 *
 * If the compile time flag BLISS_USE_GMP is set,
 * then the GNU Multiple Precision Arithmetic library (GMP) is used to
 * obtain arbitrary precision, otherwise "long double" is used to
 * approximate big integers.
 */

#if defined(BLISS_USE_GMP)

class BigNum
{
  mpz_t v;
public:
  /**
   * Create a new big number and set it to zero.
   */
  BigNum() {mpz_init(v); }

  /**
   * Destroy the number.
   */
  ~BigNum() {mpz_clear(v); }

  /**
   * Set the number to \a n.
   */
  void assign(const int n) {mpz_set_si(v, n); }

  /**
   * Multiply the number with \a n.
   */
  void multiply(const int n) {mpz_mul_si(v, v, n); }

  /**
   * Print the number in the file stream \a fp.
   */
  size_t print(FILE* const fp) const {return mpz_out_str(fp, 10, v); }
};

#else

class BigNum
{
  long double v;
public:
  /**
   * Create a new big number and set it to zero.
   */
  BigNum(): v(0.0) {}

  /**
   * Set the number to \a n.
   */
  void assign(const int n) {v = (long double)n; }

  /**
   * Multiply the number with \a n.
   */
  void multiply(const int n) {v *= (long double)n; }

  /**
   * Print the number in the file stream \a fp.
   */
  size_t print(FILE* const fp) const {return fprintf(fp, "%Lg", v); }
};

#endif

} //namespace bliss

#endif


================================================
FILE: external/bliss/bliss/defs.hh
================================================
#ifndef BLISS_DEFS_HH
#define BLISS_DEFS_HH

#include <cassert>
#include <cstdarg>

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

namespace bliss {

/**
 * The version number of bliss.
 */
static const char * const version = "0.73";

/*
 * If a fatal error (out of memory, internal error) is encountered,
 * this function is called.
 * There should not be a return from this function but exit or
 * a jump to code that deallocates the AbstractGraph instance that called this.
 */
void fatal_error(const char* fmt, ...);


#if defined(BLISS_DEBUG)
#define BLISS_CONSISTENCY_CHECKS
#define BLISS_EXPENSIVE_CONSISTENCY_CHECKS
#endif


#if defined(BLISS_CONSISTENCY_CHECKS)
/* Force a check that the found automorphisms are valid */
#define BLISS_VERIFY_AUTOMORPHISMS
#endif


#if defined(BLISS_CONSISTENCY_CHECKS)
/* Force a check that the generated partitions are equitable */
#define BLISS_VERIFY_EQUITABLEDNESS
#endif

} // namespace bliss


/*! \mainpage Bliss
 *
 * \section intro_sec Introduction
 *
 * This is the source code documentation of bliss,
 * produced by running <A href="http://www.doxygen.org">doxygen</A> in
 * the source directory.
 * The algorithms and data structures used in bliss are documented in
 * the papers found at the
 * <A href="http://www.tcs.hut.fi/Software/bliss">bliss web site</A>.
 *
 *
 * \section compile_sec Compiling
 *
 * Compiling bliss in Linux should be easy, just execute
 * \code
 * make
 * \endcode
 * in the bliss source directory.
 * This will produce the executable program \c bliss as well as
 * the library file \c libbliss.a that can be linked in other programs.
 * If you have the <A href="http://gmplib.org/">GNU Multiple Precision
 * Arithmetic Library</A> (GMP) installed in your machine, you can also use
 * \code
 * make gmp
 * \endcode
 * to enable exact computation of automorphism group sizes.
 *
 * When linking the bliss library \c libbliss.a in other programs,
 * remember to include the standard c++ library
 * (and the GMP library if you compiled bliss to include it).
 * For instance,
 * \code gcc -o test test.c -lstdc++ -lgmp -lbliss\endcode
 *
 * \section cppapi_sec The C++ language API
 *
 * The C++ language API is the main API to bliss;
 * all other APIs are just more or less complete variants of it.
 * The C++ API consists basically of the public methods in
 * the classes bliss::AbstractGraph, bliss::Graph, and bliss::Digraph.
 * For an example of its use,
 * see the \ref executable "source of the bliss executable".
 *
 *
 * \section capi_sec The C language API
 *
 * The C language API is given in the file bliss_C.h.
 * It is currently more restricted than the C++ API so
 * consider using the C++ API whenever possible.
 */


#endif


================================================
FILE: external/bliss/bliss/graph.hh
================================================
#ifndef BLISS_GRAPH_HH
#define BLISS_GRAPH_HH

#include "abgraph.hh"

namespace bliss {

#ifdef USE_DOMAIN
typedef std::pair<unsigned, Index> IndexEdge;
#else
typedef unsigned IndexEdge;
#endif

#if defined(BLISS_CONSISTENCY_CHECKS)
static bool is_permutation(const unsigned int N, const unsigned int* perm) {
	if(N == 0) return true;
	std::vector<bool> m(N, false);
	for(unsigned int i = 0; i < N; i++) {
		if(perm[i] >= N) return false;
		if(m[perm[i]]) return false;
		m[perm[i]] = true;
	}
	return true;
}
#endif
static bool is_permutation(const std::vector<unsigned int>& perm) {
	const unsigned int N = perm.size();
	if(N == 0)
		return true;
	std::vector<bool> m(N, false);
	for(unsigned int i = 0; i < N; i++) {
		if(perm[i] >= N) return false;
		if(m[perm[i]]) return false;
		m[perm[i]] = true;
	}
	return true;
}

// \brief The class for undirected, vertex colored graphs.
// Multiple edges between vertices are not allowed (i.e., are ignored).
class Graph : public AbstractGraph {
public:
	/**
	 * The possible splitting heuristics.
	 * The selected splitting heuristics affects the computed canonical
	 * labelings; therefore, if you want to compare whether two graphs
	 * are isomorphic by computing and comparing (for equality) their
	 * canonical versions, be sure to use the same splitting heuristics
	 * for both graphs.
	 */
	typedef enum {
		/** First non-unit cell.
		 * Very fast but may result in large search spaces on difficult graphs.
		 * Use for large but easy graphs. */
		shs_f = 0,
		/** First smallest non-unit cell.
		 * Fast, should usually produce smaller search spaces than shs_f. */
		shs_fs,
		/** First largest non-unit cell.
		 * Fast, should usually produce smaller search spaces than shs_f. */
		shs_fl,
		/** First maximally non-trivially connected non-unit cell.
		 * Not so fast, should usually produce smaller search spaces than shs_f,
		 * shs_fs, and shs_fl. */
		shs_fm,
		/** First smallest maximally non-trivially connected non-unit cell.
		 * Not so fast, should usually produce smaller search spaces than shs_f,
		 * shs_fs, and shs_fl. */
		shs_fsm,
		/** First largest maximally non-trivially connected non-unit cell.
		 * Not so fast, should usually produce smaller search spaces than shs_f,
		 * shs_fs, and shs_fl. */
		shs_flm
	} SplittingHeuristic;

	//moved from protected scope by Zhiqiang
	class Vertex {
		public:
			Vertex() { color = 0;}
			~Vertex(){ ; }
#ifdef USE_DOMAIN
			void add_edge(const unsigned other_vertex, Index index) {
				edges.push_back(std::make_pair(other_vertex, index));
#else
			void add_edge(const unsigned other_vertex) {
				edges.push_back(other_vertex);
#endif
			}
			void remove_duplicate_edges(std::vector<bool>& tmp) {
#if defined(BLISS_CONSISTENCY_CHECKS)
				/* Pre-conditions  */
				for(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);
#endif
				for(std::vector<IndexEdge>::iterator iter = edges.begin(); iter != edges.end(); ) {
#ifdef USE_DOMAIN
					const unsigned int dest_vertex = iter->first; //cxh
#else
					const unsigned int dest_vertex = *iter;
#endif
					if(tmp[dest_vertex] == true) {
						/* A duplicate edge found! */
						iter = edges.erase(iter);
					} else {
						/* Not seen earlier, mark as seen */
						tmp[dest_vertex] = true;
						iter++;
					}
				}
				/* Clear tmp */
				for(std::vector<IndexEdge>::iterator iter = edges.begin(); iter != edges.end(); iter++) {
#ifdef USE_DOMAIN
					tmp[iter->first] = false;// cxh
#else
					tmp[*iter] = false;
#endif
				}
#if defined(BLISS_CONSISTENCY_CHECKS)
				/* Post-conditions  */
				for(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);
#endif
			}
			void sort_edges() { std::sort(edges.begin(), edges.end()); }
			unsigned color;
			//std::vector<unsigned> edges;
			std::vector<IndexEdge> edges; // cxh: add the edge ids from the embedding
			unsigned nof_edges() const {return edges.size(); }
			};
			//added by Zhiqiang
			std::vector<Vertex> & get_vertices_rstream() { return vertices; }
			void sort_edges_rstream() { sort_edges(); }

protected:
	std::vector<Vertex> vertices;
	void sort_edges() {
		for(unsigned int i = 0; i < get_nof_vertices(); i++)
			vertices[i].sort_edges();
	}
	void remove_duplicate_edges() {
		std::vector<bool> tmp(vertices.size(), false);
		for(std::vector<Vertex>::iterator vi = vertices.begin();
				vi != vertices.end();
				vi++)
		{
#if defined(BLISS_EXPENSIVE_CONSISTENCY_CHECKS)
			for(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);
#endif
			(*vi).remove_duplicate_edges(tmp);
		}
	}
	// \internal Partition independent invariant.
	// Return the color of the vertex. Time complexity: O(1)
	static unsigned int vertex_color_invariant(const Graph* const g, const unsigned int v) { 
		return g->vertices[v].color;
	}
	/** \internal
	 * Partition independent invariant.
	 * Returns the degree of the vertex.
	 * DUPLICATE EDGES MUST HAVE BEEN REMOVED BEFORE.
	 * Time complexity: O(1).
	 */
	// Return the degree of the vertex. Time complexity: O(1)
	static unsigned int degree_invariant(const Graph* const g, const unsigned int v) {
		return g->vertices[v].nof_edges();
	}
	/** \internal
	 * Partition independent invariant.
	 * Returns 1 if there is an edge from the vertex to itself, 0 if not.
	 * Time complexity: O(k), where k is the number of edges leaving the vertex.
	 */
	// Return 1 if the vertex v has a self-loop, 0 otherwise
	// Time complexity: O(E_v), where E_v is the number of edges leaving v
	static unsigned selfloop_invariant(const Graph* const g, const unsigned int v) {
		const Vertex& vertex = g->vertices[v];
		for(std::vector<IndexEdge>::const_iterator ei = vertex.edges.begin(); ei != vertex.edges.end(); ei++) {
#ifdef USE_DOMAIN
			if(ei->first == v) return 1; // cxh
#else
			if(*ei == v) return 1;
#endif
		}
		return 0;
	}

	// Refine the partition p according to a partition independent invariant
	bool refine_according_to_invariant(unsigned int (*inv)(const Graph* const g, const unsigned int v)) {
		bool refined = false;
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; ) {
			Partition::Cell* const next_cell = cell->next_nonsingleton;
			const unsigned int* ep = p.elements + cell->first;
			for(unsigned int i = cell->length; i > 0; i--, ep++) {
				const unsigned int ival = inv(this, *ep);
				p.invariant_values[*ep] = ival;
				if(ival > cell->max_ival) {
					cell->max_ival = ival;
					cell->max_ival_count = 1;
				}
				else if(ival == cell->max_ival) {
					cell->max_ival_count++;
				}
			}
			Partition::Cell* const last_new_cell = p.zplit_cell(cell, true);
			refined |= (last_new_cell != cell);
			cell = next_cell;
		}
		return refined;
	}
	// Routines needed when refining the partition p into equitable
	// Split the neighbourhood of a cell according to the equitable invariant
	bool split_neighbourhood_of_cell(Partition::Cell* const cell) {
		const bool was_equal_to_first = refine_equal_to_first;
		if(compute_eqref_hash) {
			eqref_hash.update(cell->first);
			eqref_hash.update(cell->length);
		}
		const unsigned int* ep = p.elements + cell->first;
		for(unsigned int i = cell->length; i > 0; i--) {
			const Vertex& v = vertices[*ep++];   
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j != 0; j--) {
#ifdef USE_DOMAIN
				const unsigned int dest_vertex = (ei++)->first; // cxh
#else
				const unsigned int dest_vertex = *ei++;
#endif
				Partition::Cell * const neighbour_cell = p.get_cell(dest_vertex);
				if(neighbour_cell->is_unit())
					continue;
				const unsigned int ival = ++p.invariant_values[dest_vertex];
				if(ival > neighbour_cell->max_ival) {
					neighbour_cell->max_ival = ival;
					neighbour_cell->max_ival_count = 1;
					if(ival == 1) {
						neighbour_heap.insert(neighbour_cell->first);
					}
				}
				else if(ival == neighbour_cell->max_ival) {
					neighbour_cell->max_ival_count++;
				}
			}
		}
		while(!neighbour_heap.is_empty()) {
			const unsigned int start = neighbour_heap.remove();
			Partition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);
			if(compute_eqref_hash) {
				eqref_hash.update(neighbour_cell->first);
				eqref_hash.update(neighbour_cell->length);
				eqref_hash.update(neighbour_cell->max_ival);
				eqref_hash.update(neighbour_cell->max_ival_count);
			}
			Partition::Cell* const last_new_cell = p.zplit_cell(neighbour_cell, true);
			// Update certificate and hash if needed
			const Partition::Cell* c = neighbour_cell;
			while(1) {
				if(in_search) {
					// Build certificate
					cert_add_redundant(CERT_SPLIT, c->first, c->length);
					// No need to continue?
					if(refine_compare_certificate and
							(refine_equal_to_first == false) and
							(refine_cmp_to_best < 0))
						goto worse_exit;
				}
				if(compute_eqref_hash) {
					eqref_hash.update(c->first);
					eqref_hash.update(c->length);
				}
				if(c == last_new_cell) break;
				c = c->next;
			}
		}

		if(refine_compare_certificate and (refine_equal_to_first == false) and (refine_cmp_to_best < 0))
			return true;
		return false;
worse_exit:
		// Clear neighbour heap 
		UintSeqHash rest;
		while(!neighbour_heap.is_empty()) {
			const unsigned int start = neighbour_heap.remove();
			Partition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);
			if(opt_use_failure_recording and was_equal_to_first) {
				rest.update(neighbour_cell->first);
				rest.update(neighbour_cell->length);
				rest.update(neighbour_cell->max_ival);
				rest.update(neighbour_cell->max_ival_count);
			}
			neighbour_cell->max_ival = 0;
			neighbour_cell->max_ival_count = 0;
			p.clear_ivs(neighbour_cell);
		}
		if(opt_use_failure_recording and was_equal_to_first) {
			for(unsigned int i = p.splitting_queue.size(); i > 0; i--) {
				Partition::Cell* const cell = p.splitting_queue.pop_front();
				rest.update(cell->first);
				rest.update(cell->length);
				p.splitting_queue.push_back(cell);
			}
			rest.update(failure_recording_fp_deviation);
			failure_recording_fp_deviation = rest.get_value();
		}
		return true;
	}

	bool split_neighbourhood_of_unit_cell(Partition::Cell* const unit_cell) {
		const bool was_equal_to_first = refine_equal_to_first;
		if(compute_eqref_hash) {
			eqref_hash.update(0x87654321);
			eqref_hash.update(unit_cell->first);
			eqref_hash.update(1);
		}
		const Vertex& v = vertices[p.elements[unit_cell->first]];
		std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
		for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
			const unsigned int dest_vertex = (ei++)->first; // cxh
#else
			const unsigned int dest_vertex = *ei++;
#endif
			Partition::Cell * const neighbour_cell = p.get_cell(dest_vertex);

			if(neighbour_cell->is_unit()) {
				if(in_search) {
					/* Remember neighbour in order to generate certificate */
					neighbour_heap.insert(neighbour_cell->first);
				}
				continue;
			}
			if(neighbour_cell->max_ival_count == 0) {
				neighbour_heap.insert(neighbour_cell->first);
			}
			neighbour_cell->max_ival_count++;

			unsigned int * const swap_position =
				p.elements + neighbour_cell->first + neighbour_cell->length -
				neighbour_cell->max_ival_count;
			*p.in_pos[dest_vertex] = *swap_position;
			p.in_pos[*swap_position] = p.in_pos[dest_vertex];
			*swap_position = dest_vertex;
			p.in_pos[dest_vertex] = swap_position;
		}

		while(!neighbour_heap.is_empty()) {
			const unsigned int start = neighbour_heap.remove();
			Partition::Cell* neighbour_cell =	p.get_cell(p.elements[start]);
#if defined(BLISS_CONSISTENCY_CHECKS)
			if(neighbour_cell->is_unit()) { } else { }
#endif
			if(compute_eqref_hash) {
				eqref_hash.update(neighbour_cell->first);
				eqref_hash.update(neighbour_cell->length);
				eqref_hash.update(neighbour_cell->max_ival_count);
			}

			if(neighbour_cell->length > 1 and neighbour_cell->max_ival_count != neighbour_cell->length) {
				Partition::Cell * const new_cell =
					p.aux_split_in_two(neighbour_cell, neighbour_cell->length - neighbour_cell->max_ival_count);
				unsigned int *ep = p.elements + new_cell->first;
				unsigned int * const lp = p.elements+new_cell->first+new_cell->length;
				while(ep < lp) {
					p.element_to_cell_map[*ep] = new_cell;
					ep++;
				}
				neighbour_cell->max_ival_count = 0;


				if(compute_eqref_hash) {
					/* Update hash */
					eqref_hash.update(neighbour_cell->first);
					eqref_hash.update(neighbour_cell->length);
					eqref_hash.update(0);
					eqref_hash.update(new_cell->first);
					eqref_hash.update(new_cell->length);
					eqref_hash.update(1);
				}

				/* Add cells in splitting_queue */
				if(neighbour_cell->is_in_splitting_queue()) {
					/* Both cells must be included in splitting_queue in order
					   to ensure refinement into equitable partition */
					p.splitting_queue_add(new_cell);
				} else {
					Partition::Cell *min_cell, *max_cell;
					if(neighbour_cell->length <= new_cell->length) {
						min_cell = neighbour_cell;
						max_cell = new_cell;
					} else {
						min_cell = new_cell;
						max_cell = neighbour_cell;
					}
					/* Put the smaller cell in splitting_queue */
					p.splitting_queue_add(min_cell);
					if(max_cell->is_unit()) {
						/* Put the "larger" cell also in splitting_queue */
						p.splitting_queue_add(max_cell);
					}
				}
				/* Update pointer for certificate generation */
				neighbour_cell = new_cell;
			} else {
				/* neighbour_cell->length == 1 ||
				   neighbour_cell->max_ival_count == neighbour_cell->length */
				neighbour_cell->max_ival_count = 0;
			}

			/*
			 * Build certificate if required
			 */
			if(in_search) {
				for(unsigned int i = neighbour_cell->first, j = neighbour_cell->length; j > 0; j--, i++) {
					/* Build certificate */
					cert_add(CERT_EDGE, unit_cell->first, i);
					/* No need to continue? */
					if(refine_compare_certificate and (refine_equal_to_first == false) and (refine_cmp_to_best < 0))
						goto worse_exit;
				}
			} /* if(in_search) */
		} /* while(!neighbour_heap.is_empty()) */

		if(refine_compare_certificate and
				(refine_equal_to_first == false) and
				(refine_cmp_to_best < 0))
			return true;
		return false;

worse_exit:
		/* Clear neighbour heap */
		UintSeqHash rest;
		while(!neighbour_heap.is_empty()) {
			const unsigned int start = neighbour_heap.remove();
			Partition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);
			if(opt_use_failure_recording and was_equal_to_first) {
				rest.update(neighbour_cell->first);
				rest.update(neighbour_cell->length);
				rest.update(neighbour_cell->max_ival_count);
			}
			neighbour_cell->max_ival_count = 0;
		}
		if(opt_use_failure_recording and was_equal_to_first) {
			rest.update(failure_recording_fp_deviation);
			failure_recording_fp_deviation = rest.get_value();
		}
		return true;
	}

	//  Build the initial equitable partition
	void make_initial_equitable_partition() {
		refine_according_to_invariant(&vertex_color_invariant);
		p.splitting_queue_clear();
		//p.print_signature(stderr); fprintf(stderr, "\n");
		refine_according_to_invariant(&selfloop_invariant);
		p.splitting_queue_clear();
		//p.print_signature(stderr); fprintf(stderr, "\n");
		refine_according_to_invariant(&degree_invariant);
		p.splitting_queue_clear();
		//p.print_signature(stderr); fprintf(stderr, "\n");
		refine_to_equitable();
		//p.print_signature(stderr); fprintf(stderr, "\n");
	}
	// \internal
	// \copydoc AbstractGraph::is_equitable() const
	//Check whether the current partition p is equitable.
	//Performance: very slow, use only for debugging purposes.
	bool is_equitable() const {
		const unsigned int N = get_nof_vertices();
		if(N == 0) return true;
		std::vector<unsigned int> first_count = std::vector<unsigned int>(N, 0);
		std::vector<unsigned int> other_count = std::vector<unsigned int>(N, 0);
		for(Partition::Cell *cell = p.first_cell; cell; cell = cell->next) {
			if(cell->is_unit()) continue;
			unsigned int *ep = p.elements + cell->first;
			const Vertex &first_vertex = vertices[*ep++];
			/* Count how many edges lead from the first vertex to
			 * the neighbouring cells */
			for(std::vector<IndexEdge>::const_iterator ei = first_vertex.edges.begin(); ei != first_vertex.edges.end(); ei++) {
#ifdef USE_DOMAIN
				first_count[p.get_cell(ei->first)->first]++; // cxh
#else
				first_count[p.get_cell(*ei)->first]++;
#endif
			}
			/* Count and compare to the edges of the other vertices */
			for(unsigned int i = cell->length; i > 1; i--) {
				const Vertex &vertex = vertices[*ep++];
				for(std::vector<IndexEdge>::const_iterator ei = vertex.edges.begin(); ei != vertex.edges.end(); ei++) {
#ifdef USE_DOMAIN
					other_count[p.get_cell(ei->first)->first]++; // cxh
#else
					other_count[p.get_cell(*ei)->first]++;
#endif
				}
				for(Partition::Cell *cell2 = p.first_cell; cell2; cell2 = cell2->next) {
					if(first_count[cell2->first] != other_count[cell2->first]) {
						/* Not equitable */
						return false;
					}
					other_count[cell2->first] = 0;
				}
			}
			/* Reset first_count */
			for(unsigned int i = 0; i < N; i++) first_count[i] = 0;
		}
		return true;
	}
	/* Splitting heuristics, documented in more detail in graph.cc */
	SplittingHeuristic sh;

	// Find the next cell to be splitted
	Partition::Cell* find_next_cell_to_be_splitted(Partition::Cell* cell) {
		switch(sh) {
			case shs_f:   return sh_first();
			case shs_fs:  return sh_first_smallest();
			case shs_fl:  return sh_first_largest();
			case shs_fm:  return sh_first_max_neighbours();
			case shs_fsm: return sh_first_smallest_max_neighbours();
			case shs_flm: return sh_first_largest_max_neighbours();
			default:      fatal_error("Internal error - unknown splitting heuristics");
						  return 0;
		}
	}
	// \internal
	// A splitting heuristic.
	// Returns the first nonsingleton cell in the current partition.
	Partition::Cell* sh_first() {
		Partition::Cell* best_cell = 0;
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)
				continue;
			best_cell = cell;
			break;
		}
		return best_cell;
	}
	// \internal A splitting heuristic.
	// Returns the first smallest nonsingleton cell in the current partition.
	Partition::Cell* sh_first_smallest() {
		Partition::Cell* best_cell = 0;
		unsigned int best_size = UINT_MAX;
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level) continue;
			if(cell->length < best_size) {
				best_size = cell->length;
				best_cell = cell;
			}
		}
		return best_cell;
	}
	// \internal A splitting heuristic.
	// Returns the first largest nonsingleton cell in the current partition.
	Partition::Cell* sh_first_largest() {
		Partition::Cell* best_cell = 0;
		unsigned int best_size = 0;
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)
				continue;
			if(cell->length > best_size) {
				best_size = cell->length;
				best_cell = cell;
			}
		}
		return best_cell;
	}
	// \internal
	// A splitting heuristic.
	// Returns the first nonsingleton cell with max number of neighbouring nonsingleton cells.
	// Assumes that the partition p is equitable.
	// Assumes that the max_ival fields of the cells are all 0.
	Partition::Cell* sh_first_max_neighbours() {
		Partition::Cell* best_cell = 0;
		int best_value = -1;
		KStack<Partition::Cell*> neighbour_cells_visited;
		neighbour_cells_visited.init(get_nof_vertices());
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)
				continue;
			const Vertex& v = vertices[p.elements[cell->first]];
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
				Partition::Cell * const neighbour_cell = p.get_cell((ei++)->first); // cxh
#else
				Partition::Cell * const neighbour_cell = p.get_cell(*ei++);
#endif
				if(neighbour_cell->is_unit()) continue;
				neighbour_cell->max_ival++;
				if(neighbour_cell->max_ival == 1)
					neighbour_cells_visited.push(neighbour_cell);
			}
			int value = 0;
			while(!neighbour_cells_visited.is_empty()) {
				Partition::Cell* const neighbour_cell = neighbour_cells_visited.pop();
				if(neighbour_cell->max_ival != neighbour_cell->length)
					value++;
				neighbour_cell->max_ival = 0;
			}
			if(value > best_value) {
				best_value = value;
				best_cell = cell;
			}
		}
		return best_cell;
	}
	// \internal A splitting heuristic.
	// Returns the first smallest nonsingleton cell with max number of neighbouring nonsingleton cells.
	// Assumes that the partition p is equitable. Assumes that the max_ival fields of the cells are all 0.
	Partition::Cell* sh_first_smallest_max_neighbours() {
		Partition::Cell* best_cell = 0;
		int best_value = -1;
		unsigned int best_size = UINT_MAX;
		KStack<Partition::Cell*> neighbour_cells_visited;
		neighbour_cells_visited.init(get_nof_vertices());
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)
				continue;
			const Vertex& v = vertices[p.elements[cell->first]];
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
				Partition::Cell* const neighbour_cell = p.get_cell((ei++)->first); // cxh
#else
				Partition::Cell* const neighbour_cell = p.get_cell(*ei++);
#endif
				if(neighbour_cell->is_unit()) continue;
				neighbour_cell->max_ival++;
				if(neighbour_cell->max_ival == 1)
					neighbour_cells_visited.push(neighbour_cell);
			}
			int value = 0;
			while(!neighbour_cells_visited.is_empty()) {
				Partition::Cell* const neighbour_cell = neighbour_cells_visited.pop();
				if(neighbour_cell->max_ival != neighbour_cell->length)
					value++;
				neighbour_cell->max_ival = 0;
			}
			if((value > best_value) or (value == best_value and cell->length < best_size)) {
				best_value = value;
				best_size = cell->length;
				best_cell = cell;
			}
		}
		return best_cell;
	}
	// \internal A splitting heuristic.
	// Returns the first largest nonsingleton cell with max number of neighbouring nonsingleton cells.
	// Assumes that the partition p is equitable. Assumes that the max_ival fields of the cells are all 0.
	Partition::Cell* sh_first_largest_max_neighbours() {
		Partition::Cell* best_cell = 0;
		int best_value = -1;
		unsigned int best_size = 0;
		KStack<Partition::Cell*> neighbour_cells_visited;
		neighbour_cells_visited.init(get_nof_vertices());
		for(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {
			if(opt_use_comprec and p.cr_get_level(cell->first) != cr_level) continue;
			const Vertex& v = vertices[p.elements[cell->first]];
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
				Partition::Cell* const neighbour_cell = p.get_cell((ei++)->first); // cxh
#else
				Partition::Cell* const neighbour_cell = p.get_cell(*ei++);
#endif
				if(neighbour_cell->is_unit()) continue;
				neighbour_cell->max_ival++;
				if(neighbour_cell->max_ival == 1)
					neighbour_cells_visited.push(neighbour_cell);
			}
			int value = 0;
			while(!neighbour_cells_visited.is_empty()) {
				Partition::Cell* const neighbour_cell = neighbour_cells_visited.pop();
				if(neighbour_cell->max_ival != neighbour_cell->length) value++;
				neighbour_cell->max_ival = 0;
			}
			if((value > best_value) or (value == best_value and cell->length > best_size)) {
				best_value = value;
				best_size = cell->length;
				best_cell = cell;
			}
		}
		return best_cell;
	}
	//Initialize the certificate size and memory
	void initialize_certificate() {
		certificate_index = 0;
		certificate_current_path.clear();
		certificate_first_path.clear();
		certificate_best_path.clear();
	}
	bool is_automorphism(unsigned* const perm) {
		std::set<unsigned int, std::less<unsigned int> > edges1;
		std::set<unsigned int, std::less<unsigned int> > edges2;

#if defined(BLISS_CONSISTENCY_CHECKS)
		if(!is_permutation(get_nof_vertices(), perm))
			_INTERNAL_ERROR();
#endif

		for(unsigned int i = 0; i < get_nof_vertices(); i++) {
			Vertex& v1 = vertices[i];
			edges1.clear();
			for(std::vector<IndexEdge>::iterator ei = v1.edges.begin(); ei != v1.edges.end(); ei++)
#ifdef USE_DOMAIN
				edges1.insert(perm[ei->first]); // cxh
#else
			edges1.insert(perm[*ei]);
#endif
			Vertex& v2 = vertices[perm[i]];
			edges2.clear();
			for(std::vector<IndexEdge>::iterator ei = v2.edges.begin(); ei != v2.edges.end(); ei++)
#ifdef USE_DOMAIN
				edges2.insert(ei->first); // cxh
#else
			edges2.insert(*ei);
#endif
			if(!(edges1 == edges2)) return false;
		}
		return true;
	}

	bool nucr_find_first_component(const unsigned level) {
		cr_component.clear();
		cr_component_elements = 0;
		/* Find first non-discrete cell in the component level */
		Partition::Cell* first_cell = p.first_nonsingleton_cell;
		while(first_cell) {
			if(p.cr_get_level(first_cell->first) == level) break;
			first_cell = first_cell->next_nonsingleton;
		}
		/* The component is discrete, return false */
		if(!first_cell) return false;
		std::vector<Partition::Cell*> component;
		first_cell->max_ival = 1;
		component.push_back(first_cell);
		for(unsigned int i = 0; i < component.size(); i++) {
			Partition::Cell* const cell = component[i];
			const Vertex& v = vertices[p.elements[cell->first]];
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
				const unsigned int neighbour = (ei++)->first; // cxh
#else
				const unsigned int neighbour = *ei++;
#endif 
				Partition::Cell* const neighbour_cell = p.get_cell(neighbour);
				/* Skip unit neighbours */
				if(neighbour_cell->is_unit()) continue;
				/* Already marked to be in the same component? */
				if(neighbour_cell->max_ival == 1) continue;
				/* Is the neighbour at the same component recursion level? */
				if(p.cr_get_level(neighbour_cell->first) != level) continue;
				if(neighbour_cell->max_ival_count == 0)
					neighbour_heap.insert(neighbour_cell->first);
				neighbour_cell->max_ival_count++;
			}
			while(!neighbour_heap.is_empty()) {
				const unsigned int start = neighbour_heap.remove();
				Partition::Cell* const neighbour_cell =
					p.get_cell(p.elements[start]);
				/* Skip saturated neighbour cells */
				if(neighbour_cell->max_ival_count == neighbour_cell->length) {
					neighbour_cell->max_ival_count = 0;
					continue;
				} 
				neighbour_cell->max_ival_count = 0;
				neighbour_cell->max_ival = 1;
				component.push_back(neighbour_cell);
			}
		}
		for(unsigned int i = 0; i < component.size(); i++) {
			Partition::Cell* const cell = component[i];
			cell->max_ival = 0;
			cr_component.push_back(cell->first);
			cr_component_elements += cell->length;
		}
		if(verbstr and verbose_level > 2) {
			fprintf(verbstr, "NU-component with %lu cells and %u vertices\n",
					(long unsigned)cr_component.size(), cr_component_elements);
			fflush(verbstr);
		}
		return true;
	}
	bool nucr_find_first_component(const unsigned int level, std::vector<unsigned int>& component, unsigned int& component_elements, Partition::Cell*& sh_return) {
		component.clear();
		component_elements = 0;
		sh_return = 0;
		unsigned int sh_first  = 0;
		unsigned int sh_size   = 0;
		unsigned int sh_nuconn = 0;

		/* Find first non-discrete cell in the component level */
		Partition::Cell* first_cell = p.first_nonsingleton_cell;
		while(first_cell) {
			if(p.cr_get_level(first_cell->first) == level) break;
			first_cell = first_cell->next_nonsingleton;
		}
		if(!first_cell) {
			/* The component is discrete, return false */
			return false;
		}
		std::vector<Partition::Cell*> comp;
		KStack<Partition::Cell*> neighbours;
		neighbours.init(get_nof_vertices());
		first_cell->max_ival = 1;
		comp.push_back(first_cell);
		for(unsigned int i = 0; i < comp.size(); i++) {
			Partition::Cell* const cell = comp[i];
			const Vertex& v = vertices[p.elements[cell->first]];
			std::vector<IndexEdge>::const_iterator ei = v.edges.begin();
			for(unsigned int j = v.nof_edges(); j > 0; j--) {
#ifdef USE_DOMAIN
				const unsigned int neighbour = (ei++)->first; // cxh
#else
				const unsigned int neighbour = *ei++;
#endif
				Partition::Cell* const neighbour_cell = p.get_cell(neighbour);
				/* Skip unit neighbours */
				if(neighbour_cell->is_unit()) continue;
				/* Is the neighbour at the same component recursion level? */
				//if(p.cr_get_level(neighbour_cell->first) != level)
				//  continue;
				if(neighbour_cell->max_ival_count == 0)
					neighbours.push(neighbour_cell);
				neighbour_cell->max_ival_count++;
			}
			unsigned int nuconn = 1;
			while(!neighbours.is_empty()) {
				Partition::Cell* const neighbour_cell = neighbours.pop();
				//neighbours.pop_back();
				/* Skip saturated neighbour cells */
				if(neighbour_cell->max_ival_count == neighbour_cell->length) {
					neighbour_cell->max_ival_count = 0;
					continue;
				}
				nuconn++;
				neighbour_cell->max_ival_count = 0;
				if(neighbour_cell->max_ival == 0) {
					comp.push_back(neighbour_cell);
					neighbour_cell->max_ival = 1;
				}
			}
			switch(sh) {
				case shs_f:
					if(sh_return == 0 or cell->first <= sh_first) {
						sh_return = cell;
						sh_first = cell->first;
					}
					break;
				case shs_fs:
					if(sh_return == 0 or cell->length < sh_size or
							(cell->length == sh_size and cell->first <= sh_first)) {
						sh_return = cell;
						sh_first = cell->first;
						sh_size = cell->length;
					}
					break;
				case shs_fl:
					if(sh_return == 0 or cell->length > sh_size or
							(cell->length == sh_size and cell->first <= sh_first)) {
						sh_return = cell;
						sh_first = cell->first;
						sh_size = cell->length;
					}
					break;
				case shs_fm:
					if(sh_return == 0 or nuconn > sh_nuconn or
							(nuconn == sh_nuconn and cell->first <= sh_first)) {
						sh_return = cell;
						sh_first = cell->first;
						sh_nuconn = nuconn;
					}
					break;
				case shs_fsm:
					if(sh_return == 0 or
							nuconn > sh_nuconn or
							(nuconn == sh_nuconn and
							 (cell->length < sh_size or
							  (cell->length == sh_size and cell->first <= sh_first)))) {
						sh_return = cell;
						sh_first = cell->first;
						sh_size = cell->length;
						sh_nuconn = nuconn;
					}
					break;
				case shs_flm:
					if(sh_return == 0 or
							nuconn > sh_nuconn or
							(nuconn == sh_nuconn and
							 (cell->length > sh_size or
							  (cell->length == sh_size and cell->first <= sh_first)))) {
						sh_return = cell;
						sh_first = cell->first;
						sh_size = cell->length;
						sh_nuconn = nuconn;
					}
					break;
				default:
					fatal_error("Internal error - unknown splitting heuristics");
					return 0;
			}
		}
		assert(sh_return);
		for(unsigned int i = 0; i < comp.size(); i++) {
			Partition::Cell* const cell = comp[i];
			cell->max_ival = 0;
			component.push_back(cell->first);
			component_elements += cell->length;
		}
		if(verbstr and verbose_level > 2) {
			fprintf(verbstr, "NU-component with %lu cells and %u vertices\n",
					(long unsigned)component.size(), component_elements);
			fflush(verbstr);
		}
		return true;
	}

public:
	// Create a new graph with \a N vertices and no edges.
	Graph(const unsigned nof_vertices = 0) {
		vertices.resize(nof_vertices);
		sh = shs_flm;
	}

	/**
	 * Destroy the graph.
	 */
	~Graph() { ; }

	/**
	 * Read the graph from the file \a fp in a variant of the DIMACS format.
	 * See the <A href="http://www.tcs.hut.fi/Software/bliss/">bliss website</A>
	 * for the definition of the file format.
	 * Note that in the DIMACS file the vertices are numbered from 1 to N while
	 * in this C++ API they are from 0 to N-1.
	 * Thus the vertex n in the file corresponds to the vertex n-1 in the API.
	 *
	 * \param fp      the file stream for the graph file
	 * \param errstr  if non-null, the possible error messages are printed
	 *                in this file stream
	 * \return        a new Graph object or 0 if reading failed for some
	 *                reason
	 */
	static Graph* read_dimacs(FILE* const fp, FILE* const errstr = stderr) { return NULL; }

	/**
	 * Write the graph to a file in a variant of the DIMACS format.
	 * See the <A href="http://www.tcs.hut.fi/Software/bliss/">bliss website</A>
	 * for the definition of the file format.
	 */
	void write_dimacs(FILE* const fp) {}

	// \copydoc AbstractGraph::write_dot(FILE * const fp)
	void write_dot(FILE* const fp) {}

	// \copydoc AbstractGraph::write_dot(const char * const file_name)
	void write_dot(const char* const file_name) {}

	// \copydoc AbstractGraph::is_automorphism(const std::vector<unsigned int>& perm) const
	bool is_automorphism(const std::vector<unsigned>& perm) const {
		if(!(perm.size() == get_nof_vertices() and is_permutation(perm)))
			return false;
		std::set<unsigned, std::less<unsigned> > edges1;
		std::set<unsigned, std::less<unsigned> > edges2;
		for(unsigned i = 0; i < get_nof_vertices(); i++) {
			const Vertex& v1 = vertices[i];
			edges1.clear();
			for(std::vector<IndexEdge>::const_iterator ei = v1.edges.begin(); ei != v1.edges.end(); ei++)
#ifdef USE_DOMAIN
				edges1.insert(perm[ei->first]); // cxh
#else
			edges1.insert(perm[*ei]);
#endif
			const Vertex& v2 = vertices[perm[i]];
			edges2.clear();
			for(std::vector<IndexEdge>::const_iterator ei = v2.edges.begin(); ei != v2.edges.end(); ei++)
#ifdef USE_DOMAIN
				edges2.insert(ei->first); // cxh
#else
			edges2.insert(*ei);
#endif
			if(!(edges1 == edges2)) return false;
		}
		return true;
	}
	// \copydoc AbstractGraph::get_hash()
	virtual unsigned get_hash() {
		remove_duplicate_edges();
		sort_edges();
		UintSeqHash h;
		h.update(get_nof_vertices());
		/* Hash the color of each vertex */
		for(unsigned int i = 0; i < get_nof_vertices(); i++) {
			h.update(vertices[i].color);
		}
		/* Hash the edges */
		for(unsigned int i = 0; i < get_nof_vertices(); i++) {
			Vertex &v = vertices[i];
			for(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {
#ifdef USE_DOMAIN
				const unsigned int dest_i = ei->first; // cxh
#else
				const unsigned int dest_i = *ei;
#endif
				if(dest_i < i) continue;
				h.update(i);
				h.update(dest_i);
			}
		}
		return h.get_value();
	}
	// Return the number of vertices in the graph.
	unsigned int get_nof_vertices() const {return vertices.size(); }

	// \copydoc AbstractGraph::permute(const unsigned int* const perm) const
	Graph* permute(const unsigned* perm) const {
#if defined(BLISS_CONSISTENCY_CHECKS)
		if(!is_permutation(get_nof_vertices(), perm))
			_INTERNAL_ERROR();
#endif
		Graph* const g = new Graph(get_nof_vertices());
		for(unsigned i = 0; i < get_nof_vertices(); i++) {
			const Vertex& v = vertices[i];
			Vertex& permuted_v = g->vertices[perm[i]];
			permuted_v.color = v.color;
			for(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {
#ifdef USE_DOMAIN
				const unsigned dest_v = ei->first; //cxh
				permuted_v.add_edge(perm[dest_v], ei->second);
#else
				const unsigned dest_v = *ei;
				permuted_v.add_edge(perm[dest_v]);
#endif
			}
			permuted_v.sort_edges();
		}
		return g;
	}
	Graph* permute(const std::vector<unsigned>& perm) const {
#if defined(BLISS_CONSISTENCY_CHECKS)
#endif
		Graph* const g = new Graph(get_nof_vertices());
		for(unsigned int i = 0; i < get_nof_vertices(); i++) {
			const Vertex& v = vertices[i];
			Vertex& permuted_v = g->vertices[perm[i]];
			permuted_v.color = v.color;
			for(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {
#ifdef USE_DOMAIN
				const unsigned dest_v = ei->first; // cxh
				permuted_v.add_edge(perm[dest_v], ei->second);
#else
				const unsigned dest_v = *ei;
				permuted_v.add_edge(perm[dest_v]);
#endif
			}
			permuted_v.sort_edges();
		}
		return g;
	}
	// Add a new vertex with color \a color in the graph and return its index.
	unsigned add_vertex(const unsigned color = 0) {
		const unsigned int vertex_num = vertices.size();
		vertices.resize(vertex_num + 1);
		vertices.back().color = color;
		return vertex_num;
	}
	/**
	 * Add an edge between vertices \a v1 and \a v2.
	 * Duplicate edges between vertices are ignored but try to avoid introducing
	 * them in the first place as they are not ignored immediately but will
	 * consume memory and computation resources for a while.
	 */
	void add_edge(const unsigned vertex1, const unsigned vertex2, Index index) {
		//printf("Adding edge (%u -> %u)\n", vertex1, vertex2);
#ifdef USE_DOMAIN
		vertices[vertex1].add_edge(vertex2, index);
		vertices[vertex2].add_edge(vertex1, std::make_pair(index.second, index.first));
#else
		vertices[vertex1].add_edge(vertex2);
		vertices[vertex2].add_edge(vertex1);
#endif
	}
	// Change the color of the vertex \a vertex to \a color.
	void change_color(const unsigned vertex, const unsigned color) {
		vertices[vertex].color = color;
	}

	/**
	 * Compare this graph with the graph \a other.
	 * Returns 0 if the graphs are equal, and a negative (positive) integer
	 * if this graph is "smaller than" ("greater than", resp.) than \a other.
	 */
	int cmp(Graph& other) {
		/* Compare the numbers of vertices */
		if(get_nof_vertices() < other.get_nof_vertices())
			return -1;
		if(get_nof_vertices() > other.get_nof_vertices())
			return 1;
		/* Compare vertex colors */
		for(unsigned i = 0; i < get_nof_vertices(); i++) {
			if(vertices[i].color < other.vertices[i].color)
				return -1;
			if(vertices[i].color > other.vertices[i].color)
				return 1;
		}
		/* Compare vertex degrees */
		remove_duplicate_edges();
		other.remove_duplicate_edges();
		for(unsigned i = 0; i < get_nof_vertices(); i++) {
			if(vertices[i].nof_edges() < other.vertices[i].nof_edges())
				return -1;
			if(vertices[i].nof_edges() > other.vertices[i].nof_edges())
				return 1;
		}
		/* Compare edges */
		for(unsigned i = 0; i < get_nof_vertices(); i++) {
			Vertex &v1 = vertices[i];
			Vertex &v2 = other.vertices[i];
			v1.sort_edges();
			v2.sort_edges();
			std::vector<IndexEdge>::const_iterator ei1 = v1.edges.begin();
			std::vector<IndexEdge>::const_iterator ei2 = v2.edges.begin();
			while(ei1 != v1.edges.end()) {
#ifdef USE_DOMAIN
				if(ei1->first < ei2->first) return -1; // cxh
				if(ei1->first > ei2->first) return 1; // cxh
#else
				if(*ei1 < *ei2) return -1;
				if(*ei1 > *ei2) return 1;
#endif
				ei1++;
				ei2++;
			}
		}
		return 0;
	}
	/**
	 * Set the splitting heuristic used by the automorphism and canonical
	 * labeling algorithm.
	 * The selected splitting heuristics affects the computed canonical
	 * labelings; therefore, if you want to compare whether two graphs
	 * are isomorphic by computing and comparing (for equality) their
	 * canonical versions, be sure to use the same splitting heuristics
	 * for both graphs.
	 */
	void set_splitting_heuristic(const SplittingHeuristic shs) {sh = shs; }
};

}

#endif


================================================
FILE: external/bliss/bliss/heap.hh
================================================
#ifndef BLISS_HEAP_HH
#define BLISS_HEAP_HH
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
namespace bliss {
/** \internal
 * \brief A capacity bounded heap data structure.
 */
class Heap {
	unsigned int N;
	unsigned int n;
	unsigned int *array;
	//void upheap(unsigned int k);
	void upheap(unsigned int index) {
		const unsigned int v = array[index];
		array[0] = 0;
		while(array[index/2] > v) {
			array[index] = array[index/2];
			index = index/2;
		}
		array[index] = v;
	}
	//void downheap(unsigned int k);
	void downheap(unsigned int index) {
		const unsigned int v = array[index];
		const unsigned int lim = n/2;
		while(index <= lim) {
			unsigned int new_index = index + index;
			if((new_index < n) and (array[new_index] > array[new_index+1]))
				new_index++;
			if(v <= array[new_index])
				break;
			array[index] = array[new_index];
			index = new_index;
		}
		array[index] = v;
	}

public:
	/**
	 * Create a new heap.
	 * init() must be called after this.
	 */
	Heap() {array = 0; n = 0; N = 0; }
	~Heap() {
		if(array) {
			free(array);
			array = 0;
			n = 0;
			N = 0;
		}
	}
	/**
	 * Initialize the heap to have the capacity to hold \e size elements.
	 */
	//void init(const unsigned int size);
	void init(const unsigned int size) {
		if(size > N) {
			if(array) free(array);
			array = (unsigned int*)malloc((size + 1) * sizeof(unsigned int));
			N = size;
		}
	}
	/**
	 * Is the heap empty?
	 * Time complexity is O(1).
	 */
	bool is_empty() const { return (n==0); }

	/**
	 * Remove all the elements in the heap.
	 * Time complexity is O(1).
	 */
	void clear() { n = 0; }

	/**
	 * Insert the element \a e in the heap.
	 * Time complexity is O(log(N)), where N is the number of elements
	 * currently in the heap.
	 */
	//void insert(const unsigned int e);
	void insert(const unsigned int v) {
		array[++n] = v;
		upheap(n);
	}

	/**
	 * Remove and return the smallest element in the heap.
	 * Time complexity is O(log(N)), where N is the number of elements
	 * currently in the heap.
	 */
	//unsigned int remove();
	unsigned int remove() {
		const unsigned int v = array[1];
		array[1] = array[n--];
		downheap(1);
		return v;
	}

	/**
	 * Get the number of elements in the heap.
	 */
	unsigned int size() const {return n; }

};
} // namespace bliss

#endif


================================================
FILE: external/bliss/bliss/kqueue.hh
================================================
#ifndef BLISS_KQUEUE_HH
#define BLISS_KQUEUE_HH

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "defs.hh"

namespace bliss {

/** \internal
 * \brief A very simple implementation of queues with fixed capacity.
 */

template <class Type>
class KQueue
{
public:
  /**
   * Create a new queue with capacity zero.
   * The function init() should be called next.
   */
  KQueue();

  ~KQueue();

  /**
   * Initialize the queue to have the capacity to hold at most \a N elements.
   */
  void init(const unsigned int N);
  
  /** Is the queue empty? */
  bool is_empty() const;

  /** Return the number of elements in the queue. */
  unsigned int size() const;

  /** Remove all the elements in the queue. */
  void clear();

  /** Return (but don't remove) the first element in the queue. */
  Type front() const;

  /** Remove and return the first element of the queue. */
  Type pop_front();

  /** Push the element \a e in the front of the queue. */
  void push_front(Type e);

  /** Remove and return the last element of the queue. */
  Type pop_back();

  /** Push the element \a e in the back of the queue. */
  void push_back(Type e);
private:
  Type *entries, *end;
  Type *head, *tail;
};

template <class Type>
KQueue<Type>::KQueue()
{
  entries = 0;
  end = 0;
  head = 0;
  tail = 0;
}

template <class Type>
KQueue<Type>::~KQueue()
{
  if(entries)
    free(entries);
}

template <class Type>
void KQueue<Type>::init(const unsigned int k)
{
  assert(k > 0);
  if(entries)
    free(entries);
  entries = (Type*)malloc((k + 1) * sizeof(Type));
  end = entries + k + 1;
  head = entries;
  tail = head;
}

template <class Type>
void KQueue<Type>::clear()
{
  head = entries;
  tail = head;
}

template <class Type>
bool KQueue<Type>::is_empty() const
{
  return(head == tail);
}

template <class Type>
unsigned int KQueue<Type>::size() const
{
  if(tail >= head)
    return(tail - head);
  return((end - head) + (tail - entries));
}

template <class Type>
Type KQueue<Type>::front() const
{
  return *head;
}

template <class Type>
Type KQueue<Type>::pop_front()
{
  Type *old_head = head;
  head++;
  if(head == end)
    head = entries;
  return *old_head;
}

template <class Type>
void KQueue<Type>::push_front(Type e)
{
  if(head == entries)
    head = end - 1;
  else
    head--;
  *head = e;
}

template <class Type>
void KQueue<Type>::push_back(Type e)
{
  *tail = e;
  tail++;
  if(tail == end)
    tail = entries;
}

} // namespace bliss

#endif


================================================
FILE: external/bliss/bliss/kstack.hh
================================================
#ifndef BLISS_KSTACK_H
#define BLISS_KSTACK_H

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <cstdlib>
#include "defs.hh"

namespace bliss {

/** \internal
 * \brief A very simple implementation of a stack with fixed capacity.
 */
template <class Type>
class KStack {
public:
  /**
   * Create a new stack with zero capacity.
   * The function init() should be called next.
   */
  KStack();

  /**
   * Create a new stack with the capacity to hold at most \a N elements.
   */
  KStack(int N);

  ~KStack();

  /**
   * Initialize the stack to have the capacity to hold at most \a N elements.
   */
  void init(int N);

  /**
   * Is the stack empty?
   */
  bool is_empty() const {return(cursor == entries); }

  /**
   * Return (but don't remove) the top element of the stack.
   */
  Type top() const {BLISS_ASSERT(cursor > entries); return *cursor; }

  /**
   * Pop (remove) the top element of the stack.
   */
  Type pop()
  {
    return *cursor--;
  }

  /**
   * Push the element \a e in the stack.
   */
  void push(Type e)
  {
    *(++cursor) = e;
  }

  /** Remove all the elements in the stack. */
  void clean() {cursor = entries; }

  /**
   * Get the number of elements in the stack.
   */
  unsigned int size() const {return(cursor - entries); }

  /**
   * Return the i:th element in the stack, where \a i is in the range
   * 0,...,this.size()-1; the 0:th element is the bottom element
   * in the stack.
   */
  Type element_at(unsigned int i)
  {
    assert(i < size());
    return entries[i+1];
  }

  /** Return the capacity (NOT the number of elements) of the stack. */
  int capacity() {return kapacity; }
private:
  int kapacity;
  Type *entries;
  Type *cursor;
};

template <class Type>
KStack<Type>::KStack()
{
  kapacity = 0;
  entries = 0;
  cursor = 0;
}

template <class Type>
KStack<Type>::KStack(int k)
{
  assert(k > 0);
  kapacity = k;
  entries = (Type*)malloc((k+1) * sizeof(Type));
  cursor = entries;
}

template <class Type>
void KStack<Type>::init(int k)
{
  assert(k > 0);
  if(entries)
    free(entries);
  kapacity = k;
  entries = (Type*)malloc((k+1) * sizeof(Type));
  cursor = entries;
}

template <class Type>
KStack<Type>::~KStack()
{
  free(entries);
}

} // namespace bliss

#endif


================================================
FILE: external/bliss/bliss/orbit.hh
================================================
#ifndef BLISS_ORBIT_HH
#define BLISS_ORBIT_HH

namespace bliss {
class Orbit {
	class OrbitEntry {
		public:
			unsigned int element;
			OrbitEntry *next;
			unsigned int size;
	};
	OrbitEntry *orbits;
	OrbitEntry **in_orbit;
	unsigned int nof_elements;
	unsigned int _nof_orbits;
	void merge_orbits(OrbitEntry *orbit1, OrbitEntry *orbit2) {
		if(orbit1 != orbit2) {
			_nof_orbits--;
			// Only update the elements in the smaller orbit
			if(orbit1->size > orbit2->size) {
				OrbitEntry * const temp = orbit2;
				orbit2 = orbit1;
				orbit1 = temp;
			}
			// Link the elements of orbit1 to the almost beginning of orbit2
			OrbitEntry *e = orbit1;
			while(e->next) {
				in_orbit[e->element] = orbit2;
				e = e->next;
			}
			in_orbit[e->element] = orbit2;
			e->next = orbit2->next;
			orbit2->next = orbit1;
			// Keep the minimal orbit representative in the beginning
			if(orbit1->element < orbit2->element) {
				const unsigned int temp = orbit1->element;
				orbit1->element = orbit2->element;
				orbit2->element = temp;
			}
			orbit2->size += orbit1->size;
		}
	}

	public:
	// Create a new orbit information object.
	// The init() function must be called next to actually initialize the object.
	Orbit() {
		orbits = 0;
		in_orbit = 0;
		nof_elements = 0;
	}
	~Orbit() {
		if(orbits) {
			free(orbits);
			orbits = 0;
		}
		if(in_orbit) {
			free(in_orbit);
			in_orbit = 0;
		}
		nof_elements = 0;
	}

	// Initialize the orbit information to consider sets of \a N elements.
	// It is required that \a N > 0.
	// The orbit information is reset so that each element forms an orbit of its own.
	// Time complexity is O(N). \sa reset()
	void init(const unsigned int n) {
		assert(n > 0);
		if(orbits) free(orbits);
		orbits = (OrbitEntry*)malloc(n * sizeof(OrbitEntry));
		if(in_orbit) free(in_orbit);
		in_orbit = (OrbitEntry**)malloc(n * sizeof(OrbitEntry*));
		nof_elements = n;
		reset();
	}

	// Reset the orbits so that each element forms an orbit of its own.
	// Time complexity is O(N).
	void reset() {
		assert(orbits);
		assert(in_orbit);
		for(unsigned int i = 0; i < nof_elements; i++) {
			orbits[i].element = i;
			orbits[i].next = 0;
			orbits[i].size = 1;
			in_orbit[i] = &orbits[i];
		}
		_nof_orbits = nof_elements;
	}

	// Merge the orbits of the elements \a e1 and \a e2.
	// Time complexity is O(k), where k is the number of elements in
	// the smaller of the merged orbits.
	void merge_orbits(unsigned int e1, unsigned int e2) {
		merge_orbits(in_orbit[e1], in_orbit[e2]);
	}

	// Is the element \a e the smallest element in its orbit?
	// Time complexity is O(1).
	bool is_minimal_representative(unsigned element) const {
		return(get_minimal_representative(element) == element);
	}
	/// Get the smallest element in the orbit of the element \a e.
	// Time complexity is O(1).
	unsigned get_minimal_representative(unsigned element) const {
		OrbitEntry * const orbit = in_orbit[element];
		return(orbit->element);
	}
	// Get the number of elements in the orbit of the element \a e.
	// Time complexity is O(1).

	unsigned orbit_size(unsigned element) const {
		return(in_orbit[element]->size);
	}
	// Get the number of orbits.
	// Time complexity is O(1).
	unsigned int nof_orbits() const {return _nof_orbits; }
};

} // namespace bliss

#endif


================================================
FILE: external/bliss/bliss/partition.hh
================================================
#ifndef BLISS_PARTITION_HH
#define BLISS_PARTITION_HH

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

namespace bliss {
  class Partition;
}

#include <cstdio>
#include <cstdlib>
#include <climits>
#include <assert.h>
#include "kstack.hh"
#include "kqueue.hh"
#include "heap.hh"
#include "orbit.hh"
#include "abgraph.hh"
#include "graph.hh"

namespace bliss {

/** \internal
 * \brief A class for refinable, backtrackable ordered partitions.
 *
 * This is rather a data structure with some helper functions than
 * a proper self-contained class.
 * That is, for efficiency reasons the fields of this class are directly
 * manipulated from bliss::AbstractGraph and its subclasses.
 * Conversely, some methods of this class modify the fields of
 * bliss::AbstractGraph, too.
 */
class Partition
{
public:
  /**
   * \brief Data structure for holding information about a cell in a Partition.
   */
  class Cell
  {
    friend class Partition;
  public:
    unsigned int length;
    // Index of the first element of the cell in the Partition::elements array
    unsigned int first;
    unsigned int max_ival;
    unsigned int max_ival_count;
  private:
    bool in_splitting_queue;
  public:
    bool in_neighbour_heap;
    /* Pointer to the next cell, null if this is the last one. */
    Cell* next;
    Cell* prev;
    Cell* next_nonsingleton;
    Cell* prev_nonsingleton;
    unsigned int split_level;
    /** Is this a unit cell? */
    bool is_unit() const {return(length == 1); }
    /** Is this cell in splitting queue? */
    bool is_in_splitting_queue() const {return(in_splitting_queue); }
  };


private:

  /** \internal
   * Data structure for remembering information about splits in order to
   * perform efficient backtracking over the splits.
   */
  class RefInfo {
  public:
    unsigned int split_cell_first;
    int prev_nonsingleton_first;
    int next_nonsingleton_first;
  };
  /** \internal
   * A stack for remembering the splits, used for backtracking.
   */
  KStack<RefInfo> refinement_stack;

  class BacktrackInfo {
  public:
    BacktrackInfo() : refinement_stack_size(0), cr_backtrack_point(0) {}
    unsigned int refinement_stack_size;
    unsigned int cr_backtrack_point;
  };

  /** \internal
   * The main stack for enabling backtracking.
   */
  std::vector<BacktrackInfo> bt_stack;

public:
  AbstractGraph* graph;

  /* Used during equitable partition refinement */
  KQueue<Cell*> splitting_queue;
  //void  splitting_queue_add(Cell* const cell);
  Cell* splitting_queue_pop();
  bool  splitting_queue_is_empty() const;
  //void  splitting_queue_clear();

void splitting_queue_add(Cell* const cell) {
  static const unsigned int smallish_cell_threshold = 1;
  cell->in_splitting_queue = true;
  if(cell->length <= smallish_cell_threshold)
    splitting_queue.push_front(cell);
  else
    splitting_queue.push_back(cell);    
}

void splitting_queue_clear() {
  while(!splitting_queue_is_empty())
    splitting_queue_pop();
}

  /** Type for backtracking points. */
  typedef unsigned int BacktrackPoint;

  /**
   * Get a new backtrack point for the current partition
   */
  //BacktrackPoint set_backtrack_point();
BacktrackPoint set_backtrack_point() {
  BacktrackInfo info;
  info.refinement_stack_size = refinement_stack.size();
  if(cr_enabled)
    info.cr_backtrack_point = cr_get_backtrack_point();
  BacktrackPoint p = bt_stack.size();
  bt_stack.push_back(info);
  return p;
}

  /**
   * Backtrack to the point \a p and remove it.
   */
  //void goto_backtrack_point(BacktrackPoint p);
void goto_backtrack_point(BacktrackPoint p) {
  BacktrackInfo info = bt_stack[p];
  bt_stack.resize(p);
  if(cr_enabled)
    cr_goto_backtrack_point(info.cr_backtrack_point);
  const unsigned int dest_refinement_stack_size = info.refinement_stack_size;
  assert(refinement_stack.size() >= dest_refinement_stack_size);
  while(refinement_stack.size() > dest_refinement_stack_size) {
      RefInfo i = refinement_stack.pop();
      const unsigned int first = i.split_cell_first;
      Cell* cell = get_cell(elements[first]);
      if(cell->first != first) {
	  assert(cell->first < first);
	  assert(cell->split_level <= dest_refinement_stack_size);
	  goto done;
	}
      assert(cell->split_level > dest_refinement_stack_size);
      while(cell->split_level > dest_refinement_stack_size) {
	  assert(cell->prev);
	  cell = cell->prev;
	}
      while(cell->next and cell->next->split_level > dest_refinement_stack_size) {
	  /* Merge next cell */
	  Cell* const next_cell = cell->next;
	  if(cell->length == 1)
	    discrete_cell_count--;
	  if(next_cell->length == 1)
	    discrete_cell_count--;
	  /* Update element_to_cell_map values of elements added in cell */
	  unsigned int* ep = elements + next_cell->first;
	  unsigned int* const lp = ep + next_cell->length;
	  for( ; ep < lp; ep++)
	    element_to_cell_map[*ep] = cell;
	  /* Update cell parameters */
	  cell->length += next_cell->length;
	  if(next_cell->next)
	    next_cell->next->prev = cell;
	  cell->next = next_cell->next;
	  /* (Pseudo)free next_cell */
	  next_cell->first = 0;
	  next_cell->length = 0;
	  next_cell->prev = 0;
	  next_cell->next = free_cells;
	  free_cells = next_cell;
	}

    done:
      if(i.prev_nonsingleton_first >= 0) {
	  Cell* const prev_cell = get_cell(elements[i.prev_nonsingleton_first]);
	  cell->prev_nonsingleton = prev_cell;
	  prev_cell->next_nonsingleton = cell;
	} else {
	  //assert(cell->prev_nonsingleton == 0);
	  cell->prev_nonsingleton = 0;
	  first_nonsingleton_cell = cell;
	}

      if(i.next_nonsingleton_first >= 0) {
	  Cell* const next_cell = get_cell(elements[i.next_nonsingleton_first]);
	  cell->next_nonsingleton = next_cell;
	  next_cell->prev_nonsingleton = cell;
	} else {
	  //assert(cell->next_nonsingleton == 0);
	  cell->next_nonsingleton = 0;
	}
    }
}
  /**
   * Split the non-unit Cell \a cell = {\a element,e1,e2,...,en} containing
   * the element \a element in two:
   * \a cell = {e1,...,en} and \a newcell = {\a element}.
   * @param cell     a non-unit Cell
   * @param element  an element in \a cell
   * @return         the new unit Cell \a newcell
   */
  //Cell* individualize(Cell* const cell, const unsigned int element);
Cell* individualize(Cell * const cell, const unsigned int element) {
  unsigned int * const pos = in_pos[element];
  const unsigned int last = cell->first + cell->length - 1;
  *pos = elements[last];
  in_pos[*pos] = pos;
  elements[last] = element;
  in_pos[element] = elements + last;
  Partition::Cell * const new_cell = aux_split_in_two(cell, cell->length-1);
  element_to_cell_map[element] = new_cell;
  return new_cell;
}

  //Cell* aux_split_in_two(Cell* const cell, const unsigned int first_half_size);
Cell* aux_split_in_two(Cell* const cell, const unsigned int first_half_size) {
  RefInfo i;
  /* (Pseudo)allocate new cell */
  Cell * const new_cell = free_cells;
  free_cells = new_cell->next;
  /* Update new cell parameters */
  new_cell->first = cell->first + first_half_size;
  new_cell->length = cell->length - first_half_size;
  new_cell->next = cell->next;
  if(new_cell->next)
    new_cell->next->prev = new_cell;
  new_cell->prev = cell;
  new_cell->split_level = refinement_stack.size()+1;
  /* Update old, splitted cell parameters */
  cell->length = first_half_size;
  cell->next = new_cell;
  /* CR */
  if(cr_enabled)
    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));

  /* Add cell in refinement_stack for backtracking */
  i.split_cell_first = new_cell->first;
  if(cell->prev_nonsingleton)
    i.prev_nonsingleton_first = cell->prev_nonsingleton->first;
  else
    i.prev_nonsingleton_first = -1;
  if(cell->next_nonsingleton)
    i.next_nonsingleton_first = cell->next_nonsingleton->first;
  else
    i.next_nonsingleton_first = -1;
  refinement_stack.push(i);

  /* Modify nonsingleton cell list */
  if(new_cell->length > 1)
    {
      new_cell->prev_nonsingleton = cell;
      new_cell->next_nonsingleton = cell->next_nonsingleton;
      if(new_cell->next_nonsingleton)
	new_cell->next_nonsingleton->prev_nonsingleton = new_cell;
      cell->next_nonsingleton = new_cell;
    }
  else
    {
      new_cell->next_nonsingleton = 0;
      new_cell->prev_nonsingleton = 0;
      discrete_cell_count++;
    }

  if(cell->is_unit())
    {
      if(cell->prev_nonsingleton)
	cell->prev_nonsingleton->next_nonsingleton = cell->next_nonsingleton;
      else
	first_nonsingleton_cell = cell->next_nonsingleton;
      if(cell->next_nonsingleton)
	cell->next_nonsingleton->prev_nonsingleton = cell->prev_nonsingleton;
      cell->next_nonsingleton = 0;
      cell->prev_nonsingleton = 0;
      discrete_cell_count++;
    }

  return new_cell;
} 

private:
  unsigned int N;
  Cell* cells;
  Cell* free_cells;
  unsigned int discrete_cell_count;
public:
  Cell* first_cell;
  Cell* first_nonsingleton_cell;
  unsigned int *elements;
  /* invariant_values[e] gives the invariant value of the element e */
  unsigned int *invariant_values;
  /* element_to_cell_map[e] gives the cell of the element e */
  Cell **element_to_cell_map;
  /** Get the cell of the element \a e */
  Cell* get_cell(const unsigned int e) const {
    return element_to_cell_map[e];
  }
  /* in_pos[e] points to the elements array s.t. *in_pos[e] = e  */
  unsigned int **in_pos;

  //Partition();
  //~Partition();
Partition() {
  N = 0;
  elements = 0;
  in_pos = 0;
  invariant_values = 0;
  cells = 0;
  free_cells = 0;
  element_to_cell_map = 0;
  graph = 0;
  discrete_cell_count = 0;
  /* Initialize a distribution count sorting array. */
  for(unsigned int i = 0; i < 256; i++)
    dcs_count[i] = 0;

  cr_enabled = false;
  cr_cells = 0;
  cr_levels = 0;
}

~Partition() {
  if(elements)            {free(elements); elements = 0; }
  if(cells)               {free(cells); cells = 0; }
  if(element_to_cell_map) {free(element_to_cell_map); element_to_cell_map = 0; }
  if(in_pos)              {free(in_pos); in_pos = 0; }
  if(invariant_values)    {free(invariant_values); invariant_values = 0; }
  N = 0;
}

  /**
   * Initialize the partition to the unit partition (all elements in one cell)
   * over the \a N > 0 elements {0,...,\a N-1}.
   */
  //void init(const unsigned int N);
void init(const unsigned int M) {
  assert(M > 0);
  N = M;

  if(elements)
    free(elements);
  elements = (unsigned int*)malloc(N * sizeof(unsigned int));
  for(unsigned int i = 0; i < N; i++)
    elements[i] = i;

  if(in_pos)
    free(in_pos);
  in_pos = (unsigned int**)malloc(N * sizeof(unsigned int*));
  for(unsigned int i = 0; i < N; i++)
    in_pos[i] = elements + i;

  if(invariant_values)
    free(invariant_values);
  invariant_values = (unsigned int*)malloc(N * sizeof(unsigned int));
  for(unsigned int i = 0; i < N; i++)
    invariant_values[i] = 0;

  if(cells)
    free(cells);
  cells = (Cell*)malloc(N * sizeof(Cell));

  cells[0].first = 0;
  cells[0].length = N;
  cells[0].max_ival = 0;
  cells[0].max_ival_count = 0;
  cells[0].in_splitting_queue = false;
  cells[0].in_neighbour_heap = false;
  cells[0].prev = 0;
  cells[0].next = 0;
  cells[0].next_nonsingleton = 0;
  cells[0].prev_nonsingleton = 0;
  cells[0].split_level = 0;
  first_cell = &cells[0];
  if(N == 1)
    {
      first_nonsingleton_cell = 0;
      discrete_cell_count = 1;
    }
  else
    {
      first_nonsingleton_cell = &cells[0];
      discrete_cell_count = 0;
    }

  for(unsigned int i = 1; i < N; i++)
    {
      cells[i].first = 0;
      cells[i].length = 0;
      cells[i].max_ival = 0;
      cells[i].max_ival_count = 0;
      cells[i].in_splitting_queue = false;
      cells[i].in_neighbour_heap = false;
      cells[i].prev = 0;
      cells[i].next = (i < N-1)?&cells[i+1]:0;
      cells[i].next_nonsingleton = 0;
      cells[i].prev_nonsingleton = 0;
    }
  if(N > 1)
    free_cells = &cells[1];
  else
    free_cells = 0;

  if(element_to_cell_map)
    free(element_to_cell_map);
  element_to_cell_map = (Cell **)malloc(N * sizeof(Cell *));
  for(unsigned int i = 0; i < N; i++)
    element_to_cell_map[i] = first_cell;

  splitting_queue.init(N);
  refinement_stack.init(N);

  /* Reset the main backtracking stack */
  bt_stack.clear();
}
  /**
   * Returns true iff the partition is discrete, meaning that all
   * the elements are in their own cells.
   */
  bool is_discrete() const {return(free_cells == 0); }

  unsigned int nof_discrete_cells() const {return(discrete_cell_count); }

  /**
   * Print the partition into the file stream \a fp.
   */
  //size_t print(FILE* const fp, const bool add_newline = true) const;
size_t print(FILE* const fp, const bool add_newline = true) const {
  size_t r = 0;
  const char* cell_sep = "";
  r += fprintf(fp, "[");
  for(Cell* cell = first_cell; cell; cell = cell->next)
    {
      /* Print cell */
      r += fprintf(fp, "%s{", cell_sep);
      cell_sep = ",";
      const char* elem_sep = "";
      for(unsigned int i = 0; i < cell->length; i++)
	{
	  r += fprintf(fp, "%s%u", elem_sep, elements[cell->first + i]);
	  elem_sep = ",";
	}
      r += fprintf(fp, "}");
    }
  r += fprintf(fp, "]");
  if(add_newline) r += fprintf(fp, "\n");
  return r;
}

  /**
   * Print the partition cell sizes into the file stream \a fp.
   */
  //size_t print_signature(FILE* const fp, const bool add_newline = true) const;
size_t print_signature(FILE* const fp, const bool add_newline = true) const {
  size_t r = 0;
  const char* cell_sep = "";
  r += fprintf(fp, "[");
  for(Cell* cell = first_cell; cell; cell = cell->next)
    {
      if(cell->is_unit()) continue;
      //fprintf(fp, "%s%u", cell_sep, cr_cells[cell->first].level);
      r += fprintf(fp, "%s%u", cell_sep, cell->length);
      cell_sep = ",";
    }
  r += fprintf(fp, "]");
  if(add_newline) r += fprintf(fp, "\n");
  return r;
}

  /*
   * Splits the Cell \a cell into [cell_1,...,cell_n]
   * according to the invariant_values of the elements in \a cell.
   * After splitting, cell_1 == \a cell.
   * Returns the pointer to the Cell cell_n;
   * cell_n != cell iff the Cell \a cell was actually splitted.
   * The flag \a max_ival_info_ok indicates whether the max_ival and
   * max_ival_count fields of the Cell \a cell have consistent values
   * when the method is called.
   * Clears the invariant values of elements in the Cell \a cell as well as
   * the max_ival and max_ival_count fields of the Cell \a cell.
   */
  //Cell *zplit_cell(Cell * const cell, const bool max_ival_info_ok);
Cell* zplit_cell(Cell* const cell, const bool max_ival_info_ok) {
  Cell* last_new_cell = cell;
  if(!max_ival_info_ok) {
      /* Compute max_ival info */
      assert(cell->max_ival == 0);
      assert(cell->max_ival_count == 0);
      unsigned int *ep = elements + cell->first;
      for(unsigned int i = cell->length; i > 0; i--, ep++) {
	  const unsigned int ival = invariant_values[*ep];
	  if(ival > cell->max_ival) {
	      cell->max_ival = ival;
	      cell->max_ival_count = 1;
	    }
	  else if(ival == cell->max_ival) {
	      cell->max_ival_count++;
	    }
	}
    }

#ifdef BLISS_CONSISTENCY_CHECKS
  /* Verify max_ival info */
  {
    unsigned int nof_zeros = 0;
    unsigned int max_ival = 0;
    unsigned int max_ival_count = 0;
    unsigned int *ep = elements + cell->first;
    for(unsigned int i = cell->length; i > 0; i--, ep++)
      {
	const unsigned int ival = invariant_values[*ep];
	if(ival == 0)
	  nof_zeros++;
	if(ival > max_ival)
	  {
	    max_ival = ival;
	    max_ival_count = 1;
	  }
	else if(ival == max_ival)
	  max_ival_count++;
      }
    assert(max_ival == cell->max_ival);
    assert(max_ival_count == cell->max_ival_count);
  }
#endif

  /* max_ival info has been computed */

  if(cell->max_ival_count == cell->length)
    {
      /* All invariant values are the same, clear 'em */
      if(cell->max_ival > 0)
	clear_ivs(cell);
    }
  else
    {
      /* All invariant values are not the same */
      if(cell->max_ival == 1)
	{
	  /* Specialized splitting for cells with binary invariant values */
	  last_new_cell = sort_and_split_cell1(cell);
	}
      else if(cell->max_ival < 256)
	{
	  /* Specialized splitting for cells with invariant values < 256 */
	  last_new_cell = sort_and_split_cell255(cell, cell->max_ival);
	}
      else
	{
	  /* Generic sorting and splitting */
	  const bool sorted = shellsort_cell(cell);
	  if (!sorted) printf("error sorting\n");
	  assert(sorted);
	  last_new_cell = split_cell(cell);
	}
    }
  cell->max_ival = 0;
  cell->max_ival_count = 0;
  return last_new_cell;
}
  /*
   * Routines for component recursion
   */
  //void cr_init();
  //void cr_free();
  unsigned int cr_get_level(const unsigned int cell_index) const;
  //unsigned int cr_split_level(const unsigned int level, const std::vector<unsigned int>& cells);
/*
 *
 * Component recursion specific code
 *
 */
void cr_init() {
  assert(bt_stack.empty());
  cr_enabled = true;
  if(cr_cells) free(cr_cells);
  cr_cells = (CRCell*)malloc(N * sizeof(CRCell));
  if(!cr_cells) {assert(false && "Mem out"); }

  if(cr_levels) free(cr_levels);
  cr_levels = (CRCell**)malloc(N * sizeof(CRCell*));
  if(!cr_levels) {assert(false && "Mem out"); }

  for(unsigned int i = 0; i < N; i++) {
    cr_levels[i] = 0;
    cr_cells[i].level = UINT_MAX;
    cr_cells[i].next = 0;
    cr_cells[i].prev_next_ptr = 0;
  }

  for(const Cell *cell = first_cell; cell; cell = cell->next)
    cr_create_at_level_trailed(cell->first, 0);

  cr_max_level = 0;
}


void cr_free() {
  if(cr_cells) {free(cr_cells); cr_cells = 0; }
  if(cr_levels) {free(cr_levels); cr_levels = 0; }
  cr_created_trail.clear();
  cr_splitted_level_trail.clear();
  cr_bt_info.clear();
  cr_max_level = 0;
  cr_enabled = false;
}

unsigned int cr_split_level(const unsigned int level, const std::vector<unsigned int>& splitted_cells) {
  assert(cr_enabled);
  assert(level <= cr_max_level);
  cr_levels[++cr_max_level] = 0;
  cr_splitted_level_trail.push_back(level);
  for(unsigned int i = 0; i < splitted_cells.size(); i++) {
      const unsigned int cell_index = splitted_cells[i];
      assert(cell_index < N);
      CRCell& cr_cell = cr_cells[cell_index];
      assert(cr_cell.level == level);
      cr_cell.detach();
      cr_create_at_level(cell_index, cr_max_level);
    }
  return cr_max_level;
}

  /** Clear the invariant_values of the elements in the Cell \a cell. */
  //void clear_ivs(Cell* const cell);
void clear_ivs(Cell* const cell) {
  unsigned int* ep = elements + cell->first;
  for(unsigned int i = cell->length; i > 0; i--, ep++)
    invariant_values[*ep] = 0;
}

private:
  /*
   * Component recursion data structures
   */

  /* Is component recursion support in use? */
  bool cr_enabled;

  class CRCell {
  public:
    unsigned int level;
    CRCell* next;
    CRCell** prev_next_ptr;
    void detach() {
      if(next)
	next->prev_next_ptr = prev_next_ptr;
      *(prev_next_ptr) = next;
      level = UINT_MAX;
      next = 0;
      prev_next_ptr = 0;
    }
  };
  CRCell* cr_cells;
  CRCell** cr_levels;
  class CR_BTInfo {
  public:
    unsigned int created_trail_index;
    unsigned int splitted_level_trail_index;
  };
  std::vector<unsigned int> cr_created_trail;
  std::vector<unsigned int> cr_splitted_level_trail;
  std::vector<CR_BTInfo> cr_bt_info;
  unsigned int cr_max_level;
  //void cr_create_at_level(const unsigned int cell_index, unsigned int level);
  //void cr_create_at_level_trailed(const unsigned int cell_index, unsigned int level);
  //unsigned int cr_get_backtrack_point();
  //void cr_goto_backtrack_point(const unsigned int btpoint);

void cr_create_at_level(const unsigned int cell_index, const unsigned int level) {
  assert(cr_enabled);
  assert(cell_index < N);
  assert(level < N);
  CRCell& cr_cell = cr_cells[cell_index];
  assert(cr_cell.level == UINT_MAX);
  assert(cr_cell.next == 0);
  assert(cr_cell.prev_next_ptr == 0);
  if(cr_levels[level])
    cr_levels[level]->prev_next_ptr = &(cr_cell.next);
  cr_cell.next = cr_levels[level];
  cr_levels[level] = &cr_cell;
  cr_cell.prev_next_ptr = &cr_levels[level];
  cr_cell.level = level;
}


void cr_create_at_level_trailed(const unsigned int cell_index, const unsigned int level) {
  assert(cr_enabled);
  cr_create_at_level(cell_index, level);
  cr_created_trail.push_back(cell_index);
}

unsigned int cr_get_backtrack_point() {
  assert(cr_enabled);
  CR_BTInfo info;
  info.created_trail_index = cr_created_trail.size();
  info.splitted_level_trail_index = cr_splitted_level_trail.size();
  cr_bt_info.push_back(info);
  return cr_bt_info.size()-1;
}


void cr_goto_backtrack_point(const unsigned int btpoint) {
  assert(cr_enabled);
  assert(btpoint < cr_bt_info.size());
  while(cr_created_trail.size() > cr_bt_info[btpoint].created_trail_index) {
      const unsigned int cell_index = cr_created_trail.back();
      cr_created_trail.pop_back();
      CRCell& cr_cell = cr_cells[cell_index];
      assert(cr_cell.level != UINT_MAX);
      assert(cr_cell.prev_next_ptr);
      cr_cell.detach();
    }

  while(cr_splitted_level_trail.size() >
	cr_bt_info[btpoint].splitted_level_trail_index)
    {
      const unsigned int dest_level = cr_splitted_level_trail.back();
      cr_splitted_level_trail.pop_back();
      assert(cr_max_level > 0);
      assert(dest_level < cr_max_level);
      while(cr_levels[cr_max_level]) {
	CRCell *cr_cell = cr_levels[cr_max_level];
	cr_cell->detach();
	cr_create_at_level(cr_cell - cr_cells, dest_level);
      }
      cr_max_level--;
    }
  cr_bt_info.resize(btpoint);
}

  // Auxiliary routines for sorting and splitting cells
  //Cell* sort_and_split_cell1(Cell* cell);
  //Cell* sort_and_split_cell255(Cell* const cell, const unsigned int max_ival);
  //bool shellsort_cell(Cell* cell);

// Assumes that the invariant values are NOT the same and that the cell contains more than one element
Cell* sort_and_split_cell1(Cell* const cell) {
#if defined(BLISS_EXPENSIVE_CONSISTENCY_CHECKS)
  assert(cell->length > 1);
  assert(cell->first + cell->length <= N);
  unsigned int nof_0_found = 0;
  unsigned int nof_1_found = 0;
  for(unsigned int i = cell->first; i < cell->first + cell->length; i++)
    {
      const unsigned int ival = invariant_values[elements[i]];
      assert(ival == 0 or ival == 1);
      if(ival == 0) nof_0_found++;
      else nof_1_found++;
    }
  assert(nof_0_found > 0);
  assert(nof_1_found > 0);
  assert(nof_1_found == cell->max_ival_count);
  assert(nof_0_found + nof_1_found == cell->length);
  assert(cell->max_ival == 1);
#endif


  /* (Pseudo)allocate new cell */
  Cell* const new_cell = free_cells;
  free_cells = new_cell->next;

#define NEW_SORT1
#ifdef NEW_SORT1
      unsigned int *ep0 = elements + cell->first;
      unsigned int *ep1 = ep0 + cell->length - cell->max_ival_count;
      if(cell->max_ival_count > cell->length / 2)
	{
	  /* There are more ones than zeros, only move zeros */
	  unsigned int * const end = ep0 + cell->length;
	  while(ep1 < end)
	    {
	      while(invariant_values[*ep1] == 0)
		{
		  const unsigned int tmp = *ep1;
		  *ep1 = *ep0;
		  *ep0 = tmp;
		  in_pos[tmp] = ep0;
		  in_pos[*ep1] = ep1;
		  ep0++;
		}
	      element_to_cell_map[*ep1] = new_cell;
	      invariant_values[*ep1] = 0;
	      ep1++;
	    }
	}
      else
	{
	  /* There are more zeros than ones, only move ones */
	  unsigned int * const end = ep1;
	  while(ep0 < end)
	    {
	      while(invariant_values[*ep0] != 0)
		{
		  const unsigned int tmp = *ep0;
		  *ep0 = *ep1;
		  *ep1 = tmp;
		  in_pos[tmp] = ep1;
		  in_pos[*ep0] = ep0;
		  ep1++;
		}
	      ep0++;
	    }
	  ep1 = end;
	  while(ep1 < elements + cell->first + cell->length)
	    {
	      element_to_cell_map[*ep1] = new_cell;
	      invariant_values[*ep1] = 0;
	      ep1++;
	    }
	}
  /* Update new cell parameters */
  new_cell->first = cell->first + cell->length - cell->max_ival_count;
  new_cell->length = cell->length - (new_cell->first - cell->first);
  new_cell->next = cell->next;
  if(new_cell->next)
    new_cell->next->prev = new_cell;
  new_cell->prev = cell;
  new_cell->split_level = refinement_stack.size()+1;
  /* Update old, splitted cell parameters */
  cell->length = new_cell->first - cell->first;
  cell->next = new_cell;
  /* CR */
  if(cr_enabled)
    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));

#else
  /* Sort vertices in the cell according to the invariant values */
  unsigned int *ep0 = elements + cell->first;
  unsigned int *ep1 = ep0 + cell->length;
  while(ep1 > ep0)
    {
      const unsigned int element = *ep0;
      const unsigned int ival = invariant_values[element];
      invariant_values[element] = 0;
      if(ival == 0)
	{
	  ep0++;
	}
      else
	{
	  ep1--;
	  *ep0 = *ep1;
	  *ep1 = element;
	  element_to_cell_map[element] = new_cell;
	  in_pos[element] = ep1;
	  in_pos[*ep0] = ep0;
	}
    }


  /* Update new cell parameters */
  new_cell->first = ep1 - elements;
  new_cell->length = cell->length - (new_cell->first - cell->first);
  new_cell->next = cell->next;
  if(new_cell->next)
    new_cell->next->prev = new_cell;
  new_cell->prev = cell;
  new_cell->split_level = cell->split_level;
  /* Update old, splitted cell parameters */
  cell->length = new_cell->first - cell->first;
  cell->next = new_cell;
  cell->split_level = refinement_stack.size()+1;
  /* CR */
  if(cr_enabled)
    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));

#endif /* ifdef NEW_SORT1*/

  /* Add cell in refinement stack for backtracking */
  {
    RefInfo i;
    i.split_cell_first = new_cell->first;
    if(cell->prev_nonsingleton)
      i.prev_nonsingleton_first = cell->prev_nonsingleton->first;
    else
      i.prev_nonsingleton_first = -1;
    if(cell->next_nonsingleton)
      i.next_nonsingleton_first = cell->next_nonsingleton->first;
    else
      i.next_nonsingleton_first = -1;
    /* Modify nonsingleton cell list */
    if(new_cell->length > 1)
      {
	new_cell->prev_nonsingleton = cell;
	new_cell->next_nonsingleton = cell->next_nonsingleton;
	if(new_cell->next_nonsingleton)
	  new_cell->next_nonsingleton->prev_nonsingleton = new_cell;
	cell->next_nonsingleton = new_cell;
      }
    else
      {
	new_cell->next_nonsingleton = 0;
	new_cell->prev_nonsingleton = 0;
	discrete_cell_count++;
      }
    if(cell->is_unit())
      {
	if(cell->prev_nonsingleton)
	  cell->prev_nonsingleton->next_nonsingleton = cell->next_nonsingleton;
	else
	  first_nonsingleton_cell = cell->next_nonsingleton;
	if(cell->next_nonsingleton)
	  cell->next_nonsingleton->prev_nonsingleton = cell->prev_nonsingleton;
	cell->next_nonsingleton = 0;
	cell->prev_nonsingleton = 0;
	discrete_cell_count++;
      }
    refinement_stack.push(i);
  }


  /* Add cells in splitting queue */
  if(cell->in_splitting_queue) {
    /* Both cells must be included in splitting_queue in order to have
       refinement to equitable partition */
    splitting_queue_add(new_cell);
  } else {
    Cell *min_cell, *max_cell;
    if(cell->length <= new_cell->length) {
      min_cell = cell;
      max_cell = new_cell;
    } else {
      min_cell = new_cell;
      max_cell = cell;
    }
    /* Put the smaller cell in splitting_queue */
    splitting_queue_add(min_cell);
    if(max_cell->is_unit()) {
      /* Put the "larger" cell also in splitting_queue */
      splitting_queue_add(max_cell);
    }
  }


  return new_cell;
}

// Sort the elements in a cell according to their invariant values.
// The invariant values are not cleared.
// Warning: the in_pos array is left in incorrect state.
bool shellsort_cell(Cell* const cell) {
  unsigned int h;
  unsigned int* ep;
  if(cell->is_unit())
    return false;

  /* Check whether all the elements have the same invariant value */
  bool equal_invariant_values = true;
  {
    ep = elements + cell->first;
    const unsigned int ival = invariant_values[*ep];
    ep++;
    for(unsigned int i = cell->length - 1; i > 0; i--)
      {
	if(invariant_values[*ep] != ival) {
	  equal_invariant_values = false;
	  break;
	}
	ep++;
      }
  }
  if(equal_invariant_values)
    return false;

  ep = elements + cell->first;

  for(h = 1; h <= cell->length/9; h = 3*h + 1)
    ;
  for( ; h > 0; h = h/3) {
    for(unsigned int i = h; i < cell->length; i++) {
      const unsigned int element = ep[i];
      const unsigned int ival = invariant_values[element];
      unsigned int j = i;
      while(j >= h and invariant_values[ep[j-h]] > ival) {
        ep[j] = ep[j-h];
        j -= h;
      }
      ep[j] = element;
    }
  }
  return true;
}

// Distribution count sorting of cells with invariant values less than 256.
Cell* sort_and_split_cell255(Cell* const cell, const unsigned int max_ival) {
  if(cell->is_unit()) {
      /* Reset invariant value */
      invariant_values[elements[cell->first]] = 0;
      return cell;
    }
  
#ifdef BLISS_CONSISTENCY_CHECKS
  for(unsigned int i = 0; i < 256; i++)
    assert(dcs_count[i] == 0);
#endif

  /*
   * Compute the distribution of invariant values to the count array
   */
  {
    const unsigned int *ep = elements + cell->first;
    const unsigned int ival = invariant_values[*ep];
    dcs_count[ival]++;
    ep++;
#if defined(BLISS_CONSISTENCY_CHECKS)
    bool equal_invariant_values = true;
#endif
    for(unsigned int i = cell->length - 1; i != 0; i--)
      {
	const unsigned int ival2 = invariant_values[*ep];
	dcs_count[ival2]++;
#if defined(BLISS_CONSISTENCY_CHECKS)
	if(ival2 != ival) {
	  equal_invariant_values = false;
	}
#endif
	ep++;
      }
#if defined(BLISS_CONSISTENCY_CHECKS)
    assert(!equal_invariant_values);
    if(equal_invariant_values) {
      assert(dcs_count[ival] == cell->length);
      dcs_count[ival] = 0;
      clear_ivs(cell);
      return cell;
    }
#endif
  }

  /* Build start array */
  dcs_cumulate_count(max_ival);


  /* Do the sorting */
  for(unsigned int i = 0; i <= max_ival; i++)
    {
      unsigned int *ep = elements + cell->first + dcs_start[i];
      for(unsigned int j = dcs_count[i]; j > 0; j--)
	{
	  while(true)
	    {
	      const unsigned int element = *ep;
	      const unsigned int ival = invariant_values[element];
	      if(ival == i)
		break;
	      *ep = elements[cell->first + dcs_start[ival]];
	      elements[cell->first + dcs_start[ival]] = element;
	      dcs_start[ival]++;
	      dcs_count[ival]--;
	    }
	  ep++;
	}
      dcs_count[i] = 0;
    }

#if defined(BLISS_CONSISTENCY_CHECKS)
  for(unsigned int i = 0; i < 256; i++)
    assert(dcs_count[i] == 0);
#endif

  /* split cell */
  Cell* const new_cell = split_cell(cell);
  return new_cell;
}

  Cell* split_cell(Cell* const cell);

  /*
   * Some auxiliary stuff needed for distribution count sorting.
   * To make the code thread-safe (modulo the requirement that each graph is
   * only accessed in one thread at a time), the arrays are owned by
   * the partition instance, not statically defined.
   */
  unsigned int dcs_count[256];
  unsigned int dcs_start[256];
  //void dcs_cumulate_count(const unsigned int max);
// An auxiliary function for distribution count sorting.
// Build start array so that
// dcs_start[0] = 0 and dcs_start[i+1] = dcs_start[i] + dcs_count[i].
void dcs_cumulate_count(const unsigned int max)  {
  unsigned int* count_p = dcs_count;
  unsigned int* start_p = dcs_start;
  unsigned int sum = 0;
  for(unsigned int i = max+1; i > 0; i--) {
      *start_p = sum;
      start_p++;
      sum += *count_p;
      count_p++;
    }
}
};

inline Partition::Cell* Partition::splitting_queue_pop() {
  Cell* const cell = splitting_queue.pop_front();
  cell->in_splitting_queue = false;
  return cell;
}

inline bool Partition::splitting_queue_is_empty() const {
  return splitting_queue.is_empty();
}

inline unsigned int Partition::cr_get_level(const unsigned int cell_index) const {
  return(cr_cells[cell_index].level);
}

} // namespace bliss

#endif


================================================
FILE: external/bliss/bliss/search.h
================================================

void search(const bool canonical, Stats& stats) {
  const unsigned int N        = get_nof_vertices();
  unsigned int all_same_level = UINT_MAX;
  p.graph                     = this;
  /*
   * Must be done!
   */
  remove_duplicate_edges();
  /*
   * Reset search statistics
   */
  stats.reset();
  stats.nof_nodes      = 1;
  stats.nof_leaf_nodes = 1;

  /* Free old first path data structures */
  if (first_path_labeling) {
    free(first_path_labeling);
    first_path_labeling = 0;
  }
  if (first_path_labeling_inv) {
    free(first_path_labeling_inv);
    first_path_labeling_inv = 0;
  }
  if (first_path_automorphism) {
    free(first_path_automorphism);
    first_path_automorphism = 0;
  }

  /* Free old best path data structures */
  if (best_path_labeling) {
    free(best_path_labeling);
    best_path_labeling = 0;
  }
  if (best_path_labeling_inv) {
    free(best_path_labeling_inv);
    best_path_labeling_inv = 0;
  }
  if (best_path_automorphism) {
    free(best_path_automorphism);
    best_path_automorphism = 0;
  }

  if (N == 0) {
    /* Nothing to do, return... */
    return;
  }

  /* Initialize the partition ... */
  p.init(N);
  /* ... and the component recursion data structures in the partition */
  if (opt_use_comprec)
    p.cr_init();

  neighbour_heap.init(N);

  in_search = false;
  /* Do not compute certificate when building the initial partition */
  refine_compare_certificate = false;
  /* The 'eqref_hash' hash value is not computed when building
   * the initial partition as it is not used for anything at the moment.
   * This saves some cycles. */
  compute_eqref_hash = false;

  // Timer timer1;

  make_initial_equitable_partition();

  if (verbstr and verbose_level >= 2) {
    fprintf(verbstr, "Initial partition computed in %.2f seconds\n", 0.0);
    //  timer1.get_duration());
    fflush(verbstr);
  }

  /*
   * Allocate space for the "first path" and "best path" labelings
   */
  if (first_path_labeling)
    free(first_path_labeling);
  first_path_labeling = (unsigned int*)calloc(N, sizeof(unsigned int));
  if (!first_path_labeling)
    _OUT_OF_MEMORY();
  if (best_path_labeling)
    free(best_path_labeling);
  best_path_labeling = (unsigned int*)calloc(N, sizeof(unsigned int));
  if (!best_path_labeling)
    _OUT_OF_MEMORY();

  /*
   * Is the initial partition discrete?
   */
  if (p.is_discrete()) {
    /* Make the best path labeling i.e. the canonical labeling */
    update_labeling(best_path_labeling);
    /* Update statistics */
    stats.nof_leaf_nodes = 1;
    return;
  }

  /*
   * Allocate the inverses of the "first path" and "best path" labelings
   */
  if (first_path_labeling_inv)
    free(first_path_labeling_inv);
  first_path_labeling_inv = (unsigned int*)calloc(N, sizeof(unsigned int));
  if (!first_path_labeling_inv)
    _OUT_OF_MEMORY();
  if (best_path_labeling_inv)
    free(best_path_labeling_inv);
  best_path_labeling_inv = (unsigned int*)calloc(N, sizeof(unsigned int));
  if (!best_path_labeling_inv)
    _OUT_OF_MEMORY();

  /*
   * Allocate space for the automorphisms
   */
  if (first_path_automorphism)
    free(first_path_automorphism);
  first_path_automorphism = (unsigned int*)malloc(N * sizeof(unsigned int));
  if (!first_path_automorphism)
    _OUT_OF_MEMORY();
  if (best_path_automorphism)
    free(best_path_automorphism);
  best_path_automorphism = (unsigned int*)malloc(N * sizeof(unsigned int));
  if (!best_path_automorphism)
    _OUT_OF_MEMORY();

  /*
   * Initialize orbit information so that all vertices are in their own orbits
   */
  first_path_orbits.init(N);
  best_path_orbits.init(N);

  /*
   * Initialize certificate memory
   */
  initialize_certificate();

  std::vector<TreeNode> search_stack;
  std::vector<PathInfo> first_path_info;
  std::vector<PathInfo> best_path_info;

  search_stack.clear();

  /* Initialize "long prune" data structures */
  if (opt_use_long_prune)
    long_prune_init();

  /*
   * Initialize failure recording data structures
   */
  typedef std::set<unsigned int, std::less<unsigned int>> FailureRecordingSet;
  std::vector<FailureRecordingSet> failure_recording_hashes;

  /*
   * Initialize component recursion data structures
   */
  cr_cep_stack.clear();
  unsigned int cr_cep_index = 0;
  {
    /* Inset a sentinel "component end point" */
    CR_CEP sentinel;
    sentinel.creation_level      = 0;
    sentinel.discrete_cell_limit = get_nof_vertices();
    sentinel.next_cr_level       = 0;
    sentinel.next_cep_index      = 0;
    sentinel.first_checked       = false;
    sentinel.best_checked        = false;
    cr_cep_index                 = 0;
    cr_cep_stack.push_back(sentinel);
  }
  cr_level = 0;
  if (opt_use_comprec and nucr_find_first_component(cr_level) == true and
      p.nof_discrete_cells() + cr_component_elements <
          cr_cep_stack[cr_cep_index].discrete_cell_limit) {
    cr_level = p.cr_split_level(0, cr_component);
    CR_CEP cep;
    cep.creation_level      = 0;
    cep.discrete_cell_limit = p.nof_discrete_cells() + cr_component_elements;
    cep.next_cr_level       = 0;
    cep.next_cep_index      = cr_cep_index;
    cep.first_checked       = false;
    cep.best_checked        = false;
    cr_cep_index            = cr_cep_stack.size();
    cr_cep_stack.push_back(cep);
  }

  /*
   * Build the root node of the search tree
   */
  {
    TreeNode root;
    Partition::Cell* split_cell = find_next_cell_to_be_splitted(p.first_cell);
    root.split_cell_first       = split_cell->first;
    root.split_element          = TreeNode::SPLIT_START;
    root.partition_bt_point     = p.set_backtrack_point();
    root.certificate_index      = 0;
    root.fp_on                  = true;
    root.fp_cert_equal          = true;
    root.fp_extendable          = TreeNode::MAYBE;
    root.in_best_path           = false;
    root.cmp_to_best_path       = 0;
    root.long_prune_begin       = 0;

    root.failure_recording_ival = 0;

    /* Save component recursion info for backtracking */
    root.cr_level          = cr_level;
    root.cr_cep_stack_size = cr_cep_stack.size();
    root.cr_cep_index      = cr_cep_index;
    search_stack.push_back(root);
  }

  /*
   * Set status and global flags for search related procedures
   */
  in_search = true;
  /* Do not compare certificates during refinement until the first path has been
   * traversed to the leaf */
  refine_compare_certificate = false;

  /*
   * The actual backtracking search
   */
  while (!search_stack.empty()) {
    TreeNode& current_node           = search_stack.back();
    const unsigned int current_level = (unsigned int)search_stack.size() - 1;

    if (opt_use_comprec) {
      CR_CEP& cep = cr_cep_stack[current_node.cr_cep_index];
      if (cep.first_checked == true and
          current_node.fp_extendable == TreeNode::MAYBE and
          !search_stack[cep.creation_level].fp_on) {
        current_node.fp_extendable = TreeNode::NO;
      }
    }

    if (current_node.fp_on) {
      if (current_node.split_element == TreeNode::SPLIT_END) {
        search_stack.pop_back();
        continue;
      }
    } else {
      if (current_node.fp_extendable == TreeNode::YES) {
        search_stack.pop_back();
        continue;
      }
      if (current_node.split_element == TreeNode::SPLIT_END) {
        if (opt_use_failure_recording) {
          TreeNode& parent_node = search_stack[current_level - 1];
          if (parent_node.fp_on)
            failure_recording_hashes[current_level - 1].insert(
                current_node.failure_recording_ival);
        }
        search_stack.pop_back();
        continue;
      }
      if (current_node.fp_extendable == TreeNode::NO and
          (!canonical or current_node.cmp_to_best_path < 0)) {
        if (opt_use_failure_recording) {
          TreeNode& parent_node = search_stack[current_level - 1];
          if (parent_node.fp_on)
            failure_recording_hashes[current_level - 1].insert(
                current_node.failure_recording_ival);
        }
        search_stack.pop_back();
        continue;
      }
    }

    /* Restore partition ... */
    p.goto_backtrack_point(current_node.partition_bt_point);
    /* ... and re-remember backtracking point */
    current_node.partition_bt_point = p.set_backtrack_point();

    /* Restore current path certificate */
    certificate_index                     = current_node.certificate_index;
    refine_current_path_certificate_index = current_node.certificate_index;
    certificate_current_path.resize(certificate_index);

    /* Fetch split cell information */
    Partition::Cell* const cell =
        p.get_cell(p.elements[current_node.split_cell_first]);

    /* Restore component recursion information */
    cr_level = current_node.cr_level;
    cr_cep_stack.resize(current_node.cr_cep_stack_size);
    cr_cep_index = current_node.cr_cep_index;

    /*
     * Update long prune redundancy sets
     */
    if (opt_use_long_prune and current_level >= 1 and !current_node.fp_on) {
      unsigned int begin = (current_node.long_prune_begin > long_prune_begin)
                               ? current_node.long_prune_begin
                               : long_prune_begin;
      for (unsigned int i = begin; i < long_prune_end; i++) {
        const std::vector<bool>& fixed = long_prune_get_fixed(i);
#if defined(BLISS_CONSISTENCY_CHECKS)
        for (unsigned int l = 0; l < search_stack.size() - 2; l++)
          assert(fixed[search_stack[l].split_element]);
#endif
        if (fixed[search_stack[search_stack.size() - 1 - 1].split_element] ==
            false) {
          long_prune_swap(begin, i);
          begin++;
          current_node.long_prune_begin = begin;
          continue;
        }
      }

      if (current_node.split_element == TreeNode::SPLIT_START) {
        current_node.needs_long_prune = true;
      } else if (current_node.needs_long_prune) {
        current_node.needs_long_prune = false;
        unsigned int begin = (current_node.long_prune_begin > long_prune_begin)
                                 ? current_node.long_prune_begin
                                 : long_prune_begin;
        for (unsigned int i = begin; i < long_prune_end; i++) {
          const std::vector<bool>& fixed = long_prune_get_fixed(i);
#if defined(BLISS_CONSISTENCY_CHECKS)
          for (unsigned int l = 0; l < search_stack.size() - 2; l++)
            assert(fixed[search_stack[l].split_element]);
#endif
          assert(fixed[search_stack[current_level - 1].split_element] == true);
          if (fixed[search_stack[current_level - 1].split_element] == false) {
            long_prune_swap(begin, i);
            begin++;
            current_node.long_prune_begin = begin;
            continue;
          }
          const std::vector<bool>& mcrs = long_prune_get_mcrs(i);
          unsigned int* ep              = p.elements + cell->first;
          for (unsigned int j = cell->length; j > 0; j--, ep++) {
            if (mcrs[*ep] == false)
              current_node.long_prune_redundant.insert(*ep);
          }
        }
      }
    }

    /*
     * Find the next smallest, non-isomorphic element in the cell and
     * store it in current_node.split_element
     */
    {
      unsigned int next_split_element = UINT_MAX;
      // unsigned int* next_split_element_pos = 0;
      unsigned int* ep = p.elements + cell->first;
      if (current_node.fp_on) {
        /* Find the next larger splitting element that is
         * a minimal orbit representative w.r.t. first_path_orbits */
        for (unsigned int i = cell->length; i > 0; i--, ep++) {
          if ((int)(*ep) > current_node.split_element and
              *ep < next_split_element and
              first_path_orbits.is_minimal_representative(*ep)) {
            next_split_element = *ep;
            // next_split_element_pos = ep;
          }
        }
      } else if (current_node.in_best_path) {
        /* Find the next larger splitting element that is
         * a minimal orbit representative w.r.t. best_path_orbits */
        for (unsigned int i = cell->length; i > 0; i--, ep++) {
          if ((int)(*ep) > current_node.split_element and
              *ep < next_split_element and
              best_path_orbits.is_minimal_representative(*ep) and
              (!opt_use_long_prune or
               current_node.long_prune_redundant.find(*ep) ==
                   current_node.long_prune_redundant.end())) {
            next_split_element = *ep;
            // next_split_element_pos = ep;
          }
        }
      } else {
        /* Find the next larger splitting element */
        for (unsigned int i = cell->length; i > 0; i--, ep++) {
          if ((int)(*ep) > current_node.split_element and
              *ep < next_split_element and
              (!opt_use_long_prune or
               current_node.long_prune_redundant.find(*ep) ==
                   current_node.long_prune_redundant.end())) {
            next_split_element = *ep;
            // next_split_element_pos = ep;
          }
        }
      }
      if (next_split_element == UINT_MAX) {
        /* No more (unexplored children) in the cell */
        current_node.split_element = TreeNode::SPLIT_END;
        if (current_node.fp_on) {
          /* Update group size */
          const unsigned int index = first_path_orbits.orbit_size(
              first_path_info[search_stack.size() - 1].splitting_element);
          stats.group_size.multiply(index);
          stats.group_size_approx *= (long double)index;
          /*
           * Update all_same_level
           */
          if (index == cell->length and all_same_level == current_level + 1)
            all_same_level = current_level;
          if (verbstr and verbose_level >= 2) {
            fprintf(verbstr,
                    "Level %u: orbits=%u, index=%u/%u, all_same_level=%u\n",
                    current_level, first_path_orbits.nof_orbits(), index,
                    cell->length, all_same_level);
            fflush(verbstr);
          }
        }
        continue;
      }

      /* Split on smallest */
      current_node.split_element = next_split_element;
    }

    const unsigned int child_level = current_level + 1;
    /* Update some statistics */
    stats.nof_nodes++;
    if (search_stack.size() > stats.max_level)
      stats.max_level = search_stack.size();

    /* Set flags and indices for the refiner certificate builder */
    refine_equal_to_first = current_node.fp_cert_equal;
    refine_cmp_to_best    = current_node.cmp_to_best_path;
    if (!first_path_info.empty()) {
      if (refine_equal_to_first)
        refine_first_path_subcertificate_end =
            first_path_info[search_stack.size() - 1].certificate_index +
            first_path_info[search_stack.size() - 1].subcertificate_length;
      if (canonical) {
        if (refine_cmp_to_best == 0)
          refine_best_path_subcertificate_end =
              best_path_info[search_stack.size() - 1].certificate_index +
              best_path_info[search_stack.size() - 1].subcertificate_length;
      } else
        refine_cmp_to_best = -1;
    }

    const bool was_fp_cert_equal = current_node.fp_cert_equal;

    /* Individualize, i.e. split the cell in two, the latter new cell
     * will be a unit one containing info.split_element */
    Partition::Cell* const new_cell =
        p.individualize(cell, current_node.split_element);

    /*
     * Refine the new partition to equitable
     */
    if (cell->is_unit())
      refine_to_equitable(cell, new_cell);
    else
      refine_to_equitable(new_cell);

    /* Update statistics */
    if (p.is_discrete())
      stats.nof_leaf_nodes++;

    if (!first_path_info.empty()) {
      /* We are no longer on the first path */
      const unsigned int subcertificate_length =
          certificate_current_path.size() - certificate_index;
      if (refine_equal_to_first) {
        /* Was equal to the first path so far */
        PathInfo& first_pinfo = first_path_info[current_level];
        assert(first_pinfo.certificate_index == certificate_index);
        if (subcertificate_length != first_pinfo.subcertificate_length) {
          refine_equal_to_first = false;
          if (opt_use_failure_recording)
            failure_recording_fp_deviation = subcertificate_length;
        } else if (first_pinfo.eqref_hash.cmp(eqref_hash) != 0) {
          refine_equal_to_first = false;
          if (opt_use_failure_recording)
            failure_recording_fp_deviation = eqref_hash.get_value();
        }
      }
      if (canonical and (refine_cmp_to_best == 0)) {
        /* Was equal to the best path so far */
        PathInfo& bestp_info = best_path_info[current_level];
        assert(bestp_info.certificate_index == certificate_index);
        if (subcertificate_length < bestp_info.subcertificate_length) {
          refine_cmp_to_best = -1;
        } else if (subcertificate_length > bestp_info.subcertificate_length) {
          refine_cmp_to_best = 1;
        } else if (bestp_info.eqref_hash.cmp(eqref_hash) > 0) {
          refine_cmp_to_best = -1;
        } else if (bestp_info.eqref_hash.cmp(eqref_hash) < 0) {
          refine_cmp_to_best = 1;
        }
      }

      if (opt_use_failure_recording and was_fp_cert_equal and
          !refine_equal_to_first) {
        UintSeqHash k;
        k.update(failure_recording_fp_deviation);
        k.update(eqref_hash.get_value());
        failure_recording_fp_deviation = k.get_value();

        if (current_node.fp_on)
          failure_recording_hashes[current_level].insert(
              failure_recording_fp_deviation);
        else {
          for (unsigned int i = current_level; i > 0; i--) {
            if (search_stack[i].fp_on)
              break;
            const FailureRecordingSet& s = failure_recording_hashes[i];
            if (i == current_level and
                s.find(failure_recording_fp_deviation) != s.end())
              break;
            if (s.find(0) != s.end())
              break;
            search_stack[i].fp_extendable = TreeNode::NO;
          }
        }
      }

      /* Check if no longer equal to the first path and,
       * if canonical labeling is desired, also worse than the
       * current best path */
      if (refine_equal_to_first == false and
          (!canonical or (refine_cmp_to_best < 0))) {
        /* Yes, backtrack */
        stats.nof_bad_nodes++;
        if (current_node.fp_cert_equal == true and
            current_level + 1 > all_same_level) {
          assert(all_same_level >= 1);
          for (unsigned int i = all_same_level; i < search_stack.size(); i++) {
            search_stack[i].fp_extendable = TreeNode::NO;
          }
        }

        continue;
      }
    }

#if defined(BLISS_VERIFY_EQUITABLEDNESS)
    /* The new partition should be equitable */
    if (!is_equitable())
      fatal_error("consistency check failed - partition after refinement is "
                  "not equitable");
#endif

    /*
     * Next level search tree node info
     */
    TreeNode child_node;

    /* No more in the first path */
    child_node.fp_on = false;
    /* No more in the best path */
    child_node.in_best_path = false;

    child_node.fp_cert_equal = refine_equal_to_first;
    if (current_node.fp_extendable == TreeNode::NO or
        (current_node.fp_extendable == TreeNode::MAYBE and
         child_node.fp_cert_equal == false))
      child_node.fp_extendable = TreeNode::NO;
    else
      child_node.fp_extendable = TreeNode::MAYBE;
    child_node.cmp_to_best_path = refine_cmp_to_best;

    child_node.failure_recording_ival = 0;
    child_node.cr_cep_stack_size      = current_node.cr_cep_stack_size;
    child_node.cr_cep_index           = current_node.cr_cep_index;
    child_node.cr_level               = current_node.cr_level;

    certificate_index = certificate_current_path.size();

    current_node.eqref_hash = eqref_hash;
    current_node.subcertificate_length =
        certificate_index - current_node.certificate_index;

    /*
     * The first encountered leaf node at the end of the "first path"?
     */
    if (p.is_discrete() and first_path_info.empty()) {
      // fprintf(stdout, "Level %u: FIRST\n", child_level); fflush(stdout);
      stats.nof_canupdates++;
      /*
       * Update labelings and their inverses
       */
      update_labeling_and_its_inverse(first_path_labeling,
                                      first_path_labeling_inv);
      update_labeling_and_its_inverse(best_path_labeling,
                                      best_path_labeling_inv);
      /*
       * Reset automorphism array
       */
      reset_permutation(first_path_automorphism);
      reset_permutation(best_path_automorphism);
      /*
       * Reset orbit information
       */
      first_path_orbits.reset();
      best_path_orbits.reset();
      /*
       * Reset group size
       */
      stats.group_size.assign(1);
      stats.group_size_approx = 1.0;
      /*
       * Reset all_same_level
       */
      all_same_level = child_level;
      /*
       * Mark the current path to be the first and best one and save it
       */
      const unsigned int base_size = search_stack.size();
      best_path_info.clear();
      // fprintf(stdout, " New base is: ");
      for (unsigned int i = 0; i < base_size; i++) {
        search_stack[i].fp_on            = true;
        search_stack[i].fp_cert_equal    = true;
        search_stack[i].fp_extendable    = TreeNode::YES;
        search_stack[i].in_best_path     = true;
        search_stack[i].cmp_to_best_path = 0;
        PathInfo path_info;
        path_info.splitting_element     = search_stack[i].split_element;
        path_info.certificate_index     = search_stack[i].certificate_index;
        path_info.eqref_hash            = search_stack[i].eqref_hash;
        path_info.subcertificate_length = search_stack[i].subcertificate_length;
        first_path_info.push_back(path_info);
        best_path_info.push_back(path_info);
        // fprintf(stdout, "%u ", search_stack[i].split_element);
      }
      // fprintf(stdout, "\n"); fflush(stdout);
      /* Copy certificates */
      certificate_first_path = certificate_current_path;
      certificate_best_path  = certificate_current_path;

      /* From now on, compare certificates when refining */
      refine_compare_certificate = true;

      if (opt_use_failure_recording)
        failure_recording_hashes.resize(base_size);

      /*
         for(unsigned int j = 0; j < search_stack.size(); j++)
         fprintf(stderr, "%u ", search_stack[j].split_element);
         fprintf(stderr, "\n");
         p.print(stderr); fprintf(stderr, "\n");
         */

      /*
       * Backtrack to the previous level
       */
      continue;
    }

    if (p.is_discrete() and child_node.fp_cert_equal) {
      /*
       * A leaf node that is equal to the first one.
       * An automorphism found: aut[i] = elements[first_path_labeling[i]]
       */
      goto handle_first_path_automorphism;
    }

    if (!p.is_discrete()) {
      Partition::Cell* next_split_cell = 0;
      /*
       * An internal, non-leaf node
       */
      if (opt_use_comprec) {
        assert(p.nof_discrete_cells() <=
               cr_cep_stack[cr_cep_index].discrete_cell_limit);
        assert(cr_level == child_node.cr_level);

        if (p.nof_discrete_cells() ==
            cr_cep_stack[cr_cep_index].discrete_cell_limit) {
          /* We have reached the end of a component */
          assert(cr_cep_index != 0);
          CR_CEP& cep = cr_cep_stack[cr_cep_index];

          /* First, compare with respect to the first path */
          if (first_path_info.empty() or child_node.fp_cert_equal) {
            if (cep.first_checked == false) {
              /* First time, go to the next component */
              cep.first_checked = true;
            } else {
              assert(!first_path_info.empty());
              assert(cep.creation_level < search_stack.size());
              TreeNode& old_info = search_stack[cep.creation_level];
              /* If the component was found when on the first path,
               * handle the found automorphism as the other
               * first path automorphisms */
              if (old_info.fp_on)
                goto handle_first_path_automorphism;
            }
          }

          if (canonical and !first_path_info.empty() and
              child_node.cmp_to_best_path >= 0) {
            if (cep.best_checked == false) {
              /* First time, go to the next component */
              cep.best_checked = true;
            } else {
              assert(cep.creation_level < search_stack.size());
              TreeNode& old_info = search_stack[cep.creation_level];
              if (child_node.cmp_to_best_path == 0) {
                /* If the component was found when on the best path,
                 * handle the found automorphism as the other
                 * best path automorphisms */
                if (old_info.in_best_path)
                  goto handle_best_path_automorphism;
                /* Otherwise, we do not remember the automorhism as
                 * we didn't memorize the path that was invariant
                 * equal to the best one and passed through the
                 * component.
                 * Thus we can only backtrack to the previous level */
                child_node.cmp_to_best_path = -1;
                if (!child_node.fp_cert_equal) {
                  continue;
                }
              } else {
                assert(child_node.cmp_to_best_path > 0);
                if (old_info.in_best_path) {
                  stats.nof_canupdates++;
                  /*
                   * Update canonical labeling and its inverse
                   */
                  for (unsigned int i = 0; i < N; i++) {
                    if (p.get_cell(p.elements[i])->is_unit()) {
                      best_path_labeling[p.elements[i]] = i;
                      best_path_labeling_inv[i]         = p.elements[i];
                    }
                  }
                  // update_labeling_and_its_inverse(best_path_labeling,
                  // best_path_labeling_inv);
                  /* Reset best path automorphism */
                  reset_permutation(best_path_automorphism);
                  /* Reset best path orbit structure */
                  best_path_orbits.reset();
                  /* Mark to be the best one and save prefix */
                  unsigned int postfix_start = cep.creation_level;
                  assert(postfix_start < best_path_info.size());
                  while (p.get_cell(
                              best_path_info[postfix_start].splitting_element)
                             ->is_unit()) {
                    postfix_start++;
                    assert(postfix_start < best_path_info.size());
                  }
                  unsigned int postfix_start_cert =
                      best_path_info[postfix_start].certificate_index;
                  std::vector<PathInfo> best_path_temp = best_path_info;
                  best_path_info.clear();
                  for (unsigned int i = 0; i < search_stack.size(); i++) {
                    TreeNode& ss_info = search_stack[i];
                    PathInfo bp_info;
                    ss_info.cmp_to_best_path  = 0;
                    ss_info.in_best_path      = true;
                    bp_info.splitting_element = ss_info.split_element;
                    bp_info.certificate_index = ss_info.certificate_index;
                    bp_info.subcertificate_length =
                        ss_info.subcertificate_length;
                    bp_info.eqref_hash = ss_info.eqref_hash;
                    best_path_info.push_back(bp_info);
                  }
                  /* Copy the postfix of the previous best path */
                  for (unsigned int i = postfix_start;
                       i < best_path_temp.size(); i++) {
                    best_path_info.push_back(best_path_temp[i]);
                    best_path_info[best_path_info.size() - 1]
                        .certificate_index =
                        best_path_info[best_path_info.size() - 2]
                            .certificate_index +
                        best_path_info[best_path_info.size() - 2]
                            .subcertificate_length;
                  }
                  std::vector<unsigned int> certificate_best_path_old =
                      certificate_best_path;
                  certificate_best_path = certificate_current_path;
                  for (unsigned int i = postfix_start_cert;
                       i < certificate_best_path_old.size(); i++)
                    certificate_best_path.push_back(
                        certificate_best_path_old[i]);
                  assert(certificate_best_path.size() ==
                         best_path_info.back().certificate_index +
                             best_path_info.back().subcertificate_length);
                  /* Backtrack to the previous level */
                  continue;
                }
              }
            }
          }

          /* No backtracking performed, go to next componenet */
          cr_level     = cep.next_cr_level;
          cr_cep_index = cep.next_cep_index;
        }

        /* Check if the current component has been split into
         * new non-uniformity subcomponents */
        // if(nucr_find_first_component(cr_level) == true and
        // p.nof_discrete_cells() + cr_component_elements <
        // cr_cep_stack[cr_cep_index].discrete_cell_limit)
        if (nucr_find_first_component(cr_level, cr_component,
                                      cr_component_elements,
                                      next_split_cell) == true and
            p.nof_discrete_cells() + cr_component_elements <
                cr_cep_stack[cr_cep_index].discrete_cell_limit) {
          const unsigned int next_cr_level =
              p.cr_split_level(cr_level, cr_component);
          CR_CEP cep;
          cep.creation_level = search_stack.size();
          cep.discrete_cell_limit =
              p.nof_discrete_cells() + cr_component_elements;
          cep.next_cr_level  = cr_level;
          cep.next_cep_index = cr_cep_index;
          cep.first_checked  = false;
          cep.best_checked   = false;
          cr_cep_index       = cr_cep_stack.size();
          cr_cep_stack.push_back(cep);
          cr_level = next_cr_level;
        }
      }

      /*
       * Build the next node info
       */
      /* Find the next cell to be splitted */
      if (!next_split_cell)
        next_split_cell = find_next_cell_to_be_splitted(
            p.get_cell(p.elements[current_node.split_cell_first]));
      // Partition::Cell * const next_split_cell =
      // find_next_cell_to_be_splitted(p.get_cell(p.elements[current_node.split_cell_first]));
      child_node.split_cell_first   = next_split_cell->first;
      child_node.split_element      = TreeNode::SPLIT_START;
      child_node.certificate_index  = certificate_index;
      child_node.partition_bt_point = p.set_backtrack_point();
      child_node.long_prune_redundant.clear();
      child_node.long_prune_begin = current_node.long_prune_begin;

      /* Save component recursion info for backtracking */
      child_node.cr_level          = cr_level;
      child_node.cr_cep_stack_size = cr_cep_stack.size();
      child_node.cr_cep_index      = cr_cep_index;

      search_stack.push_back(child_node);
      continue;
    }

    /*
     * A leaf node not in the first path or equivalent to the first path
     */

    if (child_node.cmp_to_best_path > 0) {
      /*
       * A new, better representative found
       */
      // fprintf(stdout, "Level %u: NEW BEST\n", child_level); fflush(stdout);
      stats.nof_canupdates++;
      /*
       * Update canonical labeling and its inverse
       */
      update_labeling_and_its_inverse(best_path_labeling,
                                      best_path_labeling_inv);
      /* Reset best path automorphism */
      reset_permutation(best_path_automorphism);
      /* Reset best path orbit structure */
      best_path_orbits.reset();
      /*
       * Mark the current path to be the best one and save it
       */
      const unsigned int base_size = search_stack.size();
      assert(current_level + 1 == base_size);
      best_path_info.clear();
      for (unsigned int i = 0; i < base_size; i++) {
        search_stack[i].cmp_to_best_path = 0;
        search_stack[i].in_best_path     = true;
        PathInfo path_info;
        path_info.splitting_element     = search_stack[i].split_element;
        path_info.certificate_index     = search_stack[i].certificate_index;
        path_info.subcertificate_length = search_stack[i].subcertificate_length;
        path_info.eqref_hash            = search_stack[i].eqref_hash;
        best_path_info.push_back(path_info);
      }
      certificate_best_path = certificate_current_path;
      /*
       * Backtrack to the previous level
       */
      continue;
    }

  handle_best_path_automorphism:
    /*
     *
     * Best path automorphism handling
     *
     */
    {

      /*
       * Equal to the previous best path
       */
      if (p.is_discrete()) {
#if defined(BLISS_CONSISTENCY_CHECKS)
        /* Verify that the automorphism is correctly built */
        for (unsigned int i = 0; i < N; i++)
          assert(best_path_automorphism[i] ==
                 p.elements[best_path_labeling[i]]);
#endif
      } else {
        /* An automorphism that was found before the partition was discrete.
         * Set the image of all elements in non-disrete cells accordingly */
        for (Partition::Cell* c = p.first_nonsingleton_cell; c;
             c                  = c->next_nonsingleton) {
          for (unsigned int i = c->first; i < c->first + c->length; i++)
            if (p.get_cell(p.elements[best_path_labeling[p.elements[i]]])
                    ->is_unit())
              best_path_automorphism
                  [p.elements[best_path_labeling[p.elements[i]]]] =
                      p.elements[i];
            else
              best_path_automorphism[p.elements[i]] = p.elements[i];
        }
      }

#if defined(BLISS_VERIFY_AUTOMORPHISMS)
      /* Verify that it really is an automorphism */
      if (!is_automorphism(best_path_automorphism))
        fatal_error("Best path automorhism validation check failed");
#endif

      unsigned int gca_level_with_first = 0;
      for (unsigned int i = search_stack.size(); i > 0; i--) {
        if ((int)first_path_info[gca_level_with_first].splitting_element !=
            search_stack[gca_level_with_first].split_element)
          break;
        gca_level_with_first++;
      }

      unsigned int gca_level_with_best = 0;
      for (unsigned int i = search_stack.size(); i > 0; i--) {
        if ((int)best_path_info[gca_level_with_best].splitting_element !=
            search_stack[gca_level_with_best].split_element)
          break;
        gca_level_with_best++;
      }

      if (opt_use_long_prune) {
        /* Record automorphism */
        long_prune_add_automorphism(best_path_automorphism);
      }

      /*
       * Update orbit information
       */
      update_orbit_information(best_path_orbits, best_path_automorphism);

      /*
       * Update orbit information
       */
      const unsigned int nof_old_orbits = first_path_orbits.nof_orbits();
      update_orbit_information(first_path_orbits, best_path_automorphism);
      if (nof_old_orbits != first_path_orbits.nof_orbits()) {
        /* Some orbits were merged */
        /* Report automorphism */
        if (report_hook)
          (*report_hook)(report_user_param, get_nof_vertices(),
                         best_path_automorphism);
        /* Update statistics */
        stats.nof_generators++;
      }

      /*
       * Compute backjumping level
       */
      unsigned int backjumping_level = current_level + 1 - 1;
      if (!first_path_orbits.is_minimal_representative(
              search_stack[gca_level_with_first].split_element)) {
        backjumping_level = gca_level_with_first;
      } else {
        assert(!best_path_orbits.is_minimal_representative(
            search_stack[gca_level_with_best].split_element));
        backjumping_level = gca_level_with_best;
      }
      /* Backtrack */
      search_stack.resize(backjumping_level + 1);
      continue;
    }
    _INTERNAL_ERROR();

  handle_first_path_automorphism:
    /*
     *
     * A first-path automorphism: aut[i] = elements[first_path_labeling[i]]
     *
     */

    if (p.is_discrete()) {
#if defined(BLISS_CONSISTENCY_CHECKS)
      /* Verify that the complete automorphism is correctly built */
      for (unsigned int i = 0; i < N; i++)
        assert(first_path_automorphism[i] ==
               p.elements[first_path_labeling[i]]);
#endif
    } else {
      /* An automorphism that was found before the partition was discrete.
       * Set the image of all elements in non-disrete cells accordingly */
      for (Partition::Cell* c = p.first_nonsingleton_cell; c;
           c                  = c->next_nonsingleton) {
        for (unsigned int i = c->first; i < c->first + c->length; i++)
          if (p.get_cell(p.elements[first_path_labeling[p.elements[i]]])
                  ->is_unit())
            first_path_automorphism
                [p.elements[first_path_labeling[p.elements[i]]]] =
                    p.elements[i];
          else
            first_path_automorphism[p.elements[i]] = p.elements[i];
      }
    }

#if defined(BLISS_VERIFY_AUTOMORPHISMS)
    /* Verify that it really is an automorphism */
    if (!is_automorphism(first_path_automorphism))
      fatal_error("First path automorphism validation check failed");
#endif

    if (opt_use_long_prune) {
      long_prune_add_automorphism(first_path_automorphism);
    }

    /*
     * Update orbit information
     */
    update_orbit_information(first_path_orbits, first_path_automorphism);

    /*
     * Compute backjumping level
     */
    for (unsigned int i = 0; i < search_stack.size(); i++) {
      TreeNode& n = search_stack[i];
      if (n.fp_on) {
        ;
      } else {
        n.fp_extendable = TreeNode::YES;
      }
    }

    /* Report automorphism by calling the user defined hook function */
    if (report_hook)
      (*report_hook)(report_user_param, get_nof_vertices(),
                     first_path_automorphism);

    /* Update statistics */
    stats.nof_generators++;
    continue;

  } /* while(!search_stack.empty()) */

  /* Free "long prune" technique memory */
  if (opt_use_long_prune)
    long_prune_deallocate();

  /* Release component recursion data in partition */
  if (opt_use_comprec)
    p.cr_free();
}


================================================
FILE: external/bliss/bliss/uintseqhash.hh
================================================
#ifndef BLISS_UINTSEQHASH_HH
#define BLISS_UINTSEQHASH_HH

#include <cstdio>
namespace bliss {
static unsigned int rtab[256] = {
	0xAEAA35B8, 0x65632E16, 0x155EDBA9, 0x01349B39,
	0x8EB8BD97, 0x8E4C5367, 0x8EA78B35, 0x2B1B4072,
	0xC1163893, 0x269A8642, 0xC79D7F6D, 0x6A32DEA0,
	0xD4D2DA56, 0xD96D4F47, 0x47B5F48A, 0x2587C6BF,
	0x642B71D8, 0x5DBBAF58, 0x5C178169, 0xA16D9279,
	0x75CDA063, 0x291BC48B, 0x01AC2F47, 0x5416DF7C,
	0x45307514, 0xB3E1317B, 0xE1C7A8DE, 0x3ACDAC96,
	0x11B96831, 0x32DE22DD, 0x6A1DA93B, 0x58B62381,
	0x283810E2, 0xBC30E6A6, 0x8EE51705, 0xB06E8DFB,
	0x729AB12A, 0xA9634922, 0x1A6E8525, 0x49DD4E19,
	0xE5DB3D44, 0x8C5B3A02, 0xEBDE2864, 0xA9146D9F,
	0x736D2CB4, 0xF5229F42, 0x712BA846, 0x20631593,
	0x89C02603, 0xD5A5BF6A, 0x823F4E18, 0x5BE5DEFF,
	0x1C4EBBFA, 0x5FAB8490, 0x6E559B0C, 0x1FE528D6,
	0xB3198066, 0x4A965EB5, 0xFE8BB3D5, 0x4D2F6234,
	0x5F125AA4, 0xBCC640FA, 0x4F8BC191, 0xA447E537,
	0xAC474D3C, 0x703BFA2C, 0x617DC0E7, 0xF26299D7,
	0xC90FD835, 0x33B71C7B, 0x6D83E138, 0xCBB1BB14,
	0x029CF5FF, 0x7CBD093D, 0x4C9825EF, 0x845C4D6D,
	0x124349A5, 0x53942D21, 0x800E60DA, 0x2BA6EB7F,
	0xCEBF30D3, 0xEB18D449, 0xE281F724, 0x58B1CB09,
	0xD469A13D, 0x9C7495C3, 0xE53A7810, 0xA866C08E,
	0x832A038B, 0xDDDCA484, 0xD5FE0DDE, 0x0756002B,
	0x2FF51342, 0x60FEC9C8, 0x061A53E3, 0x47B1884E,
	0xDC17E461, 0xA17A6A37, 0x3158E7E2, 0xA40D873B,
	0x45AE2140, 0xC8F36149, 0x63A4EE2D, 0xD7107447,
	0x6F90994F, 0x5006770F, 0xC1F3CA9A, 0x91B317B2,
	0xF61B4406, 0xA8C9EE8F, 0xC6939B75, 0xB28BBC3B,
	0x36BF4AEF, 0x3B12118D, 0x4D536ECF, 0x9CF4B46B,
	0xE8AB1E03, 0x8225A360, 0x7AE4A130, 0xC4EE8B50,
	0x50651797, 0x5BB4C59F, 0xD120EE47, 0x24F3A386,
	0xBE579B45, 0x3A378EFC, 0xC5AB007B, 0x3668942B,
	0x2DBDCC3A, 0x6F37F64C, 0xC24F862A, 0xB6F97FCF,
	0x9E4FA23D, 0x551AE769, 0x46A8A5A6, 0xDC1BCFDD,
	0x8F684CF9, 0x501D811B, 0x84279F80, 0x2614E0AC,
	0x86445276, 0xAEA0CE71, 0x0812250F, 0xB586D18A,
	0xC68D721B, 0x44514E1D, 0x37CDB99A, 0x24731F89,
	0xFA72E589, 0x81E6EBA2, 0x15452965, 0x55523D9D,
	0x2DC47E14, 0x2E7FA107, 0xA7790F23, 0x40EBFDBB,
	0x77E7906B, 0x6C1DB960, 0x1A8B9898, 0x65FA0D90,
	0xED28B4D8, 0x34C3ED75, 0x768FD2EC, 0xFAB60BCB,
	0x962C75F4, 0x304F0498, 0x0A41A36B, 0xF7DE2A4A,
	0xF4770FE2, 0x73C93BBB, 0xD21C82C5, 0x6C387447,
	0x8CDB4CB9, 0x2CC243E8, 0x41859E3D, 0xB667B9CB,
	0x89681E8A, 0x61A0526C, 0x883EDDDC, 0x539DE9A4,
	0xC29E1DEC, 0x97C71EC5, 0x4A560A66, 0xBD7ECACF,
	0x576AE998, 0x31CE5616, 0x97172A6C, 0x83D047C4,
	0x274EA9A8, 0xEB31A9DA, 0x327209B5, 0x14D1F2CB,
	0x00FE1D96, 0x817DBE08, 0xD3E55AED, 0xF2D30AFC,
	0xFB072660, 0x866687D6, 0x92552EB9, 0xEA8219CD,
	0xF7927269, 0xF1948483, 0x694C1DF5, 0xB7D8B7BF,
	0xFFBC5D2F, 0x2E88B849, 0x883FD32B, 0xA0331192,
	0x8CB244DF, 0x41FAF895, 0x16902220, 0x97FB512A,
	0x2BEA3CC4, 0xAF9CAE61, 0x41ACD0D5, 0xFD2F28FF,
	0xE780ADFA, 0xB3A3A76E, 0x7112AD87, 0x7C3D6058,
	0x69E64FFF, 0xE5F8617C, 0x8580727C, 0x41F54F04,
	0xD72BE498, 0x653D1795, 0x1275A327, 0x14B499D4,
	0x4E34D553, 0x4687AA39, 0x68B64292, 0x5C18ABC3,
	0x41EABFCC, 0x92A85616, 0x82684CF8, 0x5B9F8A4E,
	0x35382FFE, 0xFB936318, 0x52C08E15, 0x80918B2E,
	0x199EDEE0, 0xA9470163, 0xEC44ACDD, 0x612D6735,
	0x8F88EA7D, 0x759F5EA4, 0xE5CC7240, 0x68CFEB8B,
	0x04725601, 0x0C22C23E, 0x5BC97174, 0x89965841,
	0x5D939479, 0x690F338A, 0x3C2D4380, 0xDAE97F2B
};

// A hash for sequences of unsigned ints.
class UintSeqHash {
protected:
	unsigned int h;
public:
	UintSeqHash() {h = 0; }
	UintSeqHash(const UintSeqHash &other) {h = other.h; }
	UintSeqHash& operator=(const UintSeqHash &other) {h = other.h; return *this; }
	/** Reset the hash value. */
	void reset() {h = 0; }
	/** Add the unsigned int \a n to the sequence. */
	void update(unsigned int i) {
		i++;
		while(i > 0) {
			h ^= rtab[i & 0xff];
			const unsigned int b = (h & 0x80000000) >> 31;
			i = i >> 8;
			h = (h << 1) | b;
		}
	}
	/** Get the hash value of the sequence seen so far. */
	unsigned int get_value() const {return h; }
	/** Compare the hash values of this and \a other.
	 * Return -1/0/1 if the value of this is smaller/equal/greater than
	 * that of \a other. */
	int cmp(const UintSeqHash &other) const {
		return (h < other.h)?-1:((h == other.h)?0:1);
	}
	/** An abbreviation for cmp(other) < 0 */
	bool is_lt(const UintSeqHash &other) const {return(cmp(other) < 0); }
	/** An abbreviation for cmp(other) <= 0 */
	bool is_le(const UintSeqHash &other) const {return(cmp(other) <= 0); }
	/** An abbreviation for cmp(other) == 0 */
	bool is_equal(const UintSeqHash &other) const {return(cmp(other) == 0); }
};
} // namespace bliss
#endif


================================================
FILE: external/bliss/bliss/utils.hh
================================================
#ifndef BLISS_UTILS_HH
#define BLISS_UTILS_HH

/*
  Copyright (c) 2003-2015 Tommi Junttila
  Released under the GNU Lesser General Public License version 3.
  
  This file is part of bliss.
  
  bliss is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, version 3 of the License.

  bliss is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with bliss.  If not, see <http://www.gnu.org/licenses/>.
*/

/**
 * \file
 * \brief Some small utilities.
 *
 */

#include <cstdio>

namespace bliss {

/**
 * Print the permutation \a perm of {0,...,N-1} in the cycle format
 * in the file stream \a fp.
 * The amount \a offset is added to each element before printing,
 * e.g. the permutation (2 4) is printed as (3 5) when \a offset is 1.
 */
void print_permutation(FILE* fp,
		       const unsigned int N,
		       const unsigned int* perm,
		       const unsigned int offset = 0);

/**
 * Print the permutation \a perm of {0,...,N-1} in the cycle format
 * in the file stream \a fp.
 * The amount \a offset is added to each element before printing,
 * e.g. the permutation (2 4) is printed as (3 5) when \a offset is 1.
 */
void print_permutation(FILE* fp,
		       const std::vector<unsigned int>& perm,
		       const unsigned int offset = 0);

/**
 * Check whether \a perm is a valid permutation on {0,...,N-1}.
 * Slow, mainly for debugging and validation purposes.
 */
bool is_permutation(const unsigned int N, const unsigned int* perm);

/**
 * Check whether \a perm is a valid permutation on {0,...,N-1}.
 * Slow, mainly for debugging and validation purposes.
 */
bool is_permutation(const std::vector<unsigned int>& perm);

} // namespace bliss

#endif


================================================
FILE: inputs/CMakeLists.txt
================================================
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/download.cmake "file(DOWNLOAD http://iss.oden.utexas.edu/projects/galois/downloads/small_inputs_for_lonestar_test.tar.gz ${CMAKE_CURRENT_BINARY_DIR}/lonestar-cpu-inputs.tar.gz SHOW_PROGRESS)")

add_custom_command(
OUTPUT lonestar-cpu-inputs.tar.gz
COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/download.cmake
)

add_custom_command(
OUTPUT small_inputs
COMMAND ${CMAKE_COMMAND} -E tar xJf lonestar-cpu-inputs.tar.gz
DEPENDS lonestar-cpu-inputs.tar.gz
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/inputs
COMMENT "Unpacking lonestar-cpu-inputs.tar.gz"
VERBATIM
)

add_custom_target(input DEPENDS small_inputs)


================================================
FILE: inputs/cholesky/matrix1.txt
================================================
4	0	0	2	0	1
0	7	2	0	0	3
0	2	3	0	0	0
2	0	0	6	3	0
0	0	0	3	7	3
1	3	0	0	3	11


================================================
FILE: inputs/cholesky/matrix1.txt.choleskyedges
================================================
0 0 2.000000
0 3 1.000000
0 5 0.500000
1 1 2.645751
1 2 0.755929
1 5 1.133893
2 2 1.558387
2 5 -0.550019
3 3 2.236068
3 4 1.341641
3 5 -0.223607
4 4 2.280351
4 5 1.447146
5 5 2.649063


================================================
FILE: inputs/cholesky/matrix1.txt.dep
================================================
0 1 2 3 4 5


================================================
FILE: inputs/cholesky/matrix1.txt.filled
================================================
0 0 4.000000
0 3 2.000000
0 5 1.000000
1 1 7.000000
1 2 2.000000
1 5 3.000000
2 2 3.000000
2 5 0.000000
3 3 6.000000
3 4 3.000000
3 5 0.000000
4 4 7.000000
4 5 3.000000
5 5 11.000000


================================================
FILE: inputs/cholesky/very-sparse.txt
================================================
576.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	256.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	256.0
0.0	0.0	1369.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	144.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	144.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	2304.0	0.0	1200.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	324.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	1200.0	0.0	641.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	9.0	0.0
0.0	256.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1040.0


================================================
FILE: inputs/cholesky/very-sparse.txt.choleskyedges
================================================
0 0 24.000000
1 1 16.000000
1 9 16.000000
2 2 37.000000
3 3 12.000000
4 4 12.000000
5 5 48.000000
5 7 25.000000
6 6 18.000000
7 7 4.000000
8 8 3.000000
9 9 28.000000


================================================
FILE: inputs/cholesky/very-sparse.txt.dep
================================================
0 1 2 3 4 5 6 7 8 9


================================================
FILE: inputs/cholesky/very-sparse.txt.filled
================================================
0 0 576.000000
1 1 256.000000
1 9 256.000000
2 2 1369.000000
3 3 144.000000
4 4 144.000000
5 5 2304.000000
5 7 1200.000000
6 6 324.000000
7 7 641.000000
8 8 9.000000
9 9 1040.000000


================================================
FILE: libcusp/CMakeLists.txt
================================================
add_library(galois_cusp INTERFACE)
add_library(Galois::cusp ALIAS galois_cusp)
set_target_properties(galois_cusp PROPERTIES EXPORT_NAME cusp)
add_dependencies(lib galois_cusp)

target_include_directories(galois_cusp INTERFACE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

target_link_libraries(galois_cusp INTERFACE galois_dist_async)

install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(TARGETS galois_cusp
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libcusp/include/galois/graphs/BasePolicies.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file BasePolicies.h
 *
 * Header file that includes the base classes for defining CuSP partitioning
 * policies.
 */

#ifndef _GALOIS_CUSP_PSCAFFOLD_H_
#define _GALOIS_CUSP_PSCAFFOLD_H_

namespace galois {
namespace graphs {

/**
 * Default fields and functions all CuSP partitioners use; this is a class to
 * inherit from.
 */
class PartitioningScaffold {
protected:
  uint32_t _hostID;   //!< host ID of owner of this object
  uint32_t _numHosts; //!< total number of hosts
  uint64_t _numNodes; //!< number of nodes in graph
  uint64_t _numEdges; //!< number of edges in graph
  //! maps from host id to nodes that host as read from disk
  std::vector<std::pair<uint64_t, uint64_t>> _gid2host;

public:
  /**
   * Constructor for Scaffold.
   *
   * @param hostID Host ID of caller
   * @param numHosts Total num hosts in execution
   * @param numNodes Total number of nodes in graph
   * @param numEdges Total number of edges in graph
   */
  PartitioningScaffold(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
                       uint64_t numEdges)
      : _hostID(hostID), _numHosts(numHosts), _numNodes(numNodes),
        _numEdges(numEdges) {}

  /**
   * Save a provided map from host to nodes a host has read into this object
   *
   * @param gid2host Map of hosts to read nodes to save
   */
  void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {
    _gid2host = gid2host;
  }
};

/**
 * Policies that use the read assignment of nodes as the masters. Does not
 * need to go through a master assignment phase, saving overhead.
 */
class ReadMasterAssignment : public PartitioningScaffold {
public:
  /**
   * Constructor simply calls parent constructor.
   */
  ReadMasterAssignment(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
                       uint64_t numEdges)
      : PartitioningScaffold(hostID, numHosts, numNodes, numEdges) {}

  /**
   * Returns the host ID of the host that read a particular node and its edges
   * from disk.
   *
   * @param gid GID of node to get master of
   * @returns Host ID of host that read the node specified by the GID.
   */
  uint32_t retrieveMaster(uint32_t gid) const {
    for (auto h = 0U; h < _numHosts; ++h) {
      uint64_t start, end;
      std::tie(start, end) = _gid2host[h];
      if (gid >= start && gid < end) {
        return h;
      }
    }
    assert(false);
    return _numHosts;
  }

  // below all unused if not assigning masters in default manner, but must be
  // defined or compiler complains

  /**
   * Returns false as this partitioning policy doesn't have a master assignment
   * phase.
   */
  bool masterAssignPhase() const { return false; }
  /**
   * Does nothing as this policy doesn't have a master assignment phase
   */
  void enterStage2() {}

  /**
   * Does nothing because this policy doesn't have a master assignment phase.
   * (uses read assignment)
   */
  template <typename EdgeTy>
  uint32_t getMaster(uint32_t, galois::graphs::BufferedGraph<EdgeTy>&,
                     const std::vector<uint32_t>&,
                     std::unordered_map<uint64_t, uint32_t>&,
                     const std::vector<uint64_t>&,
                     std::vector<galois::CopyableAtomic<uint64_t>>&,
                     const std::vector<uint64_t>&,
                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
    return 0;
  }

  /**
   * No-op because no master assignment phase.
   */
  void saveGID2HostInfo(std::unordered_map<uint64_t, uint32_t>&,
                        std::vector<uint32_t>&, uint64_t) {}
  /**
   * Technically doesn't nothing and should never be called because no master
   * assignment phase.
   */
  bool addMasterMapping(uint32_t, uint32_t) { return false; }
};

/**
 * Policies that use a custom assignment of masters (from the user).
 * Needs to go through  a master assignment phase, which adds overhead
 * to partitioning, but may get better quality partitions.
 */
class CustomMasterAssignment : public PartitioningScaffold {
protected:
  char _status; //!< Specifies what phase of master assignment partitioner is on
  //! Metadata for determining where a node's master is
  std::vector<uint32_t> _localNodeToMaster;
  //! Map GID to its master
  std::unordered_map<uint64_t, uint32_t> _gid2masters;
  //! This host's node offset (each host reads a distinct contiguous portion
  //! of graph
  uint64_t _nodeOffset;

  /**
   * Return the reader of a particular node.
   * @param gid GID of node to get reader of
   * @return Host reader of node passed in as param
   */
  unsigned getHostReader(uint64_t gid) const {
    for (auto i = 0U; i < _numHosts; ++i) {
      uint64_t start, end;
      std::tie(start, end) = _gid2host[i];
      if (gid >= start && gid < end) {
        return i;
      }
    }
    return -1;
  }

public:
  //! Calls parent constructor to initialize common data
  CustomMasterAssignment(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
                         uint64_t numEdges)
      : PartitioningScaffold(hostID, numHosts, numNodes, numEdges), _status(0) {
  }

  /**
   * Retrieves a saved master mapping: does not fail if a GID
   * mapping is not found but instead returns -1 if in stage 1, else
   * fails.
   *
   * @param gid GID to get master of
   * @returns Master of specified GID, -1, unsigned, if not found
   */
  uint32_t retrieveMaster(uint32_t gid) const {
    if (_status != 0) {
      // use map if not a locally read node, else use vector
      if (getHostReader(gid) != _hostID) {
        auto gidMasterIter = _gid2masters.find(gid);
        // found in map
        if (gidMasterIter != _gid2masters.end()) {
          uint32_t mappedMaster = gidMasterIter->second;
          // galois::gDebug("[", _hostID, "] ", gid, " found with master ",
          //               mappedMaster, "!");
          // make sure host is in bounds
          assert(mappedMaster < _numHosts);
          return mappedMaster;
        } else {
          // NOT FOUND (not necessarily a bad thing, and required for
          // some cases)
          galois::gDebug("[", _hostID, "] ", gid, " not found!");
          if (_status == 2) {
            // die if we expect all gids to be mapped already (stage 2)
            GALOIS_DIE("should not fail to find a GID after stage 2 "
                       "of master assignment phase");
          }
          return (uint32_t)-1;
        }
      } else {
        // determine offset
        uint32_t offsetIntoMap = gid - _nodeOffset;
        assert(offsetIntoMap != (uint32_t)-1);
        assert(offsetIntoMap < _localNodeToMaster.size());
        return _localNodeToMaster[offsetIntoMap];
      }
    } else {
      // stage 0 = this function shouldn't be called
      GALOIS_DIE("master setup incomplete");
      return (uint32_t)-1;
    }
  }

  /**
   * Given gid to master mapping info, save it into a local map.
   *
   * @param gid2offsets Map a GID to an offset into a vector containing master
   * mapping information
   * @param localNodeToMaster Vector that represents the master mapping of
   * local nodes
   * @param nodeOffset First GID of nodes read by this host
   */
  void saveGID2HostInfo(std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                        std::vector<uint32_t>& localNodeToMaster,
                        uint64_t nodeOffset) {
#ifndef NDEBUG
    size_t originalSize = _gid2masters.size();
#endif

    for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) {
      assert(i->second < localNodeToMaster.size());
      galois::gDebug("Map ", i->first, " to ", localNodeToMaster[i->second]);
      _gid2masters[i->first] = localNodeToMaster[i->second];
    }
    assert(_gid2masters.size() == (originalSize + gid2offsets.size()));
    // get memory back
    gid2offsets.clear();

    size_t myLocalNodes = _gid2host[_hostID].second - _gid2host[_hostID].first;
    assert((myLocalNodes + _gid2masters.size() - originalSize) ==
           localNodeToMaster.size());
    // copy over to this structure
    _localNodeToMaster = std::move(localNodeToMaster);
    assert(myLocalNodes <= _localNodeToMaster.size());

    // resize to fit only this host's read nodes
    _localNodeToMaster.resize(myLocalNodes);
    _nodeOffset = nodeOffset;

    // stage 1 setup complete
    _status = 1;
  }

  //! Returns true as policies that inherit from this should define master
  //! assignment function
  bool masterAssignPhase() const { return true; }
  //! Shifts master assignment phase to stage 2.
  void enterStage2() { _status = 2; }

  /**
   * CuSP's "getMaster" function.
   * This function should be defined by user in child class to assign a node to
   * a host.
   *
   * @todo Consolidate metadata into single struct to clean up function.
   * @returns Host id in which to assing a node
   */
  template <typename EdgeTy>
  uint32_t getMaster(uint32_t, galois::graphs::BufferedGraph<EdgeTy>&,
                     const std::vector<uint32_t>&,
                     std::unordered_map<uint64_t, uint32_t>&,
                     const std::vector<uint64_t>&,
                     std::vector<galois::CopyableAtomic<uint64_t>>&,
                     const std::vector<uint64_t>&,
                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
    return (uint32_t)-1;
  }

  /**
   * Add a new master mapping to the local map: needs to be in stage 1
   *
   * @param gid GID to map; should not be a GID read by this host (won't
   * cause problems, but would just be a waste of compute resouces)
   * @param mappedMaster master to map a GID to
   * @returns true if new mapping added; false if already existed in map
   */
  bool addMasterMapping(uint32_t gid, uint32_t mappedMaster) {
    assert(mappedMaster < _numHosts);
    if (_status <= 1) {
      auto offsetIntoMapIter = _gid2masters.find(gid);
      if (offsetIntoMapIter == _gid2masters.end()) {
        // NOT FOUND
        galois::gDebug("[", _hostID, "] ", gid, " not found; mapping!");
        _gid2masters[gid] = mappedMaster;
        return true;
      } else {
        // already mapped
        galois::gDebug("[", _hostID, "] ", gid, " already mapped with master ",
                       offsetIntoMapIter->second, "!");
        assert(offsetIntoMapIter->second == mappedMaster);
        return false;
      }
    } else {
      GALOIS_DIE("unexpected status in add master mapping: ", _status);
      return false;
    }
  }
};

} // end namespace graphs
} // end namespace galois

#endif


================================================
FILE: libcusp/include/galois/graphs/CuSPPartitioner.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file CuSPPartitioner.h
 *
 * Contains the main CuSP partitioning function.
 */

#ifndef _GALOIS_CUSP_
#define _GALOIS_CUSP_

#include "galois/DistGalois.h"
#include "galois/graphs/DistributedGraph.h"
#include "galois/graphs/NewGeneric.h"
#include "galois/graphs/GenericPartitioners.h"

namespace galois {
//! Enum for the input/output format of the partitioner.
enum CUSP_GRAPH_TYPE {
  CUSP_CSR, //!< Compressed sparse row graph format, i.e. outgoing edges
  CUSP_CSC  //!< Compressed sparse column graph format, i.e. incoming edges
};

template <typename NodeData, typename EdgeData>
using DistGraphPtr =
    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;

/**
 * Main CuSP function: partitions a graph on disk, one partition per host.
 *
 * @param graphFile Graph file to read in the Galois binary CSR format
 * @param inputType Specifies which input format (CSR or CSC) should be given
 * to the partitioner
 * @param outputType Specifies the output format (CSR or CSC) that each
 * partition will be created in
 * @param symmetricGraph This should be "true" if the passed in graphFile
 * is a symmetric graph
 * @param transposeGraphFile Transpose graph of graphFile in Galois binary
 * CSC format (i.e. give it the transpose version of graphFile). Ignore
 * this argument if the graph is symmetric.
 * @param masterBlockFile
 * @param cuspAsync Toggles asynchronous master assignment phase during
 * partitioning
 * @param cuspStateRounds Toggles number of rounds used to synchronize
 * partitioning state during master assignment phase
 * @param readPolicy Determines how each host should divide the reading
 * load of the graph on disk
 * @param nodeWeight When using a read policy that involves nodes and edges,
 * this argument assigns a weight to give each node.
 * @param edgeWeight When using a read policy that involves nodes and edges,
 * this argument assigns a weight to give each edge.
 *
 * @tparam PartitionPolicy Partitioning policy object that specifies the
 * placement of nodes/edges during partitioning.
 * @tparam NodeData Data structure to be created for each node in the graph
 * @tparam EdgeData Type of data to be stored on each edge. Currently
 * only guarantee support for void or uint32_t; all other types may cause
 * undefined behavior.
 *
 * @returns A local partition of the passed in graph as a DistributedGraph
 *
 * @todo Look into making void node data work in LargeArray for D-Galois;
 * void specialization. For now, use char as default type
 */
template <typename PartitionPolicy, typename NodeData = char,
          typename EdgeData = void>
DistGraphPtr<NodeData, EdgeData>
cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
                   CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false,
                   std::string transposeGraphFile = "",
                   std::string masterBlockFile = "", bool cuspAsync = true,
                   uint32_t cuspStateRounds = 100,
                   galois::graphs::MASTERS_DISTRIBUTION readPolicy =
                       galois::graphs::BALANCED_EDGES_OF_MASTERS,
                   uint32_t nodeWeight = 0, uint32_t edgeWeight = 0) {
  auto& net = galois::runtime::getSystemNetworkInterface();
  using DistGraphConstructor =
      galois::graphs::NewDistGraphGeneric<NodeData, EdgeData, PartitionPolicy>;

  // TODO @todo bring back graph saving/reading functionality?

  if (!symmetricGraph) {
    // out edges or in edges
    std::string inputToUse;
    // depending on output type may need to transpose edges
    bool useTranspose;

    // see what input is specified
    if (inputType == CUSP_CSR) {
      inputToUse = graphFile;
      if (outputType == CUSP_CSR) {
        useTranspose = false;
      } else if (outputType == CUSP_CSC) {
        useTranspose = true;
      } else {
        GALOIS_DIE("CuSP output graph type is invalid");
      }
    } else if (inputType == CUSP_CSC) {
      inputToUse = transposeGraphFile;
      if (outputType == CUSP_CSR) {
        useTranspose = true;
      } else if (outputType == CUSP_CSC) {
        useTranspose = false;
      } else {
        GALOIS_DIE("CuSP output graph type is invalid");
      }
    } else {
      GALOIS_DIE("Invalid input graph type specified in CuSP partitioner");
    }

    return std::make_unique<DistGraphConstructor>(
        inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose,
        readPolicy, nodeWeight, edgeWeight, masterBlockFile);
  } else {
    // symmetric graph path: assume the passed in graphFile is a symmetric
    // graph; output is also symmetric
    return std::make_unique<DistGraphConstructor>(
        graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false,
        readPolicy, nodeWeight, edgeWeight, masterBlockFile);
  }
}
} // end namespace galois
#endif


================================================
FILE: libcusp/include/galois/graphs/DistributedGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DistributedGraph.h
 *
 * Contains the implementation for DistGraph. Command line argument definitions
 * are found in DistributedGraph.cpp.
 */

#ifndef _GALOIS_DIST_HGRAPH_H_
#define _GALOIS_DIST_HGRAPH_H_

#include <unordered_map>
#include <fstream>

#include "galois/graphs/LC_CSR_Graph.h"
#include "galois/graphs/BufferedGraph.h"
#include "galois/runtime/DistStats.h"
#include "galois/graphs/OfflineGraph.h"
#include "galois/DynamicBitset.h"

/*
 * Headers for boost serialization
 */

namespace galois {
namespace graphs {
/**
 * Enums specifying how masters are to be distributed among hosts.
 */
enum MASTERS_DISTRIBUTION {
  //! balance nodes
  BALANCED_MASTERS,
  //! balance edges
  BALANCED_EDGES_OF_MASTERS,
  //! balance nodes and edges
  BALANCED_MASTERS_AND_EDGES
};

/**
 * Base DistGraph class that all distributed graphs extend from.
 *
 * @tparam NodeTy type of node data for the graph
 * @tparam EdgeTy type of edge data for the graph
 */
template <typename NodeTy, typename EdgeTy>
class DistGraph {
private:
  //! Graph name used for printing things
  constexpr static const char* const GRNAME = "dGraph";

  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true>;

  // vector for determining range objects for master nodes + nodes
  // with edges (which includes masters)
  //! represents split of all nodes among threads to balance edges
  std::vector<uint32_t> allNodesRanges;
  //! represents split of master nodes among threads to balance edges
  std::vector<uint32_t> masterRanges;
  //! represents split of nodes with edges (includes masters) among threads to
  //! balance edges
  std::vector<uint32_t> withEdgeRanges;
  //! represents split of all nodes among threads to balance in-edges
  std::vector<uint32_t> allNodesRangesIn;
  //! represents split of master nodes among threads to balance in-edges
  std::vector<uint32_t> masterRangesIn;

  using NodeRangeType =
      galois::runtime::SpecificRange<boost::counting_iterator<size_t>>;

  //! Vector of ranges that stores the 3 different range objects that a user is
  //! able to access
  std::vector<NodeRangeType> specificRanges;
  //! Like specificRanges, but for in edges
  std::vector<NodeRangeType> specificRangesIn;

protected:
  //! The internal graph used by DistGraph to represent the graph
  GraphTy graph;

  //! Marks if the graph is transposed or not.
  bool transposed;

  // global graph variables
  uint64_t numGlobalNodes; //!< Total nodes in the global unpartitioned graph.
  uint64_t numGlobalEdges; //!< Total edges in the global unpartitioned graph.
  uint32_t numNodes;       //!< Num nodes in this graph in total
  uint64_t numEdges;       //!< Num edges in this graph in total

  const unsigned id;       //!< ID of the machine.
  const uint32_t numHosts; //!< Total number of machines

  // local graph
  // size() = Number of nodes created on this host (masters + mirrors)
  uint32_t numOwned;    //!< Number of nodes owned (masters) by this host.
                        //!< size() - numOwned = mirrors on this host
  uint32_t beginMaster; //!< Local id of the beginning of master nodes.
                        //!< beginMaster + numOwned = local id of the end of
                        //!< master nodes
  uint32_t numNodesWithEdges; //!< Number of nodes (masters + mirrors) that have
                              //!< outgoing edges

  //! Information that converts host to range of nodes that host reads
  std::vector<std::pair<uint64_t, uint64_t>> gid2host;
  //! Mirror nodes from different hosts. For reduce
  std::vector<std::vector<size_t>> mirrorNodes;

  //! GID = localToGlobalVector[LID]
  std::vector<uint64_t> localToGlobalVector;
  //! LID = globalToLocalMap[GID]
  std::unordered_map<uint64_t, uint32_t> globalToLocalMap;

  //! Increments evilPhase, a phase counter used by communication.
  void inline increment_evilPhase() {
    ++galois::runtime::evilPhase;
    if (galois::runtime::evilPhase >=
        static_cast<uint32_t>(
            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
                                                    // LCI
      galois::runtime::evilPhase = 1;
    }
  }

  //! Returns evilPhase + 1, handling loop around as necessary
  unsigned inline evilPhasePlus1() {
    unsigned result = galois::runtime::evilPhase + 1;

    // limit defined by MPI or LCI
    if (result >= uint32_t{std::numeric_limits<int16_t>::max()}) {
      return 1;
    }
    return result;
  }

  //! used to sort edges in the sort edges function
  template <typename GraphNode, typename ET>
  struct IdLess {
    bool
    operator()(const galois::graphs::EdgeSortValue<GraphNode, ET>& e1,
               const galois::graphs::EdgeSortValue<GraphNode, ET>& e2) const {
      return e1.dst < e2.dst;
    }
  };

private:
  /**
   * Given an OfflineGraph, compute the masters for each node by
   * evenly (or unevenly as specified by scale factor)
   * blocking the nodes off to assign to each host. Considers
   * ONLY nodes and not edges.
   *
   * @param g The offline graph which has loaded the graph you want
   * to get the masters for
   * @param scalefactor A vector that specifies if a particular host
   * should have more or less than other hosts
   * @param DecomposeFactor Specifies how decomposed the blocking
   * of nodes should be. For example, a factor of 2 will make 2 blocks
   * out of 1 block had the decompose factor been set to 1.
   */
  void computeMastersBlockedNodes(galois::graphs::OfflineGraph& g,
                                  const std::vector<unsigned>& scalefactor,
                                  unsigned DecomposeFactor = 1) {
    uint64_t numNodes_to_divide = g.size();
    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {
      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)
        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,
                                               i, numHosts * DecomposeFactor));
      return;
    }

    // TODO: not compatible with DecomposeFactor.
    assert(scalefactor.size() == numHosts);

    unsigned numBlocks = 0;

    for (unsigned i = 0; i < numHosts; ++i) {
      numBlocks += scalefactor[i];
    }

    std::vector<std::pair<uint64_t, uint64_t>> blocks;
    for (unsigned i = 0; i < numBlocks; ++i) {
      blocks.push_back(
          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));
    }

    std::vector<unsigned> prefixSums;
    prefixSums.push_back(0);

    for (unsigned i = 1; i < numHosts; ++i) {
      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);
    }

    for (unsigned i = 0; i < numHosts; ++i) {
      unsigned firstBlock = prefixSums[i];
      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;
      gid2host.push_back(
          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));
    }
  }

  /**
   * Given an OfflineGraph, compute the masters for each node by
   * evenly (or unevenly as specified by scale factor)
   * blocking the nodes off to assign to each host while taking
   * into consideration the only edges of the node to get
   * even blocks.
   *
   * @param g The offline graph which has loaded the graph you want
   * to get the masters for
   * @param scalefactor A vector that specifies if a particular host
   * should have more or less than other hosts
   * @param DecomposeFactor Specifies how decomposed the blocking
   * of nodes should be. For example, a factor of 2 will make 2 blocks
   * out of 1 block had the decompose factor been set to 1.
   */
  void computeMastersBalancedEdges(galois::graphs::OfflineGraph& g,
                                   const std::vector<unsigned>& scalefactor,
                                   uint32_t edgeWeight,
                                   unsigned DecomposeFactor = 1) {
    if (edgeWeight == 0) {
      edgeWeight = 1;
    }

    auto& net = galois::runtime::getSystemNetworkInterface();

    gid2host.resize(numHosts * DecomposeFactor);
    for (unsigned d = 0; d < DecomposeFactor; ++d) {
      auto r = g.divideByNode(0, edgeWeight, (id + d * numHosts),
                              numHosts * DecomposeFactor, scalefactor);
      gid2host[id + d * numHosts].first  = *(r.first.first);
      gid2host[id + d * numHosts].second = *(r.first.second);
    }

    for (unsigned h = 0; h < numHosts; ++h) {
      if (h == id) {
        continue;
      }
      galois::runtime::SendBuffer b;
      for (unsigned d = 0; d < DecomposeFactor; ++d) {
        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
      }
      net.sendTagged(h, galois::runtime::evilPhase, b);
    }
    net.flush();
    unsigned received = 1;
    while (received < numHosts) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      assert(p->first != id);
      auto& b = p->second;
      for (unsigned d = 0; d < DecomposeFactor; ++d) {
        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);
      }
      ++received;
    }
    increment_evilPhase();

#ifndef NDEBUG
    for (unsigned h = 0; h < numHosts; h++) {
      if (h == 0) {
        assert(gid2host[h].first == 0);
      } else if (h == numHosts - 1) {
        assert(gid2host[h].first == gid2host[h - 1].second);
        assert(gid2host[h].second == g.size());
      } else {
        assert(gid2host[h].first == gid2host[h - 1].second);
        assert(gid2host[h].second == gid2host[h + 1].first);
      }
    }
#endif
  }

  /**
   * Given an OfflineGraph, compute the masters for each node by
   * evenly (or unevenly as specified by scale factor)
   * blocking the nodes off to assign to each host while taking
   * into consideration the edges of the node AND the node itself.
   *
   * @param g The offline graph which has loaded the graph you want
   * to get the masters for
   * @param scalefactor A vector that specifies if a particular host
   * should have more or less than other hosts
   * @param DecomposeFactor Specifies how decomposed the blocking
   * of nodes should be. For example, a factor of 2 will make 2 blocks
   * out of 1 block had the decompose factor been set to 1. Ignored
   * in this function currently.
   *
   * @todo make this function work with decompose factor
   */
  void computeMastersBalancedNodesAndEdges(
      galois::graphs::OfflineGraph& g, const std::vector<unsigned>& scalefactor,
      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {
    if (nodeWeight == 0) {
      nodeWeight = g.sizeEdges() / g.size(); // average degree
    }
    if (edgeWeight == 0) {
      edgeWeight = 1;
    }

    auto& net = galois::runtime::getSystemNetworkInterface();
    gid2host.resize(numHosts);
    auto r = g.divideByNode(nodeWeight, edgeWeight, id, numHosts, scalefactor);
    gid2host[id].first  = *r.first.first;
    gid2host[id].second = *r.first.second;
    for (unsigned h = 0; h < numHosts; ++h) {
      if (h == id)
        continue;
      galois::runtime::SendBuffer b;
      galois::runtime::gSerialize(b, gid2host[id]);
      net.sendTagged(h, galois::runtime::evilPhase, b);
    }
    net.flush();
    unsigned received = 1;
    while (received < numHosts) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      assert(p->first != id);
      auto& b = p->second;
      galois::runtime::gDeserialize(b, gid2host[p->first]);
      ++received;
    }
    increment_evilPhase();
  }

protected:
  /**
   * Wrapper call that will call into more specific compute masters
   * functions that compute masters based on nodes, edges, or both.
   *
   * @param masters_distribution method of masters distribution to use
   * @param g The offline graph which has loaded the graph you want
   * to get the masters for
   * @param scalefactor A vector that specifies if a particular host
   * should have more or less than other hosts
   * @param nodeWeight weight to give nodes when computing balance
   * @param edgeWeight weight to give edges when computing balance
   * @param DecomposeFactor Specifies how decomposed the blocking
   * of nodes should be. For example, a factor of 2 will make 2 blocks
   * out of 1 block had the decompose factor been set to 1.
   */
  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,
                          galois::graphs::OfflineGraph& g,
                          const std::vector<unsigned>& scalefactor,
                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
                          unsigned DecomposeFactor = 1) {
    galois::Timer timer;
    timer.start();
    g.reset_seek_counters();

    uint64_t numNodes_to_divide = g.size();

    // compute masters for all nodes
    switch (masters_distribution) {
    case BALANCED_MASTERS:
      computeMastersBlockedNodes(g, scalefactor, DecomposeFactor);
      break;
    case BALANCED_MASTERS_AND_EDGES:
      computeMastersBalancedNodesAndEdges(g, scalefactor, nodeWeight,
                                          edgeWeight, DecomposeFactor);
      break;
    case BALANCED_EDGES_OF_MASTERS:
    default:
      computeMastersBalancedEdges(g, scalefactor, edgeWeight, DecomposeFactor);
      break;
    }

    timer.stop();

    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
        GRNAME, "MasterDistTime", timer.get());

    galois::gPrint(
        "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
        " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n");
    return numNodes_to_divide;
  }

  //! reader assignment from a file
  //! corresponds to master assignment if using an edge cut
  void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {
    // read file lines
    std::ifstream mappings(filename);
    std::string curLine;

    unsigned timesToRead = id + 1;

    for (unsigned i = 0; i < timesToRead; i++) {
      std::getline(mappings, curLine);
    }

    std::vector<char> modifyLine(curLine.begin(), curLine.end());
    char* tokenizedString = modifyLine.data();
    char* token;
    token = strtok(tokenizedString, " ");

    // loop 6 more times
    for (unsigned i = 0; i < 6; i++) {
      token = strtok(NULL, " ");
    }
    std::string left(token);

    // 3 more times for right
    for (unsigned i = 0; i < 3; i++) {
      token = strtok(NULL, " ");
    }
    std::string right(token);

    gid2host.resize(numHosts);
    gid2host[id].first  = std::stoul(left);
    gid2host[id].second = std::stoul(right) + 1;
    galois::gPrint("[", id, "] Left: ", gid2host[id].first,
                   ", Right: ", gid2host[id].second, "\n");

    /////////////////////////
    // send/recv from other hosts
    /////////////////////////
    auto& net = galois::runtime::getSystemNetworkInterface();

    for (unsigned h = 0; h < numHosts; ++h) {
      if (h == id)
        continue;
      galois::runtime::SendBuffer b;
      galois::runtime::gSerialize(b, gid2host[id]);
      net.sendTagged(h, galois::runtime::evilPhase, b);
    }
    net.flush();
    unsigned received = 1;
    while (received < numHosts) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      assert(p->first != id);
      auto& b = p->second;
      galois::runtime::gDeserialize(b, gid2host[p->first]);
      ++received;
    }
    increment_evilPhase();

    // sanity checking assignment
    for (unsigned h = 0; h < numHosts; h++) {
      if (h == 0) {
        GALOIS_ASSERT(gid2host[h].first == 0);
      } else if (h == numHosts - 1) {
        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
                      gid2host[h].first, " ", gid2host[h - 1].second);
        GALOIS_ASSERT(gid2host[h].second == g.size(), gid2host[h].second, " ",
                      g.size());
      } else {
        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
                      gid2host[h].first, " ", gid2host[h - 1].second);
        GALOIS_ASSERT(gid2host[h].second == gid2host[h + 1].first,
                      gid2host[h].second, " ", gid2host[h + 1].first);
      }
    }
  }

  uint32_t G2L(uint64_t gid) const {
    assert(isLocal(gid));
    return globalToLocalMap.at(gid);
  }

  uint64_t L2G(uint32_t lid) const { return localToGlobalVector[lid]; }

public:
  //! Type representing a node in this graph
  using GraphNode = typename GraphTy::GraphNode;
  //! Expose EdgeTy to other classes
  using EdgeType = EdgeTy;
  //! iterator type over nodes
  using iterator = typename GraphTy::iterator;
  //! constant iterator type over nodes
  using const_iterator = typename GraphTy::const_iterator;
  //! iterator type over edges
  using edge_iterator = typename GraphTy::edge_iterator;

  /**
   * Constructor for DistGraph. Initializes metadata fields.
   *
   * @param host host number that this graph resides on
   * @param numHosts total number of hosts in the currently executing program
   */
  DistGraph(unsigned host, unsigned numHosts)
      : transposed(false), id(host), numHosts(numHosts) {
    mirrorNodes.resize(numHosts);
    numGlobalNodes = 0;
    numGlobalEdges = 0;
  }

  /**
   * Return a vector of pairs denoting mirror node ranges.
   *
   * Assumes all mirror nodes occur after the masters: this invariant should be
   * held by CuSP.
   */
  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {
    std::vector<std::pair<uint32_t, uint32_t>> mirrorRangesVector;
    // order of nodes locally is masters, outgoing mirrors, incoming mirrors,
    // so just get from numOwned to end
    if (numOwned != numNodes) {
      assert(numOwned < numNodes);
      mirrorRangesVector.push_back(std::make_pair(numOwned, numNodes));
    }
    return mirrorRangesVector;
  }

  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirrorNodes; }

private:
  virtual unsigned getHostIDImpl(uint64_t) const = 0;
  virtual bool isOwnedImpl(uint64_t) const       = 0;
  virtual bool isLocalImpl(uint64_t) const       = 0;
  virtual bool isVertexCutImpl() const           = 0;
  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {
    return std::make_pair(0u, 0u);
  }

public:
  virtual ~DistGraph() {}
  //! Determines which host has the master for a particular node
  //! @returns Host id of node in question
  inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
  //! Determine if a node has a master on this host.
  //! @returns True if passed in global id has a master on this host
  inline bool isOwned(uint64_t gid) const { return isOwnedImpl(gid); }
  //! Determine if a node has a proxy on this host
  //! @returns True if passed in global id has a proxy on this host
  inline bool isLocal(uint64_t gid) const { return isLocalImpl(gid); }
  /**
   * Returns true if current partition is a vertex cut
   * @returns true if partition being stored in this graph is a vertex cut
   */
  inline bool is_vertex_cut() const { return isVertexCutImpl(); }
  /**
   * Returns Cartesian split (if it exists, else returns pair of 0s
   */
  inline std::pair<unsigned, unsigned> cartesianGrid() const {
    return cartesianGridImpl();
  }

  bool isTransposed() { return transposed; }

  /**
   * Converts a local node id into a global node id
   *
   * @param nodeID local node id
   * @returns global node id corresponding to the local one
   */
  inline uint64_t getGID(const uint32_t nodeID) const { return L2G(nodeID); }

  /**
   * Converts a global node id into a local node id
   *
   * @param nodeID global node id
   * @returns local node id corresponding to the global one
   */
  inline uint32_t getLID(const uint64_t nodeID) const { return G2L(nodeID); }

  /**
   * Get data of a node.
   *
   * @param N node to get the data of
   * @param mflag access flag for node data
   * @returns A node data object
   */
  inline typename GraphTy::node_data_reference
  getData(GraphNode N,
          galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
    auto& r = graph.getData(N, mflag);
    return r;
  }

  /**
   * Get the edge data for a particular edge in the graph.
   *
   * @param ni edge to get the data of
   * @param mflag access flag for edge data
   * @returns The edge data for the requested edge
   */
  inline typename GraphTy::edge_data_reference
  getEdgeData(edge_iterator ni,
              galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
    auto& r = graph.getEdgeData(ni, mflag);
    return r;
  }

  /**
   * Gets edge destination of edge ni.
   *
   * @param ni edge id to get destination of
   * @returns Local ID of destination of edge ni
   */
  GraphNode getEdgeDst(edge_iterator ni) { return graph.getEdgeDst(ni); }

  /**
   * Gets the first edge of some node.
   *
   * @param N node to get the edge of
   * @returns iterator to first edge of N
   */
  inline edge_iterator edge_begin(GraphNode N) {
    return graph.edge_begin(N, galois::MethodFlag::UNPROTECTED);
  }

  /**
   * Gets the end edge boundary of some node.
   *
   * @param N node to get the edge of
   * @returns iterator to the end of the edges of node N, i.e. the first edge
   * of the next node (or an "end" iterator if there is no next node)
   */
  inline edge_iterator edge_end(GraphNode N) {
    return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);
  }

  /**
   * Returns an iterable object over the edges of a particular node in the
   * graph.
   *
   * @param N node to get edges iterator over
   */
  inline galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>
  edges(GraphNode N) {
    return galois::graphs::internal::make_no_deref_range(edge_begin(N),
                                                         edge_end(N));
  }

  /**
   * Gets number of nodes on this (local) graph.
   *
   * @returns number of nodes present in this (local) graph
   */
  inline size_t size() const { return graph.size(); }

  /**
   * Gets number of edges on this (local) graph.
   *
   * @returns number of edges present in this (local) graph
   */
  inline size_t sizeEdges() const { return graph.sizeEdges(); }

  /**
   * Gets number of nodes on this (local) graph.
   *
   * @returns number of nodes present in this (local) graph
   */
  inline size_t numMasters() const { return numOwned; }

  /**
   * Gets number of nodes with edges (may include nodes without edges)
   * on this (local) graph.
   *
   * @returns number of nodes with edges (may include nodes without edges
   * as it measures a contiguous range)
   */
  inline size_t getNumNodesWithEdges() const { return numNodesWithEdges; }

  /**
   * Gets number of nodes on the global unpartitioned graph.
   *
   * @returns number of nodes present in the global unpartitioned graph
   */
  inline size_t globalSize() const { return numGlobalNodes; }

  /**
   * Gets number of edges on the global unpartitioned graph.
   *
   * @returns number of edges present in the global unpartitioned graph
   */
  inline size_t globalSizeEdges() const { return numGlobalEdges; }

  /**
   * Returns a range object that encapsulates all nodes of the graph.
   *
   * @returns A range object that contains all the nodes in this graph
   */
  inline const NodeRangeType& allNodesRange() const {
    assert(specificRanges.size() == 3);
    return specificRanges[0];
  }

  /**
   * Returns a range object that encapsulates only master nodes in this
   * graph.
   *
   * @returns A range object that contains the master nodes in this graph
   */
  inline const NodeRangeType& masterNodesRange() const {
    assert(specificRanges.size() == 3);
    return specificRanges[1];
  }

  /**
   * Returns a range object that encapsulates master nodes and nodes
   * with edges in this graph.
   *
   * @returns A range object that contains the master nodes and the nodes
   * with outgoing edges in this graph
   */
  inline const NodeRangeType& allNodesWithEdgesRange() const {
    assert(specificRanges.size() == 3);
    return specificRanges[2];
  }

  /**
   * Returns a vector object that contains the global IDs (in order) of
   * the master nodes in this graph.
   *
   * @returns A vector object that contains the global IDs (in order) of
   * the master nodes in this graph
   */
  std::vector<uint64_t> getMasterGlobalIDs() {
    std::vector<uint64_t> IDs;

    IDs.reserve(numMasters());
    for (auto node : masterNodesRange()) {
      IDs.push_back(getGID(node));
    }

    return IDs;
  }

protected:
  /**
   * Uses a pre-computed prefix sum to determine division of nodes among
   * threads.
   *
   * The call uses binary search to determine the ranges.
   */
  inline void determineThreadRanges() {
    allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum(
        galois::runtime::activeThreads, graph.getEdgePrefixSum());
  }

  /**
   * Determines the thread ranges for master nodes only and saves them to
   * the object.
   *
   * Only call after graph is constructed + only call once
   */
  inline void determineThreadRangesMaster() {
    // make sure this hasn't been called before
    assert(masterRanges.size() == 0);

    // first check if we even need to do any work; if already calculated,
    // use already calculated vector
    if (beginMaster == 0 && (beginMaster + numOwned) == size()) {
      masterRanges = allNodesRanges;
    } else if (beginMaster == 0 &&
               (beginMaster + numOwned) == numNodesWithEdges &&
               withEdgeRanges.size() != 0) {
      masterRanges = withEdgeRanges;
    } else {
      galois::gDebug("Manually det. master thread ranges");
      masterRanges = galois::graphs::determineUnitRangesFromGraph(
          graph, galois::runtime::activeThreads, beginMaster,
          beginMaster + numOwned, 0);
    }
  }

  /**
   * Determines the thread ranges for nodes with edges only and saves them to
   * the object.
   *
   * Only call after graph is constructed + only call once
   */
  inline void determineThreadRangesWithEdges() {
    // make sure not called before
    assert(withEdgeRanges.size() == 0);

    // first check if we even need to do any work; if already calculated,
    // use already calculated vector
    if (numNodesWithEdges == size()) {
      withEdgeRanges = allNodesRanges;
    } else if (beginMaster == 0 &&
               (beginMaster + numOwned) == numNodesWithEdges &&
               masterRanges.size() != 0) {
      withEdgeRanges = masterRanges;
    } else {
      galois::gDebug("Manually det. with edges thread ranges");
      withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
          graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
    }
  }

  /**
   * Initializes the 3 range objects that a user can access to iterate
   * over the graph in different ways.
   */
  void initializeSpecificRanges() {
    assert(specificRanges.size() == 0);

    // TODO/FIXME assertion likely not safe if a host gets no nodes
    // make sure the thread ranges have already been calculated
    // for the 3 ranges
    assert(allNodesRanges.size() != 0);
    assert(masterRanges.size() != 0);
    assert(withEdgeRanges.size() != 0);

    // 0 is all nodes
    specificRanges.push_back(galois::runtime::makeSpecificRange(
        boost::counting_iterator<size_t>(0),
        boost::counting_iterator<size_t>(size()), allNodesRanges.data()));

    // 1 is master nodes
    specificRanges.push_back(galois::runtime::makeSpecificRange(
        boost::counting_iterator<size_t>(beginMaster),
        boost::counting_iterator<size_t>(beginMaster + numOwned),
        masterRanges.data()));

    // 2 is with edge nodes
    specificRanges.push_back(galois::runtime::makeSpecificRange(
        boost::counting_iterator<size_t>(0),
        boost::counting_iterator<size_t>(numNodesWithEdges),
        withEdgeRanges.data()));

    assert(specificRanges.size() == 3);
  }

  /**
   * Specific range editor: makes the range for edges equivalent to the range
   * for masters.
   */
  void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }

public:
  /**
   * Write the local LC_CSR graph to the file on a disk.
   *
   * @todo revive this
   */
  void save_local_graph_to_file(std::string) { GALOIS_DIE("not implemented"); }

  /**
   * Read the local LC_CSR graph from the file on a disk.
   *
   * @todo revive this
   */
  void read_local_graph_from_file(std::string) {
    GALOIS_DIE("not implemented");
  }

  /**
   * Deallocates underlying LC CSR Graph
   */
  void deallocate() {
    galois::gDebug("Deallocating CSR in DistGraph");
    graph.deallocate();
  }

  /**
   * Sort the underlying LC_CSR_Graph by ID (destinations)
   * It sorts edges of the nodes by destination.
   */
  void sortEdgesByDestination() {
    using GN = typename GraphTy::GraphNode;
    galois::do_all(
        galois::iterate(graph),
        [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },
        galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
  }
};

template <typename NodeTy, typename EdgeTy>
constexpr const char* const galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;
} // end namespace graphs
} // end namespace galois

#endif //_GALOIS_DIST_HGRAPH_H


================================================
FILE: libcusp/include/galois/graphs/GenericPartitioners.h
================================================
#ifndef _GALOIS_DIST_GENERICPARTS_H
#define _GALOIS_DIST_GENERICPARTS_H

#include "DistributedGraph.h"
#include "BasePolicies.h"
#include <utility>
#include <cmath>
#include <limits>

class NoCommunication : public galois::graphs::ReadMasterAssignment {
public:
  NoCommunication(uint32_t, uint32_t numHosts, uint64_t, uint64_t)
      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}

  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {
    return retrieveMaster(src);
  }

  bool noCommunication() { return true; }
  bool isVertexCut() const { return false; }
  void serializePartition(boost::archive::binary_oarchive&) {}
  void deserializePartition(boost::archive::binary_iarchive&) {}
  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(0u, 0u);
  }
};

/**
 */
class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {
public:
  MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
                    std::vector<uint64_t>&)
      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}

  static bool needNodeDegrees() { return false; }

  bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; }
};

class MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment {
  std::vector<uint64_t>& ndegrees;

public:
  MiningPolicyDegrees(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
                      std::vector<uint64_t>& _ndeg)
      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0),
        ndegrees(_ndeg) {}

  static bool needNodeDegrees() { return true; }

  bool keepEdge(uint32_t src, uint32_t dst) const {
    uint64_t sourceDegree = ndegrees[src];
    uint64_t destDegree   = ndegrees[dst];
    if ((destDegree > sourceDegree) ||
        ((destDegree == sourceDegree) && (src < dst))) {
      return true;
    } else {
      return false;
    }
  }
};

////////////////////////////////////////////////////////////////////////////////

class GenericCVC : public galois::graphs::ReadMasterAssignment {
  unsigned numRowHosts;
  unsigned numColumnHosts;
  unsigned _h_offset;

  void factorizeHosts() {
    numColumnHosts = sqrt(_numHosts);

    while ((_numHosts % numColumnHosts) != 0)
      numColumnHosts--;

    numRowHosts = _numHosts / numColumnHosts;
    assert(numRowHosts >= numColumnHosts);

    // if (moreColumnHosts) {
    //  std::swap(numRowHosts, numColumnHosts);
    //}

    if (_hostID == 0) {
      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
                     "\n");
    }
  }

  //! Returns the grid row ID of this host
  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
  //! Returns the grid row ID of the specified host
  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
  //! Returns the grid column ID of this host
  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
  //! Returns the grid column ID of the specified host
  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }

  //! Find the column of a particular node
  unsigned getColumnOfNode(uint64_t gid) const {
    return gridColumnID(retrieveMaster(gid));
  }

public:
  GenericCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
             uint64_t numEdges)
      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,
                                             numEdges) {
    factorizeHosts();
    _h_offset = gridRowID() * numColumnHosts;
  }

  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {
    int i = getColumnOfNode(dst);
    return _h_offset + i;
  }

  bool noCommunication() { return false; }
  bool isVertexCut() const {
    if ((numRowHosts == 1) || (numColumnHosts == 1))
      return false;
    return true;
  }
  void serializePartition(boost::archive::binary_oarchive& ar) {
    ar << numRowHosts;
    ar << numColumnHosts;
  }
  void deserializePartition(boost::archive::binary_iarchive& ar) {
    ar >> numRowHosts;
    ar >> numColumnHosts;
  }

  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(numRowHosts, numColumnHosts);
  }
};

////////////////////////////////////////////////////////////////////////////////

// same as above, except columns are flipped (changes behavior of vertex cut
// call as well)
class GenericCVCColumnFlip : public galois::graphs::ReadMasterAssignment {
  unsigned numRowHosts;
  unsigned numColumnHosts;
  unsigned _h_offset;

  void factorizeHosts() {
    numColumnHosts = sqrt(_numHosts);

    while ((_numHosts % numColumnHosts) != 0)
      numColumnHosts--;

    numRowHosts = _numHosts / numColumnHosts;
    assert(numRowHosts >= numColumnHosts);

    // column flip
    std::swap(numRowHosts, numColumnHosts);

    if (_hostID == 0) {
      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
                     "\n");
    }
  }

  //! Returns the grid row ID of this host
  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
  //! Returns the grid row ID of the specified host
  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
  //! Returns the grid column ID of this host
  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
  //! Returns the grid column ID of the specified host
  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }

  //! Find the column of a particular node
  unsigned getColumnOfNode(uint64_t gid) const {
    return gridColumnID(retrieveMaster(gid));
  }

public:
  GenericCVCColumnFlip(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
                       uint64_t numEdges)
      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,
                                             numEdges) {
    factorizeHosts();
    _h_offset = gridRowID() * numColumnHosts;
  }

  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {
    int i = getColumnOfNode(dst);
    return _h_offset + i;
  }

  bool noCommunication() { return false; }
  bool isVertexCut() const {
    if ((numRowHosts == 1) && (numColumnHosts == 1))
      return false;
    return true;
  }

  void serializePartition(boost::archive::binary_oarchive& ar) {
    ar << numRowHosts;
    ar << numColumnHosts;
  }

  void deserializePartition(boost::archive::binary_iarchive& ar) {
    ar >> numRowHosts;
    ar >> numColumnHosts;
  }

  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(numRowHosts, numColumnHosts);
  }
};
////////////////////////////////////////////////////////////////////////////////
class GenericHVC : public galois::graphs::ReadMasterAssignment {
  uint32_t _vCutThreshold;

public:
  GenericHVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
             uint64_t numEdges)
      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,
                                             numEdges) {
    _vCutThreshold = 1000; // can be changed, but default seems to be 1000
  }

  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t numEdges) const {
    if (numEdges > _vCutThreshold) {
      return retrieveMaster(dst);
    } else {
      return retrieveMaster(src);
    }
  }

  bool noCommunication() { return false; }
  // TODO I should be able to make this runtime detectable
  bool isVertexCut() const { return true; }
  void serializePartition(boost::archive::binary_oarchive&) {}
  void deserializePartition(boost::archive::binary_iarchive&) {}
  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(0u, 0u);
  }
};

////////////////////////////////////////////////////////////////////////////////

class GingerP : public galois::graphs::CustomMasterAssignment {
  // used in hybrid cut
  uint32_t _vCutThreshold;
  // ginger scoring constants
  double _gamma;
  double _alpha;
  // ginger node/edge ratio
  double _neRatio;

  /**
   * Returns Ginger's composite balance parameter for a given host
   */
  double getCompositeBalanceParam(
      unsigned host, const std::vector<uint64_t>& nodeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
      const std::vector<uint64_t>& edgeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    // get node/edge loads
    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();
    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();

    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;
  }

  /**
   * Use FENNEL balance equation to get a score value for partition
   * scoring
   */
  double getFennelBalanceScore(double param) {
    return _alpha * _gamma * pow(param, _gamma - 1);
  }

public:
  GingerP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
          uint64_t numEdges)
      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
                                               numEdges) {
    _vCutThreshold = 1000;
    _gamma         = 1.5;
    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);
    _neRatio = (double)numNodes / (double)numEdges;
  }

  template <typename EdgeTy>
  uint32_t getMaster(uint32_t src,
                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                     const std::vector<uint32_t>& localNodeToMaster,
                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                     const std::vector<uint64_t>& nodeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
                     const std::vector<uint64_t>& edgeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    auto ii = bufGraph.edgeBegin(src);
    auto ee = bufGraph.edgeEnd(src);
    // number of edges
    uint64_t ne = std::distance(ii, ee);

    // high in-degree nodes masters stay the same
    if (ne > _vCutThreshold) {
      return _hostID;
    } else {
      // low in degree masters move based on augmented FENNEL scoring metric
      // initialize array to hold scores
      galois::PODResizeableArray<double> scores;
      scores.resize(_numHosts);
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] = 0.0;
      }

      for (; ii < ee; ++ii) {
        uint64_t dst         = bufGraph.edgeDestination(*ii);
        size_t offsetIntoMap = (unsigned)-1;

        auto it = gid2offsets.find(dst);
        if (it != gid2offsets.end()) {
          offsetIntoMap = it->second;
        } else {
          // determine offset
          offsetIntoMap = dst - bufGraph.getNodeOffset();
        }

        assert(offsetIntoMap != (unsigned)-1);
        assert(offsetIntoMap < localNodeToMaster.size());

        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];

        if (currentAssignment != (unsigned)-1) {
          scores[currentAssignment] += 1.0;
        } else {
          galois::gDebug("[", _hostID, "] ", dst, " unassigned");
        }
      }

      // subtraction of the composite balance term
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(
            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));
      }

      unsigned bestHost = -1;
      double bestScore  = std::numeric_limits<double>::lowest();
      // find max score
      for (unsigned i = 0; i < _numHosts; i++) {
        if (scores[i] >= bestScore) {
          // galois::gDebug("best score ", bestScore, " beaten by ", scores[i]);
          bestScore = scores[i];
          bestHost  = i;
        }
      }

      galois::gDebug("[", _hostID, "] ", src, " assigned to ", bestHost,
                     " with num edge ", ne);

      // update metadata; TODO make this a nicer interface
      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);
      galois::atomicAdd(edgeAccum[bestHost], ne);

      return bestHost;
    }
  }

  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t numEdges) const {
    // if high indegree, then move to source (which is dst), else stay on
    // dst (which is src)
    // note "dst" here is actually the source on the actual graph
    // since we're reading transpose
    if (numEdges > _vCutThreshold) {
      return retrieveMaster(dst);
    } else {
      return retrieveMaster(src);
    }
  }

  bool noCommunication() { return false; }
  // TODO I should be able to make this runtime detectable
  bool isVertexCut() const { return true; }
  void serializePartition(boost::archive::binary_oarchive&) {}
  void deserializePartition(boost::archive::binary_iarchive&) {}
  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(0u, 0u);
  }
};

class FennelP : public galois::graphs::CustomMasterAssignment {
  // used in hybrid cut
  uint32_t _vCutThreshold;
  // ginger scoring constants
  double _gamma;
  double _alpha;
  // ginger node/edge ratio
  double _neRatio;

  /**
   * Returns Ginger's composite balance parameter for a given host
   */
  double getCompositeBalanceParam(
      unsigned host, const std::vector<uint64_t>& nodeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
      const std::vector<uint64_t>& edgeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    // get node/edge loads
    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();
    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();

    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;
  }

  /**
   * Use FENNEL balance equation to get a score value for partition
   * scoring
   */
  double getFennelBalanceScore(double param) {
    return _alpha * _gamma * pow(param, _gamma - 1);
  }

public:
  FennelP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
          uint64_t numEdges)
      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
                                               numEdges) {
    _vCutThreshold = 1000;
    _gamma         = 1.5;
    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);
    _neRatio = (double)numNodes / (double)numEdges;
  }

  template <typename EdgeTy>
  uint32_t getMaster(uint32_t src,
                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                     const std::vector<uint32_t>& localNodeToMaster,
                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                     const std::vector<uint64_t>& nodeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
                     const std::vector<uint64_t>& edgeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    auto ii = bufGraph.edgeBegin(src);
    auto ee = bufGraph.edgeEnd(src);
    // number of edges
    uint64_t ne = std::distance(ii, ee);

    // high degree nodes masters stay the same
    if (ne > _vCutThreshold) {
      return _hostID;
    } else {
      // low degree masters move based on augmented FENNEL scoring metric
      // initialize array to hold scores
      galois::PODResizeableArray<double> scores;
      scores.resize(_numHosts);
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] = 0.0;
      }

      for (; ii < ee; ++ii) {
        uint64_t dst         = bufGraph.edgeDestination(*ii);
        size_t offsetIntoMap = (unsigned)-1;

        auto it = gid2offsets.find(dst);
        if (it != gid2offsets.end()) {
          offsetIntoMap = it->second;
        } else {
          // determine offset
          offsetIntoMap = dst - bufGraph.getNodeOffset();
        }

        assert(offsetIntoMap != (unsigned)-1);
        assert(offsetIntoMap < localNodeToMaster.size());

        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];

        if (currentAssignment != (unsigned)-1) {
          scores[currentAssignment] += 1.0;
        } else {
          galois::gDebug("[", _hostID, "] ", dst, " unassigned");
        }
      }

      // subtraction of the composite balance term
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(
            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));
      }

      unsigned bestHost = -1;
      double bestScore  = std::numeric_limits<double>::lowest();
      // find max score
      for (unsigned i = 0; i < _numHosts; i++) {
        if (scores[i] >= bestScore) {
          // galois::gDebug("best score ", bestScore, " beaten by ", scores[i]);
          bestScore = scores[i];
          bestHost  = i;
        }
      }

      galois::gDebug("[", _hostID, "] ", src, " assigned to ", bestHost,
                     " with num edge ", ne);

      // update metadata; TODO make this a nicer interface
      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);
      galois::atomicAdd(edgeAccum[bestHost], ne);

      return bestHost;
    }
  }

  // Fennel is an edge cut: all edges on source
  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {
    return retrieveMaster(src);
  }

  bool noCommunication() { return false; }
  // TODO I should be able to make this runtime detectable
  bool isVertexCut() const { return false; }
  void serializePartition(boost::archive::binary_oarchive&) {}
  void deserializePartition(boost::archive::binary_iarchive&) {}
  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(0u, 0u);
  }
};

class SugarP : public galois::graphs::CustomMasterAssignment {
  // used in hybrid cut
  uint32_t _vCutThreshold;
  // ginger scoring constants
  double _gamma;
  double _alpha;
  // ginger node/edge ratio
  double _neRatio;

  unsigned numRowHosts;
  unsigned numColumnHosts;

  void factorizeHosts() {
    numColumnHosts = sqrt(_numHosts);

    while ((_numHosts % numColumnHosts) != 0)
      numColumnHosts--;

    numRowHosts = _numHosts / numColumnHosts;
    assert(numRowHosts >= numColumnHosts);

    if (_hostID == 0) {
      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
                     "\n");
    }
  }

  //! Returns the grid row ID of this host
  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
  //! Returns the grid row ID of the specified host
  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
  //! Returns the grid column ID of this host
  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
  //! Returns the grid column ID of the specified host
  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }

  //! Find the row of a particular node
  unsigned getRowOfNode(uint64_t gid) const {
    return gridRowID(retrieveMaster(gid));
  }

  //! Find the column of a particular node
  unsigned getColumnOfNode(uint64_t gid) const {
    return gridColumnID(retrieveMaster(gid));
  }

  /**
   * Returns Ginger's composite balance parameter for a given host
   */
  double getCompositeBalanceParam(
      unsigned host, const std::vector<uint64_t>& nodeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
      const std::vector<uint64_t>& edgeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    // get node/edge loads
    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();
    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();

    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;
  }

  /**
   * Use FENNEL balance equation to get a score value for partition
   * scoring
   */
  double getFennelBalanceScore(double param) {
    return _alpha * _gamma * pow(param, _gamma - 1);
  }

public:
  SugarP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
         uint64_t numEdges)
      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
                                               numEdges) {
    _vCutThreshold = 1000;
    _gamma         = 1.5;
    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);
    _neRatio = (double)numNodes / (double)numEdges;
    // CVC things
    factorizeHosts();
  }

  template <typename EdgeTy>
  uint32_t getMaster(uint32_t src,
                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                     const std::vector<uint32_t>& localNodeToMaster,
                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                     const std::vector<uint64_t>& nodeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
                     const std::vector<uint64_t>& edgeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    auto ii = bufGraph.edgeBegin(src);
    auto ee = bufGraph.edgeEnd(src);
    // number of edges
    uint64_t ne = std::distance(ii, ee);

    // high degree nodes masters stay the same
    if (ne > _vCutThreshold) {
      return _hostID;
    } else {
      // low degree masters move based on augmented FENNEL scoring metric
      // initialize array to hold scores
      galois::PODResizeableArray<double> scores;
      scores.resize(_numHosts);
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] = 0.0;
      }

      for (; ii < ee; ++ii) {
        uint64_t dst         = bufGraph.edgeDestination(*ii);
        size_t offsetIntoMap = (unsigned)-1;

        auto it = gid2offsets.find(dst);
        if (it != gid2offsets.end()) {
          offsetIntoMap = it->second;
        } else {
          // determine offset
          offsetIntoMap = dst - bufGraph.getNodeOffset();
        }

        assert(offsetIntoMap != (unsigned)-1);
        assert(offsetIntoMap < localNodeToMaster.size());

        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];

        if (currentAssignment != (unsigned)-1) {
          scores[currentAssignment] += 1.0;
        } else {
          // galois::gDebug("[", _hostID, "] ", dst, " unassigned");
        }
      }

      // subtraction of the composite balance term
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(
            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));
      }

      unsigned bestHost = -1;
      double bestScore  = std::numeric_limits<double>::lowest();
      // find max score
      for (unsigned i = 0; i < _numHosts; i++) {
        if (scores[i] >= bestScore) {
          // galois::gDebug("best score ", bestScore, " beaten by ", scores[i]);
          bestScore = scores[i];
          bestHost  = i;
        }
      }

      galois::gDebug("[", _hostID, "] ", src, " assigned to ", bestHost,
                     " with num edge ", ne);

      // update metadata; TODO make this a nicer interface
      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);
      galois::atomicAdd(edgeAccum[bestHost], ne);

      return bestHost;
    }
  }

  /**
   * return owner of edge using cartesian edge owner determination
   */
  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {
    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;
    unsigned cyclicColumnOffset = getColumnOfNode(dst);
    return blockedRowOffset + cyclicColumnOffset;
  }

  bool noCommunication() { return false; }
  bool isVertexCut() const {
    if ((numRowHosts == 1) || (numColumnHosts == 1))
      return false;
    return true;
  }

  void serializePartition(boost::archive::binary_oarchive& ar) {
    ar << numRowHosts;
    ar << numColumnHosts;
  }

  void deserializePartition(boost::archive::binary_iarchive& ar) {
    ar >> numRowHosts;
    ar >> numColumnHosts;
  }

  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(numRowHosts, numColumnHosts);
  }
};

class SugarColumnFlipP : public galois::graphs::CustomMasterAssignment {
  // used in hybrid cut
  uint32_t _vCutThreshold;
  // ginger scoring constants
  double _gamma;
  double _alpha;
  // ginger node/edge ratio
  double _neRatio;

  unsigned numRowHosts;
  unsigned numColumnHosts;

  void factorizeHosts() {
    numColumnHosts = sqrt(_numHosts);

    while ((_numHosts % numColumnHosts) != 0)
      numColumnHosts--;

    numRowHosts = _numHosts / numColumnHosts;
    assert(numRowHosts >= numColumnHosts);

    // column flip
    std::swap(numRowHosts, numColumnHosts);

    if (_hostID == 0) {
      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
                     "\n");
    }
  }

  //! Returns the grid row ID of this host
  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
  //! Returns the grid row ID of the specified host
  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
  //! Returns the grid column ID of this host
  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
  //! Returns the grid column ID of the specified host
  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }

  //! Find the row of a particular node
  unsigned getRowOfNode(uint64_t gid) const {
    return gridRowID(retrieveMaster(gid));
  }

  //! Find the column of a particular node
  unsigned getColumnOfNode(uint64_t gid) const {
    return gridColumnID(retrieveMaster(gid));
  }

  /**
   * Returns Ginger's composite balance parameter for a given host
   */
  double getCompositeBalanceParam(
      unsigned host, const std::vector<uint64_t>& nodeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
      const std::vector<uint64_t>& edgeLoads,
      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    // get node/edge loads
    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();
    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();

    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;
  }

  /**
   * Use FENNEL balance equation to get a score value for partition
   * scoring
   */
  double getFennelBalanceScore(double param) {
    return _alpha * _gamma * pow(param, _gamma - 1);
  }

public:
  SugarColumnFlipP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
                   uint64_t numEdges)
      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
                                               numEdges) {
    _vCutThreshold = 1000;
    _gamma         = 1.5;
    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);
    _neRatio = (double)numNodes / (double)numEdges;
    // CVC things
    factorizeHosts();
  }

  template <typename EdgeTy>
  uint32_t getMaster(uint32_t src,
                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                     const std::vector<uint32_t>& localNodeToMaster,
                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                     const std::vector<uint64_t>& nodeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
                     const std::vector<uint64_t>& edgeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {
    auto ii = bufGraph.edgeBegin(src);
    auto ee = bufGraph.edgeEnd(src);
    // number of edges
    uint64_t ne = std::distance(ii, ee);

    // high degree nodes masters stay the same
    if (ne > _vCutThreshold) {
      return _hostID;
    } else {
      // low degree masters move based on augmented FENNEL scoring metric
      // initialize array to hold scores
      galois::PODResizeableArray<double> scores;
      scores.resize(_numHosts);
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] = 0.0;
      }

      for (; ii < ee; ++ii) {
        uint64_t dst         = bufGraph.edgeDestination(*ii);
        size_t offsetIntoMap = (unsigned)-1;

        auto it = gid2offsets.find(dst);
        if (it != gid2offsets.end()) {
          offsetIntoMap = it->second;
        } else {
          // determine offset
          offsetIntoMap = dst - bufGraph.getNodeOffset();
        }

        assert(offsetIntoMap != (unsigned)-1);
        assert(offsetIntoMap < localNodeToMaster.size());

        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];

        if (currentAssignment != (unsigned)-1) {
          scores[currentAssignment] += 1.0;
        } else {
          galois::gDebug("[", _hostID, "] ", dst, " unassigned");
        }
      }

      // subtraction of the composite balance term
      for (unsigned i = 0; i < _numHosts; i++) {
        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(
            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));
      }

      unsigned bestHost = -1;
      double bestScore  = std::numeric_limits<double>::lowest();
      // find max score
      for (unsigned i = 0; i < _numHosts; i++) {
        if (scores[i] >= bestScore) {
          // galois::gDebug("best score ", bestScore, " beaten by ", scores[i]);
          bestScore = scores[i];
          bestHost  = i;
        }
      }

      galois::gDebug("[", _hostID, "] ", src, " assigned to ", bestHost,
                     " with num edge ", ne);

      // update metadata; TODO make this a nicer interface
      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);
      galois::atomicAdd(edgeAccum[bestHost], ne);

      return bestHost;
    }
  }

  /**
   * return owner of edge using cartesian edge owner determination
   */
  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {
    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;
    unsigned cyclicColumnOffset = getColumnOfNode(dst);
    return blockedRowOffset + cyclicColumnOffset;
  }

  bool noCommunication() { return false; }
  bool isVertexCut() const {
    if ((numRowHosts == 1) && (numColumnHosts == 1))
      return false;
    return true;
  }
  void serializePartition(boost::archive::binary_oarchive& ar) {
    ar << numRowHosts;
    ar << numColumnHosts;
  }
  void deserializePartition(boost::archive::binary_iarchive& ar) {
    ar >> numRowHosts;
    ar >> numColumnHosts;
  }

  std::pair<unsigned, unsigned> cartesianGrid() {
    return std::make_pair(numRowHosts, numColumnHosts);
  }
};

#endif


================================================
FILE: libcusp/include/galois/graphs/MiningPartitioner.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file MiningPartitioner.h
 *
 * Graph mining partitioning that duplicates edges. Currently only supports an
 * outgoing edge cut.
 *
 * TODO lots of code dpulication here with regular cusp partitioner; need to
 * merge
 */

#ifndef _GALOIS_DIST_MINING_H
#define _GALOIS_DIST_MINING_H

#include "galois/graphs/DistributedGraph.h"
#include "galois/DReducible.h"

namespace galois {
namespace graphs {
/**
 * @tparam NodeTy type of node data for the graph
 * @tparam EdgeTy type of edge data for the graph
 *
 * @todo fully document and clean up code
 * @warning not meant for public use + not fully documented yet
 */
template <typename NodeTy, typename EdgeTy, typename Partitioner>
class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
  //! size used to buffer edge sends during partitioning
  constexpr static unsigned edgePartitionSendBufSize = 8388608;
  constexpr static const char* const GRNAME          = "dGraph_Mining";
  std::unique_ptr<Partitioner> graphPartitioner;

  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {
    assert(base_DistGraph::isLocal(gid));
    // optimized for edge cuts
    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)
      return gid - globalOffset;

    return base_DistGraph::globalToLocalMap.at(gid);
  }

  /**
   * Free memory of a vector by swapping an empty vector with it
   */
  template <typename V>
  void freeVector(V& vectorToKill) {
    V dummyVector;
    vectorToKill.swap(dummyVector);
  }

  uint32_t nodesToReceive;

  uint64_t myKeptEdges;
  uint64_t myReadEdges;
  uint64_t globalKeptEdges;
  uint64_t totalEdgeProxies;

  std::vector<std::vector<size_t>> mirrorEdges;
  std::unordered_map<uint64_t, uint64_t> localEdgeGIDToLID;

  std::vector<uint64_t> getNodeDegrees(const std::string filename,
                                       uint32_t numNodes) {
    std::vector<uint64_t> nodeDegrees;
    nodeDegrees.resize(numNodes);

    // read in prefix sum from GR on disk
    std::ifstream graphFile(filename.c_str());
    graphFile.seekg(sizeof(uint64_t) * 4);

    uint64_t* outIndexBuffer = (uint64_t*)malloc(sizeof(uint64_t) * numNodes);
    if (outIndexBuffer == nullptr) {
      GALOIS_DIE("out of memory");
    }
    uint64_t numBytesToLoad = numNodes * sizeof(uint64_t);
    uint64_t bytesRead      = 0;

    while (numBytesToLoad > 0) {
      graphFile.read(((char*)outIndexBuffer) + bytesRead, numBytesToLoad);
      size_t numRead = graphFile.gcount();
      numBytesToLoad -= numRead;
      bytesRead += numRead;
    }
    assert(numBytesToLoad == 0);

    galois::do_all(
        galois::iterate(0u, numNodes),
        [&](unsigned n) {
          if (n != 0) {
            nodeDegrees[n] = outIndexBuffer[n] - outIndexBuffer[n - 1];
          } else {
            nodeDegrees[n] = outIndexBuffer[0];
          }
          // galois::gDebug(n, " degree ", nodeDegrees[n]);
        },
        galois::loopname("GetNodeDegrees"), galois::no_stats());
    free(outIndexBuffer);

#ifndef NDEBUG
    if (base_DistGraph::id == 0) {
      galois::gDebug("Sanity checking node degrees");
    }

    galois::GAccumulator<uint64_t> edgeCount;
    galois::do_all(
        galois::iterate(0u, numNodes),
        [&](unsigned n) { edgeCount += nodeDegrees[n]; },
        galois::loopname("SanityCheckDegrees"), galois::no_stats());
    GALOIS_ASSERT(edgeCount.reduce() == base_DistGraph::numGlobalEdges);
#endif

    return nodeDegrees;
  }

  virtual unsigned getHostIDImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return graphPartitioner->retrieveMaster(gid);
  }

  virtual bool isOwnedImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);
  }

  virtual bool isLocalImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return (base_DistGraph::globalToLocalMap.find(gid) !=
            base_DistGraph::globalToLocalMap.end());
  }

  virtual bool isVertexCutImpl() const { return false; }

public:
  //! typedef for base DistGraph class
  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;

  /**
   * Returns edges owned by this graph (i.e. read).
   */
  uint64_t numOwnedEdges() const { return myKeptEdges; }

  /**
   * Returns # edges kept in all graphs.
   */
  uint64_t globalEdges() const { return globalKeptEdges; }

  std::vector<std::vector<size_t>>& getMirrorEdges() { return mirrorEdges; }

  /**
   * Return the reader of a particular node.
   * @param gid GID of node to get reader of
   * @return Host reader of node passed in as param
   */
  unsigned getHostReader(uint64_t gid) const {
    for (auto i = 0U; i < base_DistGraph::numHosts; ++i) {
      uint64_t start, end;
      std::tie(start, end) = base_DistGraph::gid2host[i];
      if (gid >= start && gid < end) {
        return i;
      }
    }
    return -1;
  }

  /**
   * Constructor
   */
  MiningGraph(
      const std::string& filename, unsigned host, unsigned _numHosts,
      bool setupGluon = true, bool doSort = false,
      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,
      uint32_t nodeWeight = 0, uint32_t edgeWeight = 0)
      : base_DistGraph(host, _numHosts) {
    galois::runtime::reportParam(GRNAME, "MiningGraph", "0");
    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct(
        "GraphPartitioningTime", GRNAME);
    Tgraph_construct.start();

    ////////////////////////////////////////////////////////////////////////////

    galois::graphs::OfflineGraph g(filename);
    base_DistGraph::numGlobalNodes = g.size();
    base_DistGraph::numGlobalEdges = g.sizeEdges();
    std::vector<unsigned> dummy;

    // not actually getting masters, but getting assigned readers for nodes
    base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);

    std::vector<uint64_t> ndegrees;

    if (Partitioner::needNodeDegrees()) {
      if (base_DistGraph::id == 0) {
        galois::gInfo("Calculating node degrees for partitioner");
      }

      galois::runtime::reportParam(GRNAME, "UsingDegreeOrdering", "1");
      ndegrees = getNodeDegrees(filename, base_DistGraph::numGlobalNodes);
    }

    graphPartitioner = std::make_unique<Partitioner>(
        host, _numHosts, base_DistGraph::numGlobalNodes,
        base_DistGraph::numGlobalEdges, ndegrees);
    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);

    ////////////////////////////////////////////////////////////////////////////

    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
        g.edge_begin(nodeBegin);
    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
        g.edge_begin(nodeEnd);

    galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n");
    // never read edge data from disk
    galois::graphs::BufferedGraph<void> bufGraph;
    bufGraph.resetReadCounters();
    galois::StatTimer graphReadTimer("GraphReading", GRNAME);
    graphReadTimer.start();
    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
                              *edgeEnd, base_DistGraph::numGlobalNodes,
                              base_DistGraph::numGlobalEdges);
    graphReadTimer.stop();
    galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n");

    ////////////////////////////////////////////////////////////////////////////

    galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
    inspectionTimer.start();
    bufGraph.resetReadCounters();
    galois::gstl::Vector<uint64_t> prefixSumOfEdges;
    base_DistGraph::numOwned = nodeEnd - nodeBegin;
    prefixSumOfEdges.resize(base_DistGraph::numOwned);

    // initial pass; set up lid-gid mappings, determine which proxies exist on
    // this host; prefix sum of edges cna be set up up to the last master
    // node
    galois::DynamicBitSet presentProxies =
        edgeInspectionRound1(bufGraph, prefixSumOfEdges);
    // set my read nodes on present proxies
    // TODO parallel?
    for (uint64_t i = nodeBegin; i < nodeEnd; i++) {
      presentProxies.set(i);
    }

    // vector to store bitsets received from other hosts
    std::vector<galois::DynamicBitSet> proxiesOnOtherHosts;
    proxiesOnOtherHosts.resize(_numHosts);

    // send off mirror proxies that exist on this host to other hosts
    communicateProxyInfo(presentProxies, proxiesOnOtherHosts);

    // signifies how many outgoing edges a particular host should expect from
    // this host
    std::vector<std::vector<uint64_t>> numOutgoingEdges;
    numOutgoingEdges.resize(base_DistGraph::numHosts);
    // edge inspection phase 2: determine how many edges to send to each host
    // don't actually send yet
    edgeInspectionRound2(bufGraph, numOutgoingEdges, proxiesOnOtherHosts);

    // prefix sum finalization
    finalizePrefixSum(numOutgoingEdges, prefixSumOfEdges);

    // doubly make sure the data is cleared
    freeVector(numOutgoingEdges); // should no longer use this variable
    inspectionTimer.stop();

    ////////////////////////////////////////////////////////////////////////////

    galois::StatTimer allocationTimer("GraphAllocation", GRNAME);
    allocationTimer.start();

    // Graph construction related calls
    base_DistGraph::beginMaster = 0;
    // Allocate and construct the graph
    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
                                       base_DistGraph::numEdges);
    base_DistGraph::graph.constructNodes();

    // edge end fixing
    auto& base_graph = base_DistGraph::graph;
    galois::do_all(
        galois::iterate((uint32_t)0, base_DistGraph::numNodes),
        [&](uint64_t n) { base_graph.fixEndEdge(n, prefixSumOfEdges[n]); },
#if MORE_DIST_STATS
        galois::loopname("FixEndEdgeLoop"),
#endif
        galois::no_stats());
    // get memory from prefix sum back
    prefixSumOfEdges.clear();
    freeVector(prefixSumOfEdges); // should no longer use this variable

    allocationTimer.stop();

    ////////////////////////////////////////////////////////////////////////////

    if (setupGluon) {
      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors("FillMirrors",
                                                          GRNAME);

      TfillMirrors.start();
      fillMirrors();
      TfillMirrors.stop();
    }

    ////////////////////////////////////////////////////////////////////////////

    loadEdges(base_DistGraph::graph, bufGraph, proxiesOnOtherHosts);
    // TODO this might be useful to keep around
    proxiesOnOtherHosts.clear();
    ndegrees.clear();

    // SORT EDGES
    if (doSort) {
      base_DistGraph::sortEdgesByDestination();
    }

    if (setupGluon) {
      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrorsEdges(
          "FillMirrorsEdges", GRNAME);
      TfillMirrorsEdges.start();
      // edges
      mirrorEdges.resize(base_DistGraph::numHosts);
      galois::gPrint("[", base_DistGraph::id,
                     "] Filling mirrors and creating "
                     "mirror map\n");
      fillMirrorsEdgesAndCreateMirrorMap();
      TfillMirrorsEdges.stop();
    }

    ////////////////////////////////////////////////////////////////////////////

    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges("ThreadRangesTime",
                                                          GRNAME);

    galois::gPrint("[", base_DistGraph::id, "] Determining thread ranges\n");

    Tthread_ranges.start();
    base_DistGraph::determineThreadRanges();
    base_DistGraph::determineThreadRangesMaster();
    base_DistGraph::determineThreadRangesWithEdges();
    base_DistGraph::initializeSpecificRanges();
    Tthread_ranges.stop();

    Tgraph_construct.stop();
    galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n");

    galois::DGAccumulator<uint64_t> accumer;
    accumer.reset();
    accumer += base_DistGraph::sizeEdges();
    totalEdgeProxies = accumer.reduce();

    uint64_t totalNodeProxies;
    accumer.reset();
    accumer += base_DistGraph::size();
    totalNodeProxies = accumer.reduce();

    // report some statistics
    if (base_DistGraph::id == 0) {
      galois::runtime::reportStat_Single(
          GRNAME, std::string("TotalNodeProxies"), totalNodeProxies);
      galois::runtime::reportStat_Single(
          GRNAME, std::string("TotalEdgeProxies"), totalEdgeProxies);
      galois::runtime::reportStat_Single(GRNAME,
                                         std::string("OriginalNumberEdges"),
                                         base_DistGraph::globalSizeEdges());
      galois::runtime::reportStat_Single(GRNAME, std::string("TotalKeptEdges"),
                                         globalKeptEdges);
      GALOIS_ASSERT(globalKeptEdges * 2 == base_DistGraph::globalSizeEdges());
      galois::runtime::reportStat_Single(
          GRNAME, std::string("ReplicationFactorNodes"),
          (totalNodeProxies) / (double)base_DistGraph::globalSize());
      galois::runtime::reportStat_Single(
          GRNAME, std::string("ReplicatonFactorEdges"),
          (totalEdgeProxies) / (double)globalKeptEdges);
    }
  }

private:
  galois::DynamicBitSet
  edgeInspectionRound1(galois::graphs::BufferedGraph<void>& bufGraph,
                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    galois::DynamicBitSet incomingMirrors;
    incomingMirrors.resize(base_DistGraph::numGlobalNodes);
    incomingMirrors.reset();

    uint32_t myID         = base_DistGraph::id;
    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;

    // already set before this is called
    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);

    galois::DGAccumulator<uint64_t> keptEdges;
    keptEdges.reset();

    galois::GAccumulator<uint64_t> allEdges;
    allEdges.reset();

    auto& ltgv = base_DistGraph::localToGlobalVector;
    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](size_t n) {
          uint64_t edgeCount = 0;
          auto ii            = bufGraph.edgeBegin(n);
          auto ee            = bufGraph.edgeEnd(n);
          allEdges += std::distance(ii, ee);
          for (; ii < ee; ++ii) {
            uint32_t dst = bufGraph.edgeDestination(*ii);

            if (graphPartitioner->keepEdge(n, dst)) {
              edgeCount++;
              keptEdges += 1;
              // which mirrors do I have
              if (graphPartitioner->retrieveMaster(dst) != myID) {
                incomingMirrors.set(dst);
              }
            }
          }
          prefixSumOfEdges[n - globalOffset] = edgeCount;
          ltgv[n - globalOffset]             = n;
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeInspectionLoop"),
#endif
        galois::steal(), galois::no_stats());

    myKeptEdges     = keptEdges.read_local();
    myReadEdges     = allEdges.reduce();
    globalKeptEdges = keptEdges.reduce();

    // get incoming mirrors ready for creation
    uint32_t additionalMirrorCount = incomingMirrors.count();
    base_DistGraph::localToGlobalVector.resize(
        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);

    // note prefix sum will get finalized in a later step
    if (base_DistGraph::numOwned > 0) {
      prefixSumOfEdges.resize(prefixSumOfEdges.size() + additionalMirrorCount,
                              0);
    } else {
      prefixSumOfEdges.resize(additionalMirrorCount, 0);
    }

    // map creation: lid to gid
    if (additionalMirrorCount > 0) {
      uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;
      uint32_t activeThreads = galois::getActiveThreads();
      std::vector<uint64_t> threadPrefixSums(activeThreads);
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range(0u, totalNumNodes, tid, nthreads);
        uint64_t count = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          if (incomingMirrors.test(i))
            ++count;
        }
        threadPrefixSums[tid] = count;
      });
      // get prefix sums
      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
        threadPrefixSums[i] += threadPrefixSums[i - 1];
      }

      assert(threadPrefixSums.back() == additionalMirrorCount);

      uint32_t startingNodeIndex = base_DistGraph::numOwned;
      // do actual work, second on_each
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range(0u, totalNumNodes, tid, nthreads);
        // start location to start adding things into prefix sums/vectors
        uint32_t threadStartLocation = 0;
        if (tid != 0) {
          threadStartLocation = threadPrefixSums[tid - 1];
        }
        uint32_t handledNodes = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          if (incomingMirrors.test(i)) {
            base_DistGraph::localToGlobalVector[startingNodeIndex +
                                                threadStartLocation +
                                                handledNodes] = i;
            handledNodes++;
          }
        }
      });
    }

    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;
    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
    assert(base_DistGraph::localToGlobalVector.size() ==
           base_DistGraph::numNodes);

    // g2l mapping
    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {
      // global to local map construction
      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
          i;
    }
    assert(base_DistGraph::globalToLocalMap.size() == base_DistGraph::numNodes);

    return incomingMirrors;
  }

  /**
   * Communicate to other hosts which proxies exist on this host.
   *
   * @param presentProxies Bitset marking which proxies are present on this host
   * @param proxiesOnOtherHosts Vector to deserialize received bitsets into
   */
  void communicateProxyInfo(
      galois::DynamicBitSet& presentProxies,
      std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    // Send proxies on this host to other hosts
    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
      if (h != base_DistGraph::id) {
        galois::runtime::SendBuffer bitsetBuffer;
        galois::runtime::gSerialize(bitsetBuffer, presentProxies);
        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
      }
    }

    // receive loop
    for (unsigned h = 0; h < net.Num - 1; h++) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      uint32_t sendingHost = p->first;
      // deserialize proxiesOnOtherHosts
      galois::runtime::gDeserialize(p->second,
                                    proxiesOnOtherHosts[sendingHost]);
    }

    base_DistGraph::increment_evilPhase();
  }

  void edgeInspectionRound2(
      galois::graphs::BufferedGraph<void>& bufGraph,
      std::vector<std::vector<uint64_t>>& numOutgoingEdges,
      std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    // allocate vectors for counting edges that must be sent
    // number of nodes that this host has read from disk
    uint32_t numRead = base_DistGraph::gid2host[base_DistGraph::id].second -
                       base_DistGraph::gid2host[base_DistGraph::id].first;
    // allocate space for outgoing edges
    for (uint32_t i = 0; i < base_DistGraph::numHosts; ++i) {
      numOutgoingEdges[i].assign(numRead, 0);
    }
    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;

    galois::DynamicBitSet hostHasOutgoing;
    hostHasOutgoing.resize(base_DistGraph::numHosts);
    hostHasOutgoing.reset();

    // flip loop order, this can be optimized
    // for each host, loop over my local nodes
    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](size_t n) {
          auto ii = bufGraph.edgeBegin(n);
          auto ee = bufGraph.edgeEnd(n);

          for (; ii < ee; ++ii) {
            uint32_t dst = bufGraph.edgeDestination(*ii);
            // make sure this edge is going to be kept and not dropped
            if (graphPartitioner->keepEdge(n, dst)) {
              for (unsigned h = 0; h < net.Num; h++) {
                if (h != net.ID) {
                  if (proxiesOnOtherHosts[h].test(n)) {
                    // if kept, make sure destination exists on that host
                    if (proxiesOnOtherHosts[h].test(dst)) {
                      // if it does, this edge must be duplicated on that host;
                      // increment count
                      numOutgoingEdges[h][n - globalOffset] += 1;
                      hostHasOutgoing.set(h);
                    }
                  }
                }
              }
            }
          }
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeInspectionRound2Loop"),
#endif
        galois::steal(), galois::no_stats());

    // send data off, then receive it
    sendInspectionData(numOutgoingEdges, hostHasOutgoing);
    recvInspectionData(numOutgoingEdges);
    base_DistGraph::increment_evilPhase();
  }

  /**
   * Send data out from inspection to other hosts.
   *
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   * @param[in] hostHasOutgoing bitset tracking which hosts have outgoing
   * edges from this host
   */
  void sendInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                          galois::DynamicBitSet& hostHasOutgoing) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    galois::GAccumulator<uint64_t> bytesSent;
    bytesSent.reset();

    for (unsigned h = 0; h < net.Num; h++) {
      if (h == net.ID) {
        continue;
      }

      // send outgoing edges data off to comm partner
      galois::runtime::SendBuffer b;

      // only send if non-zeros exist
      if (hostHasOutgoing.test(h)) {
        galois::runtime::gSerialize(b, 1); // token saying data exists
        galois::runtime::gSerialize(b, numOutgoingEdges[h]);
      } else {
        galois::runtime::gSerialize(b, 0); // token saying no data exists
      }
      numOutgoingEdges[h].clear();

      bytesSent.update(b.size());

      // send buffer and free memory
      net.sendTagged(h, galois::runtime::evilPhase, b);
      b.getVec().clear();
    }
    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());

    galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n");
  }

  /**
   * Receive data from inspection from other hosts. Processes the incoming
   * edge bitsets/offsets.
   *
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   */
  void
  recvInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    for (unsigned h = 0; h < net.Num - 1; h++) {
      // expect data from comm partner back
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      uint32_t sendingHost = p->first;

      // get outgoing edges; first get status var
      uint32_t outgoingExists = 2;
      galois::runtime::gDeserialize(p->second, outgoingExists);

      if (outgoingExists == 1) {
        // actual data sent
        galois::runtime::gDeserialize(p->second, numOutgoingEdges[sendingHost]);
      } else if (outgoingExists == 0) {
        // no data sent; just clear again
        numOutgoingEdges[sendingHost].clear();
      } else {
        GALOIS_DIE("unreachable: ", outgoingExists);
      }
    }

    galois::gPrint("[", base_DistGraph::id,
                   "] Inspection receives complete.\n");
  }

  /**
   * Take inspection metadata and begin mapping nodes/creating prefix sums,
   * return the prefix sum.
   */
  galois::gstl::Vector<uint64_t>
  finalizePrefixSum(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                    galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    base_DistGraph::numEdges = 0;

    inspectOutgoingNodes(numOutgoingEdges, prefixSumOfEdges);
    finalizeInspection(prefixSumOfEdges);
    galois::gDebug("[", base_DistGraph::id,
                   "] To receive this many nodes: ", nodesToReceive);
    galois::gPrint("[", base_DistGraph::id,
                   "] Inspection allocation complete.\n");
    return prefixSumOfEdges;
  }

  /**
   * Outgoing inspection: loop over proxy nodes, determnine if need to receive
   * edges.
   */
  void
  inspectOutgoingNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    galois::GAccumulator<uint32_t> toReceive;
    toReceive.reset();

    uint32_t proxyStart = base_DistGraph::numOwned;
    uint32_t proxyEnd   = base_DistGraph::numNodes;
    assert(proxyEnd == prefixSumOfEdges.size());

    galois::GAccumulator<uint64_t> edgesToReceive;
    edgesToReceive.reset();

    // loop over proxy nodes, see if edges need to be sent from another host
    // by looking at results of edge inspection
    galois::do_all(
        galois::iterate(proxyStart, proxyEnd),
        [&](uint32_t lid) {
          uint64_t gid = base_DistGraph::localToGlobalVector[lid];
          assert(gid < base_DistGraph::numGlobalNodes);
          unsigned hostReader = getHostReader(gid);
          assert(hostReader < base_DistGraph::numHosts);
          assert(hostReader != base_DistGraph::id); // self shouldn't be proxy

          uint64_t nodeOffset = base_DistGraph::gid2host[hostReader].first;
          if (numOutgoingEdges[hostReader].size()) {
            if (numOutgoingEdges[hostReader][gid - nodeOffset]) {
              // if this host is going to send me edges, note it for future use
              prefixSumOfEdges[lid] =
                  numOutgoingEdges[hostReader][gid - nodeOffset];
              edgesToReceive += numOutgoingEdges[hostReader][gid - nodeOffset];
              toReceive += 1;
            }
          }
        },
        galois::loopname("OutgoingNodeInspection"), galois::steal(),
        galois::no_stats());

    galois::gPrint("[", base_DistGraph::id, "] Need receive ",
                   edgesToReceive.reduce(), " edges; self is ", myKeptEdges,
                   "\n");
    // get memory back
    numOutgoingEdges.clear();
    nodesToReceive = toReceive.reduce();
  }

  /**
   * finalize metadata maps
   */
  void finalizeInspection(galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    for (unsigned i = 1; i < base_DistGraph::numNodes; i++) {
      // finalize prefix sum
      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];
    }
    if (prefixSumOfEdges.size() != 0) {
      base_DistGraph::numEdges = prefixSumOfEdges.back();
    } else {
      base_DistGraph::numEdges = 0;
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
public:
  galois::GAccumulator<uint64_t> lgMapAccesses;
  /**
   * Construct a map from local edge GIDs to LID
   */
  void constructLocalEdgeGIDMap() {
    lgMapAccesses.reset();
    galois::StatTimer mapConstructTimer("GID2LIDMapConstructTimer", GRNAME);
    mapConstructTimer.start();

    localEdgeGIDToLID.reserve(base_DistGraph::sizeEdges());

    uint64_t count = 0;
    for (unsigned src = 0; src < base_DistGraph::size(); src++) {
      for (auto edge = base_DistGraph::edge_begin(src);
           edge != base_DistGraph::edge_end(src); edge++) {
        assert((*edge) == count);
        unsigned dst      = base_DistGraph::getEdgeDst(edge);
        uint64_t localGID = getEdgeGIDFromSD(src, dst);
        // insert into map
        localEdgeGIDToLID.insert(std::make_pair(localGID, count));
        count++;
      }
    }

    GALOIS_ASSERT(localEdgeGIDToLID.size() == base_DistGraph::sizeEdges());
    GALOIS_ASSERT(count == base_DistGraph::sizeEdges());

    mapConstructTimer.stop();
  }

  void reportAccessBefore() {
    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccessesBefore"),
                                       lgMapAccesses.reduce());
  }

  void reportAccess() {
    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccesses"),
                                       lgMapAccesses.reduce());
  }

  /**
   * checks map constructed above to see which local id corresponds
   * to a node/edge (if it exists)
   *
   * assumes map is generated
   */
  std::pair<uint64_t, bool> getLIDFromMap(unsigned src, unsigned dst) {
    lgMapAccesses += 1;
    // try to find gid in map
    uint64_t localGID = getEdgeGIDFromSD(src, dst);
    auto findResult   = localEdgeGIDToLID.find(localGID);

    // return if found, else return a false
    if (findResult != localEdgeGIDToLID.end()) {
      return std::make_pair(findResult->second, true);
    } else {
      // not found
      return std::make_pair((uint64_t)-1, false);
    }
  }

  uint64_t getEdgeLID(uint64_t gid) {
    uint64_t sourceNodeGID = edgeGIDToSource(gid);
    uint64_t sourceNodeLID = base_DistGraph::getLID(sourceNodeGID);
    uint64_t destNodeLID   = base_DistGraph::getLID(edgeGIDToDest(gid));

    for (auto edge : base_DistGraph::edges(sourceNodeLID)) {
      uint64_t edgeDst = base_DistGraph::getEdgeDst(edge);
      if (edgeDst == destNodeLID) {
        return *edge;
      }
    }
    GALOIS_DIE("unreachable");
    return (uint64_t)-1;
  }

  uint32_t findSourceFromEdge(uint64_t lid) {
    // TODO binary search
    // uint32_t left = 0;
    // uint32_t right = base_DistGraph::numNodes;
    // uint32_t mid = (left + right) / 2;

    for (uint32_t mid = 0; mid < base_DistGraph::numNodes; mid++) {
      uint64_t edge_left  = *(base_DistGraph::edge_begin(mid));
      uint64_t edge_right = *(base_DistGraph::edge_begin(mid + 1));

      if (edge_left <= lid && lid < edge_right) {
        return mid;
      }
    }

    GALOIS_DIE("unreachable");
    return (uint32_t)-1;
  }

  uint64_t getEdgeGID(uint64_t lid) {
    uint32_t src = base_DistGraph::getGID(findSourceFromEdge(lid));
    uint32_t dst = base_DistGraph::getGID(base_DistGraph::getEdgeDst(lid));
    return getEdgeGIDFromSD(src, dst);
  }

private:
  // https://www.quora.com/
  // Is-there-a-mathematical-function-that-converts-two-numbers-into-one-so-
  // that-the-two-numbers-can-always-be-extracted-again
  // GLOBAL IDS ONLY
  uint64_t getEdgeGIDFromSD(uint32_t source, uint32_t dest) {
    return source + (dest % base_DistGraph::numGlobalNodes) *
                        base_DistGraph::numGlobalNodes;
  }

  uint64_t edgeGIDToSource(uint64_t gid) {
    return gid % base_DistGraph::numGlobalNodes;
  }

  uint64_t edgeGIDToDest(uint64_t gid) {
    // assuming this floors
    return gid / base_DistGraph::numGlobalNodes;
  }

  /**
   * Fill up mirror arrays.
   * TODO make parallel?
   */
  void fillMirrors() {
    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -
                                        base_DistGraph::numOwned);
    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
         i++) {
      uint32_t globalID = base_DistGraph::localToGlobalVector[i];
      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
          .push_back(globalID);
    }
  }

  void fillMirrorsEdgesAndCreateMirrorMap() {
    for (uint32_t src = base_DistGraph::numOwned;
         src < base_DistGraph::numNodes; src++) {
      auto ee               = base_DistGraph::edge_begin(src);
      auto ee_end           = base_DistGraph::edge_end(src);
      uint32_t globalSource = base_DistGraph::getGID(src);
      unsigned sourceOwner  = graphPartitioner->retrieveMaster(globalSource);

      for (; ee != ee_end; ++ee) {
        // create mirror array
        uint64_t edgeGID = getEdgeGIDFromSD(
            globalSource,
            base_DistGraph::getGID(base_DistGraph::getEdgeDst(ee)));
        mirrorEdges[sourceOwner].push_back(edgeGID);
      }
    }
  }

  ////////////////////////////////////////////////////////////////////////////////

  template <typename GraphTy>
  void loadEdges(GraphTy& graph, galois::graphs::BufferedGraph<void>& bufGraph,
                 std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {
    galois::StatTimer loadEdgeTimer("EdgeLoading", GRNAME);
    loadEdgeTimer.start();

    bufGraph.resetReadCounters();
    std::atomic<uint32_t> receivedNodes;
    receivedNodes.store(0);

    // sends data
    sendEdges(graph, bufGraph, receivedNodes, proxiesOnOtherHosts);
    // uint64_t bufBytesRead = bufGraph.getBytesRead();
    // get data from graph back (don't need it after sending things out)
    bufGraph.resetAndFree();

    // receives data
    galois::on_each(
        [&](unsigned GALOIS_UNUSED(tid), unsigned GALOIS_UNUSED(nthreads)) {
          receiveEdges(graph, receivedNodes);
        });
    base_DistGraph::increment_evilPhase();
    loadEdgeTimer.stop();

    galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ",
                   loadEdgeTimer.get_usec() / 1000000.0f, " seconds\n");
  }

  // no edge data version
  template <typename GraphTy>
  void sendEdges(GraphTy& graph, galois::graphs::BufferedGraph<void>& bufGraph,
                 std::atomic<uint32_t>& receivedNodes,
                 std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {
    using DstVecType      = std::vector<std::vector<uint64_t>>;
    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;

    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(
        base_DistGraph::numHosts);
    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(
        base_DistGraph::numHosts);

    auto& net                = galois::runtime::getSystemNetworkInterface();
    const unsigned& id       = this->base_DistGraph::id;
    const unsigned& numHosts = this->base_DistGraph::numHosts;

    galois::GAccumulator<uint64_t> messagesSent;
    galois::GAccumulator<uint64_t> bytesSent;
    galois::GReduceMax<uint64_t> maxBytesSent;
    messagesSent.reset();
    bytesSent.reset();
    maxBytesSent.reset();

    // Go over assigned nodes and distribute edges.
    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](uint64_t src) {
          uint32_t lsrc    = 0;
          uint64_t curEdge = 0;
          if (base_DistGraph::isLocal(src)) {
            lsrc    = this->G2L(src);
            curEdge = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
          }

          auto ee        = bufGraph.edgeBegin(src);
          auto ee_end    = bufGraph.edgeEnd(src);
          auto& gdst_vec = *gdst_vecs.getLocal();

          for (unsigned i = 0; i < numHosts; ++i) {
            gdst_vec[i].clear();
          }

          for (; ee != ee_end; ++ee) {
            uint32_t gdst = bufGraph.edgeDestination(*ee);
            // make sure this edge is going to be kept and not dropped
            if (graphPartitioner->keepEdge(src, gdst)) {
              assert(base_DistGraph::isLocal(src));
              uint32_t ldst = this->G2L(gdst);
              graph.constructEdge(curEdge++, ldst);

              for (unsigned h = 0; h < net.Num; h++) {
                if (h != net.ID) {
                  if (proxiesOnOtherHosts[h].test(src)) {
                    // if kept, make sure destination exists on that host
                    if (proxiesOnOtherHosts[h].test(gdst)) {
                      // if it does, this edge must be duplicated on that host;
                      // increment count
                      gdst_vec[h].push_back(gdst);
                    }
                  }
                }
              }
            }
          }

          // make sure all edges accounted for if local
          if (base_DistGraph::isLocal(src)) {
            assert(curEdge == (*graph.edge_end(lsrc)));
          }

          // send
          for (uint32_t h = 0; h < numHosts; ++h) {
            if (h == id)
              continue;

            if (gdst_vec[h].size() > 0) {
              auto& b = (*sendBuffers.getLocal())[h];
              galois::runtime::gSerialize(b, src);
              galois::runtime::gSerialize(b, gdst_vec[h]);

              // send if over limit
              if (b.size() > edgePartitionSendBufSize) {
                messagesSent += 1;
                bytesSent.update(b.size());
                maxBytesSent.update(b.size());

                net.sendTagged(h, galois::runtime::evilPhase, b);
                b.getVec().clear();
                b.getVec().reserve(edgePartitionSendBufSize * 1.25);
              }
            }
          }

          // overlap receives
          auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
          this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeLoading"),
#endif
        galois::steal(), galois::no_stats());

    // flush buffers
    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {
      auto& sbr = *sendBuffers.getRemote(threadNum);
      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {
        if (h == this->base_DistGraph::id)
          continue;
        auto& sendBuffer = sbr[h];
        if (sendBuffer.size() > 0) {
          messagesSent += 1;
          bytesSent.update(sendBuffer.size());
          maxBytesSent.update(sendBuffer.size());

          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
          sendBuffer.getVec().clear();
        }
      }
    }

    net.flush();

    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingMessagesSent"), messagesSent.reduce());
    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingBytesSent"), bytesSent.reduce());
    galois::runtime::reportStat_Tmax(
        GRNAME, std::string("EdgeLoadingMaxBytesSent"), maxBytesSent.reduce());
  }

  //! @copydoc DistGraphHybridCut::processReceivedEdgeBuffer
  template <typename GraphTy>
  void processReceivedEdgeBuffer(
      std::optional<std::pair<uint32_t, galois::runtime::RecvBuffer>>& buffer,
      GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
    if (buffer) {
      auto& rb = buffer->second;
      while (rb.r_size() > 0) {
        uint64_t n;
        std::vector<uint64_t> gdst_vec;
        galois::runtime::gDeserialize(rb, n);
        galois::runtime::gDeserialize(rb, gdst_vec);
        assert(base_DistGraph::isLocal(n));
        uint32_t lsrc = this->G2L(n);
        uint64_t cur = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
        uint64_t cur_end = *graph.edge_end(lsrc);
        assert((cur_end - cur) == gdst_vec.size());
        deserializeEdges(graph, gdst_vec, cur, cur_end);
        ++receivedNodes;
      }
    }
  }

  /**
   * Receive the edge dest/data assigned to this host from other hosts
   * that were responsible for reading them.
   */
  template <typename GraphTy>
  void receiveEdges(GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    // receive edges for all mirror nodes
    while (receivedNodes < nodesToReceive) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      processReceivedEdgeBuffer(p, graph, receivedNodes);
    }
  }

  template <typename GraphTy>
  void deserializeEdges(GraphTy& graph, std::vector<uint64_t>& gdst_vec,
                        uint64_t& cur, uint64_t& cur_end) {
    uint64_t i = 0;
    while (cur < cur_end) {
      uint64_t gdst = gdst_vec[i++];
      uint32_t ldst = this->G2L(gdst);
      graph.constructEdge(cur++, ldst);
    }
  }
};

// make GRNAME visible to public
template <typename NodeTy, typename EdgeTy, typename Partitioner>
constexpr const char* const
    galois::graphs::MiningGraph<NodeTy, EdgeTy, Partitioner>::GRNAME;

} // end namespace graphs
} // end namespace galois
#endif


================================================
FILE: libcusp/include/galois/graphs/NewGeneric.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2020, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file NewGeneric.h
 *
 * Contains the main graph class as well as the partitioning logic that CuSP
 * uses.
 */

#ifndef _GALOIS_DIST_NEWGENERIC_H
#define _GALOIS_DIST_NEWGENERIC_H

#include "galois/graphs/DistributedGraph.h"
#include "galois/DReducible.h"
#include <optional>
#include <sstream>

#define CUSP_PT_TIMER 0

namespace galois {
namespace graphs {
/**
 * @tparam NodeTy type of node data for the graph
 * @tparam EdgeTy type of edge data for the graph
 *
 * @todo fully document and clean up code
 * @warning not meant for public use + not fully documented yet
 */
template <typename NodeTy, typename EdgeTy, typename Partitioner>
class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
  //! size used to buffer edge sends during partitioning
  constexpr static unsigned edgePartitionSendBufSize = 8388608;
  constexpr static const char* const GRNAME          = "dGraph_Generic";
  std::unique_ptr<Partitioner> graphPartitioner;

  //! How many rounds to sync state during edge assignment phase
  uint32_t _edgeStateRounds;
  std::vector<galois::DGAccumulator<uint64_t>> hostLoads;
  std::vector<uint64_t> old_hostLoads;

  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {
    assert(base_DistGraph::isLocal(gid));
    // optimized for edge cuts
    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)
      return gid - globalOffset;

    return base_DistGraph::globalToLocalMap.at(gid);
  }

  /**
   * Free memory of a vector by swapping an empty vector with it
   */
  template <typename V>
  void freeVector(V& vectorToKill) {
    V dummyVector;
    vectorToKill.swap(dummyVector);
  }

  uint32_t nodesToReceive;

public:
  //! typedef for base DistGraph class
  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;

private:
  virtual unsigned getHostIDImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return graphPartitioner->retrieveMaster(gid);
  }

  virtual bool isOwnedImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);
  }

  virtual bool isLocalImpl(uint64_t gid) const {
    assert(gid < base_DistGraph::numGlobalNodes);
    return (base_DistGraph::globalToLocalMap.find(gid) !=
            base_DistGraph::globalToLocalMap.end());
  }

  // TODO current uses graph partitioner
  // TODO make it so user doens't have to specify; can be done by tracking
  // if an outgoing mirror is marked as having an incoming edge on any
  // host
  virtual bool isVertexCutImpl() const {
    return graphPartitioner->isVertexCut();
  }
  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {
    return graphPartitioner->cartesianGrid();
  }

public:
  /**
   * Reset load balance on host reducibles.
   */
  void resetEdgeLoad() {
    if (_edgeStateRounds > 1) {
      if (!graphPartitioner->noCommunication()) {
        for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {
          hostLoads[i].reset();
          old_hostLoads[i] = 0;
        }
      }
    }
  }

  /**
   * Sync load balance on hosts using reducibles.
   */
  void syncEdgeLoad() {
    if (_edgeStateRounds > 1) {
      if (!graphPartitioner->noCommunication()) {
        for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {
          old_hostLoads[i] += hostLoads[i].reduce();
          hostLoads[i].reset();
        }
      }
    }
  }

  /**
   * Debug function: prints host loads.
   */
  void printEdgeLoad() {
    if (_edgeStateRounds > 1) {
      if (!graphPartitioner->noCommunication()) {
        if (base_DistGraph::id == 0) {
          for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {
            galois::gDebug("[", base_DistGraph::id, "] ", i, " ",
                           old_hostLoads[i], "\n");
          }
        }
      }
    }
  }

  /**
   * Constructor
   */
  NewDistGraphGeneric(
      const std::string& filename, unsigned host, unsigned _numHosts,
      bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false,
      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,
      uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
      std::string masterBlockFile = "", bool readFromFile = false,
      std::string localGraphFileName = "local_graph",
      uint32_t edgeStateRounds       = 1)
      : base_DistGraph(host, _numHosts), _edgeStateRounds(edgeStateRounds) {
    galois::runtime::reportParam("dGraph", "GenericPartitioner", "0");
    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct(
        "GraphPartitioningTime", GRNAME);
    Tgraph_construct.start();

    if (readFromFile) {
      galois::gPrint("[", base_DistGraph::id,
                     "] Reading local graph from file ", localGraphFileName,
                     "\n");
      base_DistGraph::read_local_graph_from_file(localGraphFileName);
      Tgraph_construct.stop();
      return;
    }

    galois::graphs::OfflineGraph g(filename);
    base_DistGraph::numGlobalNodes = g.size();
    base_DistGraph::numGlobalEdges = g.sizeEdges();
    std::vector<unsigned> dummy;
    // not actually getting masters, but getting assigned readers for nodes
    if (masterBlockFile == "") {
      base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);
    } else {
      galois::gInfo("Getting reader assignment from file");
      base_DistGraph::readersFromFile(g, masterBlockFile);
    }

    graphPartitioner = std::make_unique<Partitioner>(
        host, _numHosts, base_DistGraph::numGlobalNodes,
        base_DistGraph::numGlobalEdges);
    // TODO abstract this away somehow
    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);

    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
        g.edge_begin(nodeBegin);
    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
        g.edge_begin(nodeEnd);

    // signifies how many outgoing edges a particular host should expect from
    // this host
    std::vector<std::vector<uint64_t>> numOutgoingEdges;
    // signifies if a host should create a node because it has an incoming edge
    std::vector<galois::DynamicBitSet> hasIncomingEdge;

    // only need to use for things that need communication
    if (!graphPartitioner->noCommunication()) {
      if (_edgeStateRounds > 1) {
        hostLoads.resize(base_DistGraph::numHosts);
        old_hostLoads.resize(base_DistGraph::numHosts);
        resetEdgeLoad();
      }
      numOutgoingEdges.resize(base_DistGraph::numHosts);
      hasIncomingEdge.resize(base_DistGraph::numHosts);
    }

    // phase 0

    galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n");
    galois::graphs::BufferedGraph<EdgeTy> bufGraph;
    bufGraph.resetReadCounters();
    galois::StatTimer graphReadTimer("GraphReading", GRNAME);
    graphReadTimer.start();
    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
                              *edgeEnd, base_DistGraph::numGlobalNodes,
                              base_DistGraph::numGlobalEdges);
    graphReadTimer.stop();
    galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n");

    if (graphPartitioner->masterAssignPhase()) {
      // loop over all nodes, determine where neighbors are, assign masters
      galois::StatTimer phase0Timer("Phase0", GRNAME);
      galois::gPrint("[", base_DistGraph::id,
                     "] Starting master assignment.\n");
      phase0Timer.start();
      phase0(bufGraph, cuspAsync, stateRounds);
      phase0Timer.stop();
      galois::gPrint("[", base_DistGraph::id,
                     "] Master assignment complete.\n");
    }

    galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
    inspectionTimer.start();
    bufGraph.resetReadCounters();
    galois::gstl::Vector<uint64_t> prefixSumOfEdges;

    // assign edges to other nodes
    if (!graphPartitioner->noCommunication()) {
      edgeInspection(bufGraph, numOutgoingEdges, hasIncomingEdge,
                     inspectionTimer);
      galois::DynamicBitSet& finalIncoming =
          hasIncomingEdge[base_DistGraph::id];

      galois::StatTimer mapTimer("NodeMapping", GRNAME);
      mapTimer.start();
      nodeMapping(numOutgoingEdges, finalIncoming, prefixSumOfEdges);
      mapTimer.stop();

      finalIncoming.resize(0);
    } else {
      base_DistGraph::numOwned = nodeEnd - nodeBegin;
      uint64_t edgeOffset      = *bufGraph.edgeBegin(nodeBegin);
      // edge prefix sum, no comm required
      edgeCutInspection(bufGraph, inspectionTimer, edgeOffset,
                        prefixSumOfEdges);
    }
    // inspection timer is stopped in edgeInspection function

    // flip partitioners that have a master assignment phase to stage 2
    // (meaning all nodes and masters that will be on this host are present in
    // the partitioner's metadata)
    if (graphPartitioner->masterAssignPhase()) {
      graphPartitioner->enterStage2();
    }

    // get memory back from inspection metadata
    numOutgoingEdges.clear();
    hasIncomingEdge.clear();
    // doubly make sure the data is cleared
    freeVector(numOutgoingEdges); // should no longer use this variable
    freeVector(hasIncomingEdge);  // should no longer use this variable

    // Graph construction related calls

    base_DistGraph::beginMaster = 0;
    // Allocate and construct the graph
    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
                                       base_DistGraph::numEdges);
    base_DistGraph::graph.constructNodes();

    // edge end fixing
    auto& base_graph = base_DistGraph::graph;
    galois::do_all(
        galois::iterate((uint32_t)0, base_DistGraph::numNodes),
        [&](uint64_t n) { base_graph.fixEndEdge(n, prefixSumOfEdges[n]); },
#if MORE_DIST_STATS
        galois::loopname("FixEndEdgeLoop"),
#endif
        galois::no_stats());
    // get memory from prefix sum back
    prefixSumOfEdges.clear();
    freeVector(prefixSumOfEdges); // should no longer use this variable
    galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors("FillMirrors", GRNAME);

    TfillMirrors.start();
    fillMirrors();
    TfillMirrors.stop();

    if (_edgeStateRounds > 1) {
      // reset edge load since we need exact same answers again
      resetEdgeLoad();
    }

    // Edge loading
    if (!graphPartitioner->noCommunication()) {
      loadEdges(base_DistGraph::graph, bufGraph);
    } else {
      // Edge cut construction
      edgeCutLoad(base_DistGraph::graph, bufGraph);
      bufGraph.resetAndFree();
    }

    // Finalization

    // TODO this is a hack; fix it somehow
    // if vertex cut but not a cart cut is the condition
    if (graphPartitioner->isVertexCut() &&
        graphPartitioner->cartesianGrid().first == 0) {
      base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
    }

    if (transpose) {
      base_DistGraph::transposed        = true;
      base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
      if (base_DistGraph::numNodes > 0) {
        // consider all nodes to have outgoing edges (TODO better way to do
        // this?) for now it's fine I guess
        base_DistGraph::graph.transpose(GRNAME);
      }
    }

    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges("ThreadRangesTime",
                                                          GRNAME);

    Tthread_ranges.start();
    base_DistGraph::determineThreadRanges();
    Tthread_ranges.stop();

    base_DistGraph::determineThreadRangesMaster();
    base_DistGraph::determineThreadRangesWithEdges();
    base_DistGraph::initializeSpecificRanges();

    Tgraph_construct.stop();
    galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n");

    // report state rounds
    if (base_DistGraph::id == 0) {
      galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds",
                                         (uint32_t)stateRounds);
    }
  }

private:
  galois::runtime::SpecificRange<boost::counting_iterator<size_t>>
  getSpecificThreadRange(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                         std::vector<uint32_t>& assignedThreadRanges,
                         uint64_t startNode, uint64_t endNode) {
    galois::StatTimer threadRangeTime("Phase0ThreadRangeTime");
    threadRangeTime.start();
    uint64_t numLocalNodes = endNode - startNode;
    galois::PODResizeableArray<uint64_t> edgePrefixSum;
    edgePrefixSum.resize(numLocalNodes);

    // get thread ranges with a prefix sum
    galois::do_all(
        galois::iterate(startNode, endNode),
        [&](unsigned n) {
          uint64_t offset       = n - startNode;
          edgePrefixSum[offset] = bufGraph.edgeEnd(n) - bufGraph.edgeBegin(n);
        },
        galois::no_stats());

    for (unsigned i = 1; i < numLocalNodes; i++) {
      edgePrefixSum[i] += edgePrefixSum[i - 1];
    }

    assignedThreadRanges = galois::graphs::determineUnitRangesFromPrefixSum(
        galois::runtime::activeThreads, edgePrefixSum);

    for (unsigned i = 0; i < galois::runtime::activeThreads + 1; i++) {
      assignedThreadRanges[i] += startNode;
    }

    auto toReturn = galois::runtime::makeSpecificRange(
        boost::counting_iterator<size_t>(startNode),
        boost::counting_iterator<size_t>(startNode + numLocalNodes),
        assignedThreadRanges.data());

    threadRangeTime.stop();
    return toReturn;
  }

  /**
   * For each other host, determine which nodes that this host needs to get
   * info from
   *
   * @param bufGraph Buffered graph used to loop over edges
   * @param ghosts bitset; at end
   * of execution, marked bits signify neighbors on this host that that other
   * host has read (and therefore must sync with me)
   */
  // steps 1 and 2 of neighbor location setup: memory allocation, bitset setting
  void phase0BitsetSetup(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                         galois::DynamicBitSet& ghosts) {
    galois::StatTimer bitsetSetupTimer("Phase0BitsetSetup", GRNAME);
    bitsetSetupTimer.start();

    ghosts.resize(bufGraph.size());
    ghosts.reset();

    std::vector<uint32_t> rangeVector;
    auto start = base_DistGraph::gid2host[base_DistGraph::id].first;
    auto end   = base_DistGraph::gid2host[base_DistGraph::id].second;

    galois::runtime::SpecificRange<boost::counting_iterator<size_t>> work =
        getSpecificThreadRange(bufGraph, rangeVector, start, end);

    // Step 2: loop over all local nodes, determine neighbor locations
    galois::do_all(
        galois::iterate(work),
        // galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
        //                base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](unsigned n) {
          // ptt.start();
          // galois::gPrint("[", base_DistGraph::id, " ",
          // galois::substrate::getThreadPool().getTID(), "] ", n, "\n");
          auto ii = bufGraph.edgeBegin(n);
          auto ee = bufGraph.edgeEnd(n);
          for (; ii < ee; ++ii) {
            uint32_t dst = bufGraph.edgeDestination(*ii);
            if ((dst < start) || (dst >= end)) { // not owned by this host
              // set on bitset
              ghosts.set(dst);
            }
          }
          // ptt.stop();
        },
        galois::loopname("Phase0BitsetSetup_DetermineNeighborLocations"),
        galois::steal(), galois::no_stats());

    bitsetSetupTimer.stop();
  }

  // sets up the gid to lid mapping for phase 0
  /**
   * Set up the GID to LID mapping for phase 0: In the mapping vector,
   * read nodes occupy the first chunk, and nodes read by other hosts follow.
   *
   * @param ghosts
   * @param gid2offsets mapping vector: element at an offset corresponds to a
   * particular GID (and its master)
   * @param syncNodes one vector of nodes for each host: at the end of
   * execution, will contain mirrors on this host whose master is on that host
   * @returns Number of set bits
   */
  uint64_t phase0MapSetup(
      galois::DynamicBitSet& ghosts,
      std::unordered_map<uint64_t, uint32_t>& gid2offsets,
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {
    galois::StatTimer mapSetupTimer("Phase0MapSetup", GRNAME);
    mapSetupTimer.start();

    uint32_t numLocal = base_DistGraph::gid2host[base_DistGraph::id].second -
                        base_DistGraph::gid2host[base_DistGraph::id].first;
    uint32_t lid = numLocal;

    uint64_t numToReserve = ghosts.count();
    gid2offsets.reserve(numToReserve);

    // TODO: parallelize using prefix sum?
    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
      if (h == base_DistGraph::id)
        continue;
      auto start = base_DistGraph::gid2host[h].first;
      auto end   = base_DistGraph::gid2host[h].second;
      for (uint64_t gid = start; gid < end; ++gid) {
        if (ghosts.test(gid)) {
          gid2offsets[gid] = lid;
          syncNodes[h].push_back(gid - start);
          lid++;
        }
      }
      galois::gDebug("[", base_DistGraph::id, " -> ", h, "] bitset size ",
                     (end - start) / 64, " vs. vector size ",
                     syncNodes[h].size() / 2);
    }
    lid -= numLocal;

    assert(lid == numToReserve);
    galois::gDebug("[", base_DistGraph::id, "] total bitset size ",
                   (ghosts.size() - numLocal) / 64, " vs. total vector size ",
                   numToReserve / 2);

    // TODO: should not be used after this - refactor to make this clean
    ghosts.resize(0);

    mapSetupTimer.stop();

    return lid;
  }

  // steps 4 and 5 of neighbor location setup
  /**
   * Let other hosts know which nodes they need to send to me by giving them
   * the bitset marked with nodes I am interested in on the other host.
   *
   * @param syncNodes one vector of nodes for each host: at the begin of
   * execution, will contain mirrors on this host whose master is on that host;
   * at the end of execution, will contain masters on this host whose mirror
   * is on that host
   */
  void phase0SendRecv(
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    galois::StatTimer p0BitsetCommTimer("Phase0SendRecvBitsets", GRNAME);
    p0BitsetCommTimer.start();
    uint64_t bytesSent = 0;

    // Step 4: send bitset to other hosts
    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {
      galois::runtime::SendBuffer bitsetBuffer;

      if (h != base_DistGraph::id) {
        galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]);
        bytesSent += bitsetBuffer.size();
        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
      }
    }

    // Step 5: recv bitset to other hosts; this indicates which local nodes each
    // other host needs to be informed of updates of
    for (unsigned h = 0; h < net.Num - 1; h++) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      uint32_t sendingHost = p->first;
      // deserialize into neighbor bitsets
      galois::runtime::gDeserialize(p->second, syncNodes[sendingHost]);
    }

    p0BitsetCommTimer.stop();

    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("Phase0SendRecvBitsetsBytesSent"), bytesSent);

    // comm phase complete
    base_DistGraph::increment_evilPhase();
  }

  /**
   * Given a set of loads in a vector and the accumulation to those loads,
   * synchronize them across hosts and do the accumulation into the vector
   * of loads.
   *
   * @param loads Vector of loads to accumulate to
   * @param accums Vector of accuulations to loads that occured since last
   * sync
   */
  void syncLoad(std::vector<uint64_t>& loads,
                std::vector<galois::CopyableAtomic<uint64_t>>& accums) {
    assert(loads.size() == accums.size());
    // use DG accumulator to force barrier on all hosts to sync this data
    galois::DGAccumulator<uint64_t> syncer;
    // sync accum for each host one by one
    for (unsigned i = 0; i < loads.size(); i++) {
      syncer.reset();
      syncer += (accums[i].load());
      accums[i].store(0);
      uint64_t accumulation = syncer.reduce();
      loads[i] += accumulation;
    }
  }

  /**
   * Given a copyable atomic vector, get data from it, save to a
   * PODResizeableArray, and reset value in the atomic array.
   *
   * @param atomic Atomic vector to extract and reset
   * @param nonAtomic PODarray to extract data into
   */
  template <typename VType>
  void
  extractAtomicToPODArray(std::vector<galois::CopyableAtomic<VType>>& atomic,
                          galois::PODResizeableArray<VType>& nonAtomic) {
    nonAtomic.resize(atomic.size());

    galois::do_all(
        galois::iterate((size_t)0, atomic.size()),
        [&](size_t i) {
          nonAtomic[i] = atomic[i].load();
          atomic[i].store(0);
        },
        galois::no_stats());
  }

  /**
   * Send newly accumulated node and edge loads to all other hosts and reset
   * the accumulated values. No DG accmulator used.
   *
   * @param nodeAccum new node accumulation for each host in system
   * @param edgeAccum new edge accumulation for each host in system
   */
  void asyncSendLoad(galois::PODResizeableArray<uint64_t>& nodeAccum,
                     galois::PODResizeableArray<uint64_t>& edgeAccum) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    unsigned bytesSent = 0;
    galois::StatTimer sendTimer("Phase0AsyncSendLoadTime", GRNAME);

    sendTimer.start();
    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {
      if (h != base_DistGraph::id) {
        // serialize node and edge accumulations with tag 4 (to avoid
        // conflict with other tags being used) and send
        galois::runtime::SendBuffer b;

        galois::runtime::gSerialize(b, 4);
        galois::runtime::gSerialize(b, nodeAccum);
        galois::runtime::gSerialize(b, edgeAccum);
        bytesSent += b.size();

        // note the +1 on evil phase; load messages send using a different
        // phase to avoid conflicts
        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
      }
    }
    sendTimer.stop();

    galois::runtime::reportStat_Tsum(GRNAME, "Phase0AsyncSendLoadBytesSent",
                                     bytesSent);
  }

  /**
   * Receive (if it exists) new node/edge loads from other hosts and add it to
   * our own loads.
   *
   * @param nodeLoads current node load information for each host in system
   * @param edgeLoads current edge load information for each host in system
   */
  void asyncRecvLoad(std::vector<uint64_t>& nodeLoads,
                     std::vector<uint64_t>& edgeLoads,
                     galois::DynamicBitSet& loadsClear) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p;

    galois::StatTimer recvTimer("Phase0AsyncRecvLoadTime", GRNAME);
    recvTimer.start();
    do {
      // note the +1
      p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr);

      if (p) {
        unsigned messageType = (unsigned)-1;
        // deserialize message type
        galois::runtime::gDeserialize(p->second, messageType);

        if (messageType == 4) {
          galois::PODResizeableArray<uint64_t> recvNodeAccum;
          galois::PODResizeableArray<uint64_t> recvEdgeAccum;
          // loads to add
          galois::runtime::gDeserialize(p->second, recvNodeAccum);
          galois::runtime::gDeserialize(p->second, recvEdgeAccum);

          assert(recvNodeAccum.size() == recvEdgeAccum.size());
          assert(recvNodeAccum.size() == nodeLoads.size());
          assert(recvEdgeAccum.size() == edgeLoads.size());

          galois::do_all(
              galois::iterate((size_t)0, recvNodeAccum.size()),
              [&](size_t i) {
                nodeLoads[i] += recvNodeAccum[i];
                edgeLoads[i] += recvEdgeAccum[i];
              },
              galois::no_stats());
        } else if (messageType == 3) {
          // all clear message from host
          uint32_t sendingHost = p->first;
          assert(!loadsClear.test(sendingHost));
          loadsClear.set(sendingHost);
        } else {
          GALOIS_DIE("unexpected message type in async load synchronization: ",
                     messageType);
        }
      }
    } while (p);

    recvTimer.stop();
  }

  /**
   * Send out accumulated loads from a round of node assignments to all other
   * hosts and also receive loads from other hosts if they exist
   * (non-blocking).
   *
   * @param nodeLoads current known node loads on this host
   * @param nodeAccum newly accumulated node loads from a prior round of node
   * assignments
   * @param edgeLoads current known edge loads on this host
   * @param edgeAccum newly accumulated edge loads from a prior round of node
   * assignments
   * @param loadsClear Bitset tracking if we have received all loads from
   * a particular host
   */
  void asyncSyncLoad(std::vector<uint64_t>& nodeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,
                     std::vector<uint64_t>& edgeLoads,
                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum,
                     galois::DynamicBitSet& loadsClear) {
    assert(nodeLoads.size() == base_DistGraph::numHosts);
    assert(nodeAccum.size() == base_DistGraph::numHosts);
    assert(edgeLoads.size() == base_DistGraph::numHosts);
    assert(edgeAccum.size() == base_DistGraph::numHosts);

    galois::StatTimer syncTimer("Phase0AsyncSyncLoadTime", GRNAME);
    syncTimer.start();

    // extract out data to send
    galois::PODResizeableArray<uint64_t> nonAtomicNodeAccum;
    galois::PODResizeableArray<uint64_t> nonAtomicEdgeAccum;
    extractAtomicToPODArray(nodeAccum, nonAtomicNodeAccum);
    extractAtomicToPODArray(edgeAccum, nonAtomicEdgeAccum);

    assert(nonAtomicNodeAccum.size() == base_DistGraph::numHosts);
    assert(nonAtomicEdgeAccum.size() == base_DistGraph::numHosts);

    // apply loads to self
    galois::do_all(
        galois::iterate((uint32_t)0, base_DistGraph::numHosts),
        [&](size_t i) {
          nodeLoads[i] += nonAtomicNodeAccum[i];
          edgeLoads[i] += nonAtomicEdgeAccum[i];
        },
        galois::no_stats());

#ifndef NDEBUG
    for (unsigned i = 0; i < nodeAccum.size(); i++) {
      assert(nodeAccum[i].load() == 0);
      assert(edgeAccum[i].load() == 0);
    }
#endif

    // send both nodes and edges accumulation at once
    asyncSendLoad(nonAtomicNodeAccum, nonAtomicEdgeAccum);
    asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);

    syncTimer.stop();
  }

  /**
   * Debug function: simply prints loads and accumulations
   *
   * @param loads Vector of loads to accumulate to
   * @param accums Vector of accuulations to loads that occured since last
   * sync
   */
  void printLoad(std::vector<uint64_t>& loads,
                 std::vector<galois::CopyableAtomic<uint64_t>>& accums) {
    assert(loads.size() == accums.size());
    for (unsigned i = 0; i < loads.size(); i++) {
      galois::gDebug("[", base_DistGraph::id, "] ", i, " total ", loads[i],
                     " accum ", accums[i].load());
    }
  }

  /**
   * Given a vector of data and a bitset specifying which elements in the data
   * vector need to be extracted, extract the appropriate elements into
   * a vector.
   *
   * @param offsets Bitset specifying which elements in the data vector need
   * to be extracted.
   * @param dataVector Data vector to extract data from according to the bitset
   * @return Vector of extracted elements
   */
  template <typename T>
  std::vector<T> getDataFromOffsets(std::vector<uint32_t>& offsetVector,
                                    const std::vector<T>& dataVector) {
    std::vector<T> toReturn;
    toReturn.resize(offsetVector.size());

    galois::do_all(
        galois::iterate((size_t)0, offsetVector.size()),
        [&](unsigned i) { toReturn[i] = dataVector[offsetVector[i]]; },
        galois::no_stats());

    return toReturn;
  }

  /**
   * Given a host, a bitset that marks offsets, and a vector,
   * send the data located at the offsets from the vector to the
   * specified host. If bitset is unmarked, send a no-op.
   *
   * @param targetHost Host to send data to
   * @param toSync Bitset that specifies which offsets in the data vector
   * to send
   * @param dataVector Data to be sent to the target host
   */
  void sendOffsets(unsigned targetHost, galois::DynamicBitSet& toSync,
                   std::vector<uint32_t>& dataVector,
                   std::string timerName = std::string()) {
    auto& net              = galois::runtime::getSystemNetworkInterface();
    std::string statString = std::string("Phase0SendOffsets_") + timerName;
    uint64_t bytesSent     = 0;

    galois::StatTimer sendOffsetsTimer(statString.c_str(), GRNAME);

    sendOffsetsTimer.start();

    // this means there are updates to send
    if (toSync.count()) {
      std::vector<uint32_t> offsetVector = toSync.getOffsets();
      // get masters to send into a vector
      std::vector<uint32_t> mastersToSend =
          getDataFromOffsets(offsetVector, dataVector);

      assert(mastersToSend.size());

      size_t num_selected = toSync.count();
      size_t num_total    = toSync.size();
      // figure out how to send (most efficient method; either bitset
      // and data or offsets + data)
      size_t bitset_alloc_size =
          ((num_total + 63) / 64) * sizeof(uint64_t) + (2 * sizeof(size_t));
      size_t bitsetDataSize = (num_selected * sizeof(uint32_t)) +
                              bitset_alloc_size + sizeof(num_selected);
      size_t offsetsDataSize = (num_selected * sizeof(uint32_t)) +
                               (num_selected * sizeof(unsigned int)) +
                               sizeof(uint32_t) + sizeof(num_selected);

      galois::runtime::SendBuffer b;
      // tag with send method and do send
      if (bitsetDataSize < offsetsDataSize) {
        // send bitset, tag 1
        galois::runtime::gSerialize(b, 1u);
        galois::runtime::gSerialize(b, toSync);
        galois::runtime::gSerialize(b, mastersToSend);
      } else {
        // send offsets, tag 2
        galois::runtime::gSerialize(b, 2u);
        galois::runtime::gSerialize(b, offsetVector);
        galois::runtime::gSerialize(b, mastersToSend);
      }
      bytesSent += b.size();
      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
    } else {
      // send empty no-op message, tag 0
      galois::runtime::SendBuffer b;
      galois::runtime::gSerialize(b, 0u);
      bytesSent += b.size();
      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
    }
    sendOffsetsTimer.stop();

    galois::runtime::reportStat_Tsum(GRNAME, statString + "BytesSent",
                                     bytesSent);
  }

  /**
   * Send new master assignment updates to other hosts based on syncNodes
   * for each host prepared in advance.
   *
   * @param begin to end: which nodes on this host have been updated
   * @param numLocalNodes: number of owned nodes
   * @param localNodeToMaster Vector map: an offset corresponds to a particular
   * GID; indicates masters of GIDs
   * @param syncNodes one vector of nodes for each host: contains mirrors on
   * this host whose master is on that host
   */
  void syncAssignmentSends(
      uint32_t begin, uint32_t end, uint32_t numLocalNodes,
      std::vector<uint32_t>& localNodeToMaster,
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {
    galois::StatTimer p0assignSendTime("Phase0AssignmentSendTime", GRNAME);
    p0assignSendTime.start();

    galois::DynamicBitSet toSync;
    toSync.resize(numLocalNodes);

    // send loop
    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {
      if (h != base_DistGraph::id) {
        toSync.reset();
        // send if in [start,end) and present in syncNodes[h]
        galois::do_all(
            galois::iterate(syncNodes[h]),
            [&](uint32_t lid) {
              if ((lid >= begin) && (lid < end)) {
                toSync.set(lid);
              }
            },
            galois::no_stats());
        // do actual send based on sync bitset
        sendOffsets(h, toSync, localNodeToMaster, "NewAssignments");
      }
    }

    p0assignSendTime.stop();
  }

  /**
   * Send message to all hosts saying we're done with assignments. Can
   * specify a phase to distinguish between all clears for assignments
   * and loads
   */
  void sendAllClears(unsigned phase = 0) {
    unsigned bytesSent = 0;
    auto& net          = galois::runtime::getSystemNetworkInterface();
    galois::StatTimer allClearTimer("Phase0SendAllClearTime", GRNAME);
    allClearTimer.start();

    // send loop
    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {
      if (h != base_DistGraph::id) {
        galois::runtime::SendBuffer b;
        galois::runtime::gSerialize(b, 3u);
        bytesSent += b.size();
        // assumes phase is 0 or 1
        if (phase == 1) {
          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
        } else if (phase == 0) {
          net.sendTagged(h, galois::runtime::evilPhase, b);
        } else {
          GALOIS_DIE("unexpected phase: ", phase);
        }
      }
    }
    allClearTimer.stop();

    galois::runtime::reportStat_Tsum(GRNAME, "Phase0SendAllClearBytesSent",
                                     bytesSent);
  }

  void saveReceivedMappings(std::vector<uint32_t>& localNodeToMaster,
                            std::unordered_map<uint64_t, uint32_t>& gid2offsets,
                            unsigned sendingHost,
                            std::vector<uint32_t>& receivedOffsets,
                            std::vector<uint32_t>& receivedMasters) {
    uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;
    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, " offset ",
                   hostOffset);

    // if execution gets here, messageType was 1 or 2
    assert(receivedMasters.size() == receivedOffsets.size());

    galois::do_all(
        galois::iterate((size_t)0, receivedMasters.size()),
        [&](size_t i) {
          uint64_t curGID       = hostOffset + receivedOffsets[i];
          uint32_t indexIntoMap = gid2offsets[curGID];
          galois::gDebug("[", base_DistGraph::id, "] gid ", curGID, " offset ",
                         indexIntoMap);
          localNodeToMaster[indexIntoMap] = receivedMasters[i];
        },
        galois::no_stats());
  }

  /**
   * Receive offsets and masters into the provided vectors and return sending
   * host and the message type.
   *
   * @param receivedOffsets vector to receive offsets into
   * @param receivedMasters vector to receive masters mappings into
   * @returns sending host and message type of received data
   */
  std::pair<unsigned, unsigned>
  recvOffsetsAndMasters(std::vector<uint32_t>& receivedOffsets,
                        std::vector<uint32_t>& receivedMasters) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
    do {
      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
    } while (!p);

    uint32_t sendingHost = p->first;
    unsigned messageType = (unsigned)-1;

    // deserialize message type
    galois::runtime::gDeserialize(p->second, messageType);

    if (messageType == 1) {
      // bitset; deserialize, then get offsets
      galois::DynamicBitSet receivedSet;
      galois::runtime::gDeserialize(p->second, receivedSet);
      receivedOffsets = receivedSet.getOffsets();
      galois::runtime::gDeserialize(p->second, receivedMasters);
    } else if (messageType == 2) {
      // offsets
      galois::runtime::gDeserialize(p->second, receivedOffsets);
      galois::runtime::gDeserialize(p->second, receivedMasters);
    } else if (messageType != 0) {
      GALOIS_DIE("invalid message type for sync of master assignments: ",
                 messageType);
    }

    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
                   " send message type ", messageType);

    return std::make_pair(sendingHost, messageType);
  }

  /**
   * Receive offsets and masters into the provided vectors and return sending
   * host and the message type, async (i.e. does not have to receive anything
   * to exit function.
   *
   * @param receivedOffsets vector to receive offsets into
   * @param receivedMasters vector to receive masters mappings into
   */
  void recvOffsetsAndMastersAsync(
      std::vector<uint32_t>& localNodeToMaster,
      std::unordered_map<uint64_t, uint32_t>& gid2offsets,
      galois::DynamicBitSet& hostFinished) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;

    // repeat loop until no message
    do {
      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      if (p) {
        uint32_t sendingHost = p->first;
        unsigned messageType = (unsigned)-1;

        std::vector<uint32_t> receivedOffsets;
        std::vector<uint32_t> receivedMasters;

        // deserialize message type
        galois::runtime::gDeserialize(p->second, messageType);

        if (messageType == 1) {
          // bitset; deserialize, then get offsets
          galois::DynamicBitSet receivedSet;
          galois::runtime::gDeserialize(p->second, receivedSet);
          receivedOffsets = receivedSet.getOffsets();
          galois::runtime::gDeserialize(p->second, receivedMasters);
          saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,
                               receivedOffsets, receivedMasters);
        } else if (messageType == 2) {
          // offsets
          galois::runtime::gDeserialize(p->second, receivedOffsets);
          galois::runtime::gDeserialize(p->second, receivedMasters);
          saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,
                               receivedOffsets, receivedMasters);
        } else if (messageType == 3) {
          // host indicating that it is done with all assignments from its
          // end; mark as such in bitset
          assert(!hostFinished.test(sendingHost));
          hostFinished.set(sendingHost);
        } else if (messageType != 0) {
          GALOIS_DIE("invalid message type for sync of master assignments: ",
                     messageType);
        }

        galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
                       " send message type ", messageType);
      }
    } while (p);
  }

  /**
   * Receive new master assignment updates from other hosts and update local
   * mappings.
   *
   * @param localNodeToMaster Vector map: an offset corresponds to a particular
   * GID; indicates masters of GIDs
   * @param gid2offsets Map of GIDs to the offset into the vector map that
   * corresponds to it
   */
  void
  syncAssignmentReceives(std::vector<uint32_t>& localNodeToMaster,
                         std::unordered_map<uint64_t, uint32_t>& gid2offsets) {
    galois::StatTimer p0assignReceiveTime("Phase0AssignmentReceiveTime",
                                          GRNAME);
    p0assignReceiveTime.start();

    // receive loop
    for (unsigned h = 0; h < base_DistGraph::numHosts - 1; h++) {
      unsigned sendingHost;
      unsigned messageType;
      std::vector<uint32_t> receivedOffsets;
      std::vector<uint32_t> receivedMasters;

      std::tie(sendingHost, messageType) =
          recvOffsetsAndMasters(receivedOffsets, receivedMasters);

      if (messageType == 1 || messageType == 2) {
        saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,
                             receivedOffsets, receivedMasters);
      }
    }

    p0assignReceiveTime.stop();
  }

  void syncAssignmentReceivesAsync(
      std::vector<uint32_t>& localNodeToMaster,
      std::unordered_map<uint64_t, uint32_t>& gid2offsets,
      galois::DynamicBitSet& hostFinished) {
    galois::StatTimer p0assignReceiveTime("Phase0AssignmentReceiveTimeAsync",
                                          GRNAME);
    p0assignReceiveTime.start();

    recvOffsetsAndMastersAsync(localNodeToMaster, gid2offsets, hostFinished);

    p0assignReceiveTime.stop();
  }

  /**
   * Send/receive new master assignment updates to other hosts.
   *
   * @param begin to end: which nodes on this host have been updated
   * @param numLocalNodes: number of owned nodes
   * @param localNodeToMaster Vector map: an offset corresponds to a particular
   * GID; indicates masters of GIDs
   * @param syncNodes one vector of nodes for each host: contains mirrors on
   * this host whose master is on that host
   * @param gid2offsets Map of GIDs to the offset into the vector map that
   * corresponds to it
   */
  void syncAssignment(
      uint32_t begin, uint32_t end, uint32_t numLocalNodes,
      std::vector<uint32_t>& localNodeToMaster,
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes,
      std::unordered_map<uint64_t, uint32_t>& gid2offsets) {
    galois::StatTimer syncAssignmentTimer("Phase0SyncAssignmentTime", GRNAME);
    syncAssignmentTimer.start();

    syncAssignmentSends(begin, end, numLocalNodes, localNodeToMaster,
                        syncNodes);
    syncAssignmentReceives(localNodeToMaster, gid2offsets);

    syncAssignmentTimer.stop();
  }

  void syncAssignmentAsync(
      uint32_t begin, uint32_t end, uint32_t numLocalNodes,
      std::vector<uint32_t>& localNodeToMaster,
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes,
      std::unordered_map<uint64_t, uint32_t>& gid2offsets,
      galois::DynamicBitSet& hostFinished) {
    galois::StatTimer syncAssignmentTimer("Phase0SyncAssignmentAsyncTime",
                                          GRNAME);
    syncAssignmentTimer.start();

    syncAssignmentSends(begin, end, numLocalNodes, localNodeToMaster,
                        syncNodes);
    syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets, hostFinished);

    syncAssignmentTimer.stop();
  }

  /**
   * Send masters mappings that were read on this host to their appropirate
   * owners
   *
   * @param localNodeToMaster local id to master mapping map
   * @param ghosts bitsets specifying which hosts have which neighbors
   * that this host has read
   */
  void sendMastersToOwners(
      std::vector<uint32_t>& localNodeToMaster,
      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {
    uint32_t begin = base_DistGraph::gid2host[base_DistGraph::id].first;
    uint32_t end   = base_DistGraph::gid2host[base_DistGraph::id].second;
    // for each host, determine which master assignments still need to be sent
    // (if a host is a master of a node, but that node is not present as a
    // neighbor on the host, then this host needs to send the master assignment)
    galois::DynamicBitSet toSend;
    toSend.resize(end - begin);

    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
      if (h != base_DistGraph::id) {
        toSend.reset();
        // send if present in localNodeToMaster but not present in syncNodes
        galois::do_all(
            galois::iterate((uint32_t)0, end - begin),
            [&](uint32_t lid) {
              if (localNodeToMaster[lid] == h) {
                toSend.set(lid);
              }
            },
            galois::no_stats());
        galois::do_all(
            galois::iterate(syncNodes[h]),
            [&](uint32_t lid) { toSend.reset(lid); }, galois::no_stats());

        sendOffsets(h, toSend, localNodeToMaster, "MastersToOwners");
      }
    }
  }

  /**
   * Receive master mapping messages from hosts and add it to the graph
   * partitioner's map.
   */
  void recvMastersToOwners() {
    for (unsigned h = 0; h < base_DistGraph::numHosts - 1; h++) {
      unsigned sendingHost;
      unsigned messageType;
      std::vector<uint32_t> receivedOffsets;
      std::vector<uint32_t> receivedMasters;

      std::tie(sendingHost, messageType) =
          recvOffsetsAndMasters(receivedOffsets, receivedMasters);

      if (messageType == 1 || messageType == 2) {
        assert(receivedMasters.size() == receivedOffsets.size());
        uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;

        // must be single threaded as map updating isn't thread-safe
        for (unsigned i = 0; i < receivedMasters.size(); i++) {
          uint64_t gidToMap = hostOffset + receivedOffsets[i];
#ifndef NDEBUG
          bool newMapped =
#endif
              graphPartitioner->addMasterMapping(gidToMap, receivedMasters[i]);
          assert(newMapped);
        }
      }
    }
  }

  /**
   * Phase responsible for initial master assignment.
   *
   * @param bufGraph Locally read graph on this host
   * @param async Specifies whether or not do synchronization of node
   * assignments BSP style or asynchronous style. Note regardless of which
   * is chosen there is a barrier at the end of master assignment.
   */
  void phase0(galois::graphs::BufferedGraph<EdgeTy>& bufGraph, bool async,
              const uint32_t stateRounds) {
    galois::DynamicBitSet ghosts;
    galois::gstl::Vector<galois::gstl::Vector<uint32_t>>
        syncNodes; // masterNodes
    syncNodes.resize(base_DistGraph::numHosts);

    // determine on which hosts that this host's read nodes havs neighbors on
    phase0BitsetSetup(bufGraph, ghosts);
    // gid to vector offset setup
    std::unordered_map<uint64_t, uint32_t> gid2offsets;
    uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes);
    galois::gDebug("[", base_DistGraph::id, "] num neighbors found is ",
                   neighborCount);
    // send off neighbor metadata
    phase0SendRecv(syncNodes);

    galois::StatTimer p0allocTimer("Phase0AllocationTime", GRNAME);

    p0allocTimer.start();

    // setup other partitioning metadata: nodes on each host, edges on each
    // host (as determined by edge cut)
    std::vector<uint64_t> nodeLoads;
    std::vector<uint64_t> edgeLoads;
    std::vector<galois::CopyableAtomic<uint64_t>> nodeAccum;
    std::vector<galois::CopyableAtomic<uint64_t>> edgeAccum;
    nodeLoads.assign(base_DistGraph::numHosts, 0);
    edgeLoads.assign(base_DistGraph::numHosts, 0);
    nodeAccum.assign(base_DistGraph::numHosts, 0);
    edgeAccum.assign(base_DistGraph::numHosts, 0);

    uint32_t numLocalNodes =
        base_DistGraph::gid2host[base_DistGraph::id].second -
        base_DistGraph::gid2host[base_DistGraph::id].first;

    std::vector<uint32_t> localNodeToMaster;
    localNodeToMaster.assign(numLocalNodes + neighborCount, (uint32_t)-1);

    // bitsets tracking termination of assignments and partitioning loads
    galois::DynamicBitSet hostFinished;
    galois::DynamicBitSet loadsClear;

    if (async) {
      if (base_DistGraph::id == 0) {
        galois::gPrint("Using asynchronous master determination sends.\n");
      }

      hostFinished.resize(base_DistGraph::numHosts);
      loadsClear.resize(base_DistGraph::numHosts);
    }

    p0allocTimer.stop();

    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;

#ifndef NDEBUG
    for (uint32_t i : localNodeToMaster) {
      assert(i == (uint32_t)-1);
    }
#endif

    if (base_DistGraph::id == 0) {
      galois::gPrint("Number of BSP sync rounds in master assignment: ",
                     stateRounds, "\n");
    }

    // galois::PerThreadTimer<CUSP_PT_TIMER> ptt(
    //  GRNAME, "Phase0DetermineMaster_" + std::string(base_DistGraph::id)
    //);
    for (unsigned syncRound = 0; syncRound < stateRounds; syncRound++) {
      uint32_t beginNode;
      uint32_t endNode;
      std::tie(beginNode, endNode) = galois::block_range(
          globalOffset, base_DistGraph::gid2host[base_DistGraph::id].second,
          syncRound, stateRounds);

      // create specific range for this block
      std::vector<uint32_t> rangeVec;
      auto work =
          getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode);

      // debug print
      // galois::on_each([&] (unsigned i, unsigned j) {
      //  galois::gDebug("[", base_DistGraph::id, " ", i, "] sync round ",
      //  syncRound, " local range ",
      //                 *work.local_begin(), " ", *work.local_end());
      //});

      galois::do_all(
          // iterate over my read nodes
          galois::iterate(work),
          // galois::iterate(beginNode, endNode),
          [&](uint32_t node) {
            // ptt.start();
            // determine master function takes source node, iterator of
            // neighbors
            uint32_t assignedHost = graphPartitioner->getMaster(
                node, bufGraph, localNodeToMaster, gid2offsets, nodeLoads,
                nodeAccum, edgeLoads, edgeAccum);
            // != -1 means it was assigned a host
            assert(assignedHost != (uint32_t)-1);
            // update mapping; this is a local node, so can get position
            // on map with subtraction
            localNodeToMaster[node - globalOffset] = assignedHost;

            // galois::gDebug("[", base_DistGraph::id, "] state round ",
            // syncRound,
            //               " set ", node, " ", node - globalOffset);

            // ptt.stop();
          },
          galois::loopname("Phase0DetermineMasters"), galois::steal(),
          galois::no_stats());

      // do synchronization of master assignment of neighbors
      if (!async) {
        syncAssignment(beginNode - globalOffset, endNode - globalOffset,
                       numLocalNodes, localNodeToMaster, syncNodes,
                       gid2offsets);
      } else {
        // don't need to send anything if there is nothing to send unlike sync
        if (beginNode != endNode) {
          syncAssignmentAsync(beginNode - globalOffset, endNode - globalOffset,
                              numLocalNodes, localNodeToMaster, syncNodes,
                              gid2offsets, hostFinished);
        }
      }

      // sync node/edge loads
      galois::StatTimer loadSyncTimer("Phase0LoadSyncTime", GRNAME);

      loadSyncTimer.start();
      if (!async) {
        syncLoad(nodeLoads, nodeAccum);
        syncLoad(edgeLoads, edgeAccum);
      } else {
        asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear);
      }
      loadSyncTimer.stop();

#ifndef NDEBUG
      if (async) {
        galois::gDebug("[", base_DistGraph::id, "] host count ",
                       hostFinished.count());
      }
#endif
    }

    // if asynchronous, don't move on until everything is done
    if (async) {
      galois::StatTimer waitTime("Phase0AsyncWaitTime", GRNAME);
      // assignment clears
      sendAllClears();
      // load clears
      sendAllClears(1);

      hostFinished.set(base_DistGraph::id);
      loadsClear.set(base_DistGraph::id);

      waitTime.start();
      while (hostFinished.count() != base_DistGraph::numHosts ||
             loadsClear.count() != base_DistGraph::numHosts) {
        //#ifndef NDEBUG
        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to
        // finish, ",
        //               hostFinished.count());
        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts
        // loads "
        //               "syncs to finish, ", loadsClear.count());
        //#endif
        // make sure all assignments are done and all loads are done
        syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,
                                    hostFinished);
        asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);
      }
      waitTime.stop();
    }

#ifndef NDEBUG
    printLoad(nodeLoads, nodeAccum);
    printLoad(edgeLoads, edgeAccum);
#endif

    // sanity check for correctness (all should be assigned)
    for (uint32_t i = 0; i < localNodeToMaster.size(); i++) {
      if (localNodeToMaster[i] == (uint32_t)-1) {
        // galois::gDebug("[", base_DistGraph::id, "] bad index ", i);
        assert(localNodeToMaster[i] != (uint32_t)-1);
      }
    }

    base_DistGraph::increment_evilPhase();
    // increment twice if async is used as async uses 2 phases
    if (async) {
      base_DistGraph::increment_evilPhase();
    }

    galois::gPrint("[", base_DistGraph::id,
                   "] Local master assignment "
                   "complete.\n");

    // one more step: let masters know of nodes they own (if they don't
    // have the node locally then this is the only way they will learn about
    // it)
    galois::StatTimer p0master2ownerTimer("Phase0MastersToOwners", GRNAME);

    p0master2ownerTimer.start();
    sendMastersToOwners(localNodeToMaster, syncNodes);
    recvMastersToOwners();
    p0master2ownerTimer.stop();

    galois::gPrint("[", base_DistGraph::id, "] Received my master mappings.\n");

    base_DistGraph::increment_evilPhase();

    graphPartitioner->saveGID2HostInfo(gid2offsets, localNodeToMaster,
                                       bufGraph.getNodeOffset());
  }

  void edgeCutInspection(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                         galois::StatTimer& inspectionTimer,
                         uint64_t edgeOffset,
                         galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    galois::DynamicBitSet incomingMirrors;
    incomingMirrors.resize(base_DistGraph::numGlobalNodes);
    incomingMirrors.reset();
    uint32_t myID         = base_DistGraph::id;
    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;

    // already set before this is called
    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
    prefixSumOfEdges.resize(base_DistGraph::numOwned);

    auto& ltgv = base_DistGraph::localToGlobalVector;
    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](size_t n) {
          auto ii = bufGraph.edgeBegin(n);
          auto ee = bufGraph.edgeEnd(n);
          for (; ii < ee; ++ii) {
            uint32_t dst = bufGraph.edgeDestination(*ii);
            if (graphPartitioner->retrieveMaster(dst) != myID) {
              incomingMirrors.set(dst);
            }
          }
          prefixSumOfEdges[n - globalOffset] = (*ee) - edgeOffset;
          ltgv[n - globalOffset]             = n;
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeInspectionLoop"),
#endif
        galois::steal(), galois::no_stats());
    inspectionTimer.stop();

    uint64_t allBytesRead = bufGraph.getBytesRead();
    galois::gPrint(
        "[", base_DistGraph::id,
        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
        " seconds to read ", allBytesRead, " bytes (",
        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");

    // get incoming mirrors ready for creation
    uint32_t additionalMirrorCount = incomingMirrors.count();
    base_DistGraph::localToGlobalVector.resize(
        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);
    if (base_DistGraph::numOwned > 0) {
      // fill prefix sum with last number (incomings have no edges)
      prefixSumOfEdges.resize(prefixSumOfEdges.size() + additionalMirrorCount,
                              prefixSumOfEdges.back());
    } else {
      prefixSumOfEdges.resize(additionalMirrorCount);
    }

    if (additionalMirrorCount > 0) {
      // TODO move this part below into separate function
      uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;
      uint32_t activeThreads = galois::getActiveThreads();
      std::vector<uint64_t> threadPrefixSums(activeThreads);
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range(0u, totalNumNodes, tid, nthreads);
        uint64_t count = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          if (incomingMirrors.test(i))
            ++count;
        }
        threadPrefixSums[tid] = count;
      });
      // get prefix sums
      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
        threadPrefixSums[i] += threadPrefixSums[i - 1];
      }

      assert(threadPrefixSums.back() == additionalMirrorCount);

      uint32_t startingNodeIndex = base_DistGraph::numOwned;
      // do actual work, second on_each
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range(0u, totalNumNodes, tid, nthreads);
        // start location to start adding things into prefix sums/vectors
        uint32_t threadStartLocation = 0;
        if (tid != 0) {
          threadStartLocation = threadPrefixSums[tid - 1];
        }
        uint32_t handledNodes = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          if (incomingMirrors.test(i)) {
            base_DistGraph::localToGlobalVector[startingNodeIndex +
                                                threadStartLocation +
                                                handledNodes] = i;
            handledNodes++;
          }
        }
      });
    }

    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;
    if (prefixSumOfEdges.size() != 0) {
      base_DistGraph::numEdges = prefixSumOfEdges.back();
    } else {
      base_DistGraph::numEdges = 0;
    }
    assert(base_DistGraph::localToGlobalVector.size() ==
           base_DistGraph::numNodes);
    assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);

    // g2l mapping
    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {
      // global to local map construction
      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
          i;
    }
    assert(base_DistGraph::globalToLocalMap.size() == base_DistGraph::numNodes);

    base_DistGraph::numNodesWithEdges = base_DistGraph::numOwned;
  }

  /**
   * Given a loaded graph, construct the edges in the DistGraph graph.
   * Variant that constructs edge data as well.
   *
   * @tparam GraphTy type of graph to construct
   *
   * @param [in,out] graph Graph to construct edges in
   * @param bGraph Buffered graph that has edges to write into graph in memory
   */
  template <typename GraphTy,
            typename std::enable_if<!std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void edgeCutLoad(GraphTy& graph,
                   galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
    if (base_DistGraph::id == 0) {
      galois::gPrint("Loading edge-data while creating edges\n");
    }

    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
    bGraph.resetReadCounters();
    galois::StatTimer timer("EdgeLoading", GRNAME);
    timer.start();

    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](size_t n) {
          auto ii       = bGraph.edgeBegin(n);
          auto ee       = bGraph.edgeEnd(n);
          uint32_t lsrc = this->G2LEdgeCut(n, globalOffset);
          uint64_t cur =
              *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
          for (; ii < ee; ++ii) {
            auto gdst           = bGraph.edgeDestination(*ii);
            decltype(gdst) ldst = this->G2LEdgeCut(gdst, globalOffset);
            auto gdata          = bGraph.edgeData(*ii);
            graph.constructEdge(cur++, ldst, gdata);
          }
          assert(cur == (*graph.edge_end(lsrc)));
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeLoadingLoop"),
#endif
        galois::steal(), galois::no_stats());

    timer.stop();
    galois::gPrint("[", base_DistGraph::id,
                   "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                   " seconds to read ", bGraph.getBytesRead(), " bytes (",
                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
  }

  /**
   * Given a loaded graph, construct the edges in the DistGraph graph.
   * No edge data.
   *
   * @tparam GraphTy type of graph to construct
   *
   * @param [in,out] graph Graph to construct edges in
   * @param bGraph Buffered graph that has edges to write into graph in memory
   */
  template <typename GraphTy,
            typename std::enable_if<std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void edgeCutLoad(GraphTy& graph,
                   galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
    if (base_DistGraph::id == 0) {
      galois::gPrint("Loading edge-data while creating edges\n");
    }

    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
    bGraph.resetReadCounters();
    galois::StatTimer timer("EdgeLoading", GRNAME);
    timer.start();

    galois::do_all(
        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,
                        base_DistGraph::gid2host[base_DistGraph::id].second),
        [&](size_t n) {
          auto ii       = bGraph.edgeBegin(n);
          auto ee       = bGraph.edgeEnd(n);
          uint32_t lsrc = this->G2LEdgeCut(n, globalOffset);
          uint64_t cur =
              *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
          for (; ii < ee; ++ii) {
            auto gdst           = bGraph.edgeDestination(*ii);
            decltype(gdst) ldst = this->G2LEdgeCut(gdst, globalOffset);
            graph.constructEdge(cur++, ldst);
          }
          assert(cur == (*graph.edge_end(lsrc)));
        },
#if MORE_DIST_STATS
        galois::loopname("EdgeLoadingLoop"),
#endif
        galois::steal(), galois::no_stats());

    timer.stop();
    galois::gPrint("[", base_DistGraph::id,
                   "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                   " seconds to read ", bGraph.getBytesRead(), " bytes (",
                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
  }

  /**
   * Assign edges to hosts (but don't actually send), and send this information
   * out to all hosts
   * @param[in] bufGraph local graph to read
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be
   * created)on a host have incoming edges
   */
  void edgeInspection(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                      std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                      std::vector<galois::DynamicBitSet>& hasIncomingEdge,
                      galois::StatTimer& inspectionTimer) {
    // number of nodes that this host has read from disk
    uint32_t numRead = base_DistGraph::gid2host[base_DistGraph::id].second -
                       base_DistGraph::gid2host[base_DistGraph::id].first;

    // allocate space for outgoing edges
    for (uint32_t i = 0; i < base_DistGraph::numHosts; ++i) {
      numOutgoingEdges[i].assign(numRead, 0);
    }

    galois::DynamicBitSet hostHasOutgoing;
    hostHasOutgoing.resize(base_DistGraph::numHosts);
    hostHasOutgoing.reset();
    assignEdges(bufGraph, numOutgoingEdges, hasIncomingEdge, hostHasOutgoing);

    inspectionTimer.stop();
    // report edge inspection time
    uint64_t allBytesRead = bufGraph.getBytesRead();
    galois::gPrint(
        "[", base_DistGraph::id,
        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
        " seconds to read ", allBytesRead, " bytes (",
        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");

    // old inspection barrier
    // galois::runtime::getHostBarrier().wait();

    sendInspectionData(numOutgoingEdges, hasIncomingEdge, hostHasOutgoing);

    // setup a single hasIncomingEdge bitvector

    uint32_t myHostID = base_DistGraph::id;
    if (hasIncomingEdge[myHostID].size() == 0) {
      hasIncomingEdge[myHostID].resize(base_DistGraph::numGlobalNodes);
      hasIncomingEdge[myHostID].reset();
    }
    recvInspectionData(numOutgoingEdges, hasIncomingEdge[myHostID]);
    base_DistGraph::increment_evilPhase();
  }

  /**
   * Inspect read edges and determine where to send them. Mark metadata as
   * necessary.
   *
   * @param[in] bufGraph local graph to read
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be
   * created)on a host have incoming edges
   * @param[in,out] hostHasOutgoing bitset tracking which hosts have outgoing
   * edges from this host
   */
  void assignEdges(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                   std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                   std::vector<galois::DynamicBitSet>& hasIncomingEdge,
                   galois::DynamicBitSet& hostHasOutgoing) {
    std::vector<galois::CopyableAtomic<char>> indicatorVars(
        base_DistGraph::numHosts);
    // initialize indicators of initialized bitsets to 0
    for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {
      indicatorVars[i] = 0;
    }

    // global offset into my read nodes
    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
    uint32_t globalNodes  = base_DistGraph::numGlobalNodes;

    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {
      uint32_t beginNode;
      uint32_t endNode;
      std::tie(beginNode, endNode) = galois::block_range(
          globalOffset, base_DistGraph::gid2host[base_DistGraph::id].second,
          syncRound, _edgeStateRounds);
      // TODO maybe edge range this?

      galois::do_all(
          // iterate over my read nodes
          galois::iterate(beginNode, endNode),
          [&](size_t src) {
            auto ee            = bufGraph.edgeBegin(src);
            auto ee_end        = bufGraph.edgeEnd(src);
            uint64_t numEdgesL = std::distance(ee, ee_end);

            for (; ee != ee_end; ee++) {
              uint32_t dst         = bufGraph.edgeDestination(*ee);
              uint32_t hostBelongs = -1;
              hostBelongs = graphPartitioner->getEdgeOwner(src, dst, numEdgesL);
              if (_edgeStateRounds > 1) {
                hostLoads[hostBelongs] += 1;
              }

              numOutgoingEdges[hostBelongs][src - globalOffset] += 1;
              hostHasOutgoing.set(hostBelongs);
              bool hostIsMasterOfDest =
                  (hostBelongs == graphPartitioner->retrieveMaster(dst));

              // this means a mirror must be created for destination node on
              // that host since it will not be created otherwise
              if (!hostIsMasterOfDest) {
                auto& bitsetStatus = indicatorVars[hostBelongs];

                // initialize the bitset if necessary
                if (bitsetStatus == 0) {
                  char expected = 0;
                  bool result =
                      bitsetStatus.compare_exchange_strong(expected, 1);
                  // i swapped successfully, therefore do allocation
                  if (result) {
                    hasIncomingEdge[hostBelongs].resize(globalNodes);
                    hasIncomingEdge[hostBelongs].reset();
                    bitsetStatus = 2;
                  }
                }
                // until initialized, loop
                while (indicatorVars[hostBelongs] != 2)
                  ;
                hasIncomingEdge[hostBelongs].set(dst);
              }
            }
          },
#if MORE_DIST_STATS
          galois::loopname("AssignEdges"),
#endif
          galois::steal(), galois::no_stats());
      syncEdgeLoad();
    }
  }

  /**
   * Given a vector specifying which nodes have edges for an unspecified
   * receiver host, save the masters of those nodes (which are known on this
   * host but not necessarily other hosts) into a vector and serialize it for
   * the receiver to update their master node mapping.
   *
   * @param b Send buffer
   * @param hostOutgoingEdges Number of edges that the receiver of this
   * vector should expect for each node on this host
   */
  void
  serializeOutgoingMasterMap(galois::runtime::SendBuffer& b,
                             const std::vector<uint64_t>& hostOutgoingEdges) {
    // 2 phase: one phase determines amount of work each thread does,
    // second has threads actually do copies
    uint32_t activeThreads = galois::getActiveThreads();
    std::vector<uint64_t> threadPrefixSums(activeThreads);
    size_t hostSize = base_DistGraph::gid2host[base_DistGraph::id].second -
                      base_DistGraph::gid2host[base_DistGraph::id].first;
    assert(hostSize == hostOutgoingEdges.size());

    // for each thread, figure out how many items it will work with
    // (non-zero outgoing edges)
    galois::on_each([&](unsigned tid, unsigned nthreads) {
      size_t beginNode;
      size_t endNode;
      std::tie(beginNode, endNode) =
          galois::block_range((size_t)0, hostSize, tid, nthreads);
      uint64_t count = 0;
      for (size_t i = beginNode; i < endNode; i++) {
        if (hostOutgoingEdges[i] > 0) {
          count++;
        }
      }
      threadPrefixSums[tid] = count;
    });

    // get prefix sums
    for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
      threadPrefixSums[i] += threadPrefixSums[i - 1];
    }

    uint32_t numNonZero = threadPrefixSums[activeThreads - 1];
    std::vector<uint32_t> masterLocation;
    masterLocation.resize(numNonZero, (uint32_t)-1);
    // should only be in here if there's something to send in first place
    assert(numNonZero > 0);

    uint64_t startNode = base_DistGraph::gid2host[base_DistGraph::id].first;

    // do actual work, second on_each; find non-zeros again, get master
    // corresponding to that non-zero and send to other end
    galois::on_each([&](unsigned tid, unsigned nthreads) {
      size_t beginNode;
      size_t endNode;
      std::tie(beginNode, endNode) =
          galois::block_range((size_t)0, hostSize, tid, nthreads);
      // start location to start adding things into prefix sums/vectors
      uint32_t threadStartLocation = 0;
      if (tid != 0) {
        threadStartLocation = threadPrefixSums[tid - 1];
      }

      uint32_t handledNodes = 0;
      for (size_t i = beginNode; i < endNode; i++) {
        if (hostOutgoingEdges[i] > 0) {
          // get master of i
          masterLocation[threadStartLocation + handledNodes] =
              graphPartitioner->retrieveMaster(i + startNode);
          handledNodes++;
        }
      }
    });

#ifndef NDEBUG
    for (uint32_t i : masterLocation) {
      assert(i != (uint32_t)-1);
    }
#endif

    // serialize into buffer; since this is sent along with vector receiver end
    // will know how to deal with it
    galois::runtime::gSerialize(b, masterLocation);
  }

  void
  serializeIncomingMasterMap(galois::runtime::SendBuffer& b,
                             const galois::DynamicBitSet& hostIncomingEdges) {
    size_t numOfNodes = hostIncomingEdges.count();
    std::vector<uint32_t> masterMap;
    masterMap.resize(numOfNodes, (uint32_t)-1);

    std::vector<uint32_t> bitsetOffsets = hostIncomingEdges.getOffsets();

    // size_t firstBound = base_DistGraph::gid2host[h].first;
    // size_t secondBound = base_DistGraph::gid2host[h].second;

    // galois::do_all(
    //  galois::iterate((size_t)0, firstBound),
    //  [&] (size_t offset) {
    //    masterMap[offset] =
    //    graphPartitioner->retrieveMaster(bitsetOffsets[offset]);
    //  },
    //  galois::no_stats()
    //);

    galois::do_all(
        // galois::iterate((size_t)secondBound, numOfNodes),
        galois::iterate((size_t)0, numOfNodes),
        [&](size_t offset) {
          masterMap[offset] =
              graphPartitioner->retrieveMaster(bitsetOffsets[offset]);
        },
        galois::no_stats());

#ifndef NDEBUG
    for (uint32_t i : masterMap) {
      assert(i != (uint32_t)-1);
      assert(i < base_DistGraph::numHosts);
    }
#endif

    // serialize into buffer; since this is sent along with vector receiver end
    // will know how to deal with it
    galois::runtime::gSerialize(b, masterMap);
  }

  void deserializeOutgoingMasterMap(
      uint32_t senderHost, const std::vector<uint64_t>& hostOutgoingEdges,
      const std::vector<uint32_t>& recvMasterLocations) {
    uint64_t hostOffset = base_DistGraph::gid2host[senderHost].first;
    size_t hostSize     = base_DistGraph::gid2host[senderHost].second -
                      base_DistGraph::gid2host[senderHost].first;
    assert(hostSize == hostOutgoingEdges.size());
    galois::DynamicBitSet offsetsToConsider;
    offsetsToConsider.resize(hostSize);
    offsetsToConsider.reset();

    // step 1: figure out offsets that need to be handled (i.e. non-zero): only
    // handle if not already in map
    galois::do_all(
        galois::iterate((size_t)0, hostOutgoingEdges.size()),
        [&](size_t offset) {
          if (hostOutgoingEdges[offset] > 0) {
            offsetsToConsider.set(offset);
          }
        },
        galois::no_stats(), galois::steal());
    assert(offsetsToConsider.count() == recvMasterLocations.size());

    // step 2: using bitset that tells which offsets are set, add
    // to already master map in partitioner (this is single threaded
    // since map is not a concurrent data structure)
    size_t curCount = 0;
    // size_t actuallySet = 0;
    for (uint32_t offset : offsetsToConsider.getOffsets()) {
      // galois::gDebug("[", base_DistGraph::id, "] ", " setting ",
      //               offset + hostOffset, " from host ", senderHost,
      //               " to ", recvMasterLocations[curCount]);
      graphPartitioner->addMasterMapping(offset + hostOffset,
                                         recvMasterLocations[curCount]);
      // bool set = graphPartitioner->addMasterMapping(offset + hostOffset,
      //                                          recvMasterLocations[curCount]);
      // if (set) { actuallySet++; }
      curCount++;
    }

    // galois::gDebug("[", base_DistGraph::id, "] host ", senderHost, ": set ",
    //               actuallySet, " out of ", recvMasterLocations.size());
  }

  /**
   * Map GIDs to masters from incoming master map sent from hosts.
   *
   * @param senderHost host that sent the data
   * @param gids GIDs corresponding to the received master locations
   * @param recvMasterLocations masters of GIDs in the gids vector
   */
  void deserializeIncomingMasterMap(
      const std::vector<uint32_t>& gids,
      const std::vector<uint32_t>& recvMasterLocations) {
    assert(gids.size() == recvMasterLocations.size());
    size_t curCount = 0;
    for (uint64_t gid : gids) {
      assert(gid < base_DistGraph::numGlobalNodes);
      // galois::gDebug("[", base_DistGraph::id, "] ", " in-setting ", gid, " to
      // ",
      //               recvMasterLocations[curCount]);
      graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]);
      curCount++;
    }
  }

  /**
   * Send data out from inspection to other hosts.
   *
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be
   * created)on a host have incoming edges
   * @param[in] hostHasOutgoing bitset tracking which hosts have outgoing
   * edges from this host
   */
  void sendInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                          std::vector<galois::DynamicBitSet>& hasIncomingEdge,
                          galois::DynamicBitSet& hostHasOutgoing) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    galois::GAccumulator<uint64_t> bytesSent;
    bytesSent.reset();

    for (unsigned h = 0; h < net.Num; h++) {
      if (h == net.ID) {
        // i have no outgoing edges i will keep; go ahead and clear
        if (!hostHasOutgoing.test(h)) {
          numOutgoingEdges[h].clear();
        }
        continue;
      }
      // send outgoing edges data off to comm partner
      galois::runtime::SendBuffer b;

      // only send if non-zeros exist
      if (hostHasOutgoing.test(h)) {
        galois::runtime::gSerialize(b, 1); // token saying data exists
        galois::runtime::gSerialize(b, numOutgoingEdges[h]);
        if (graphPartitioner->masterAssignPhase()) {
          serializeOutgoingMasterMap(b, numOutgoingEdges[h]);
        }
      } else {
        galois::runtime::gSerialize(b, 0); // token saying no data exists
      }
      numOutgoingEdges[h].clear();

      // determine form to send bitset in
      galois::DynamicBitSet& curBitset = hasIncomingEdge[h];
      uint64_t bitsetSize              = curBitset.size(); // num bits
      uint64_t onlyOffsetsSize         = curBitset.count() * 32;
      if (bitsetSize == 0) {
        // there was nothing there to send in first place
        galois::runtime::gSerialize(b, 0);
      } else if (onlyOffsetsSize <= bitsetSize) {
        // send only offsets
        std::vector<uint32_t> offsets = curBitset.getOffsets();
        galois::runtime::gSerialize(b, 2); // 2 = only offsets
        galois::runtime::gSerialize(b, offsets);

        if (graphPartitioner->masterAssignPhase()) {
          // galois::gDebug("incoming master map serialization");
          // serializeIncomingMasterMap(b, curBitset, h);
          serializeIncomingMasterMap(b, curBitset);
        }
      } else {
        // send entire bitset
        galois::runtime::gSerialize(b, 1);
        galois::runtime::gSerialize(b, curBitset);
        if (graphPartitioner->masterAssignPhase()) {
          // galois::gDebug("incoming master map serialization");
          // serializeIncomingMasterMap(b, curBitset, h);
          serializeIncomingMasterMap(b, curBitset);
        }
      }
      // get memory from bitset back
      curBitset.resize(0);

      bytesSent.update(b.size());

      // send buffer and free memory
      net.sendTagged(h, galois::runtime::evilPhase, b);
      b.getVec().clear();
    }

    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());

    galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n");
  }

  /**
   * Receive data from inspection from other hosts. Processes the incoming
   * edge bitsets/offsets.
   *
   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have
   * outgoing edges
   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be
   * created) on this host have incoming edges
   */
  void recvInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                          galois::DynamicBitSet& hasIncomingEdge) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    for (unsigned h = 0; h < net.Num - 1; h++) {
      // expect data from comm partner back
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      uint32_t sendingHost = p->first;

      // get outgoing edges; first get status var
      uint32_t outgoingExists = 2;
      galois::runtime::gDeserialize(p->second, outgoingExists);

      if (outgoingExists == 1) {
        // actual data sent
        galois::runtime::gDeserialize(p->second, numOutgoingEdges[sendingHost]);

        if (graphPartitioner->masterAssignPhase()) {
          std::vector<uint32_t> recvMasterLocations;
          galois::runtime::gDeserialize(p->second, recvMasterLocations);
          deserializeOutgoingMasterMap(
              sendingHost, numOutgoingEdges[sendingHost], recvMasterLocations);
        }
      } else if (outgoingExists == 0) {
        // no data sent; just clear again
        numOutgoingEdges[sendingHost].clear();
      } else {
        GALOIS_DIE("invalid recv inspection data metadata mode, outgoing");
      }

      uint32_t bitsetMetaMode = 3; // initialize to invalid mode
      galois::runtime::gDeserialize(p->second, bitsetMetaMode);
      if (bitsetMetaMode == 1) {
        // sent as bitset; deserialize then or with main bitset
        galois::DynamicBitSet recvSet;
        galois::runtime::gDeserialize(p->second, recvSet);
        hasIncomingEdge.bitwise_or(recvSet);

        if (graphPartitioner->masterAssignPhase()) {
          std::vector<uint32_t> recvMasterLocations;
          galois::runtime::gDeserialize(p->second, recvMasterLocations);
          deserializeIncomingMasterMap(recvSet.getOffsets(),
                                       recvMasterLocations);
        }
      } else if (bitsetMetaMode == 2) {
        // sent as vector of offsets
        std::vector<uint32_t> recvOffsets;
        galois::runtime::gDeserialize(p->second, recvOffsets);
        for (uint32_t offset : recvOffsets) {
          hasIncomingEdge.set(offset);
        }

        if (graphPartitioner->masterAssignPhase()) {
          std::vector<uint32_t> recvMasterLocations;
          galois::runtime::gDeserialize(p->second, recvMasterLocations);
          deserializeIncomingMasterMap(recvOffsets, recvMasterLocations);
        }
      } else if (bitsetMetaMode == 0) {
        // do nothing; there was nothing to receive
      } else {
        GALOIS_DIE("invalid recv inspection data metadata mode");
      }
    }

    galois::gPrint("[", base_DistGraph::id,
                   "] Inspection receives complete.\n");
  }

  /**
   * Take inspection metadata and being mapping nodes/creating prefix sums,
   * return the prefix sum.
   */
  galois::gstl::Vector<uint64_t>
  nodeMapping(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
              galois::DynamicBitSet& hasIncomingEdge,
              galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    base_DistGraph::numNodes = 0;
    base_DistGraph::numEdges = 0;
    nodesToReceive           = 0;

    // reserve overestimation of nodes
    prefixSumOfEdges.reserve(base_DistGraph::numGlobalNodes /
                             base_DistGraph::numHosts * 1.15);
    base_DistGraph::localToGlobalVector.reserve(
        base_DistGraph::numGlobalNodes / base_DistGraph::numHosts * 1.15);

    inspectMasterNodes(numOutgoingEdges, prefixSumOfEdges);
    inspectOutgoingNodes(numOutgoingEdges, prefixSumOfEdges);
    createIntermediateMetadata(prefixSumOfEdges, hasIncomingEdge.count());
    inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges);
    finalizeInspection(prefixSumOfEdges);

    galois::gDebug("[", base_DistGraph::id,
                   "] To receive this many nodes: ", nodesToReceive);

    galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n");
    return prefixSumOfEdges;
  }

  /**
   * Inspect master nodes; loop over all nodes, determine if master; if is,
   * create mapping + get num edges
   */
  void inspectMasterNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                          galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    uint32_t myHID = base_DistGraph::id;

    galois::GAccumulator<uint32_t> toReceive;
    toReceive.reset();

    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
      uint32_t activeThreads = galois::getActiveThreads();
      std::vector<uint64_t> threadPrefixSums(activeThreads);
      uint64_t startNode = base_DistGraph::gid2host[h].first;
      uint64_t lastNode  = base_DistGraph::gid2host[h].second;
      size_t hostSize    = lastNode - startNode;

      if (numOutgoingEdges[h].size() != 0) {
        assert(hostSize == numOutgoingEdges[h].size());
      }

      // for each thread, figure out how many items it will work with (only
      // owned nodes)
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        // loop over all nodes that host h has read
        std::tie(beginNode, endNode) =
            galois::block_range((size_t)0, hostSize, tid, nthreads);
        uint64_t count = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          // galois::gDebug("[", base_DistGraph::id, "] ", i + startNode,
          //               " mapped to ",
          //               graphPartitioner->retrieveMaster(i+startNode));
          if (graphPartitioner->retrieveMaster(i + startNode) == myHID) {
            count++;
          }
        }
        threadPrefixSums[tid] = count;
      });

      // get prefix sums
      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
        threadPrefixSums[i] += threadPrefixSums[i - 1];
      }

      assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);
      assert(base_DistGraph::localToGlobalVector.size() ==
             base_DistGraph::numNodes);

      uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1];
      galois::gDebug("[", base_DistGraph::id, "] This many masters from host ",
                     h, ": ", newMasterNodes);
      uint32_t startingNodeIndex = base_DistGraph::numNodes;
      // increase size of prefix sum + mapping vector
      prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes);
      base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +
                                                 newMasterNodes);

      if (newMasterNodes > 0) {
        // do actual work, second on_each
        galois::on_each([&](unsigned tid, unsigned nthreads) {
          size_t beginNode;
          size_t endNode;
          std::tie(beginNode, endNode) =
              galois::block_range((size_t)0, hostSize, tid, nthreads);

          // start location to start adding things into prefix sums/vectors
          uint32_t threadStartLocation = 0;
          if (tid != 0) {
            threadStartLocation = threadPrefixSums[tid - 1];
          }

          uint32_t handledNodes = 0;
          for (size_t i = beginNode; i < endNode; i++) {
            uint32_t globalID = startNode + i;
            // if this node is master, get outgoing edges + save mapping
            if (graphPartitioner->retrieveMaster(globalID) == myHID) {
              // check size
              if (numOutgoingEdges[h].size() > 0) {
                uint64_t myEdges       = numOutgoingEdges[h][i];
                numOutgoingEdges[h][i] = 0; // set to 0; does not need to be
                                            // handled later
                prefixSumOfEdges[startingNodeIndex + threadStartLocation +
                                 handledNodes] = myEdges;
                if (myEdges > 0 && h != myHID) {
                  toReceive += 1;
                }
              } else {
                prefixSumOfEdges[startingNodeIndex + threadStartLocation +
                                 handledNodes] = 0;
              }

              base_DistGraph::localToGlobalVector[startingNodeIndex +
                                                  threadStartLocation +
                                                  handledNodes] = globalID;
              handledNodes++;
            }
          }
        });
        base_DistGraph::numNodes += newMasterNodes;
      }
    }

    nodesToReceive += toReceive.reduce();
    // masters have been handled
    base_DistGraph::numOwned = base_DistGraph::numNodes;
  }

  /**
   * Outgoing inspection: loop over all nodes, determnine if outgoing exists;
   * if does, create mapping, get edges
   */
  void
  inspectOutgoingNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,
                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    uint32_t myHID = base_DistGraph::id;

    galois::GAccumulator<uint32_t> toReceive;
    toReceive.reset();

    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
      size_t hostSize = numOutgoingEdges[h].size();
      // if i got no outgoing info from this host, safely continue to next one
      if (hostSize == 0) {
        continue;
      }

      uint32_t activeThreads = galois::getActiveThreads();
      std::vector<uint64_t> threadPrefixSums(activeThreads);

      // for each thread, figure out how many items it will work with (only
      // owned nodes)
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range((size_t)0, hostSize, tid, nthreads);
        uint64_t count = 0;
        for (size_t i = beginNode; i < endNode; i++) {
          if (numOutgoingEdges[h][i] > 0) {
            count++;
          }
        }
        threadPrefixSums[tid] = count;
      });

      // get prefix sums
      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
        threadPrefixSums[i] += threadPrefixSums[i - 1];
      }

      assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);
      assert(base_DistGraph::localToGlobalVector.size() ==
             base_DistGraph::numNodes);

      uint32_t newOutgoingNodes = threadPrefixSums[activeThreads - 1];
      // increase size of prefix sum + mapping vector
      prefixSumOfEdges.resize(base_DistGraph::numNodes + newOutgoingNodes);
      base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +
                                                 newOutgoingNodes);

      uint64_t startNode         = base_DistGraph::gid2host[h].first;
      uint32_t startingNodeIndex = base_DistGraph::numNodes;

      if (newOutgoingNodes > 0) {
        // do actual work, second on_each
        galois::on_each([&](unsigned tid, unsigned nthreads) {
          size_t beginNode;
          size_t endNode;
          std::tie(beginNode, endNode) =
              galois::block_range((size_t)0, hostSize, tid, nthreads);

          // start location to start adding things into prefix sums/vectors
          uint32_t threadStartLocation = 0;
          if (tid != 0) {
            threadStartLocation = threadPrefixSums[tid - 1];
          }

          uint32_t handledNodes = 0;

          for (size_t i = beginNode; i < endNode; i++) {
            uint64_t myEdges = numOutgoingEdges[h][i];
            if (myEdges > 0) {
              prefixSumOfEdges[startingNodeIndex + threadStartLocation +
                               handledNodes]                    = myEdges;
              base_DistGraph::localToGlobalVector[startingNodeIndex +
                                                  threadStartLocation +
                                                  handledNodes] = startNode + i;
              handledNodes++;

              if (myEdges > 0 && h != myHID) {
                toReceive += 1;
              }
            }
          }
        });
        base_DistGraph::numNodes += newOutgoingNodes;
      }
      // don't need anymore after this point; get memory back
      numOutgoingEdges[h].clear();
    }

    nodesToReceive += toReceive.reduce();
    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
  }

  /**
   * Create a part of the global to local map (it's missing the incoming
   * mirrors with no edges) + part of prefix sum
   *
   * @param[in, out] prefixSumOfEdges edge prefix sum to build
   * @param[in] incomingEstimate estimate of number of incoming nodes to build
   */
  void
  createIntermediateMetadata(galois::gstl::Vector<uint64_t>& prefixSumOfEdges,
                             const uint64_t incomingEstimate) {
    if (base_DistGraph::numNodes == 0) {
      return;
    }
    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodesWithEdges +
                                             incomingEstimate);
    base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[0]] =
        0;
    // global to local map construction using num nodes with edges
    for (unsigned i = 1; i < base_DistGraph::numNodesWithEdges; i++) {
      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];
      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
          i;
    }
  }

  /**
   * incoming node creation if is doesn't already exist + if actually amrked
   * as having incoming node
   */
  void inspectIncomingNodes(galois::DynamicBitSet& hasIncomingEdge,
                            galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;

    uint32_t activeThreads = galois::getActiveThreads();
    std::vector<uint64_t> threadPrefixSums(activeThreads);

    galois::on_each([&](unsigned tid, unsigned nthreads) {
      size_t beginNode;
      size_t endNode;
      std::tie(beginNode, endNode) =
          galois::block_range(0u, totalNumNodes, tid, nthreads);
      uint64_t count = 0;
      for (size_t i = beginNode; i < endNode; i++) {
        // only count if doesn't exist in global/local map + is incoming
        // edge
        if (hasIncomingEdge.test(i) &&
            !base_DistGraph::globalToLocalMap.count(i))
          ++count;
      }
      threadPrefixSums[tid] = count;
    });
    // get prefix sums
    for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
      threadPrefixSums[i] += threadPrefixSums[i - 1];
    }

    assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);
    assert(base_DistGraph::localToGlobalVector.size() ==
           base_DistGraph::numNodes);

    uint32_t newIncomingNodes = threadPrefixSums[activeThreads - 1];
    // increase size of prefix sum + mapping vector
    prefixSumOfEdges.resize(base_DistGraph::numNodes + newIncomingNodes);
    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +
                                               newIncomingNodes);

    uint32_t startingNodeIndex = base_DistGraph::numNodes;

    if (newIncomingNodes > 0) {
      // do actual work, second on_each
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t beginNode;
        size_t endNode;
        std::tie(beginNode, endNode) =
            galois::block_range(0u, totalNumNodes, tid, nthreads);

        // start location to start adding things into prefix sums/vectors
        uint32_t threadStartLocation = 0;
        if (tid != 0) {
          threadStartLocation = threadPrefixSums[tid - 1];
        }

        uint32_t handledNodes = 0;

        for (size_t i = beginNode; i < endNode; i++) {
          if (hasIncomingEdge.test(i) &&
              !base_DistGraph::globalToLocalMap.count(i)) {
            prefixSumOfEdges[startingNodeIndex + threadStartLocation +
                             handledNodes]                    = 0;
            base_DistGraph::localToGlobalVector[startingNodeIndex +
                                                threadStartLocation +
                                                handledNodes] = i;
            handledNodes++;
          }
        }
      });
      base_DistGraph::numNodes += newIncomingNodes;
    }
  }

  /**
   * finalize metadata maps
   */
  void finalizeInspection(galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {
    // reserve rest of memory needed
    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
    for (unsigned i = base_DistGraph::numNodesWithEdges;
         i < base_DistGraph::numNodes; i++) {
      // finalize prefix sum
      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];
      // global to local map construction
      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
          i;
    }
    if (prefixSumOfEdges.size() != 0) {
      base_DistGraph::numEdges = prefixSumOfEdges.back();
    } else {
      base_DistGraph::numEdges = 0;
    }
  }

  ////////////////////////////////////////////////////////////////////////////////

  /**
   * Fill up mirror arrays.
   * TODO make parallel?
   */
  void fillMirrors() {
    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -
                                        base_DistGraph::numOwned);
    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
         i++) {
      uint32_t globalID = base_DistGraph::localToGlobalVector[i];
      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
          .push_back(globalID);
    }
  }

  ////////////////////////////////////////////////////////////////////////////////

  template <typename GraphTy>
  void loadEdges(GraphTy& graph,
                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph) {
    if (base_DistGraph::id == 0) {
      if (std::is_void<typename GraphTy::edge_data_type>::value) {
        fprintf(stderr, "Loading void edge-data while creating edges.\n");
      } else {
        fprintf(stderr, "Loading edge-data while creating edges.\n");
      }
    }

    bufGraph.resetReadCounters();

    std::atomic<uint32_t> receivedNodes;
    receivedNodes.store(0);

    galois::StatTimer loadEdgeTimer("EdgeLoading", GRNAME);
    loadEdgeTimer.start();

    // sends data
    sendEdges(graph, bufGraph, receivedNodes);
    uint64_t bufBytesRead = bufGraph.getBytesRead();
    // get data from graph back (don't need it after sending things out)
    bufGraph.resetAndFree();

    // receives data
    galois::on_each(
        [&](unsigned, unsigned) { receiveEdges(graph, receivedNodes); });
    base_DistGraph::increment_evilPhase();

    loadEdgeTimer.stop();

    galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ",
                   loadEdgeTimer.get_usec() / 1000000.0f, " seconds to read ",
                   bufBytesRead, " bytes (",
                   bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)\n");
  }

  // Edge type is not void. (i.e. edge data exists)
  template <typename GraphTy,
            typename std::enable_if<!std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void sendEdges(GraphTy& graph,
                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                 std::atomic<uint32_t>& receivedNodes) {
    using DstVecType = std::vector<std::vector<uint64_t>>;
    using DataVecType =
        std::vector<std::vector<typename GraphTy::edge_data_type>>;
    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;

    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(
        base_DistGraph::numHosts);
    galois::substrate::PerThreadStorage<DataVecType> gdata_vecs(
        base_DistGraph::numHosts);
    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(
        base_DistGraph::numHosts);

    auto& net                = galois::runtime::getSystemNetworkInterface();
    const unsigned& id       = this->base_DistGraph::id;
    const unsigned& numHosts = this->base_DistGraph::numHosts;

    galois::GAccumulator<uint64_t> messagesSent;
    galois::GAccumulator<uint64_t> bytesSent;
    galois::GReduceMax<uint64_t> maxBytesSent;
    messagesSent.reset();
    bytesSent.reset();
    maxBytesSent.reset();

    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {
      uint32_t beginNode;
      uint32_t endNode;
      std::tie(beginNode, endNode) = galois::block_range(
          base_DistGraph::gid2host[base_DistGraph::id].first,
          base_DistGraph::gid2host[base_DistGraph::id].second, syncRound,
          _edgeStateRounds);

      // Go over assigned nodes and distribute edges.
      galois::do_all(
          galois::iterate(beginNode, endNode),
          [&](uint64_t src) {
            uint32_t lsrc    = 0;
            uint64_t curEdge = 0;
            if (base_DistGraph::isLocal(src)) {
              lsrc = this->G2L(src);
              curEdge =
                  *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
            }

            auto ee            = bufGraph.edgeBegin(src);
            auto ee_end        = bufGraph.edgeEnd(src);
            uint64_t numEdgesL = std::distance(ee, ee_end);
            auto& gdst_vec     = *gdst_vecs.getLocal();
            auto& gdata_vec    = *gdata_vecs.getLocal();

            for (unsigned i = 0; i < numHosts; ++i) {
              gdst_vec[i].clear();
              gdata_vec[i].clear();
              gdst_vec[i].reserve(numEdgesL);
              // gdata_vec[i].reserve(numEdgesL);
            }

            for (; ee != ee_end; ++ee) {
              uint32_t gdst = bufGraph.edgeDestination(*ee);
              auto gdata    = bufGraph.edgeData(*ee);

              uint32_t hostBelongs =
                  graphPartitioner->getEdgeOwner(src, gdst, numEdgesL);
              if (_edgeStateRounds > 1) {
                hostLoads[hostBelongs] += 1;
              }

              if (hostBelongs == id) {
                // edge belongs here, construct on self
                assert(base_DistGraph::isLocal(src));
                uint32_t ldst = this->G2L(gdst);
                graph.constructEdge(curEdge++, ldst, gdata);
                // TODO
                // if ldst is an outgoing mirror, this is vertex cut
              } else {
                // add to host vector to send out later
                gdst_vec[hostBelongs].push_back(gdst);
                gdata_vec[hostBelongs].push_back(gdata);
              }
            }

            // make sure all edges accounted for if local
            if (base_DistGraph::isLocal(src)) {
              assert(curEdge == (*graph.edge_end(lsrc)));
            }

            // send
            for (uint32_t h = 0; h < numHosts; ++h) {
              if (h == id)
                continue;

              if (gdst_vec[h].size() > 0) {
                auto& b = (*sendBuffers.getLocal())[h];
                galois::runtime::gSerialize(b, src);
                galois::runtime::gSerialize(b, gdst_vec[h]);
                galois::runtime::gSerialize(b, gdata_vec[h]);

                // send if over limit
                if (b.size() > edgePartitionSendBufSize) {
                  messagesSent += 1;
                  bytesSent.update(b.size());
                  maxBytesSent.update(b.size());

                  net.sendTagged(h, galois::runtime::evilPhase, b);
                  b.getVec().clear();
                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
                }
              }
            }

            // overlap receives
            auto buffer =
                net.recieveTagged(galois::runtime::evilPhase, nullptr);
            this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
          },
#if MORE_DIST_STATS
          galois::loopname("EdgeLoadingLoop"),
#endif
          galois::steal(), galois::no_stats());
      syncEdgeLoad();
      // printEdgeLoad();
    }

    // flush buffers
    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {
      auto& sbr = *sendBuffers.getRemote(threadNum);
      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {
        if (h == this->base_DistGraph::id)
          continue;
        auto& sendBuffer = sbr[h];
        if (sendBuffer.size() > 0) {
          messagesSent += 1;
          bytesSent.update(sendBuffer.size());
          maxBytesSent.update(sendBuffer.size());

          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
          sendBuffer.getVec().clear();
        }
      }
    }

    net.flush();

    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingMessagesSent"), messagesSent.reduce());
    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingBytesSent"), bytesSent.reduce());
    galois::runtime::reportStat_Tmax(
        GRNAME, std::string("EdgeLoadingMaxBytesSent"), maxBytesSent.reduce());
  }

  // no edge data version
  template <typename GraphTy,
            typename std::enable_if<std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void sendEdges(GraphTy& graph,
                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph,
                 std::atomic<uint32_t>& receivedNodes) {
    using DstVecType      = std::vector<std::vector<uint64_t>>;
    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;

    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(
        base_DistGraph::numHosts);
    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(
        base_DistGraph::numHosts);

    auto& net                = galois::runtime::getSystemNetworkInterface();
    const unsigned& id       = this->base_DistGraph::id;
    const unsigned& numHosts = this->base_DistGraph::numHosts;

    galois::GAccumulator<uint64_t> messagesSent;
    galois::GAccumulator<uint64_t> bytesSent;
    galois::GReduceMax<uint64_t> maxBytesSent;
    messagesSent.reset();
    bytesSent.reset();
    maxBytesSent.reset();

    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {
      uint64_t beginNode;
      uint64_t endNode;
      std::tie(beginNode, endNode) = galois::block_range(
          base_DistGraph::gid2host[base_DistGraph::id].first,
          base_DistGraph::gid2host[base_DistGraph::id].second, syncRound,
          _edgeStateRounds);

      // Go over assigned nodes and distribute edges.
      galois::do_all(
          galois::iterate(beginNode, endNode),
          [&](uint64_t src) {
            uint32_t lsrc    = 0;
            uint64_t curEdge = 0;
            if (base_DistGraph::isLocal(src)) {
              lsrc = this->G2L(src);
              curEdge =
                  *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
            }

            auto ee            = bufGraph.edgeBegin(src);
            auto ee_end        = bufGraph.edgeEnd(src);
            uint64_t numEdgesL = std::distance(ee, ee_end);
            auto& gdst_vec     = *gdst_vecs.getLocal();

            for (unsigned i = 0; i < numHosts; ++i) {
              gdst_vec[i].clear();
              // gdst_vec[i].reserve(numEdgesL);
            }

            for (; ee != ee_end; ++ee) {
              uint32_t gdst = bufGraph.edgeDestination(*ee);
              uint32_t hostBelongs =
                  graphPartitioner->getEdgeOwner(src, gdst, numEdgesL);
              if (_edgeStateRounds > 1) {
                hostLoads[hostBelongs] += 1;
              }

              if (hostBelongs == id) {
                // edge belongs here, construct on self
                assert(base_DistGraph::isLocal(src));
                uint32_t ldst = this->G2L(gdst);
                graph.constructEdge(curEdge++, ldst);
                // TODO
                // if ldst is an outgoing mirror, this is vertex cut
              } else {
                // add to host vector to send out later
                gdst_vec[hostBelongs].push_back(gdst);
              }
            }

            // make sure all edges accounted for if local
            if (base_DistGraph::isLocal(src)) {
              assert(curEdge == (*graph.edge_end(lsrc)));
            }

            // send
            for (uint32_t h = 0; h < numHosts; ++h) {
              if (h == id)
                continue;

              if (gdst_vec[h].size() > 0) {
                auto& b = (*sendBuffers.getLocal())[h];
                galois::runtime::gSerialize(b, src);
                galois::runtime::gSerialize(b, gdst_vec[h]);

                // send if over limit
                if (b.size() > edgePartitionSendBufSize) {
                  messagesSent += 1;
                  bytesSent.update(b.size());
                  maxBytesSent.update(b.size());

                  net.sendTagged(h, galois::runtime::evilPhase, b);
                  b.getVec().clear();
                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
                }
              }
            }

            // overlap receives
            auto buffer =
                net.recieveTagged(galois::runtime::evilPhase, nullptr);
            this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
          },
#if MORE_DIST_STATS
          galois::loopname("EdgeLoading"),
#endif
          galois::steal(), galois::no_stats());
      syncEdgeLoad();
      // printEdgeLoad();
    }

    // flush buffers
    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {
      auto& sbr = *sendBuffers.getRemote(threadNum);
      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {
        if (h == this->base_DistGraph::id)
          continue;
        auto& sendBuffer = sbr[h];
        if (sendBuffer.size() > 0) {
          messagesSent += 1;
          bytesSent.update(sendBuffer.size());
          maxBytesSent.update(sendBuffer.size());

          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
          sendBuffer.getVec().clear();
        }
      }
    }

    net.flush();

    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingMessagesSent"), messagesSent.reduce());
    galois::runtime::reportStat_Tsum(
        GRNAME, std::string("EdgeLoadingBytesSent"), bytesSent.reduce());
    galois::runtime::reportStat_Tmax(
        GRNAME, std::string("EdgeLoadingMaxBytesSent"), maxBytesSent.reduce());
  }

  //! @copydoc DistGraphHybridCut::processReceivedEdgeBuffer
  template <typename GraphTy>
  void processReceivedEdgeBuffer(
      std::optional<std::pair<uint32_t, galois::runtime::RecvBuffer>>& buffer,
      GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
    if (buffer) {
      auto& rb = buffer->second;
      while (rb.r_size() > 0) {
        uint64_t n;
        std::vector<uint64_t> gdst_vec;
        galois::runtime::gDeserialize(rb, n);
        galois::runtime::gDeserialize(rb, gdst_vec);
        assert(base_DistGraph::isLocal(n));
        uint32_t lsrc = this->G2L(n);
        uint64_t cur = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);
        uint64_t cur_end = *graph.edge_end(lsrc);
        assert((cur_end - cur) == gdst_vec.size());
        deserializeEdges(graph, rb, gdst_vec, cur, cur_end);
        ++receivedNodes;
      }
    }
  }

  /**
   * Receive the edge dest/data assigned to this host from other hosts
   * that were responsible for reading them.
   */
  template <typename GraphTy>
  void receiveEdges(GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
    auto& net = galois::runtime::getSystemNetworkInterface();

    // receive edges for all mirror nodes
    while (receivedNodes < nodesToReceive) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      processReceivedEdgeBuffer(p, graph, receivedNodes);
    }
  }

  template <typename GraphTy,
            typename std::enable_if<!std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void deserializeEdges(GraphTy& graph, galois::runtime::RecvBuffer& b,
                        std::vector<uint64_t>& gdst_vec, uint64_t& cur,
                        uint64_t& cur_end) {
    std::vector<typename GraphTy::edge_data_type> gdata_vec;
    galois::runtime::gDeserialize(b, gdata_vec);
    uint64_t i = 0;
    while (cur < cur_end) {
      auto gdata    = gdata_vec[i];
      uint64_t gdst = gdst_vec[i++];
      uint32_t ldst = this->G2L(gdst);
      graph.constructEdge(cur++, ldst, gdata);
      // TODO if ldst is an outgoing mirror, this is vertex cut
    }
  }

  template <typename GraphTy,
            typename std::enable_if<std::is_void<
                typename GraphTy::edge_data_type>::value>::type* = nullptr>
  void deserializeEdges(GraphTy& graph, galois::runtime::RecvBuffer&,
                        std::vector<uint64_t>& gdst_vec, uint64_t& cur,
                        uint64_t& cur_end) {
    uint64_t i = 0;
    while (cur < cur_end) {
      uint64_t gdst = gdst_vec[i++];
      uint32_t ldst = this->G2L(gdst);
      graph.constructEdge(cur++, ldst);
      // TODO if ldst is an outgoing mirror, this is vertex cut
    }
  }
};

// make GRNAME visible to public
template <typename NodeTy, typename EdgeTy, typename Partitioner>
constexpr const char* const
    galois::graphs::NewDistGraphGeneric<NodeTy, EdgeTy, Partitioner>::GRNAME;

} // end namespace graphs
} // end namespace galois
#endif


================================================
FILE: libdist/CMakeLists.txt
================================================
add_library(galois_dist_async STATIC)
add_library(Galois::dist_async ALIAS galois_dist_async)
add_dependencies(lib galois_dist_async)
set_target_properties(galois_dist_async PROPERTIES EXPORT_NAME dist_async)

target_sources(galois_dist_async PRIVATE
        src/Barrier.cpp
        src/DistGalois.cpp
        src/DistStats.cpp
        src/Network.cpp
        src/NetworkBuffered.cpp
        src/NetworkIOMPI.cpp
        src/NetworkLCI.cpp
)

target_include_directories(galois_dist_async PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

target_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX)
target_link_libraries(galois_dist_async PUBLIC galois_shmem)

target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)

if (GALOIS_USE_BARE_MPI)
  target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1)
endif()

if (GALOIS_USE_LCI)
  add_definitions(-DGALOIS_USE_LCI)
  set(LCI_ROOT "${CMAKE_BINARY_DIR}/libdist/external/src/lci")
  set(LCI_INCLUDE "${LCI_ROOT}/include")
  set(LCI_LIBRARY "${LCI_ROOT}/liblci.a")

  include(ExternalProject)
  # do not clone submodules for external projects
  cmake_policy(SET CMP0097 NEW)

  ExternalProject_Add(lci
    PREFIX external
    BUILD_IN_SOURCE 1
    CONFIGURE_COMMAND ""
    INSTALL_COMMAND ""
    LOG_OUTPUT_ON_FAILURE 1
    GIT_REPOSITORY "https://github.com/uiuc-hpc/LC.git"
    GIT_SUBMODULES ""
    GIT_TAG "9bf912829339879e1132614c6d24cd032c32366b")

  add_dependencies(galois_dist_async lci)
  target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2)
  target_include_directories(galois_dist_async PUBLIC 
    $<BUILD_INTERFACE:${LCI_INCLUDE}>
    $<INSTALL_INTERFACE:include>
  )
endif(GALOIS_USE_LCI)

install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(TARGETS galois_dist_async
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libdist/include/galois/DReducible.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DReducible.h
 *
 * Implements distributed reducible objects for easy reduction of values
 * across a distributed system.
 */
#ifndef GALOIS_DISTACCUMULATOR_H
#define GALOIS_DISTACCUMULATOR_H

#include <limits>
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/LWCI.h"
#include "galois/runtime/DistStats.h"

namespace galois {

/**
 * Distributed sum-reducer for getting the sum of some value across multiple
 * hosts.
 *
 * @tparam Ty type of value to max-reduce
 */
template <typename Ty>
class DGAccumulator {
  galois::runtime::NetworkInterface& net =
      galois::runtime::getSystemNetworkInterface();

  galois::GAccumulator<Ty> mdata;
  Ty local_mdata, global_mdata;

#ifdef GALOIS_USE_LCI
  /**
   * Sum reduction using LWCI
   */
  inline void reduce_lwci() {
    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),
                &galois::runtime::internal::ompi_op_sum<Ty>, lc_col_ep);
  }
#else
  /**
   * Sum reduction using MPI
   */
  inline void reduce_mpi() {
    if (typeid(Ty) == typeid(int32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(int64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(float)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_SUM,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(long double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_SUM,
                    MPI_COMM_WORLD);
    } else {
      static_assert(true,
                    "Type of DGAccumulator not supported for MPI reduction");
    }
  }
#endif

public:
  //! Default constructor
  DGAccumulator() {}

  /**
   * Adds to accumulated value
   *
   * @param rhs Value to add
   * @returns reference to this object
   */
  DGAccumulator& operator+=(const Ty& rhs) {
    mdata += rhs;
    return *this;
  }

  /**
   * Sets current value stored in accumulator.
   *
   * @param rhs Value to set
   */
  void operator=(const Ty rhs) {
    mdata.reset();
    mdata += rhs;
  }

  /**
   * Sets current value stored in accumulator.
   *
   * @param rhs Value to set
   */
  void set(const Ty rhs) {
    mdata.reset();
    mdata += rhs;
  }

  /**
   * Read local accumulated value.
   *
   * @returns locally accumulated value
   */
  Ty read_local() {
    if (local_mdata == 0)
      local_mdata = mdata.reduce();
    return local_mdata;
  }

  /**
   * Read the value returned by the last reduce call.
   * Should call reduce before calling this function if an up to date
   * value is required
   *
   * @returns the value of the last reduce call
   */
  Ty read() { return global_mdata; }

  /**
   * Reset the entire accumulator.
   *
   * @returns the value of the last reduce call
   */
  Ty reset() {
    Ty retval = global_mdata;
    mdata.reset();
    local_mdata = global_mdata = 0;
    return retval;
  }

  /**
   * Reduce data across all hosts, saves the value, and returns the
   * reduced value
   *
   * @param runID optional argument used to create a statistics timer
   * for later reporting
   *
   * @returns The reduced value
   */
  Ty reduce(std::string runID = std::string()) {
    std::string timer_str("ReduceDGAccum_" + runID);

    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),
                                                         "DGReducible");
    reduceTimer.start();

    if (local_mdata == 0)
      local_mdata = mdata.reduce();

#ifdef GALOIS_USE_LCI
    reduce_lwci();
#else
    reduce_mpi();
#endif

    reduceTimer.stop();

    return global_mdata;
  }
};

////////////////////////////////////////////////////////////////////////////////

/**
 * Distributed max-reducer for getting the max of some value across multiple
 * hosts.
 *
 * @tparam Ty type of value to max-reduce
 */
template <typename Ty>
class DGReduceMax {
  galois::runtime::NetworkInterface& net =
      galois::runtime::getSystemNetworkInterface();

  galois::GReduceMax<Ty> mdata; // local max reducer
  Ty local_mdata, global_mdata;

#ifdef GALOIS_USE_LCI
  /**
   * Use LWCI to reduce max across hosts
   */
  inline void reduce_lwci() {
    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),
                &galois::runtime::internal::ompi_op_max<Ty>, lc_col_ep);
  }
#else
  /**
   * Use MPI to reduce max across hosts
   */
  inline void reduce_mpi() {
    if (typeid(Ty) == typeid(int32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(int64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(float)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_MAX,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(long double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_MAX,
                    MPI_COMM_WORLD);
    } else {
      static_assert(true, "Type of DGReduceMax not supported for MPI "
                          "reduction");
    }
  }
#endif

public:
  /**
   * Default constructor; initializes everything to 0.
   */
  DGReduceMax() {
    local_mdata  = 0;
    global_mdata = 0;
  }

  /**
   * Update the local max-reduced value.
   *
   * @param rhs Value to max-reduce locally with
   */
  void update(const Ty rhs) { mdata.update(rhs); }

  /**
   * Read the local reduced max value; if it has never been reduced, it will
   * attempt get the global value through a reduce (i.e. all other hosts
   * should call reduce as well).
   *
   * @returns the local value stored in the accumulator or a global value if
   * reduce has never been called
   */
  Ty read_local() {
    if (local_mdata == 0)
      local_mdata = mdata.reduce();
    return local_mdata;
  }

  /**
   * Read the global reduced max value. For accurate results, you should
   * call reduce before calling this.
   *
   * @returns the global value stored in the accumulator
   */
  Ty read() { return global_mdata; }

  /**
   * Reset this accumulator.
   *
   * @returns the previous global value stored in this accumulator (note if
   * never reduced, it will be 0
   */
  Ty reset() {
    Ty retval = global_mdata;
    mdata.reset();
    local_mdata = global_mdata = 0;
    return retval;
  }

  /**
   * Do a max reduction across all hosts by sending data to all other hosts
   * and reducing received data.
   *
   * @returns the max-reduced value after reducing from all hosts.
   */
  Ty reduce(std::string runID = std::string()) {
    std::string timer_str("ReduceDGReduceMax_" + runID);

    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),
                                                         "DGReduceMax");

    reduceTimer.start();
    if (local_mdata == 0)
      local_mdata = mdata.reduce();

#ifdef GALOIS_USE_LCI
    reduce_lwci();
#else
    reduce_mpi();
#endif
    reduceTimer.stop();

    return global_mdata;
  }
};

////////////////////////////////////////////////////////////////////////////////

/**
 * Distributed min-reducer for getting the min of some value across multiple
 * hosts.
 *
 * @tparam Ty type of value to min-reduce
 */
template <typename Ty>
class DGReduceMin {
  galois::runtime::NetworkInterface& net =
      galois::runtime::getSystemNetworkInterface();

  galois::GReduceMin<Ty> mdata; // local min reducer
  Ty local_mdata, global_mdata;

#ifdef GALOIS_USE_LCI
  /**
   * Use LWCI to reduce min across hosts
   */
  inline void reduce_lwci() {
    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),
                &galois::runtime::internal::ompi_op_min<Ty>, lc_col_ep);
  }
#else
  /**
   * Use MPI to reduce min across hosts
   */
  inline void reduce_mpi() {
    if (typeid(Ty) == typeid(int32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(int64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint32_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(uint64_t)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(float)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_MIN,
                    MPI_COMM_WORLD);
    } else if (typeid(Ty) == typeid(long double)) {
      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_MIN,
                    MPI_COMM_WORLD);
    } else {
      static_assert(true, "Type of DGReduceMin not supported for MPI "
                          "reduction");
    }
  }
#endif

public:
  /**
   * Default constructor; initializes everything to the max value of the type.
   */
  DGReduceMin() {
    local_mdata  = std::numeric_limits<Ty>::max();
    global_mdata = std::numeric_limits<Ty>::max();
    ;
  }

  /**
   * Update the local min-reduced value.
   *
   * @param rhs Value to min-reduce locally with
   */
  void update(const Ty rhs) { mdata.update(rhs); }

  /**
   * Read the local reduced min value; if it has never been reduced, it will
   * attempt get the global value through a reduce (i.e. all other hosts
   * should call reduce as well).
   *
   * @returns the local value stored in the accumulator or a global value if
   * reduce has never been called
   */
  Ty read_local() {
    if (local_mdata == std::numeric_limits<Ty>::max())
      local_mdata = mdata.reduce();
    return local_mdata;
  }

  /**
   * Read the global reduced min value. For accurate results, you should
   * call reduce before calling this.
   *
   * @returns the global value stored in the accumulator
   */
  Ty read() { return global_mdata; }

  /**
   * Reset this accumulator.
   *
   * @returns the previous global value stored in this accumulator (note if
   * never reduced, it will be 0
   */
  Ty reset() {
    Ty retval = global_mdata;
    mdata.reset();
    local_mdata = global_mdata = std::numeric_limits<Ty>::max();
    return retval;
  }

  /**
   * Do a min reduction across all hosts by sending data to all other hosts
   * and reducing received data.
   *
   * @returns the min-reduced value after reducing from all hosts.
   */
  Ty reduce(std::string runID = std::string()) {
    std::string timer_str("ReduceDGReduceMin_" + runID);

    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),
                                                         "DGReduceMin");

    reduceTimer.start();
    if (local_mdata == std::numeric_limits<Ty>::max())
      local_mdata = mdata.reduce();

#ifdef GALOIS_USE_LCI
    reduce_lwci();
#else
    reduce_mpi();
#endif
    reduceTimer.stop();

    return global_mdata;
  }
};

} // namespace galois
#endif


================================================
FILE: libdist/include/galois/DTerminationDetector.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DReducible.h
 *
 * Implements distributed reducible objects for easy reduction of values
 * across a distributed system.
 */
#ifndef GALOIS_DISTTERMINATOR_H
#define GALOIS_DISTTERMINATOR_H

#include <limits>
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/LWCI.h"
#include "galois/runtime/DistStats.h"

namespace galois {

/**
 * Distributed sum-reducer for getting the sum of some value across multiple
 * hosts.
 *
 * @tparam Ty type of value to max-reduce
 */
template <typename Ty>
class DGTerminator {
  galois::runtime::NetworkInterface& net =
      galois::runtime::getSystemNetworkInterface();

  galois::GAccumulator<Ty> mdata;
  Ty local_mdata, global_mdata;

  uint64_t prev_snapshot;
  uint64_t snapshot;
  uint64_t global_snapshot;
  bool work_done;
#ifndef GALOIS_USE_LCI
  MPI_Request snapshot_request;
#else
  lc_colreq snapshot_request;
#endif

public:
  //! Default constructor
  DGTerminator() {
    reinitialize();
    initiate_snapshot();
    reset();
  }

  void reinitialize() {
    prev_snapshot   = 0;
    snapshot        = 1;
    global_snapshot = 1;
    work_done       = false;
  }

  /**
   * Adds to accumulated value
   *
   * @param rhs Value to add
   * @returns reference to this object
   */
  DGTerminator& operator+=(const Ty& rhs) {
    mdata += rhs;
    return *this;
  }

  /**
   * Sets current value stored in accumulator.
   *
   * @param rhs Value to set
   */
  void operator=(const Ty rhs) {
    mdata.reset();
    mdata += rhs;
  }

  /**
   * Sets current value stored in accumulator.
   *
   * @param rhs Value to set
   */
  void set(const Ty rhs) {
    mdata.reset();
    mdata += rhs;
  }

  /**
   * Read local accumulated value.
   *
   * @returns locally accumulated value
   */
  Ty read_local() {
    if (local_mdata == 0)
      local_mdata = mdata.reduce();
    return local_mdata;
  }

  /**
   * Read the value returned by the last reduce call.
   * Should call reduce before calling this function if an up to date
   * value is required
   *
   * @returns the value of the last reduce call
   */
  Ty read() { return global_mdata; }

  /**
   * Reset the entire accumulator.
   *
   * @returns the value of the last reduce call
   */
  Ty reset() {
    Ty retval = global_mdata;
    mdata.reset();
    local_mdata = global_mdata = 0;
    return retval;
  }

  void initiate_snapshot() {
#ifdef GALOIS_USE_LCI
    lc_ialreduce(&snapshot, &global_snapshot, sizeof(Ty),
                 &galois::runtime::internal::ompi_op_max<Ty>, lc_col_ep,
                 &snapshot_request);
#else
    MPI_Iallreduce(&snapshot, &global_snapshot, 1, MPI_UNSIGNED_LONG, MPI_MAX,
                   MPI_COMM_WORLD, &snapshot_request);
#endif
  }

  bool terminate() {
    bool active = (local_mdata != 0);
    if (!active) {
      active = net.anyPendingSends();
    }
    int snapshot_ended = 0;
    if (!active) {
#ifndef GALOIS_USE_LCI
      MPI_Test(&snapshot_request, &snapshot_ended, MPI_STATUS_IGNORE);
#else
      lc_col_progress(&snapshot_request);
      snapshot_ended = snapshot_request.flag;
#endif
    }
    if (!active) { // check pending receives after checking snapshot
      active = net.anyPendingReceives();
      if (active)
        galois::gDebug("[", net.ID, "] pending receive");
    }
    if (active) {
      work_done = true;
    } else {
      if (snapshot_ended != 0) {
        snapshot = global_snapshot;
        if (work_done) {
          work_done     = false;
          prev_snapshot = snapshot;
          ++snapshot;
          galois::gDebug("[", net.ID, "] work done, taking snapshot ",
                         snapshot);
          initiate_snapshot();
        } else if (prev_snapshot != snapshot) {
          prev_snapshot = snapshot;
          galois::gDebug("[", net.ID, "] no work done, taking snapshot ",
                         snapshot);
          initiate_snapshot();
        } else {
          galois::gDebug("[", net.ID, "] terminating ", snapshot);
          // an explicit barrier may be required here
          // so that the next async phase begins on all hosts at the same time
          // however, this may add overheads when it is not required
          // (depending on when the next async phase actually begins), so
          // ASSUME: caller will call getHostBarrier().wait() if required
          reinitialize(); // for next async phase
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Reduce data across all hosts, saves the value, and returns the
   * reduced value
   *
   * @param runID optional argument used to create a statistics timer
   * for later reporting
   *
   * @returns The reduced value
   */
  Ty reduce(std::string runID = std::string()) {
    std::string timer_str("ReduceDGAccum_" + runID);

    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),
                                                         "DGReducible");
    reduceTimer.start();

    if (local_mdata == 0)
      local_mdata = mdata.reduce();

    bool halt    = terminate();
    global_mdata = !halt;
    if (halt) {
      galois::runtime::evilPhase += 2; // one for reduce and one for broadcast
      if (galois::runtime::evilPhase >=
          static_cast<uint32_t>(
              std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
                                                      // LCI
        galois::runtime::evilPhase = 1;
      }
    }

    reduceTimer.stop();

    return global_mdata;
  }
};

} // namespace galois
#endif


================================================
FILE: libdist/include/galois/DistGalois.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DistGalois.h
 *
 * Contains the declaration of DistMemSys, a way to explicitly initiate the
 * Galois runtime.
 */
#ifndef GALOIS_DIST_GALOIS_H
#define GALOIS_DIST_GALOIS_H

#include "galois/runtime/SharedMem.h"
#include "galois/runtime/DistStats.h"

#include <string>
#include <utility>
#include <tuple>

namespace galois {
/**
 * Explicit class to initialize the Galois Runtime.
 * The runtime is destroyed when this object is destroyed
 */
class DistMemSys : public runtime::SharedMem<runtime::DistStatManager> {
public:
  explicit DistMemSys();

  ~DistMemSys();

  DistMemSys(const DistMemSys&) = delete;
  DistMemSys& operator=(const DistMemSys&) = delete;

  DistMemSys(DistMemSys&&) = delete;
  DistMemSys& operator=(DistMemSys&&) = delete;
};

} // namespace galois
#endif


================================================
FILE: libdist/include/galois/runtime/BareMPI.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file BareMPI.h
 *
 * Contains the BareMPI enum and the command line option that controls bare
 * MPI usage.
 */
#pragma once
#ifdef GALOIS_USE_BARE_MPI
#include "mpi.h"

//! Defines types of bare MPI to use
enum BareMPI {
  noBareMPI,          //!< do not use bare MPI; use our network layer
  nonBlockingBareMPI, //!< non blocking bare MPI
  oneSidedBareMPI     //!< one sided bare MPI
};
#endif


================================================
FILE: libdist/include/galois/runtime/DistStats.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DistStats.h
 *
 * Contains declaration of DistStatManager, which reports runtime statistics of
 * a distributed application in Galois.
 */

#ifndef GALOIS_RUNTIME_DIST_STATS_H
#define GALOIS_RUNTIME_DIST_STATS_H

//! Turn on if you want more distributed stats to be printed
#ifndef MORE_DIST_STATS
#define MORE_DIST_STATS 0
#endif
//! Turn on if you want more communication statistics to be printed
#ifndef GALOIS_COMM_STATS
#define GALOIS_COMM_STATS 0
#endif
//! Turn on if you want per-bulk-synchronous parallel timers to be printed
//! (otherwise all rounds are under 1 timer)
#ifndef GALOIS_PER_ROUND_STATS
#define GALOIS_PER_ROUND_STATS 0
#endif

#include "galois/runtime/Statistics.h"
#include "galois/runtime/Network.h"

#include <string>

namespace galois {
namespace runtime {

/**
 * Helper class for the DistStatManager that aids in receiving statistics
 */
class StatRecvHelper;

/**
 * Class responsible for tracking all statistics of a running distributed
 * Galois program and reporting them at the end of program execution.
 */
class DistStatManager : public galois::runtime::StatManager {
  //! Friend class that helps with receiving stats
  friend class galois::runtime::StatRecvHelper;
  using Base = galois::runtime::StatManager;
  using Str  = galois::gstl::Str;
  using Base::SEP;

  static constexpr const char* const HSTAT_SEP     = Base::TSTAT_SEP;
  static constexpr const char* const HSTAT_NAME    = "HostValues";
  static constexpr const char* const HSTAT_ENV_VAR = "PRINT_PER_HOST_STATS";

  static bool printingHostVals(void);

  template <typename _UNUSED = void>
  struct HostTotalTypesImpl {
    struct DummyStat {
      StatTotal::Type m_totalTy;

      explicit DummyStat(StatTotal::Type total) : m_totalTy(total) {}

      template <typename _U>
      void add(const _U&) const {}

      const StatTotal::Type& totalTy(void) const { return m_totalTy; }
    };

    using TMap = internal::BasicStatMap<DummyStat>;

    bool merged = false;
    substrate::PerThreadStorage<TMap> perThrdMap;

    void addToStat(const Str& region, const Str& category,
                   const StatTotal::Type& hTotalTy) {
      perThrdMap.getLocal()->addToStat(region, category, 0, hTotalTy);
    }

    void mergeStats(void) {
      if (merged) {
        return;
      }
      GALOIS_ASSERT(perThrdMap.getLocal() == perThrdMap.getRemote(0),
                    "Must call from Thread 0");

      auto* t0Map = perThrdMap.getRemote(0);

      for (unsigned t = 1; t < perThrdMap.size(); ++t) {
        const auto* manager = perThrdMap.getRemote(t);

        for (auto i = manager->cbegin(), end_i = manager->cend(); i != end_i;
             ++i) {
          t0Map->addToStat(manager->region(i), manager->category(i), 0,
                           manager->stat(i).totalTy());
        }
      }

      merged = true;
    }

    const TMap& mergedMap(void) const {
      assert(merged && "Must merge first");
      return *perThrdMap.getRemote(0);
    }
  };

  using HostTotalTypes = HostTotalTypesImpl<>;

  template <typename T>
  using ThrdVals = galois::gstl::Vector<T>;

  template <typename T>
  using HostStatVal =
      std::tuple<unsigned, T, StatTotal::Type, const ThrdVals<T>&>;

  template <typename T>
  struct HostStat : public internal::VecStat<T> {
    using Base             = internal::VecStat<T>;
    using ThrdStats        = internal::VecStat<T>;
    using PerHostThrdStats = galois::gstl::Map<unsigned, ThrdStats>;

    PerHostThrdStats perHostThrdStats;

    explicit HostStat(const StatTotal::Type& hTotalTy) : Base(hTotalTy) {}

    void add(const HostStatVal<T>& val) {
      const auto& hostID      = std::get<0>(val);
      const auto& thrdTotal   = std::get<1>(val);
      const auto& thrdTotalTy = std::get<2>(val);
      const auto& thrdVals    = std::get<3>(val);

      Base::add(thrdTotal);

      auto p      = perHostThrdStats.emplace(hostID, ThrdStats(thrdTotalTy));
      auto& tstat = p.first->second;

      for (const auto& i : thrdVals) {
        tstat.add(i);
      }
    }

    void printHostVals(std::ostream& out, const Str& region,
                       const Str& category) const {
      out << StatManager::statKind<T>() << SEP << galois::runtime::getHostID()
          << SEP;
      out << region << SEP << category << SEP;
      out << HSTAT_NAME << SEP;

      const char* sep = "";

      for (const auto& v : Base::values()) {
        out << sep << v;
        sep = HSTAT_SEP;
      }

      out << std::endl;
    }

    void printThreadVals(std::ostream& out, const Str& region,
                         const Str& category) const {
      for (const auto& p : perHostThrdStats) {
        out << StatManager::statKind<T>() << SEP << p.first << SEP;
        out << region << SEP << category << SEP;
        out << StatTotal::str(p.second.totalTy()) << SEP << p.second.total();
        out << std::endl;

        out << StatManager::statKind<T>() << SEP << p.first << SEP;
        out << region << SEP << category << SEP;
        out << StatManager::TSTAT_NAME << SEP;

        const char* sep = "";
        for (const auto& v : p.second.values()) {
          out << sep << v;
          sep = StatManager::TSTAT_SEP;
        }

        out << std::endl;
      }
    }
  };

  template <typename T>
  struct DistStatCombiner : public internal::BasicStatMap<HostStat<T>> {
    using Base = internal::BasicStatMap<HostStat<T>>;

#if __GNUC__ < 5
    static const char* htotalName(const StatTotal::Type& type){
#else
    static constexpr const char* htotalName(const StatTotal::Type& type) {
#endif
        switch (type) {
          case StatTotal::SINGLE : return "HOST_0";
  case StatTotal::TSUM:
    return "HSUM";
  case StatTotal::TAVG:
    return "HAVG";
  case StatTotal::TMIN:
    return "HMIN";
  case StatTotal::TMAX:
    return "HMAX";
  default:
    std::abort();
    return nullptr;
  }
}

    void print(std::ostream& out) const {
  for (auto i = Base::cbegin(), end_i = Base::cend(); i != end_i; ++i) {
    out << StatManager::statKind<T>() << SEP << galois::runtime::getHostID()
        << SEP;
    out << Base::region(i) << SEP << Base::category(i) << SEP;

    const HostStat<T>& hs = Base::stat(i);

    out << htotalName(hs.totalTy()) << SEP << hs.total();
    out << std::endl;

    if (DistStatManager::printingHostVals()) {
      hs.printHostVals(out, Base::region(i), Base::category(i));
    }

    if (StatManager::printingThreadVals()) {
      hs.printThreadVals(out, Base::region(i), Base::category(i));
    }
  }
}
}; // namespace runtime

DistStatCombiner<int64_t> intDistStats;
DistStatCombiner<double> fpDistStats;
DistStatCombiner<Str> strDistStats;
HostTotalTypes hostTotalTypes;

protected:
/**
 * Merge all stats from each individual thread as well as each individual
 * host as prescribed the the reduction (Total) type specified for each
 * statistic.
 */
void mergeStats(void);

/**
 * Print the header of the stats file output.
 *
 * @param out File to print header out to
 */
void printHeader(std::ostream& out) const;

/**
 * Merge all stats. Host 0 will then print out all collected stats.
 */
virtual void printStats(std::ostream& out);

public:
//! Dist stat manager constructor
DistStatManager(const std::string& outfile = "");
~DistStatManager();

/**
 * Adds a statistic to the statistics manager.
 *
 * @param region Region name to give statistic
 * @param category Category of statistic
 * @param val Value of the statistic
 * @param thrdTotalTy The type of reduction used to combine thread statistics
 * of the same kind
 * @param hTotalTy The type of reduction used to combine host statistics
 * of the same kind
 */
template <typename T>
void addToStat(const Str& region, const Str& category, const T& val,
               const StatTotal::Type& thrdTotalTy,
               const StatTotal::Type& hTotalTy) {
  Base::addToStat(region, category, val, thrdTotalTy);
  hostTotalTypes.addToStat(region, category, hTotalTy);
}

private:
void combineAtHost_0_helper(void);
void combineAtHost_0_helper2(void);
void receiveAtHost_0_helper(void);
void receiveAtHost_0_helper2(void);
void combineAtHost_0(void);
StatTotal::Type findHostTotalTy(const Str& region, const Str& category,
                                const StatTotal::Type& thrdTotalTy) const;
void addRecvdHostTotalTy(const Str& region, const Str& category,
                         const StatTotal::Type& totalTy);
void addRecvdStat(unsigned hostID, const Str& region, const Str& category,
                  int64_t thrdTotal, const StatTotal::Type& thrdTotalTy,
                  const ThrdVals<int64_t>& thrdVals);
void addRecvdStat(unsigned hostID, const Str& region, const Str& category,
                  double thrdTotal, const StatTotal::Type& thrdTotalTy,
                  const ThrdVals<double>& thrdVals);
void addRecvdParam(unsigned hostID, const Str& region, const Str& category,
                   const Str& thrdTotal, const StatTotal::Type& thrdTotalTy,
                   const ThrdVals<Str>& thrdVals);
}; // namespace galois

namespace internal {
/**
 * Gets a pointer to the distributed stat manager.
 *
 * @returns Pointer to distributed statistics manager
 */
DistStatManager* distSysStatManager(void);
} // namespace internal

/**
 * Adds a statistic to the statistics manager. Calls addToStat in
 * DistStatManager.
 *
 * @param region Region name to give statistic
 * @param category Category of statistic
 * @param value Value of the statistic
 * @param thrdTotalTy The type of reduction used to combine thread statistics
 * of the same kind
 * @param hTotalTy The type of reduction used to combine host statistics
 * of the same kind
 */
template <typename S1, typename S2, typename T>
inline void reportDistStat(const S1& region, const S2& category, const T& value,
                           const StatTotal::Type& thrdTotalTy,
                           const StatTotal::Type& hTotalTy) {
  internal::distSysStatManager()->addToStat(gstl::makeStr(region),
                                            gstl::makeStr(category), value,
                                            thrdTotalTy, hTotalTy);
}

} // end namespace runtime
} // end namespace galois

#endif // GALOIS_RUNTIME_DIST_STATS_H


================================================
FILE: libdist/include/galois/runtime/LWCI.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file LWCI.h
 *
 * LWCI header that includes lc.h (LCI library) and internal helper functions
 * on arrays.
 */

#pragma once
#ifdef GALOIS_USE_LCI
GALOIS_IGNORE_UNUSED_PARAMETERS
#include "lc.h"
GALOIS_END_IGNORE_UNUSED_PARAMETERS

extern lc_ep lc_col_ep;
extern lc_ep lc_p2p_ep[3];

namespace galois {
namespace runtime {
namespace internal {

/**
 * Element-wise sum of 2 arrays.
 *
 * @tparam Ty type of elements contained in the arrays
 *
 * @param dst destination array to write to
 * @param src source array to read from
 * @param count Size of array in bytes
 */
template <typename Ty>
void ompi_op_sum(void* dst, void* src, size_t count) {
  Ty* dst_ty = (Ty*)dst;
  Ty* src_ty = (Ty*)src;
  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {
    dst_ty[i] += src_ty[i];
  }
}

/**
 * Element-wise max of 2 arrays.
 *
 * @tparam Ty type of elements contained in the arrays
 *
 * @param dst destination array to write to
 * @param src source array to read from
 * @param count Size of array in bytes
 */
template <typename Ty>
void ompi_op_max(void* dst, void* src, size_t count) {
  Ty* dst_ty = (Ty*)dst;
  Ty* src_ty = (Ty*)src;
  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {
    if (dst_ty[i] < src_ty[i]) {
      dst_ty[i] = src_ty[i];
    }
  }
}

/**
 * Element-wise min of 2 arrays.
 *
 * @tparam Ty type of elements contained in the arrays
 *
 * @param dst destination array to write to
 * @param src source array to read from
 * @param count Size of array in bytes
 */
template <typename Ty>
void ompi_op_min(void* dst, void* src, size_t count) {
  Ty* dst_ty = (Ty*)dst;
  Ty* src_ty = (Ty*)src;
  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {
    if (dst_ty[i] > src_ty[i]) {
      dst_ty[i] = src_ty[i];
    }
  }
}

} // namespace internal
} // namespace runtime
} // namespace galois
#endif


================================================
FILE: libdist/include/galois/runtime/MemUsage.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file MemUsage.h
 *
 * Contains MemUsageTracker, a class that tracks memory usage throughout
 * runtime of a program of send/receive buffers.
 */

#pragma once
#include <atomic>
namespace galois {
namespace runtime {

/**
 * Class that tracks memory usage (mainly of send and receive buffers).
 */
class MemUsageTracker {
  std::atomic<int64_t>
      currentMemUsage; //!< mem usage of send and receive buffers
  int64_t maxMemUsage; //!< max mem usage of send and receive buffers

public:
  //! Default constructor initializes everything to 0.
  MemUsageTracker() : currentMemUsage(0), maxMemUsage(0) {}

  /**
   * Increment memory usage.
   *
   * @param size amount to increment mem usage by
   */
  inline void incrementMemUsage(uint64_t size) {
    currentMemUsage += size;
    if (currentMemUsage > maxMemUsage)
      maxMemUsage = currentMemUsage;
  }

  /**
   * Decrement memory usage.
   *
   * @param size amount to decrement mem usage by
   */
  inline void decrementMemUsage(uint64_t size) { currentMemUsage -= size; }

  /**
   * Reset mem usage and max mem usage to 0.
   */
  inline void resetMemUsage() {
    currentMemUsage = 0;
    maxMemUsage     = 0;
  }

  /**
   * Get max mem usage.
   *
   * @returns maximum memory usage tracked so far
   */
  inline int64_t getMaxMemUsage() const { return maxMemUsage; }
};

} // namespace runtime
} // namespace galois


================================================
FILE: libdist/include/galois/runtime/Network.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Network.h
 *
 * Contains the network interface class which is the base class for all
 * network layer implementations.
 */

#ifndef GALOIS_RUNTIME_NETWORK_H
#define GALOIS_RUNTIME_NETWORK_H

#include "galois/runtime/Serialize.h"
#include "galois/runtime/MemUsage.h"
#include "galois/substrate/Barrier.h"

#include <mpi.h>

#include <cstdint>
#include <optional>
#include <tuple>

namespace galois::runtime {

//! typedef for buffer that stores data to be sent out
using SendBuffer = SerializeBuffer;
//! typedef for buffer that received data is saved into
using RecvBuffer = DeSerializeBuffer;

/**
 * A class that defines functions that a network interface in Galois should
 * have. How the sends/recvs/stat-collecting happens as well
 * as the network layer itself is up to the implemention of the class.
 */
class NetworkInterface {
protected:
  //! Initialize the MPI system. Should only be called once per process.
  void initializeMPI();

  //! Finalize the MPI system. Should only be called once per process.
  void finalizeMPI();

  //! Memory usage tracker
  MemUsageTracker memUsageTracker;

  //! Number of inflight sends and receives
  std::atomic<size_t> inflightSends;
  std::atomic<size_t> inflightRecvs;

#ifdef GALOIS_USE_BARE_MPI
public:
  //! Wrapper that calls into increment mem usage on the memory usage tracker
  inline void incrementMemUsage(uint64_t size) {
    memUsageTracker.incrementMemUsage(size);
  }
  //! Wrapper that calls into decrement mem usage on the memory usage tracker
  inline void decrementMemUsage(uint64_t size) {
    memUsageTracker.decrementMemUsage(size);
  }
#endif

public:
  //! This machine's host ID
  static uint32_t ID;
  //! The total number of machines in the current program
  static uint32_t Num;

  /**
   * Constructor for interface.
   */
  NetworkInterface();

  /**
   * Destructor destroys MPI (if it exists).
   */
  virtual ~NetworkInterface();

  //! Send a message to a given (dest) host.  A message is simply a
  //! landing pad (recv, funciton pointer) and some data (buf)
  //! on the receiver, recv(buf) will be called durring handleReceives()
  //! buf is invalidated by this operation
  void sendMsg(uint32_t dest, void (*recv)(uint32_t, RecvBuffer&),
               SendBuffer& buf);

  //! Send a message letting the network handle the serialization and
  //! deserialization slightly slower
  template <typename... Args>
  void sendSimple(uint32_t dest, void (*recv)(uint32_t, Args...),
                  Args... param);

  //! Send a message to a given (dest) host.  A message is simply a
  //! tag (tag) and some data (buf)
  //! on the receiver, buf will be returned on a receiveTagged(tag)
  //! buf is invalidated by this operation
  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
                          int type = 0) = 0;

  //! Send a message to all hosts.  A message is simply a
  //! landing pad (recv) and some data (buf)
  //! buf is invalidated by this operation
  void broadcast(void (*recv)(uint32_t, RecvBuffer&), SendBuffer& buf,
                 bool self = false);

  //! Broadcast a message allowing the network to handle serialization and
  //! deserialization
  template <typename... Args>
  void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param);

  //! Receive and dispatch messages
  void handleReceives();

  //! Wrapper to reset the mem usage tracker's stats
  inline void resetMemUsage() { memUsageTracker.resetMemUsage(); }

  //! Reports the memory usage tracker's statistics to the stat manager
  void reportMemUsage() const;

  //! Receive a tagged message
  virtual std::optional<std::pair<uint32_t, RecvBuffer>>
  recieveTagged(uint32_t tag, std::unique_lock<substrate::SimpleLock>* rlg,
                int type = 0) = 0;

  //! move send buffers out to network
  virtual void flush() = 0;

  //! @returns true if any send is in progress or is pending to be enqueued
  virtual bool anyPendingSends() = 0;

  //! @returns true if any receive is in progress or is pending to be dequeued
  virtual bool anyPendingReceives() = 0;

  //! Get how many bytes were sent
  //! @returns num bytes sent
  virtual unsigned long reportSendBytes() const = 0;
  //! Get how many messages were sent
  //! @returns num messages sent
  virtual unsigned long reportSendMsgs() const = 0;
  //! Get how many bytes were received
  //! @returns num bytes received
  virtual unsigned long reportRecvBytes() const = 0;
  //! Get how many messages were received
  //! @returns num messages received
  virtual unsigned long reportRecvMsgs() const = 0;
  //! Get any other extra statistics that might need to be reported; varies
  //! depending on implementation
  //! @returns vector of extra things to be reported
  virtual std::vector<unsigned long> reportExtra() const = 0;
  //! Get the names of the extra things that are returned by reportExtra
  //! @returns vector of the names of the reported extra things
  virtual std::vector<std::pair<std::string, unsigned long>>
  reportExtraNamed() const = 0;
};

//! Variable that keeps track of which network send/recv phase a program is
//! currently on. Can be seen as a count of send/recv rounds that have occured.
extern uint32_t evilPhase;

//! Get the network interface
//! @returns network interface
NetworkInterface& getSystemNetworkInterface();

namespace internal {
//! Deletes the system network interface (if it exists).
void destroySystemNetworkInterface();
} // namespace internal

//! Gets this host's ID
//! @returns ID of this host
uint32_t getHostID();

//! Returns a BufferedNetwork interface
NetworkInterface& makeNetworkBuffered();

//! Returns a LCINetwork interface
NetworkInterface& makeNetworkLCI();

//! Returns a host barrier, which is a regular MPI-Like Barrier for all hosts.
//! @warning Should not be called within a parallel region; assumes only one
//! thread is calling it
substrate::Barrier& getHostBarrier();
//! Returns a fence that ensures all pending messages are delivered, acting
//! like a memory-barrier
substrate::Barrier& getHostFence();

////////////////////////////////////////////////////////////////////////////////
// Implementations
////////////////////////////////////////////////////////////////////////////////
namespace { // anon
template <typename... Args>
static void genericLandingPad(uint32_t src, RecvBuffer& buf) {
  void (*fp)(uint32_t, Args...);
  std::tuple<Args...> args;
  gDeserialize(buf, fp, args);
  std::apply([fp, src](Args... params) { fp(src, params...); }, args);
}

} // namespace

template <typename... Args>
void NetworkInterface::sendSimple(uint32_t dest,
                                  void (*recv)(uint32_t, Args...),
                                  Args... param) {
  SendBuffer buf;
  gSerialize(buf, (uintptr_t)recv, param...,
             (uintptr_t)genericLandingPad<Args...>);
  sendTagged(dest, 0, buf);
}

template <typename... Args>
void NetworkInterface::broadcastSimple(void (*recv)(uint32_t, Args...),
                                       Args... param) {
  SendBuffer buf;
  gSerialize(buf, (uintptr_t)recv, param...);
  broadcast(genericLandingPad<Args...>, buf, false);
}

} // namespace galois::runtime
#endif


================================================
FILE: libdist/include/galois/runtime/NetworkIO.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file NetworkIO.h
 *
 * Contains NetworkIO, a base class that is inherited by classes that want to
 * implement the communication layer of Galois. (e.g. NetworkIOMPI and
 * NetworkIOLWCI)
 */

#ifndef GALOIS_RUNTIME_NETWORKTHREAD_H
#define GALOIS_RUNTIME_NETWORKTHREAD_H

#include <cstdint>
#include <vector>
#include <tuple>
#include <memory>
#include <cassert>
#include <cstring>
#include <deque>
#include <string>
#include <fstream>
#include <unistd.h>
#include <mpi.h>
#include "galois/runtime/MemUsage.h"
#include "galois/PODResizeableArray.h"

namespace galois {
namespace runtime {

/**
 * Class for the network IO layer which is responsible for doing sends/receives
 * of data. Used by the network interface to do the actual communication.
 */
class NetworkIO {
protected:
  /**
   * Wrapper for dealing with MPI error codes. Program dies if the error code
   * isn't MPI_SUCCESS.
   *
   * @param rc Error code to check for success
   */
  static void handleError(int rc) {
    if (rc != MPI_SUCCESS) {
      MPI_Abort(MPI_COMM_WORLD, rc);
    }
  }

  //! memory usage tracker
  MemUsageTracker& memUsageTracker;

  //! Number of inflight sends and receives
  std::atomic<size_t>& inflightSends;
  std::atomic<size_t>& inflightRecvs;

  // using vTy = std::vector<uint8_t>;
  using vTy = galois::PODResizeableArray<uint8_t>;

public:
  /**
   * Message structure for sending data across the network.
   */
  struct message {
    uint32_t host; //!< destination of this message
    uint32_t tag;  //!< tag on message indicating distinct communication phases
    vTy data;      //!< data portion of message

    //! Default constructor initializes host and tag to large numbers.
    message() : host(~0), tag(~0) {}
    //! @param h Host to send message to
    //! @param t Tag to associate with message
    //! @param d Data to save in message
    message(uint32_t h, uint32_t t, vTy&& d)
        : host(h), tag(t), data(std::move(d)) {}

    //! A message is valid if there is data to be sent
    //! @returns true if data is non-empty
    bool valid() const { return !data.empty(); }
  };

  //! The default constructor takes a memory usage tracker and saves it
  //! @param tracker reference to a memory usage tracker used by the system
  //! @param sends
  //! @param recvs
  NetworkIO(MemUsageTracker& tracker, std::atomic<size_t>& sends,
            std::atomic<size_t>& recvs)
      : memUsageTracker(tracker), inflightSends(sends), inflightRecvs(recvs) {}

  //! Default destructor does nothing.
  virtual ~NetworkIO();
  //! Queues a message for sending out. Takes ownership of data buffer.
  virtual void enqueue(message m) = 0;
  //! Checks to see if a message is here for this host to receive. If so, take
  //! and return it
  //! @returns an empty message if no message
  virtual message dequeue() = 0;
  //! Make progress. Other functions don't have to make progress.
  virtual void progress() = 0;
};

/**
 * Creates/returns a network IO layer that uses MPI to do communication.
 *
 * @returns tuple with pointer to the MPI IO layer, this host's ID, and the
 * total number of hosts in the system
 */
std::tuple<std::unique_ptr<NetworkIO>, uint32_t, uint32_t>
makeNetworkIOMPI(galois::runtime::MemUsageTracker& tracker,
                 std::atomic<size_t>& sends, std::atomic<size_t>& recvs);
// #ifdef GALOIS_USE_LCI
// /**
//  * Creates/returns a network IO layer that uses LWCI to do communication.
//  *
//  * @returns tuple with pointer to the LWCI IO layer, this host's ID, and the
//  * total number of hosts in the system
//  */
// std::tuple<std::unique_ptr<NetworkIO>, uint32_t, uint32_t>
// makeNetworkIOLWCI(galois::runtime::MemUsageTracker& tracker,
//                   std::atomic<size_t>& sends, std::atomic<size_t>& recvs);
// #endif

} // namespace runtime
} // namespace galois

#endif


================================================
FILE: libdist/include/galois/runtime/Serialize.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Serialize.h
 *
 * Contains functions that serialize/deserialize data, mainly for sending
 * out serialized data over the network and deserializing it on the other end.
 */

#ifndef GALOIS_RUNTIME_SERIALIZE_H
#define GALOIS_RUNTIME_SERIALIZE_H

#include <type_traits>
#include <ostream>
#include <vector>
#include <deque>
#include <string>
#include <cassert>
#include <tuple>

#include <boost/mpl/has_xxx.hpp>
#include "galois/runtime/ExtraTraits.h"

#include <galois/gdeque.h>
#include <galois/DynamicBitset.h>
#include <galois/AtomicWrapper.h>
#include <galois/PODResizeableArray.h>
#include "galois/CopyableTuple.h"
#include "galois/Bag.h"

namespace galois {
namespace runtime {

class DeSerializeBuffer; // forward declaration for friend declaration

/**
 * Buffer for serialization of data. Mainly used during network communication.
 */
class SerializeBuffer {
  //! Access to a deserialize buffer
  friend DeSerializeBuffer;

  //! type of data buffer
  // using vTy = std::vector<uint8_t>;
  using vTy = galois::PODResizeableArray<uint8_t>;
  //! the actual data stored in this buffer
  vTy bufdata;

public:
  //! default constructor
  SerializeBuffer() = default;
  //! disabled copy constructor
  SerializeBuffer(SerializeBuffer&& rhs) = default;
  //! Creates a buffer from another buffer
  //! @param d buffer to create from
  //! @param len amount of copy from buffer d
  SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {}

  //! Push a character onto the serialize buffer
  inline void push(const char c) { bufdata.push_back(c); }

  //! Insert characters from a buffer into the serialize buffer
  void insert(const uint8_t* c, size_t bytes) {
    bufdata.insert(bufdata.end(), c, c + bytes);
  }

  //! Insert characters from a buffer into the serialize buffer at a particular
  //! offset
  void insertAt(const uint8_t* c, size_t bytes, size_t offset) {
    std::copy_n(c, bytes, bufdata.begin() + offset);
  }

  /**
   * Reserve space at the end for inserting new data into the serialize
   * buffer
   *
   * @param bytes number of bytes to reserve at the end
   * @returns offset to the end of the buffer before new space was reserved
   */
  size_t encomber(size_t bytes) {
    size_t retval = bufdata.size();
    bufdata.resize(retval + bytes);
    return retval;
  }

  void resize(size_t bytes) { bufdata.resize(bytes); }

  /**
   * Reserve more space in the serialize buffer.
   *
   * @param s extra space to reserve
   */
  void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); }

  //! Returns a pointer to the data stored in this serialize buffer
  const uint8_t* linearData() const { return bufdata.data(); }
  //! Returns vector of data stored in this serialize buffer
  vTy& getVec() { return bufdata; }

  //! Returns an iterator to the beginning of the data in this serialize buffer
  vTy::const_iterator begin() const { return bufdata.cbegin(); }
  //! Returns an iterator to the end of the data in this serialize buffer
  vTy::const_iterator end() const { return bufdata.cend(); }

  using size_type = vTy::size_type;

  //! Returns the size of the serialize buffer
  size_type size() const { return bufdata.size(); }

  //! Utility print function for the serialize buffer
  //! @param o stream to print to
  void print(std::ostream& o) const {
    o << "<{" << std::hex;
    for (auto& i : bufdata)
      o << (unsigned int)i << " ";
    o << std::dec << "}>";
  }

  //! Operator that calls the print function of the serialize buffer
  friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) {
    b.print(os);
    return os;
  }
};

/**
 * Buffer for deserialization of data. Mainly used during network
 * communication.
 */
class DeSerializeBuffer {
  //! Access to serialize buffer
  friend SerializeBuffer;
  //! type of data buffer
  // using vTy = std::vector<uint8_t>;
  using vTy = galois::PODResizeableArray<uint8_t>;
  //! the actual data stored in this buffer
  vTy bufdata;
  int offset;

public:
  //! Constructor initializes offset into buffer to 0
  DeSerializeBuffer() : offset(0) {}
  //! Disable copy constructor
  DeSerializeBuffer(DeSerializeBuffer&&) = default;
  //! Move constructor
  //! @param v vector to act as deserialize buffer
  //! @param start offset to start saving data into
  DeSerializeBuffer(vTy&& v, uint32_t start = 0)
      : bufdata(std::move(v)), offset(start) {}

  //! Constructor that takes an existing vector to use as the deserialize
  //! buffer
  explicit DeSerializeBuffer(vTy& data) {
    bufdata.swap(data);
    offset = 0;
  }

  /**
   * Initializes the deserialize buffer with a certain size
   * @param [in] count size to initialize buffer to
   */
  explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {}

  /**
   * Initializes the deserialize buffer using vector initialization from
   * 2 iterators.
   */
  template <typename Iter>
  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {}

  /**
   * Initialize a deserialize buffer from a serialize buffer
   */
  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) {
    bufdata.swap(buf.bufdata);
  }

  /**
   * Disable copy constructor
   */
  DeSerializeBuffer& operator=(DeSerializeBuffer&& buf) = default;

  /**
   * Reset deserialize buffer
   * @param count new size of buffer
   */
  void reset(int count) {
    offset = 0;
    bufdata.resize(count);
  }

  //! Gets the current offset into the deserialize buffer
  unsigned getOffset() const { return offset; }
  //! Sets the offset into the deserialize buffer
  void setOffset(unsigned off) {
    assert(off <= size());
    offset = off;
  }

  //! Gets the size of the deserialize buffer
  unsigned size() const { return bufdata.size(); }

  //! Returns true if the deserialize buffer is empty
  //! @returns true if the deserialize buffer is empty
  bool empty() const { return bufdata.empty(); }

  //! Get the next character in the deserialize buffer
  unsigned char pop() { return bufdata.at(offset++); }

  //! Clears the last x bytes of the deserialize buffer, resizing it as well
  //! @param x How many bytes from the end to clear
  void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); }

  /**
   * Extracts a certain amount of data from the deserialize buffer
   *
   * @param dst buffer to copy data from deserialize buffer into
   * @param num Amount of data to get from deserialize buffer
   */
  void extract(uint8_t* dst, size_t num) {
    if (num > 0) {
      memcpy(dst, &bufdata[offset], num);
      offset += num;
    }
  }

  //! Get the underlying vector storing the data of the deserialize
  //! buffer
  vTy& getVec() { return bufdata; }

  //! Get a pointer to the underlying data of the deserialize buffer
  void* linearData() { return &bufdata[0]; }

  //! Get a pointer to the remaining data of the deserialize buffer
  //! (as determined by offset)
  const uint8_t* r_linearData() const { return &bufdata[offset]; }
  //! Get the remaining size of the deserialize buffer (as determined
  //! by offset)
  size_t r_size() const { return bufdata.size() - offset; }

  //! Checks if the current location in the deserialize buffer is aligned
  //! to some size a
  bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; }

  //! Utility print of deserialize buffer
  //! @param o stream to print to
  void print(std::ostream& o) const {
    o << "<{(" << offset << ") " << std::hex;
    for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii)
      o << (unsigned int)*ii << " ";
    o << std::dec << "}>";
  }

  //! Operator for printing deserialize buffer
  friend std::ostream& operator<<(std::ostream& os,
                                  const DeSerializeBuffer& buf) {
    buf.print(os);
    return os;
  }
};

namespace internal {

/**
 * Returns the size necessary for an object in a buffer.
 * This version runs if the data is memory copyable; uses sizeof.
 *
 * @tparam T type of datato get size of
 */
template <typename T>
__attribute__((always_inline)) constexpr size_t
gSizedObj(const T&,
          typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {
  return sizeof(T);
}

/**
 * Returns the size necessary for an object in a buffer.
 * This version runs if the data is not memory copyable but is serializable.
 * It returns the size of a uintptr_t.
 *
 * @tparam T type of datato get size of
 * @returns size of uintptr_t
 */
template <typename T>
__attribute__((always_inline)) constexpr size_t
gSizedObj(const T&,
          typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,
          typename std::enable_if<has_serialize<T>::value>::type*       = 0) {
  return sizeof(uintptr_t);
}

/**
 * Returns the size necessary for storing 2 elements of a pair into a
 * serialize buffer.
 *
 * @param data pair of 2 elements
 */
template <typename T1, typename T2>
inline size_t gSizedObj(const std::pair<T1, T2>& data) {
  return gSizedObj(data.first) + gSizedObj(data.second);
}

/**
 * Returns the size necessary to store a sequence in a serialize buffer.
 * This depends on if the sequence is memory copyable.
 */
template <typename Seq>
size_t gSizedSeq(const Seq& seq) {
  typename Seq::size_type size = seq.size();
  typedef typename Seq::value_type T;
  size_t tsize = std::conditional<
      is_memory_copyable<T>::value, std::integral_constant<size_t, sizeof(T)>,
      std::integral_constant<size_t, sizeof(uintptr_t)>>::type::value;
  return sizeof(size) + tsize * size;
}

/**
 * Returns the size needed to store the elements a vector in a serialize
 * buffer.
 *
 * @returns size needed to store a vector into a serialize buffer
 */
template <typename T, typename Alloc>
inline size_t gSizedObj(const std::vector<T, Alloc>& data) {
  return gSizedSeq(data);
}

/**
 * Returns the size needed to store the elements a PODResizeableArray in a
 * serialize buffer.
 *
 * @returns size needed to store a PODResizeableArray into a serialize buffer
 */
template <typename T>
inline size_t gSizedObj(const galois::PODResizeableArray<T>& data) {
  return gSizedSeq(data);
}

/**
 * Returns the size needed to store the elements a deque into a serialize
 * buffer.
 *
 * @returns size needed to store a deque into a serialize buffer
 */
template <typename T, typename Alloc>
inline size_t gSerializeObj(const std::deque<T, Alloc>& data) {
  return gSizedSeq(data);
}

/**
 * Returns the size needed to store the elements a Galois deque into a serialize
 * buffer.
 *
 * @returns size needed to store a Galois deque into a serialize buffer
 */
template <typename T, unsigned CS>
inline size_t gSizedObj(const galois::gdeque<T, CS>& data) {
  return gSizedSeq(data);
}

/**
 * Returns the size needed to store a string into a serialize
 * buffer.
 *
 * @returns size needed to store a string into a serialize buffer
 */
template <typename A>
inline size_t
gSizedObj(const std::basic_string<char, std::char_traits<char>, A>& data) {
  return data.length() + 1;
}

/**
 * Returns the size of the passed in serialize buffer
 *
 * @returns size of the serialize buffer passed into it
 */
inline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); }

/**
 * Returns the size of the passed in deserialize buffer
 *
 * @returns size of the deserialize buffer passed into it
 */
inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); }

/**
 * Returns the size of the passed in insert bag.
 *
 * @returns size of the insert bag passed into it
 */
template <typename T>
inline size_t gSizedObj(const galois::InsertBag<T>& bag) {
  return bag.size();
}

/**
 * Returns 0.
 * @returns 0
 */
inline size_t adder() { return 0; }
/**
 * Returns the passed in argument.
 * @param a a number
 * @returns a
 */
inline size_t adder(size_t a) { return a; }
/**
 * Returns the sum of all passed in arguments.
 * @returns sum of all arguments
 */
template <typename... Args>
inline size_t adder(size_t a, size_t b, Args&&... args) {
  return a + b + adder(args...);
}

} // namespace internal

/**
 * Gets the total size necessary for storing all of the passed in arguments into
 * a serialize buffer.
 *
 * @returns size necessary for storing all arguments into a serialize buffer
 */
template <typename... Args>
static inline size_t gSized(Args&&... args) {
  return internal::adder(internal::gSizedObj(args)...);
}

////////////////////////////////////////////////////////////////////////////////
// Serialize support
////////////////////////////////////////////////////////////////////////////////

namespace internal {

/**
 * Serialize a memory copyable object into a serialize buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data Data to serialize
 */
template <typename T>
inline void gSerializeObj(
    SerializeBuffer& buf, const T& data,
    typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {
  uint8_t* pdata = (uint8_t*)&data;
  buf.insert(pdata, sizeof(T));
}

/**
 * Serialize a non-memory copyable but serializable object into a serialize
 * buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data Data to serialize
 */
template <typename T>
inline void
gSerializeObj(SerializeBuffer& buf, const T& data,
              typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,
              typename std::enable_if<has_serialize<T>::value>::type* = 0) {
  data.serialize(buf);
}

/**
 * Serialize a pair into a serialize buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data Pair to serialize
 */
template <typename T1, typename T2>
inline void gSerializeObj(SerializeBuffer& buf, const std::pair<T1, T2>& data) {
  gSerialize(buf, data.first, data.second);
}

/**
 * Serialize a pair. Either memcpys entire struct or serializes
 * each element individually.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data Pair to serialize
 */
template <typename T1, typename T2>
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::Pair<T1, T2>& data) {
  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value) {
    // do memcpy
    buf.insert((uint8_t*)&data, sizeof(data));
  } else {
    // serialize each individually
    gSerialize(buf, data.first, data.second);
  }
}

/**
 * Serialize a tuple of 3. Either memcpys entire struct or serializes
 * each element individually.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data Tuple of 3 to serialize
 * @todo This specialization isn't being used as expected. Figure out why.
 */
template <typename T1, typename T2, typename T3>
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::TupleOfThree<T1, T2, T3>& data) {
  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value &&
      is_memory_copyable<T3>::value) {
    // do memcpy
    buf.insert((uint8_t*)&data, sizeof(data));
  } else {
    // serialize each individually
    gSerialize(buf, data.first, data.second, data.third);
  }
}

/**
 * Serialize a copyable atomic: load atomic data as a plain old
 * datatype (POD) and mem copy it to the buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data copyable atomic to serialize
 */
template <typename T>
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::CopyableAtomic<T>& data) {
  T temp = data.load();
  buf.insert((uint8_t*)(&temp), sizeof(T));
}

/**
 * Serialize a string into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data String
 */
template <typename A>
inline void
gSerializeObj(SerializeBuffer& buf,
              const std::basic_string<char, std::char_traits<char>, A>& data) {
  buf.insert((uint8_t*)data.data(), data.length() + 1);
}

// Forward declaration of vector serialize
template <typename T, typename Alloc>
inline void gSerializeObj(SerializeBuffer& buf,
                          const std::vector<T, Alloc>& data);

/**
 * Serialize a sequence type into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] seq sequence to serialize
 * @todo specialize for Sequences with consecutive PODS
 */
template <typename Seq>
void gSerializeSeq(SerializeBuffer& buf, const Seq& seq) {
  typename Seq::size_type size = seq.size();
  gSerializeObj(buf, size);
  for (auto& o : seq)
    gSerializeObj(buf, o);
}

/**
 * Serialize a linear sequence type (i.e. memcopyable) into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] seq sequence to serialize
 */
template <typename Seq>
void gSerializeLinearSeq(SerializeBuffer& buf, const Seq& seq) {
  typename Seq::size_type size = seq.size();
  typedef typename Seq::value_type T;
  size_t tsize = sizeof(T);
  //  buf.reserve(size * tsize + sizeof(size));
  gSerializeObj(buf, size);
  buf.insert((uint8_t*)seq.data(), size * tsize);
}

/**
 * Serialize a vector into a buffer, choosing to do a memcopy or
 * to serialize each element individually depending on data.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data vector to serialize
 */
template <typename T, typename Alloc>
inline void gSerializeObj(SerializeBuffer& buf,
                          const std::vector<T, Alloc>& data) {
  if (is_memory_copyable<T>::value)
    gSerializeLinearSeq(buf, data);
  else
    gSerializeSeq(buf, data);
}

/**
 * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or
 * to serialize each element individually depending on data.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data PODResizeableArray to serialize
 */
template <typename T>
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::PODResizeableArray<T>& data) {
  gSerializeLinearSeq(buf, data);
}

/**
 * Serialize a deque into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data deque to serialize
 */
template <typename T, typename Alloc>
inline void gSerializeObj(SerializeBuffer& buf,
                          const std::deque<T, Alloc>& data) {
  gSerializeSeq(buf, data);
}

/**
 * Serialize a Galois deque into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data deque to serialize
 */
template <typename T, unsigned CS>
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::gdeque<T, CS>& data) {
  gSerializeSeq(buf, data);
}

/**
 * Serialize data in another serialize buffer into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data serialize buffer to get data from
 */
inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {
  buf.insert(data.linearData(), data.size());
}

/**
 * Serialize data in a deserialize buffer into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] rbuf deserialize buffer to get data from
 */
inline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) {
  //  buf.reserve(rbuf.r_size());
  buf.insert(rbuf.r_linearData(), rbuf.r_size());
}

/**
 * Serialize a dynamic bitset into a buffer.
 *
 * @param [in,out] buf Serialize buffer to serialize into
 * @param [in] data dynamic bitset to serialize
 */
inline void gSerializeObj(SerializeBuffer& buf,
                          const galois::DynamicBitSet& data) {
  gSerializeObj(buf, data.size());
  gSerializeObj(buf, data.get_vec());
}

// we removed the functions in Bag.h that this function requires, so this
// won't work
#if 0
/**
 * For serializing insertBag.
 * Insert contigous memory chunks for each thread
 * and clear it.
 * Can not be const.
 * Implemention below makes sure that it can be deserialized
 * into a linear sequence like vector or deque.
 */
template<typename T>
inline void gSerializeObj(SerializeBuffer& buf, galois::InsertBag<T>& bag){
  gSerializeObj(buf, bag.size());
  auto headerVec = bag.getHeads();
  size_t totalSize = 0;
  for(auto h : headerVec){
    size_t localSize = (h->dend - h->dbegin);
    buf.insert((uint8_t*)h->dbegin, localSize*sizeof(T));
    totalSize += (h->dend - h->dbegin);
  }

  assert(totalSize == bag.size());
  bag.clear();
}
#endif
} // namespace internal

/**
 * LazyRef structure; used to store both a type and an offset to begin
 * saving data into
 */
template <typename T>
struct LazyRef {
  size_t off;
};

/**
 * Lazy serialize: doesn't actually serialize the data itself, but only
 * reserves space for it in the serialize buffer + serializes the
 * passed in num.
 */
template <typename Seq>
static inline LazyRef<typename Seq::value_type>
gSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) {
  static_assert(is_memory_copyable<typename Seq::value_type>::value,
                "Not POD Sequence");
  typename Seq::size_type size = num;
  internal::gSerializeObj(buf, size);
  size_t tsize = sizeof(typename Seq::value_type);
  return LazyRef<typename Seq::value_type>{buf.encomber(tsize * num)};
}

/**
 * Lazy serialize: given an offset and type through a LazyRef object,
 * serializes a certain amount from the passed in data array.
 *
 * @param buf Buffer to serialize into
 * @param r struct with info on where to start saving data and the type
 * of the data that needs to be saved
 * @param item Number of items that need to be serialized
 * @param data Data array containing data that needs to be serialized
 */
template <typename Ty>
static inline void gSerializeLazy(SerializeBuffer& buf, LazyRef<Ty> r,
                                  unsigned item, Ty&& data) {
  size_t off     = r.off + sizeof(Ty) * item;
  uint8_t* pdata = (uint8_t*)&data;
  buf.insertAt(pdata, sizeof(Ty), off);
}

/**
 * Serialize an entire series of datatypes into a provided serialize buffer
 */
template <typename T1, typename... Args>
static inline void gSerialize(SerializeBuffer& buf, T1&& t1, Args&&... args) {
  buf.reserve(gSized(t1, args...));
  internal::gSerializeObj(buf, std::forward<T1>(t1));
  gSerialize(buf, std::forward<Args>(args)...);
}

/**
 * No-op function. "Base case" for recursive gSerialize function.
 */
static inline void gSerialize(SerializeBuffer&) {}

////////////////////////////////////////////////////////////////////////////////
// Deserialize support
////////////////////////////////////////////////////////////////////////////////

namespace internal {

/**
 * Deserialize a memcopyable object from a buffer.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] Data to deserialize into
 */
template <typename T>
void gDeserializeObj(
    DeSerializeBuffer& buf, T& data,
    typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {
  uint8_t* pdata = (uint8_t*)&data;
  buf.extract(pdata, sizeof(T));
}

/**
 * Deserialize a non-memcopyable but seralizable object from a buffer.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] Data to deserialize into
 */
template <typename T>
void gDeserializeObj(
    DeSerializeBuffer& buf, T& data,
    typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,
    typename std::enable_if<has_serialize<T>::value>::type*       = 0) {
  data.deserialize(buf);
}

/**
 * Deserialize a pair from a buffer.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] pair to deserialize into
 */
template <typename T1, typename T2>
void gDeserializeObj(DeSerializeBuffer& buf, std::pair<T1, T2>& data) {
  gDeserialize(buf, data.first, data.second);
}

/**
 * Deserialize into a pair. Either memcpys from buffer or deserializes
 * each element individually.
 *
 * @param [in,out] buf Buffer to deserialize from
 * @param [in] data Pair to deserialize into
 */
template <typename T1, typename T2>
inline void gDeserializeObj(DeSerializeBuffer& buf,
                            galois::Pair<T1, T2>& data) {
  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value) {
    // do memcpy
    buf.extract((uint8_t*)&data, sizeof(data));
  } else {
    // deserialize each individually
    gDeserialize(buf, data.first, data.second);
  }
}

/**
 * Deserialize into a tuple of 3. Either memcpys from buffer or deserializes
 * each element individually.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] triple to deserialize into
 * @todo This specialization isn't being used as expected. Figure out why.
 */
template <typename T1, typename T2, typename T3>
inline void gDeserializeObj(DeSerializeBuffer& buf,
                            galois::TupleOfThree<T1, T2, T3>& data) {
  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value &&
      is_memory_copyable<T3>::value) {
    // do memcpy straight to data
    buf.extract((uint8_t*)&data, sizeof(data));
  } else {
    // deserialize each individually
    gDeserialize(buf, data.first, data.second, data.third);
  }
}

/**
 * Deserialize into a CopyableAtomic. Loads the POD from the DeserializeBuffer
 * then stores it into the atomic.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] copyable atomic to deserialize into
 */
template <typename T>
void gDeserializeObj(DeSerializeBuffer& buf, galois::CopyableAtomic<T>& data) {
  T tempData;
  uint8_t* pointerToTemp = (uint8_t*)&tempData;
  buf.extract(pointerToTemp, sizeof(T));
  data.store(tempData);
}

namespace {
template <int...>
struct seq {};
template <int N, int... S>
struct gens : gens<N - 1, N - 1, S...> {};
template <int... S>
struct gens<0, S...> {
  typedef seq<S...> type;
};
} // namespace

/**
 * Deserialize into a tuple.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] tuple to serialize into
 */
template <typename... T, int... S>
void gDeserializeTuple(DeSerializeBuffer& buf, std::tuple<T...>& data,
                       seq<S...>) {
  gDeserialize(buf, std::get<S>(data)...);
}

/**
 * Wrapper for deserialization into a tuple.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] tuple to serialize into
 */
template <typename... T>
void gDeserializeObj(DeSerializeBuffer& buf, std::tuple<T...>& data) {
  return gDeserializeTuple(buf, data, typename gens<sizeof...(T)>::type());
}

/**
 * Deserialize into a string.
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] string to serialize into
 */
template <typename A>
inline void
gDeserializeObj(DeSerializeBuffer& buf,
                std::basic_string<char, std::char_traits<char>, A>& data) {
  char c = buf.pop();
  while (c != '\0') {
    data.push_back(c);
    c = buf.pop();
  };
}

// Forward declaration of vector deserialize
template <typename T, typename Alloc>
void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data);

/**
 * Deserialize into a sequence object
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param seq [in,out] sequence to deserialize into
 */
template <typename Seq>
void gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) {
  seq.clear();
  typename Seq::size_type size;
  gDeserializeObj(buf, size);
  while (size--) {
    typename Seq::value_type v;
    gDeserializeObj(buf, v);
    seq.push_back(v);
  }
}

/**
 * Deserialize into a linear sequence object (i.e. one that is mem-copyable)
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param seq [in,out] sequence to deserialize into
 */
template <typename Seq>
void gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) {
  typedef typename Seq::value_type T;
  //  seq.clear();
  typename Seq::size_type size;
  gDeserializeObj(buf, size);
  // If the alignment is right, cast to a T array and insert
  if (buf.atAlignment(alignof(T))) {
    T* src = (T*)buf.r_linearData();
    seq.assign(src, &src[size]);
    buf.setOffset(buf.getOffset() + size * sizeof(T));
  } else {
    seq.resize(size);
    buf.extract((uint8_t*)seq.data(), size * sizeof(T));
  }
}

/**
 * Deserialize into a deque
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] deque to deserialize into
 */
template <typename T, typename Alloc>
void gDeserializeObj(DeSerializeBuffer& buf, std::deque<T, Alloc>& data) {
  gDeserializeSeq(buf, data);
}

/**
 * Deserialize into a vector; implementation depends on whether or not data in
 * vector is mem-copyable
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] vector to deserialize into
 */
template <typename T, typename Alloc>
void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data) {
  if (is_memory_copyable<T>::value)
    gDeserializeLinearSeq(buf, data);
  else
    gDeserializeSeq(buf, data);
}

/**
 * Deserialize into a PODResizeableArray
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] PODResizeableArray to deserialize into
 */
template <typename T>
void gDeserializeObj(DeSerializeBuffer& buf,
                     galois::PODResizeableArray<T>& data) {
  gDeserializeLinearSeq(buf, data);
}

/**
 * Deserialize into a galois deque
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] galois deque to deserialize into
 */
template <typename T, unsigned CS>
void gDeserializeObj(DeSerializeBuffer& buf, galois::gdeque<T, CS>& data) {
  gDeserializeSeq(buf, data);
}

/**
 * Deserialize into a dynamic bitset
 *
 * @param buf [in,out] Buffer to deserialize from
 * @param data [in,out] bitset to deserialize into
 */
inline void gDeserializeObj(DeSerializeBuffer& buf,
                            galois::DynamicBitSet& data) {
  size_t size = 0;
  gDeserializeObj(buf, size);
  data.resize(size);
  gDeserializeObj(buf, data.get_vec());
}

} // namespace internal

/**
 * Deserialize data in a buffer into a series of objects
 */
template <typename T1, typename... Args>
void gDeserialize(DeSerializeBuffer& buf, T1&& t1, Args&&... args) {
  internal::gDeserializeObj(buf, std::forward<T1>(t1));
  gDeserialize(buf, std::forward<Args>(args)...);
}

/**
 * Base case for regular gDeserialize recursive call.
 */
inline void gDeserialize(DeSerializeBuffer&) {}

/**
 * "Deserialize" data in an iterator type into a data object.
 *
 * @tparam Iter iterator type that has objects of type T
 * @tparam T type of data to deserialize into
 * @param iter Iterator containing data that we want to save into the passed in
 * data reference
 * @param data Object to save data in the iterator type into
 */
template <typename Iter, typename T>
auto gDeserializeRaw(Iter iter, T& data) -> decltype(
    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),
    Iter()) {
  unsigned char* pdata = (unsigned char*)&data;
  for (size_t i = 0; i < sizeof(T); ++i)
    pdata[i] = *iter++;
  return iter;
}

} // namespace runtime
} // namespace galois

#endif // SERIALIZE DEF end


================================================
FILE: libdist/src/Barrier.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file libdist/src/Barrier.cpp
 *
 * Contains implementation of HostFence and HostBarrier as well as functions
 * that get static singletons of the 2.
 *
 * A fence flushes out and receives all messages in the network while a barrier
 * simply acts as a barrier in the code for all hosts.
 */

#include "galois/substrate/PerThreadStorage.h"
#include "galois/runtime/Substrate.h"
#include "galois/substrate/CompilerSpecific.h"
#include "galois/runtime/Network.h"
#include "galois/runtime/LWCI.h"

#include <cstdlib>
#include <cstdio>
#include <limits>

#include <iostream>
#include "galois/runtime/BareMPI.h"

namespace {
class HostFence : public galois::substrate::Barrier {
public:
  virtual const char* name() const { return "HostFence"; }

  virtual void reinit(unsigned) {}

  //! control-flow barrier across distributed hosts
  //! acts as a distributed-memory fence as well (flushes send and receives)
  virtual void wait() {
    auto& net = galois::runtime::getSystemNetworkInterface();

    if (galois::runtime::evilPhase == 0) {
      galois::gWarn("evilPhase is 0, implying loop-around or no use: fence "
                    "may not work correctly!");
    }

    for (unsigned h = 0; h < net.Num; ++h) {
      if (h == net.ID)
        continue;
      galois::runtime::SendBuffer b;
      galois::runtime::gSerialize(b, net.ID + 1); // non-zero message
      net.sendTagged(h, galois::runtime::evilPhase, b);
    }
    net.flush(); // flush all sends

    unsigned received = 1; // self
    while (received < net.Num) {
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        net.handleReceives(); // flush all receives from net.sendMsg() or
                              // net.sendSimple()
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);
      assert(p->first != net.ID);
      // ignore received data
      ++received;
    }
    ++galois::runtime::evilPhase;
    if (galois::runtime::evilPhase >=
        static_cast<uint32_t>(
            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
                                                    // LCI
      galois::runtime::evilPhase = 1;
    }
  }
};

class HostBarrier : public galois::substrate::Barrier {
public:
  virtual const char* name() const { return "HostBarrier"; }

  virtual void reinit(unsigned) {}

  //! Control-flow barrier across distributed hosts
  virtual void wait() {
#ifdef GALOIS_USE_LCI
    lc_barrier(lc_col_ep);
#else
    MPI_Barrier(MPI_COMM_WORLD); // assumes MPI_THREAD_MULTIPLE
#endif
  }
};

} // end anonymous namespace

galois::substrate::Barrier& galois::runtime::getHostBarrier() {
  static HostBarrier b;
  return b;
}

galois::substrate::Barrier& galois::runtime::getHostFence() {
  static HostFence b;
  return b;
}


================================================
FILE: libdist/src/DistGalois.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DistGalois.cpp
 *
 * Includes the definitions for DistMemSys's constructor and destructor
 */

#include "galois/DistGalois.h"
#include "galois/runtime/Network.h"

//! DistMemSys constructor which calls the shared memory runtime constructor
//! with the distributed stats manager
galois::DistMemSys::DistMemSys()
    : galois::runtime::SharedMem<galois::runtime::DistStatManager>() {}

//! DistMemSys destructor which reports memory usage from the network
galois::DistMemSys::~DistMemSys() {
  if (MORE_DIST_STATS) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    net.reportMemUsage();
  }
}


================================================
FILE: libdist/src/DistStats.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DistStats.cpp
 *
 * Contains implementations for DistStats.h.
 */
#include "galois/runtime/DistStats.h"
#include "galois/runtime/Serialize.h"
#include "galois/DTerminationDetector.h"

using namespace galois::runtime;

DistStatManager* internal::distSysStatManager(void) {
  galois::runtime::StatManager* sm = internal::sysStatManager();

  assert(sm && "StatManager not initialized");

  DistStatManager* d = dynamic_cast<DistStatManager*>(sm);

  assert(d && "dynamic_cast<DistStatManager*> failed");

  return d;
}

inline static DistStatManager* dsm(void) {
  return internal::distSysStatManager();
}

DistStatManager::DistStatManager(const std::string& outfile)
    : StatManager(outfile) {}
DistStatManager::~DistStatManager() {
  galois::runtime::internal::destroySystemNetworkInterface();
}

class galois::runtime::StatRecvHelper {

public:
  static void recvAtHost_0_hostTotalTy(galois::gstl::Str region,
                                       galois::gstl::Str category,
                                       StatTotal::Type totalTy) {

    dsm()->addRecvdHostTotalTy(region, category, totalTy);
  }

  static void recvAtHost_0_int(uint32_t hostID, galois::gstl::Str region,
                               galois::gstl::Str category, int64_t thrdTotal,
                               StatTotal::Type totalTy,
                               const galois::gstl::Vector<int64_t> thrdVals) {

    dsm()->addRecvdStat(hostID, region, category, thrdTotal, totalTy, thrdVals);
  }

  static void recvAtHost_0_fp(uint32_t hostID, galois::gstl::Str region,
                              galois::gstl::Str category, double thrdTotal,
                              StatTotal::Type totalTy,
                              const galois::gstl::Vector<double> thrdVals) {

    dsm()->addRecvdStat(hostID, region, category, thrdTotal, totalTy, thrdVals);
  }

  static void
  recvAtHost_0_str(uint32_t hostID, galois::gstl::Str region,
                   galois::gstl::Str category, galois::gstl::Str thrdTotal,
                   StatTotal::Type totalTy,
                   const galois::gstl::Vector<galois::gstl::Str> thrdVals) {

    dsm()->addRecvdParam(hostID, region, category, thrdTotal, totalTy,
                         thrdVals);
  }
};

void DistStatManager::mergeStats(void) {
  Base::mergeStats();
  hostTotalTypes.mergeStats();
  combineAtHost_0();
}

void DistStatManager::combineAtHost_0_helper(void) {
  const bool IS_HOST0 = getHostID() == 0;

  const auto& hTotalMap = hostTotalTypes.mergedMap();

  size_t syncTypePhase = 0;
  if (!IS_HOST0) {
    for (auto i = hTotalMap.cbegin(), end_i = hTotalMap.cend(); i != end_i;
         ++i) {
      SendBuffer b;
      gSerialize(b, hTotalMap.region(i), hTotalMap.category(i),
                 hTotalMap.stat(i).totalTy());
      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
                                             syncTypePhase);
    }
  }

  ++syncTypePhase;
  for (auto i = Base::intBegin(), end_i = Base::intEnd(); i != end_i; ++i) {
    Str ln;
    Str cat;
    int64_t thrdTotal;
    StatTotal::Type totalTy;
    galois::gstl::Vector<int64_t> thrdVals;

    Base::readIntStat(i, ln, cat, thrdTotal, totalTy, thrdVals);

    if (IS_HOST0) {
      addRecvdStat(0, ln, cat, thrdTotal, totalTy, thrdVals);

    } else {
      SendBuffer b;
      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
                                             syncTypePhase);
    }
  }
}

void DistStatManager::combineAtHost_0_helper2(void) {
  const bool IS_HOST0 = getHostID() == 0;

  size_t syncTypePhase = 0;
  for (auto i = Base::fpBegin(), end_i = Base::fpEnd(); i != end_i; ++i) {
    Str ln;
    Str cat;
    double thrdTotal;
    StatTotal::Type totalTy;
    galois::gstl::Vector<double> thrdVals;

    Base::readFPstat(i, ln, cat, thrdTotal, totalTy, thrdVals);

    if (IS_HOST0) {
      addRecvdStat(0, ln, cat, thrdTotal, totalTy, thrdVals);

    } else {
      SendBuffer b;
      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
                                             syncTypePhase);
    }
  }

  ++syncTypePhase;
  for (auto i = Base::paramBegin(), end_i = Base::paramEnd(); i != end_i; ++i) {
    Str ln;
    Str cat;
    Str thrdTotal;
    StatTotal::Type totalTy;
    galois::gstl::Vector<Str> thrdVals;

    Base::readParam(i, ln, cat, thrdTotal, totalTy, thrdVals);

    if (IS_HOST0) {
      addRecvdParam(0, ln, cat, thrdTotal, totalTy, thrdVals);

    } else {
      SendBuffer b;
      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
                                             syncTypePhase);
    }
  }
}

void DistStatManager::receiveAtHost_0_helper(void) {
  size_t syncTypePhase = 0;
  {
    decltype(getSystemNetworkInterface().recieveTagged(
        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
    do {
      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
                                                    nullptr, syncTypePhase);

      if (p) {
        RecvBuffer& b = p->second;

        galois::gstl::Str region;
        galois::gstl::Str category;
        StatTotal::Type totalTy;
        gDeserialize(b, region, category, totalTy);

        StatRecvHelper::recvAtHost_0_hostTotalTy(region, category, totalTy);
      }
    } while (p);
  }

  ++syncTypePhase;
  {
    decltype(getSystemNetworkInterface().recieveTagged(
        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
    do {
      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
                                                    nullptr, syncTypePhase);

      if (p) {
        uint32_t hostID = p->first;
        RecvBuffer& b   = p->second;

        Str ln;
        Str cat;
        int64_t thrdTotal;
        StatTotal::Type totalTy;
        galois::gstl::Vector<int64_t> thrdVals;
        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);

        StatRecvHelper::recvAtHost_0_int(hostID, ln, cat, thrdTotal, totalTy,
                                         thrdVals);
      }
    } while (p);
  }
}

void DistStatManager::receiveAtHost_0_helper2(void) {
  size_t syncTypePhase = 0;
  {
    decltype(getSystemNetworkInterface().recieveTagged(
        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
    do {
      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
                                                    nullptr, syncTypePhase);

      if (p) {
        uint32_t hostID = p->first;
        RecvBuffer& b   = p->second;

        Str ln;
        Str cat;
        double thrdTotal;
        StatTotal::Type totalTy;
        galois::gstl::Vector<double> thrdVals;
        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);

        StatRecvHelper::recvAtHost_0_fp(hostID, ln, cat, thrdTotal, totalTy,
                                        thrdVals);
      }
    } while (p);
  }

  ++syncTypePhase;
  {
    decltype(getSystemNetworkInterface().recieveTagged(
        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
    do {
      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
                                                    nullptr, syncTypePhase);

      if (p) {
        uint32_t hostID = p->first;
        RecvBuffer& b   = p->second;

        Str ln;
        Str cat;
        Str thrdTotal;
        StatTotal::Type totalTy;
        galois::gstl::Vector<Str> thrdVals;
        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);

        StatRecvHelper::recvAtHost_0_str(hostID, ln, cat, thrdTotal, totalTy,
                                         thrdVals);
      }
    } while (p);
  }
}

void DistStatManager::combineAtHost_0(void) {
  galois::DGTerminator<unsigned int> td;

  // host 0 reads stats from Base class
  // other hosts send stats to host 0
  combineAtHost_0_helper();
  getSystemNetworkInterface().flush();

  // work done before check
  td += 1;

  // barrier
  while (td.reduce()) {
    td.reset();
    if (getHostID() == 0) {
      // receive from other hosts
      receiveAtHost_0_helper();
    }
  }

  // explicit barrier after logical barrier is required
  // as next async phase begins immediately
  getHostBarrier().wait();

  // host 0 reads stats from Base class
  // other hosts send stats to host 0
  combineAtHost_0_helper2();
  getSystemNetworkInterface().flush();

  td += 1;

  // barrier
  while (td.reduce()) {
    td.reset();

    if (getHostID() == 0) {
      // receive from other hosts
      receiveAtHost_0_helper2();
    }
  }

  // explicit barrier after logical barrier is required
  // as next async phase begins immediately
  getHostBarrier().wait();
}

bool DistStatManager::printingHostVals(void) {
  return galois::substrate::EnvCheck(DistStatManager::HSTAT_ENV_VAR);
}

StatTotal::Type
DistStatManager::findHostTotalTy(const Str& region, const Str& category,
                                 const StatTotal::Type& thrdTotalTy) const {

  StatTotal::Type hostTotalTy = thrdTotalTy;

  auto& mrgMap = hostTotalTypes.mergedMap();

  auto i = mrgMap.findStat(region, category);
  if (i != mrgMap.cend()) {
    hostTotalTy = mrgMap.stat(i).totalTy();
  }

  return hostTotalTy;
}

void DistStatManager::addRecvdHostTotalTy(const Str& region,
                                          const Str& category,
                                          const StatTotal::Type& totalTy) {
  hostTotalTypes.addToStat(region, category, totalTy);
}

void DistStatManager::addRecvdStat(
    unsigned hostID, const Str& region, const Str& category, int64_t thrdTotal,
    const StatTotal::Type& thrdTotalTy,
    const DistStatManager::ThrdVals<int64_t>& thrdVals) {

  intDistStats.addToStat(
      region, category,
      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),
      findHostTotalTy(region, category, thrdTotalTy));
}

void DistStatManager::addRecvdStat(
    unsigned hostID, const Str& region, const Str& category, double thrdTotal,
    const StatTotal::Type& thrdTotalTy,
    const DistStatManager::ThrdVals<double>& thrdVals) {

  fpDistStats.addToStat(
      region, category,
      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),
      findHostTotalTy(region, category, thrdTotalTy));
}

void DistStatManager::addRecvdParam(
    unsigned hostID, const Str& region, const Str& category,
    const Str& thrdTotal, const StatTotal::Type& thrdTotalTy,
    const DistStatManager::ThrdVals<Str>& thrdVals) {

  strDistStats.addToStat(
      region, category,
      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),
      findHostTotalTy(region, category, thrdTotalTy));
}

void DistStatManager::printHeader(std::ostream& out) const {
  out << "STAT_TYPE" << SEP;
  out << "HOST_ID" << SEP;
  out << "REGION" << SEP << "CATEGORY" << SEP;
  out << "TOTAL_TYPE" << SEP << "TOTAL";

  out << std::endl;
}

void DistStatManager::printStats(std::ostream& out) {
  mergeStats();

  galois::DGTerminator<unsigned int> td;
  if (getHostID() == 0) {
    printHeader(out);

    intDistStats.print(out);
    fpDistStats.print(out);
    strDistStats.print(out);
  }
  // all hosts must wait for host 0 to finish printing stats
  while (td.reduce()) {
  };
}


================================================
FILE: libdist/src/Network.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Network.cpp
 *
 * Contains implementations for basic NetworkInterface functions and
 * initializations of some NetworkInterface variables.
 */

#include "galois/runtime/Tracer.h"
#include "galois/runtime/Network.h"
#include "galois/runtime/NetworkIO.h"

#include <iostream>
#include <mutex>

using namespace galois::runtime;

uint32_t galois::runtime::evilPhase = 1;

uint32_t galois::runtime::NetworkInterface::ID  = 0;
uint32_t galois::runtime::NetworkInterface::Num = 1;

uint32_t galois::runtime::getHostID() { return NetworkInterface::ID; }

galois::runtime::NetworkIO::~NetworkIO() {}

void NetworkInterface::initializeMPI() {
  int supportProvided;
  int initSuccess =
      MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &supportProvided);
  if (initSuccess != MPI_SUCCESS) {
    MPI_Abort(MPI_COMM_WORLD, initSuccess);
  }

  if (supportProvided != MPI_THREAD_MULTIPLE) {
    GALOIS_DIE("MPI_THREAD_MULTIPLE not supported.");
  }
}

void NetworkInterface::finalizeMPI() {
  int finalizeSuccess = MPI_Finalize();

  if (finalizeSuccess != MPI_SUCCESS) {
    MPI_Abort(MPI_COMM_WORLD, finalizeSuccess);
  }

  galois::gDebug("[", NetworkInterface::ID, "] MPI finalized");
}

NetworkInterface::NetworkInterface() {}

NetworkInterface::~NetworkInterface() {}

void NetworkInterface::reportMemUsage() const {
  std::string str("CommunicationMemUsage");
  galois::runtime::reportStat_Tmin("dGraph", str + "Min",
                                   memUsageTracker.getMaxMemUsage());
  galois::runtime::reportStat_Tmax("dGraph", str + "Max",
                                   memUsageTracker.getMaxMemUsage());
}

// forward decl
//! Receive broadcasted messages over the network
static void bcastLandingPad(uint32_t src, ::RecvBuffer& buf);

static void bcastLandingPad(uint32_t src, RecvBuffer& buf) {
  uintptr_t fp;
  gDeserialize(buf, fp);
  auto recv = (void (*)(uint32_t, RecvBuffer&))fp;
  trace("NetworkInterface::bcastLandingPad", (void*)recv);
  recv(src, buf);
}

void NetworkInterface::sendMsg(uint32_t dest,
                               void (*recv)(uint32_t, RecvBuffer&),
                               SendBuffer& buf) {
  gSerialize(buf, recv);
  sendTagged(dest, 0, buf);
}

void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),
                                 SendBuffer& buf, bool self) {
  trace("NetworkInterface::broadcast", (void*)recv);
  auto fp = (uintptr_t)recv;
  for (unsigned x = 0; x < Num; ++x) {
    if (x != ID) {
      SendBuffer b;
      gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad);
      sendTagged(x, 0, b);
    } else if (self) {
      RecvBuffer rb(buf.begin(), buf.end());
      recv(ID, rb);
    }
  }
}

void NetworkInterface::handleReceives() {
  std::unique_lock<substrate::SimpleLock> lg;
  auto opt = recieveTagged(0, &lg);
  while (opt) {
    uint32_t src    = std::get<0>(*opt);
    RecvBuffer& buf = std::get<1>(*opt);
    uintptr_t fp    = 0;
    gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp);
    buf.pop_back(sizeof(uintptr_t));
    assert(fp);
    auto f = (void (*)(uint32_t, RecvBuffer&))fp;
    f(src, buf);
    opt = recieveTagged(0, &lg);
  }
}

NetworkInterface& galois::runtime::getSystemNetworkInterface() {
#ifndef GALOIS_USE_LCI
  return makeNetworkBuffered();
#else
  return makeNetworkLCI();
#endif
}

void galois::runtime::internal::destroySystemNetworkInterface() {
  // get net interface, then delete it
  NetworkInterface& netInterface = getSystemNetworkInterface();
  delete &netInterface;
}


================================================
FILE: libdist/src/NetworkBuffered.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file NetworkBuffered.cpp
 *
 * Contains NetworkInterfaceBuffered, an implementation of a network interface
 * that buffers messages before sending them out.
 *
 * @todo document this file more
 */

#include "galois/runtime/Network.h"
#include "galois/runtime/NetworkIO.h"
#include "galois/runtime/Tracer.h"

#ifdef GALOIS_USE_LCI
#define NO_AGG
#endif

#include <thread>
#include <mutex>
#include <iostream>
#include <limits>

using namespace galois::runtime;
using namespace galois::substrate;

namespace {

/**
 * @class NetworkInterfaceBuffered
 *
 * Buffered network interface: messages are buffered before they are sent out.
 * A single worker thread is initialized to send/receive messages from/to
 * buffers.
 */
class NetworkInterfaceBuffered : public NetworkInterface {
  static const int COMM_MIN =
      1400; //! bytes (sligtly smaller than an ethernet packet)
  static const int COMM_DELAY = 100; //! microseconds delay

  unsigned long statSendNum;
  unsigned long statSendBytes;
  unsigned long statSendEnqueued;
  unsigned long statRecvNum;
  unsigned long statRecvBytes;
  unsigned long statRecvDequeued;
  bool anyReceivedMessages;

  // using vTy = std::vector<uint8_t>;
  using vTy = galois::PODResizeableArray<uint8_t>;

  /**
   * Receive buffers for the buffered network interface
   */
  class recvBuffer {
    std::deque<NetworkIO::message> data;
    size_t frontOffset;
    SimpleLock qlock;
    // tag of head of queue
    std::atomic<uint32_t> dataPresent;

    bool sizeAtLeast(size_t n, uint32_t tag) {
      size_t tot = -frontOffset;
      for (auto& v : data) {
        if (v.tag == tag) {
          tot += v.data.size();
          if (tot >= n)
            return true;
        } else {
          return false;
        }
      }
      return false;
    }

    template <typename IterTy>
    void copyOut(IterTy it, size_t n) {
      // assert(sizeAtLeast(n));
      // fast path is first buffer
      { // limit scope
        auto& f0data = data[0].data;
        for (int k = frontOffset, ke = f0data.size(); k < ke && n; ++k, --n)
          *it++ = f0data[k];
      }
      if (n) { // more data (slow path)
        for (int j = 1, je = data.size(); j < je && n; ++j) {
          auto& vdata = data[j].data;
          for (int k = 0, ke = vdata.size(); k < ke && n; ++k, --n) {
            *it++ = vdata[k];
          }
        }
      }
    }

    /**
     * Return a (moved) vector if the len bytes requested are the last len
     * bytes of the front of the buffer queue
     */
    std::optional<vTy> popVec(uint32_t len,
                              std::atomic<size_t>& inflightRecvs) {
      if (data[0].data.size() == frontOffset + len) {
        vTy retval(std::move(data[0].data));
        data.pop_front();
        --inflightRecvs;
        frontOffset = 0;
        if (data.size()) {
          dataPresent = data.front().tag;
        } else {
          dataPresent = ~0;
        }
        return std::optional<vTy>(std::move(retval));
      } else {
        return std::optional<vTy>();
      }
    }

    void erase(size_t n, std::atomic<size_t>& inflightRecvs) {
      frontOffset += n;
      while (frontOffset && frontOffset >= data.front().data.size()) {
        frontOffset -= data.front().data.size();
        data.pop_front();
        --inflightRecvs;
      }
      if (data.size()) {
        dataPresent = data.front().tag;
      } else {
        dataPresent = ~0;
      }
    }

    uint32_t getLenFromFront(uint32_t tag) {
      if (sizeAtLeast(sizeof(uint32_t), tag)) {
        union {
          uint8_t a[sizeof(uint32_t)];
          uint32_t b;
        } c;
        copyOut(&c.a[0], sizeof(uint32_t));
        return c.b;
      } else {
        return ~0;
      }
    }

  public:
    std::optional<RecvBuffer> popMsg(uint32_t tag,
                                     std::atomic<size_t>& inflightRecvs) {
      std::lock_guard<SimpleLock> lg(qlock);
#ifndef NO_AGG
      uint32_t len = getLenFromFront(tag);
      //      assert(len);
      if (len == ~0U || len == 0)
        return std::optional<RecvBuffer>();
      if (!sizeAtLeast(sizeof(uint32_t) + len, tag))
        return std::optional<RecvBuffer>();
      erase(4, inflightRecvs);

      // Try just using the buffer
      if (auto r = popVec(len, inflightRecvs)) {
        auto start = r->size() - len;
        //        std::cerr << "FP " << r->size() << " " << len << " " << start
        //        << "\n";
        return std::optional<RecvBuffer>(RecvBuffer(std::move(*r), start));
      }

      RecvBuffer buf(len);
      // FIXME: This is slows things down 25%
      copyOut((char*)buf.linearData(), len);
      erase(len, inflightRecvs);
      // std::cerr << "p " << tag << " " << len << "\n";
      return std::optional<RecvBuffer>(std::move(buf));
#else
      if (data.empty() || data.front().tag != tag)
        return std::optional<RecvBuffer>();

      vTy vec(std::move(data.front().data));

      data.pop_front();
      --inflightRecvs;
      if (!data.empty()) {
        dataPresent = data.front().tag;
      } else {
        dataPresent = ~0;
      }

      return std::optional<RecvBuffer>(RecvBuffer(std::move(vec), 0));
#endif
    }

    // Worker thread interface
    void add(NetworkIO::message m) {
      std::lock_guard<SimpleLock> lg(qlock);
      if (data.empty()) {
        galois::runtime::trace("ADD LATEST ", m.tag);
        dataPresent = m.tag;
      }

      // std::cerr << m.data.size() << " " <<
      //              std::count(m.data.begin(), m.data.end(), 0) << "\n";
      // for (auto x : m.data) {
      //   std::cerr << (int) x << " ";
      // }
      // std::cerr << "\n";
      // std::cerr << "A " << m.host << " " << m.tag << " " << m.data.size() <<
      // "\n";

      data.push_back(std::move(m));

      assert(data.back().data.size() !=
             (unsigned int)std::count(data.back().data.begin(),
                                      data.back().data.end(), 0));
    }

    bool hasData(uint32_t tag) { return dataPresent == tag; }

    size_t size() { return data.size(); }

    uint32_t getPresentTag() { return dataPresent; }
  }; // end recv buffer class

  std::vector<recvBuffer> recvData;
  std::vector<SimpleLock> recvLock;

  /**
   * Send buffers for the buffered network interface
   */
  class sendBuffer {
    struct msg {
      uint32_t tag;
      vTy data;
      msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {}
    };

    std::deque<msg> messages;
    std::atomic<size_t> numBytes;
    std::atomic<unsigned> urgent;
    //! @todo FIXME track time since some epoch in an atomic.
    std::chrono::high_resolution_clock::time_point time;
    SimpleLock lock, timelock;

  public:
    unsigned long statSendTimeout;
    unsigned long statSendOverflow;
    unsigned long statSendUrgent;

    size_t size() { return messages.size(); }

    void markUrgent() {
      if (numBytes) {
        std::lock_guard<SimpleLock> lg(lock);
        urgent = messages.size();
      }
    }

    bool ready() {
#ifndef NO_AGG
      if (numBytes == 0)
        return false;
      if (urgent) {
        ++statSendUrgent;
        return true;
      }
      if (numBytes > COMM_MIN) {
        ++statSendOverflow;
        return true;
      }
      auto n = std::chrono::high_resolution_clock::now();
      decltype(n) mytime;
      {
        std::lock_guard<SimpleLock> lg(timelock);
        mytime = time;
      }
      auto elapsed =
          std::chrono::duration_cast<std::chrono::microseconds>(n - mytime);
      if (elapsed.count() > COMM_DELAY) {
        ++statSendTimeout;
        return true;
      }
      return false;
#else
      return messages.size() > 0;
#endif
    }

    std::pair<uint32_t, vTy>
    assemble(std::atomic<size_t>& GALOIS_UNUSED(inflightSends)) {
      std::unique_lock<SimpleLock> lg(lock);
      if (messages.empty())
        return std::make_pair(~0, vTy());
#ifndef NO_AGG
      // compute message size
      uint32_t len = 0;
      int num      = 0;
      uint32_t tag = messages.front().tag;
      for (auto& m : messages) {
        if (m.tag != tag) {
          break;
        } else {
          // do not let it go over the integer limit because MPI_Isend cannot
          // deal with it
          if ((m.data.size() + sizeof(uint32_t) + len + num) >
              static_cast<size_t>(std::numeric_limits<int>::max())) {
            break;
          }
          len += m.data.size();
          num += sizeof(uint32_t);
        }
      }
      lg.unlock();
      // construct message
      vTy vec;
      vec.reserve(len + num);
      // go out of our way to avoid locking out senders when making messages
      lg.lock();
      do {
        auto& m = messages.front();
        lg.unlock();
        union {
          uint32_t a;
          uint8_t b[sizeof(uint32_t)];
        } foo;
        foo.a = m.data.size();
        vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]);
        vec.insert(vec.end(), m.data.begin(), m.data.end());
        if (urgent)
          --urgent;
        lg.lock();
        messages.pop_front();
        --inflightSends;
      } while (vec.size() < len + num);
      ++inflightSends;
      numBytes -= len;
#else
      uint32_t tag = messages.front().tag;
      vTy vec(std::move(messages.front().data));
      messages.pop_front();
#endif
      return std::make_pair(tag, std::move(vec));
    }

    void add(uint32_t tag, vTy& b) {
      std::lock_guard<SimpleLock> lg(lock);
      if (messages.empty()) {
        std::lock_guard<SimpleLock> lg(timelock);
        time = std::chrono::high_resolution_clock::now();
      }
      unsigned oldNumBytes = numBytes;
      numBytes += b.size();
      galois::runtime::trace("BufferedAdd", oldNumBytes, numBytes, tag,
                             galois::runtime::printVec(b));
      messages.emplace_back(tag, b);
    }
  }; // end send buffer class

  std::vector<sendBuffer> sendData;

  void workerThread() {
    initializeMPI();
    int rank;
    int hostSize;

    int rankSuccess = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (rankSuccess != MPI_SUCCESS) {
      MPI_Abort(MPI_COMM_WORLD, rankSuccess);
    }

    int sizeSuccess = MPI_Comm_size(MPI_COMM_WORLD, &hostSize);
    if (sizeSuccess != MPI_SUCCESS) {
      MPI_Abort(MPI_COMM_WORLD, sizeSuccess);
    }

    galois::gDebug("[", NetworkInterface::ID, "] MPI initialized");
    std::tie(netio, ID, Num) =
        makeNetworkIOMPI(memUsageTracker, inflightSends, inflightRecvs);

    assert(ID == (unsigned)rank);
    assert(Num == (unsigned)hostSize);

    ready = 1;
    while (ready < 2) { /*fprintf(stderr, "[WaitOnReady-2]");*/
    };
    while (ready != 3) {
      for (unsigned i = 0; i < sendData.size(); ++i) {
        netio->progress();
        // handle send queue i
        auto& sd = sendData[i];
        if (sd.ready()) {
          NetworkIO::message msg;
          msg.host                    = i;
          std::tie(msg.tag, msg.data) = sd.assemble(inflightSends);
          galois::runtime::trace("BufferedSending", msg.host, msg.tag,
                                 galois::runtime::printVec(msg.data));
          ++statSendEnqueued;
          netio->enqueue(std::move(msg));
        }
        // handle receive
        NetworkIO::message rdata = netio->dequeue();
        if (rdata.data.size()) {
          ++statRecvDequeued;
          assert(rdata.data.size() !=
                 (unsigned int)std::count(rdata.data.begin(), rdata.data.end(),
                                          0));
          galois::runtime::trace("BufferedRecieving", rdata.host, rdata.tag,
                                 galois::runtime::printVec(rdata.data));
          recvData[rdata.host].add(std::move(rdata));
        }
      }
    }
    finalizeMPI();
  }

  std::thread worker;
  std::atomic<int> ready;

public:
  using NetworkInterface::ID;
  using NetworkInterface::Num;

  NetworkInterfaceBuffered() {
    inflightSends       = 0;
    inflightRecvs       = 0;
    ready               = 0;
    anyReceivedMessages = false;
    worker = std::thread(&NetworkInterfaceBuffered::workerThread, this);
    while (ready != 1) {
    };
    recvData = decltype(recvData)(Num);
    recvLock.resize(Num);
    sendData = decltype(sendData)(Num);
    ready    = 2;
  }

  virtual ~NetworkInterfaceBuffered() {
    ready = 3;
    worker.join();
  }

  std::unique_ptr<galois::runtime::NetworkIO> netio;

  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
                          int phase) {
    ++inflightSends;
    tag += phase;
    statSendNum += 1;
    statSendBytes += buf.size();
    galois::runtime::trace("sendTagged", dest, tag,
                           galois::runtime::printVec(buf.getVec()));
    auto& sd = sendData[dest];
    sd.add(tag, buf.getVec());
  }

  virtual std::optional<std::pair<uint32_t, RecvBuffer>>
  recieveTagged(uint32_t tag,
                std::unique_lock<galois::substrate::SimpleLock>* rlg,
                int phase) {
    tag += phase;
    for (unsigned h = 0; h < recvData.size(); ++h) {
      auto& rq = recvData[h];
      if (rq.hasData(tag)) {
        if (recvLock[h].try_lock()) {
          std::unique_lock<galois::substrate::SimpleLock> lg(recvLock[h],
                                                             std::adopt_lock);
          auto buf = rq.popMsg(tag, inflightRecvs);
          if (buf) {
            ++statRecvNum;
            statRecvBytes += buf->size();
            memUsageTracker.decrementMemUsage(buf->size());
            if (rlg)
              *rlg = std::move(lg);
            galois::runtime::trace("recvTagged", h, tag,
                                   galois::runtime::printVec(buf->getVec()));
            anyReceivedMessages = true;
            return std::optional<std::pair<uint32_t, RecvBuffer>>(
                std::make_pair(h, std::move(*buf)));
          }
        }
      }
      galois::runtime::trace("recvTagged BLOCKED this by that", tag,
                             rq.getPresentTag());
    }

    return std::optional<std::pair<uint32_t, RecvBuffer>>();
  }

  virtual void flush() {
    for (auto& sd : sendData)
      sd.markUrgent();
  }

  virtual bool anyPendingSends() { return (inflightSends > 0); }

  virtual bool anyPendingReceives() {
    if (anyReceivedMessages) { // might not be acted on by the computation yet
      anyReceivedMessages = false;
      // galois::gDebug("[", ID, "] receive out of buffer \n");
      return true;
    }
    // if (inflightRecvs > 0) {
    // galois::gDebug("[", ID, "] inflight receive: ", inflightRecvs, " \n");
    // }
    return (inflightRecvs > 0);
  }

  virtual unsigned long reportSendBytes() const { return statSendBytes; }
  virtual unsigned long reportSendMsgs() const { return statSendNum; }
  virtual unsigned long reportRecvBytes() const { return statRecvBytes; }
  virtual unsigned long reportRecvMsgs() const { return statRecvNum; }

  virtual std::vector<unsigned long> reportExtra() const {
    std::vector<unsigned long> retval(5);
    for (auto& sd : sendData) {
      retval[0] += sd.statSendTimeout;
      retval[1] += sd.statSendOverflow;
      retval[2] += sd.statSendUrgent;
    }
    retval[3] = statSendEnqueued;
    retval[4] = statRecvDequeued;
    return retval;
  }

  virtual std::vector<std::pair<std::string, unsigned long>>
  reportExtraNamed() const {
    std::vector<std::pair<std::string, unsigned long>> retval(5);
    retval[0].first = "SendTimeout";
    retval[1].first = "SendOverflow";
    retval[2].first = "SendUrgent";
    retval[3].first = "SendEnqueued";
    retval[4].first = "RecvDequeued";
    for (auto& sd : sendData) {
      retval[0].second += sd.statSendTimeout;
      retval[1].second += sd.statSendOverflow;
      retval[2].second += sd.statSendUrgent;
    }
    retval[3].second = statSendEnqueued;
    retval[4].second = statRecvDequeued;
    return retval;
  }
};

} // namespace

/**
 * Create a buffered network interface, or return one if already
 * created.
 */
NetworkInterface& galois::runtime::makeNetworkBuffered() {
  static std::atomic<NetworkInterfaceBuffered*> net;
  static substrate::SimpleLock m_mutex;

  // create the interface if it doesn't yet exist in the static variable
  auto* tmp = net.load();
  if (tmp == nullptr) {
    std::lock_guard<substrate::SimpleLock> lock(m_mutex);
    tmp = net.load();
    if (tmp == nullptr) {
      tmp = new NetworkInterfaceBuffered();
      net.store(tmp);
    }
  }

  return *tmp;
}


================================================
FILE: libdist/src/NetworkIOMPI.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file NetworkIOMPI.cpp
 *
 * Contains an implementation of network IO that uses MPI.
 */

#include "galois/runtime/NetworkIO.h"
#include "galois/runtime/Tracer.h"
#include "galois/substrate/SimpleLock.h"

/**
 * MPI implementation of network IO. ASSUMES THAT MPI IS INITIALIZED
 * UPON CREATION OF THIS OBJECT.
 */
class NetworkIOMPI : public galois::runtime::NetworkIO {
private:
  /**
   * Get the host id of the caller.
   *
   * @returns host id of the caller with regard to the MPI setup
   */
  static int getID() {
    int taskRank;
    handleError(MPI_Comm_rank(MPI_COMM_WORLD, &taskRank));
    return taskRank;
  }

  /**
   * Get the total number of hosts in the system.
   *
   * @returns number of hosts with regard to the MPI setup
   */
  static int getNum() {
    int numTasks;
    handleError(MPI_Comm_size(MPI_COMM_WORLD, &numTasks));
    return numTasks;
  }

  /**
   * Get both the ID of the caller + number of hosts.
   */
  std::pair<int, int> getIDAndHostNum() {
    return std::make_pair(getID(), getNum());
  }

  /**
   * Message type to send/recv in this network IO layer.
   */
  struct mpiMessage {
    uint32_t host;
    uint32_t tag;
    vTy data;
    MPI_Request req;
    // mpiMessage(message&& _m, MPI_Request _req) : m(std::move(_m)), req(_req)
    // {}
    mpiMessage(uint32_t host, uint32_t tag, vTy&& data)
        : host(host), tag(tag), data(std::move(data)) {}
    mpiMessage(uint32_t host, uint32_t tag, size_t len)
        : host(host), tag(tag), data(len) {}
  };

  /**
   * Send queue structure.
   */
  struct sendQueueTy {
    std::deque<mpiMessage> inflight;

    galois::runtime::MemUsageTracker& memUsageTracker;

    std::atomic<size_t>& inflightSends;

    sendQueueTy(galois::runtime::MemUsageTracker& tracker,
                std::atomic<size_t>& sends)
        : memUsageTracker(tracker), inflightSends(sends) {}

    void complete() {
      while (!inflight.empty()) {
        int flag = 0;
        MPI_Status status;
        auto& f = inflight.front();
        int rv  = MPI_Test(&f.req, &flag, &status);
        handleError(rv);
        if (flag) {
          memUsageTracker.decrementMemUsage(f.data.size());
          inflight.pop_front();
          --inflightSends;
        } else
          break;
      }
    }

    void send(message m) {
      inflight.emplace_back(m.host, m.tag, std::move(m.data));
      auto& f = inflight.back();
      galois::runtime::trace("MPI SEND", f.host, f.tag, f.data.size(),
                             galois::runtime::printVec(f.data));
#ifdef GALOIS_SUPPORT_ASYNC
      int rv = MPI_Issend(f.data.data(), f.data.size(), MPI_BYTE, f.host, f.tag,
                          MPI_COMM_WORLD, &f.req);
#else
      int rv = MPI_Isend(f.data.data(), f.data.size(), MPI_BYTE, f.host, f.tag,
                         MPI_COMM_WORLD, &f.req);
#endif
      handleError(rv);
    }
  };

  /**
   * Receive queue structure
   */
  struct recvQueueTy {
    std::deque<message> done;
    std::deque<mpiMessage> inflight;

    galois::runtime::MemUsageTracker& memUsageTracker;

    std::atomic<size_t>& inflightRecvs;

    recvQueueTy(galois::runtime::MemUsageTracker& tracker,
                std::atomic<size_t>& recvs)
        : memUsageTracker(tracker), inflightRecvs(recvs) {}

    // FIXME: Does synchronous recieves overly halt forward progress?
    void probe() {
      int flag = 0;
      MPI_Status status;
      // check for new messages
      int rv = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,
                          &status);
      handleError(rv);
      if (flag) {
#ifdef GALOIS_USE_BARE_MPI
        assert(status.MPI_TAG <= 32767);
        if (status.MPI_TAG != 32767) {
#endif
          ++inflightRecvs;
          int nbytes;
          rv = MPI_Get_count(&status, MPI_BYTE, &nbytes);
          handleError(rv);
          inflight.emplace_back(status.MPI_SOURCE, status.MPI_TAG, nbytes);
          auto& m = inflight.back();
          memUsageTracker.incrementMemUsage(m.data.size());
          rv = MPI_Irecv(m.data.data(), nbytes, MPI_BYTE, status.MPI_SOURCE,
                         status.MPI_TAG, MPI_COMM_WORLD, &m.req);
          handleError(rv);
          galois::runtime::trace("MPI IRECV", status.MPI_SOURCE, status.MPI_TAG,
                                 m.data.size());
#ifdef GALOIS_USE_BARE_MPI
        }
#endif
      }

      // complete messages
      if (!inflight.empty()) {
        auto& m  = inflight.front();
        int flag = 0;
        rv       = MPI_Test(&m.req, &flag, MPI_STATUS_IGNORE);
        handleError(rv);
        if (flag) {
          done.emplace_back(m.host, m.tag, std::move(m.data));
          inflight.pop_front();
        }
      }
    }
  };

  sendQueueTy sendQueue;
  recvQueueTy recvQueue;

public:
  /**
   * Constructor.
   *
   * @param tracker memory usage tracker
   * @param sends
   * @param recvs
   * @param [out] ID this machine's host id
   * @param [out] NUM total number of hosts in the system
   */
  NetworkIOMPI(galois::runtime::MemUsageTracker& tracker,
               std::atomic<size_t>& sends, std::atomic<size_t>& recvs,
               uint32_t& ID, uint32_t& NUM)
      : NetworkIO(tracker, sends, recvs), sendQueue(tracker, inflightSends),
        recvQueue(tracker, inflightRecvs) {
    auto p = getIDAndHostNum();
    ID     = p.first;
    NUM    = p.second;
  }

  /**
   * Adds a message to the send queue
   */
  virtual void enqueue(message m) {
    memUsageTracker.incrementMemUsage(m.data.size());
    sendQueue.send(std::move(m));
  }

  /**
   * Attempts to get a message from the recv queue.
   */
  virtual message dequeue() {
    if (!recvQueue.done.empty()) {
      auto msg = std::move(recvQueue.done.front());
      recvQueue.done.pop_front();
      return msg;
    }
    return message{~0U, 0, vTy()};
  }

  /**
   * Push progress forward in the system.
   */
  virtual void progress() {
    sendQueue.complete();
    recvQueue.probe();
  }
}; // end NetworkIOMPI class

std::tuple<std::unique_ptr<galois::runtime::NetworkIO>, uint32_t, uint32_t>
galois::runtime::makeNetworkIOMPI(galois::runtime::MemUsageTracker& tracker,
                                  std::atomic<size_t>& sends,
                                  std::atomic<size_t>& recvs) {
  uint32_t ID, NUM;
  std::unique_ptr<galois::runtime::NetworkIO> n{
      new NetworkIOMPI(tracker, sends, recvs, ID, NUM)};
  return std::make_tuple(std::move(n), ID, NUM);
}


================================================
FILE: libdist/src/NetworkLCI.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file NetworkBuffered.cpp
 *
 * Contains NetworkInterfaceLCI, an implementation of a network interface
 * that buffers messages before sending them out.
 *
 * @todo document this file more
 */

#ifdef GALOIS_USE_LCI
#include "galois/runtime/Network.h"
#include "galois/runtime/NetworkIO.h"
#include "galois/runtime/Tracer.h"
#include "galois/runtime/LWCI.h"

using vTy = galois::PODResizeableArray<uint8_t>;

#include <thread>
#include <mutex>
#include <iostream>
#include <limits>
#include <queue>

#include <boost/lockfree/queue.hpp>

using namespace galois::runtime;
using namespace galois::substrate;

/* CRC-32C (iSCSI) polynomial in reversed bit order. */
#define POLY 0x82f63b78
inline uint32_t crc32c(char* buf, size_t len) {
  uint32_t crc = 0;
  int k;

  crc = ~crc;
  while (len--) {
    crc ^= *buf++;
    for (k = 0; k < 8; k++)
      crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  }
  return ~crc;
}

lc_ep lc_p2p_ep[3];
lc_ep lc_col_ep;

struct pendingReq {
  uint32_t dest;
  uint32_t tag;
  int phase;
  vTy buf;
  lc_req req;
  std::atomic<size_t>& inflight;
  pendingReq(uint32_t _d, uint32_t _t, int _p, vTy& _buf,
             std::atomic<size_t>& s)
      : dest(_d), tag(_t), phase(_p), buf(std::move(_buf)), inflight(s) {
    s++;
  }
  ~pendingReq() { inflight--; }
};

static void* alloc_req(size_t size, void** ctx) {
  vTy** vector = (vTy**)ctx;
  *vector      = new vTy(size);
  return (*vector)->data();
}

static void free_req(void* ctx) {
  pendingReq* req = (pendingReq*)ctx;
  delete req;
}

namespace {

/**
 * @class NetworkInterfaceLCI
 *
 * Buffered network interface: messages are buffered before they are sent out.
 * A single worker thread is initialized to send/receive messages from/to
 * buffers.
 */
class NetworkInterfaceLCI : public NetworkInterface {
  unsigned long statSendNum;
  unsigned long statSendBytes;
  unsigned long statSendEnqueued;
  unsigned long statRecvNum;
  unsigned long statRecvBytes;
  unsigned long statRecvDequeued;
  bool anyReceivedMessages;

  // using vTy = std::vector<uint8_t>;
  using vTy = galois::PODResizeableArray<uint8_t>;

public:
  void workerThread() {
    // Initialize LWCI
    // makeNetworkIOLWCI(memUsageTracker, inflightSends, inflightRecvs);
    if (ID == 0)
      fprintf(stderr, "**Using LWCI Communication layer**\n");

    ready = 1;
    while (ready < 2) { /*fprintf(stderr, "[WaitOnReady-2]");*/
    };
    while (ready != 3) {
      lc_progress(0);

      lc_req* req_ptr;
      for (int phase = 0; phase < 3; phase++) {
        if (lc_cq_pop(lc_p2p_ep[phase], &req_ptr) == LC_OK) {
          int bin = ((req_ptr->meta % 3) * 3) + phase;
          bufferedRecv[bin].push(convertReq(req_ptr, phase));
        }
      }

      sched_yield();
    }
  }

  std::thread worker;
  std::atomic<int> ready;

public:
  using NetworkInterface::ID;
  using NetworkInterface::Num;

  NetworkInterfaceLCI() {
    lc_init(1, &lc_col_ep);
    lc_opt opt;
    opt.dev   = 0;
    opt.desc  = LC_DYN_CQ;
    opt.alloc = alloc_req;
    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[0]);
    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[1]);
    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[2]);

    lc_get_proc_num((int*)&ID);
    lc_get_num_proc((int*)&Num);

    inflightSends       = 0;
    inflightRecvs       = 0;
    ready               = 0;
    anyReceivedMessages = false;
    worker              = std::thread(&NetworkInterfaceLCI::workerThread, this);
    while (ready != 1) {
    };
    ready = 2;
  }

  virtual ~NetworkInterfaceLCI() {
    ready = 3;
    worker.join();
  }

  boost::lockfree::queue<pendingReq*>
      bufferedRecv[9]; // [0, 1, 2] [0, 1, 2] 0: normal, 1: reduce, 2: AM

  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
                          int phase) {
    if (tag == 0)
      phase = 2;

    statSendNum += 1;
    statSendBytes += buf.size();
    // int count = 0;
#ifndef GALOIS_SUPPORT_ASYNC
    if (buf.getVec().size() < 8192) {
      while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag,
                      lc_p2p_ep[phase]) != LC_OK) {
        sched_yield();
      }
    } else
#endif
    {
      pendingReq* msg =
          new pendingReq(dest, tag, phase, buf.getVec(), inflightSends);
      while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag,
                      lc_p2p_ep[phase], free_req, msg) != LC_OK) {
        sched_yield();
      }
    }
  }

  inline pendingReq* convertReq(lc_req* req_ptr, int phase) {
    // Need to drain LCI queue to allow more injection.
    // Convert internal LCI request to a Galois pending request.
    vTy buf  = std::move(*((vTy*)(req_ptr->ctx)));
    int rank = req_ptr->rank;
    int meta = req_ptr->meta;
    delete (vTy*)req_ptr->ctx;
    lc_cq_reqfree(lc_p2p_ep[phase], req_ptr);
    return new pendingReq(rank, meta, phase, buf, inflightRecvs);
  }

  virtual std::optional<std::pair<uint32_t, RecvBuffer>>
  recieveTagged(uint32_t tag,
                std::unique_lock<galois::substrate::SimpleLock>* /*rlg*/,
                int phase) {
    if (tag == 0)
      phase = 2;
    // static int count = 0;

    pendingReq* req;
    int bin = ((tag % 3) * 3) + phase;
    if (!bufferedRecv[bin].pop(req)) {
      // if (count ++ == 10000) {
      //  printf("[%d] WARNING possible lock out on RECV %d\n", ID, tag);
      // }
      return std::optional<std::pair<uint32_t, RecvBuffer>>();
    }

    if (req->tag == tag) {
      vTy buf  = std::move(req->buf);
      int dest = req->dest;
      delete req;
      return std::optional<std::pair<uint32_t, RecvBuffer>>(
          std::make_pair(dest, std::move(buf)));
    } else {
      printf("[%d] WARNING possible lock out, wrong tag %d/%d.\n", ID, req->tag,
             tag);
      return std::optional<std::pair<uint32_t, RecvBuffer>>();
    }
  }

  virtual void flush() {}

  virtual bool anyPendingSends() {
    // static int count = 0;
    // if (count++ == 10000)
    // printf("[%d] WARNING possible lock out terminate %d %d\n", ID,
    // inflightSends.load(), inflightRecvs.load());
    return (inflightSends > 0);
  }

  virtual bool anyPendingReceives() {
    if (anyReceivedMessages) { // might not be acted on by the computation yet
      anyReceivedMessages = false;
      // galois::gDebug("[", ID, "] receive out of buffer \n");
      return true;
    }
    // if (inflightRecvs > 0) {
    // galois::gDebug("[", ID, "] inflight receive: ", inflightRecvs, " \n");
    // }
    return (inflightRecvs > 0);
  }

  virtual unsigned long reportSendBytes() const { return statSendBytes; }
  virtual unsigned long reportSendMsgs() const { return statSendNum; }
  virtual unsigned long reportRecvBytes() const { return statRecvBytes; }
  virtual unsigned long reportRecvMsgs() const { return statRecvNum; }

  virtual std::vector<unsigned long> reportExtra() const {
    std::vector<unsigned long> retval(5);
    return retval;
  }

  virtual std::vector<std::pair<std::string, unsigned long>>
  reportExtraNamed() const {
    std::vector<std::pair<std::string, unsigned long>> retval(5);
    retval[0].first  = "SendTimeout";
    retval[1].first  = "SendOverflow";
    retval[2].first  = "SendUrgent";
    retval[3].first  = "SendEnqueued";
    retval[4].first  = "RecvDequeued";
    retval[3].second = statSendEnqueued;
    retval[4].second = statRecvDequeued;
    return retval;
  }
};

} // namespace

/**
 * Create a buffered network interface, or return one if already
 * created.
 */
NetworkInterface& galois::runtime::makeNetworkLCI() {
  static std::atomic<NetworkInterfaceLCI*> net;
  static substrate::SimpleLock m_mutex;

  // create the interface if it doesn't yet exist in the static variable
  auto* tmp = net.load();
  if (tmp == nullptr) {
    std::lock_guard<substrate::SimpleLock> lock(m_mutex);
    tmp = net.load();
    if (tmp == nullptr) {
      tmp = new NetworkInterfaceLCI();
      net.store(tmp);
    }
  }

  return *tmp;
}
#endif


================================================
FILE: libgalois/CMakeLists.txt
================================================
add_library(galois_shmem)
add_library(Galois::shmem ALIAS galois_shmem)
set_target_properties(galois_shmem PROPERTIES EXPORT_NAME shmem)
add_dependencies(lib galois_shmem)

configure_file(src/Version.cpp.in Version.cpp @ONLY)
configure_file(include/galois/config.h.in include/galois/config.h)

set(sources
        "${CMAKE_CURRENT_BINARY_DIR}/Version.cpp"
        src/Barrier_Counting.cpp
        src/Barrier.cpp
        src/Barrier_Dissemination.cpp 
        src/Barrier_MCS.cpp
        src/Barrier_Pthread.cpp
        src/Barrier_Simple.cpp
        src/Barrier_Topo.cpp
        src/Context.cpp
        src/Deterministic.cpp
        src/DynamicBitset.cpp
        src/EnvCheck.cpp
        src/FileGraph.cpp
        src/FileGraphParallel.cpp
        src/gIO.cpp
        src/GraphHelpers.cpp
        src/HWTopo.cpp
        src/Mem.cpp
        src/NumaMem.cpp
        src/OCFileGraph.cpp
        src/PageAlloc.cpp
        src/PagePool.cpp
        src/PagePool.cpp
        src/ParaMeter.cpp
        src/PerThreadStorage.cpp
        src/PreAlloc.cpp
        src/Profile.cpp
        src/PtrLock.cpp
        src/SharedMem.cpp
        src/SharedMemSys.cpp
        src/SimpleLock.cpp
        src/Statistics.cpp
        src/Substrate.cpp
        src/Support.cpp
        src/Termination.cpp
        src/ThreadPool.cpp
        src/Threads.cpp
        src/ThreadTimer.cpp
        src/Timer.cpp
        src/Tracer.cpp
)

if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  list(APPEND sources src/HWTopoDarwin.cpp)
else()
  include(CheckSchedSetAffinity)
  if (NOT SCHED_SETAFFINITY_FOUND)
    if (GALOIS_STRICT_CONFIG)
      message(FATAL_ERROR "Need sched_setaffinity")
    endif()
  endif()
  list(APPEND sources src/HWTopoLinux.cpp)
endif()

target_sources(galois_shmem PRIVATE ${sources})

target_include_directories(galois_shmem PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

if (TARGET Boost::Boost)
  # Autogenerated conan module doesn't provide header-only target. Extract one
  # manually.
  get_target_property(include_dirs Boost::Boost INTERFACE_INCLUDE_DIRECTORIES)
  target_include_directories(galois_shmem PUBLIC ${include_dirs})
else()
  # Standard CMake Boost module
  target_link_libraries(galois_shmem PUBLIC Boost::boost)
endif()

if (SCHED_SETAFFINITY_FOUND)
  target_compile_definitions(galois_shmem PRIVATE GALOIS_USE_SCHED_SETAFFINITY)
  target_link_libraries(galois_shmem PRIVATE ${SCHED_SETAFFINITY_LIBRARIES})
endif()

target_link_libraries(galois_shmem INTERFACE pygalois)
target_link_libraries(galois_shmem PRIVATE Threads::Threads)

if (CMAKE_HAVE_PTHREAD_H)
  target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD)
endif()

find_package(NUMA)
if (NUMA_FOUND)
  target_compile_definitions(galois_shmem PRIVATE GALOIS_USE_NUMA)
  target_link_libraries(galois_shmem PRIVATE ${NUMA_LIBRARY})
else()
  message(WARNING "No NUMA Support.  Likely poor performance for multi-socket systems.")
endif()

if (VTune_FOUND)
  target_link_libraries(galois_shmem PRIVATE ${VTune_LIBRARIES})
endif()


add_subdirectory(test)

install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(
  DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(
  TARGETS galois_shmem
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libgalois/include/galois/ArrayWrapper.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file ArrayWrapper.h
 *
 * Defines the CopyableArray subclass used to make arrays trivially copyable if
 * possible.
 */

#ifndef _ARRAY_WRAPPER_H_
#define _ARRAY_WRAPPER_H_

#include <array>
#include "galois/config.h"
#include "galois/runtime/ExtraTraits.h"

namespace galois {
/**
 * A subclass of std::array that is marked trivially copyable if the type is
 * also memory copyable. Useful when you need a trivially copyable type for
 * serialization.
 *
 * @tparam T type of the items to be stored in the array
 * @tparam N total number of items in the array
 */
template <class T, size_t N>
class CopyableArray : public std::array<T, N> {
public:
  //! Only typedef tt_is_copyable if T is trivially copyable.
  //! Allows the use of memcopy in serialize/deserialize.
  using tt_is_copyable =
      typename std::enable_if<galois::runtime::is_memory_copyable<T>::value,
                              int>::type;
};
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/AtomicHelpers.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once
#include <atomic>
#include <algorithm>
#include <vector>

#include "galois/config.h"

namespace galois {
/** galois::atomicMax + non-atomic max calls **/
template <typename Ty>
const Ty atomicMax(std::atomic<Ty>& a, const Ty b) {
  Ty old_a = a.load(std::memory_order_relaxed);
  // if old value is less than new value, atomically exchange
  while (old_a < b &&
         !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
    ;
  return old_a;
}

template <typename Ty>
const Ty max(std::atomic<Ty>& a, const Ty& b) {
  Ty old_a = a.load(std::memory_order_relaxed);

  if (a < b) {
    a.store(b, std::memory_order_relaxed);
  }
  return old_a;
}

template <typename Ty>
const Ty max(Ty& a, const Ty& b) {
  Ty old_a = a;

  if (a < b) {
    a = b;
  }
  return old_a;
}

/** galois::atomicMin **/
template <typename Ty>
const Ty atomicMin(std::atomic<Ty>& a, const Ty b) {
  Ty old_a = a.load(std::memory_order_relaxed);
  while (old_a > b &&
         !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
    ;
  return old_a;
}

template <typename Ty>
const Ty min(std::atomic<Ty>& a, const Ty& b) {
  Ty old_a = a.load(std::memory_order_relaxed);
  if (a > b) {
    a.store(b, std::memory_order_relaxed);
  }
  return old_a;
}

template <typename Ty>
const Ty min(Ty& a, const Ty& b) {
  Ty old_a = a;
  if (a > b) {
    a = b;
  }
  return old_a;
}

/** galois::atomicAdd **/
template <typename Ty>
const Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {
  Ty old_val = val.load(std::memory_order_relaxed);
  while (!val.compare_exchange_weak(old_val, old_val + delta,
                                    std::memory_order_relaxed))
    ;
  return old_val;
}

template <typename Ty>
const Ty add(std::atomic<Ty>& a, const Ty& b) {
  Ty old_a = a.load(std::memory_order_relaxed);
  a.store(a + b, std::memory_order_relaxed);
  return old_a;
}

template <typename Ty>
const Ty add(Ty& a, std::atomic<Ty>& b) {
  Ty old_a = a;
  a        = a + b.load(std::memory_order_relaxed);
  return old_a;
}

template <typename Ty>
const Ty add(Ty& a, const Ty& b) {
  Ty old_a = a;
  a += b;
  return old_a;
}

/**
 * atomic subtraction of delta (because atomicAdd with negative numbers implies
 * a signed integer cast)
 */
template <typename Ty>
const Ty atomicSubtract(std::atomic<Ty>& val, Ty delta) {
  Ty old_val = val.load(std::memory_order_relaxed);
  while (!val.compare_exchange_weak(old_val, old_val - delta,
                                    std::memory_order_relaxed))
    ;
  return old_val;
}

template <typename Ty>
const Ty set(Ty& a, const Ty& b) {
  a = b;
  return a;
}

template <typename Ty>
const Ty set(std::atomic<Ty>& a, const Ty& b) {
  a.store(b, std::memory_order_relaxed);
  return a;
}

/** Pair Wise Average function **/
template <typename Ty>
const Ty pairWiseAvg(Ty a, Ty b) {
  return (a + b) / 2.0;
}

template <typename Ty>
void pairWiseAvg_vec(std::vector<Ty>& a_vec, std::vector<Ty>& b_vec) {
  for (unsigned i = 0; i < a_vec.size(); ++i) {
    a_vec[i] = (a_vec[i] + b_vec[i]) / 2.0;
  }
}

template <typename Ty>
void resetVec(Ty& a_arr) {
  // std::for_each(a_arr.begin(), a_arr.end(),[](Ty &ele){ele = 0;} );
  std::fill(a_arr.begin(), a_arr.end(), 0);
}

template <typename Ty>
void pairWiseAvg_vec(Ty& a_arr, Ty& b_arr) {
  for (unsigned i = 0; i < a_arr.size(); ++i) {
    a_arr[i] = (a_arr[i] + b_arr[i]) / 2.0;
  }
}

template <typename Ty>
void addArray(Ty& a_arr, Ty& b_arr) {
  for (unsigned i = 0; i < a_arr.size(); ++i) {
    a_arr[i] = (a_arr[i] + b_arr[i]);
  }
}

template <typename Ty>
void resetVec(std::vector<Ty>& a_vec) {
  std::for_each(a_vec.begin(), a_vec.end(), [](Ty& ele) { ele = 0; });
}

// like std::inner_product
template <typename ItrTy, typename Ty>
Ty innerProduct(ItrTy a_begin, ItrTy a_end, ItrTy b_begin, Ty init_value) {
  auto jj = b_begin;
  for (auto ii = a_begin; ii != a_end; ++ii, ++jj) {
    init_value += (*ii) * (*jj);
  }
  return init_value;
}

// like std::inner_product
template <typename ItrTy, typename Ty>
Ty innerProduct(ItrTy& a_arr, ItrTy& b_arr, Ty init_value) {
  auto jj = b_arr.begin();
  for (auto ii = a_arr.begin(); ii != a_arr.end(); ++ii, ++jj) {
    init_value += (*ii) * (*jj);
  }
  return init_value;
}

template <typename Ty>
void reset(Ty& var, Ty val) {
  var = val;
}

template <typename Ty>
void reset(std::atomic<Ty>& var, Ty val) {
  var.store(val, std::memory_order_relaxed);
}
} // end namespace galois


================================================
FILE: libgalois/include/galois/AtomicWrapper.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file AtomicWrapper.h
 *
 * Contains a copyable atomics class.
 */
#ifndef _ATOMIC_WRAPPER_H_
#define _ATOMIC_WRAPPER_H_

#include <atomic>

#include "galois/config.h"

namespace galois {
/**
 * Class that inherits from std::atomic to make it copyable by defining a copy
 * constructor.
 *
 * @tparam T type of the atomic
 */
template <class T>
class CopyableAtomic : public std::atomic<T> {
public:
  //! Default constructor
  CopyableAtomic() : std::atomic<T>(T{}) {}

  //! Constructor initializing atomic to passed in data
  constexpr CopyableAtomic(T desired) : std::atomic<T>(desired) {}

  //! Copy constructor
  constexpr CopyableAtomic(const CopyableAtomic<T>& other)
      : CopyableAtomic(other.load(std::memory_order_relaxed)) {}

  //! Copy constructor operator
  CopyableAtomic& operator=(const CopyableAtomic<T>& other) {
    this->store(other.load(std::memory_order_relaxed),
                std::memory_order_relaxed);
    return *this;
  }
};

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/Bag.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_BAG_H
#define GALOIS_BAG_H

#include <algorithm>
#include <stdexcept>

#include <boost/iterator/iterator_facade.hpp>

#include "galois/config.h"
#include "galois/gstl.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/gIO.h"
#include "galois/runtime/Mem.h"

namespace galois {

/**
 * Unordered collection of elements. This data structure supports scalable
 * concurrent pushes but reading the bag can only be done serially.
 */
template <typename T, unsigned int BlockSize = 0>
class InsertBag {

  struct header {
    header* next;
    T* dbegin; // start of interesting data
    T* dend;   // end of valid data
    T* dlast;  // end of storage
  };

  typedef std::pair<header*, header*> PerThread;

public:
  template <typename U>
  class Iterator : public boost::iterator_facade<Iterator<U>, U,
                                                 boost::forward_traversal_tag> {
    friend class boost::iterator_core_access;

    galois::substrate::PerThreadStorage<std::pair<header*, header*>>* hd;
    unsigned int thr;
    header* p;
    U* v;

    bool init_thread() {
      p = thr < hd->size() ? hd->getRemote(thr)->first : 0;
      v = p ? p->dbegin : 0;
      return p;
    }

    bool advance_local() {
      if (p) {
        ++v;
        return v != p->dend;
      }
      return false;
    }

    bool advance_chunk() {
      if (p) {
        p = p->next;
        v = p ? p->dbegin : 0;
      }
      return p;
    }

    void advance_thread() {
      while (thr < hd->size()) {
        ++thr;
        if (init_thread())
          return;
      }
    }

    void increment() {
      if (advance_local())
        return;
      if (advance_chunk())
        return;
      advance_thread();
    }

    template <typename OtherTy>
    bool equal(const Iterator<OtherTy>& o) const {
      return hd == o.hd && thr == o.thr && p == o.p && v == o.v;
    }

    U& dereference() const { return *v; }

  public:
    Iterator() : hd(0), thr(0), p(0), v(0) {}

    template <typename OtherTy>
    Iterator(const Iterator<OtherTy>& o)
        : hd(o.hd), thr(o.thr), p(o.p), v(o.v) {}

    Iterator(
        galois::substrate::PerThreadStorage<std::pair<header*, header*>>* h,
        unsigned t)
        : hd(h), thr(t), p(0), v(0) {
      // find first valid item
      if (!init_thread())
        advance_thread();
    }
  };

private:
  galois::runtime::FixedSizeHeap heap;
  galois::substrate::PerThreadStorage<PerThread> heads;

  void insHeader(header* h) {
    PerThread& hpair = *heads.getLocal();
    if (hpair.second) {
      hpair.second->next = h;
      hpair.second       = h;
    } else {
      hpair.first = hpair.second = h;
    }
  }

  header* newHeaderFromHeap(void* m, unsigned size) {
    header* H  = new (m) header();
    int offset = 1;
    if (sizeof(T) < sizeof(header))
      offset += sizeof(header) / sizeof(T);
    T* a      = reinterpret_cast<T*>(m);
    H->dbegin = &a[offset];
    H->dend   = H->dbegin;
    H->dlast  = &a[(size / sizeof(T))];
    H->next   = 0;
    return H;
  }

  header* newHeader() {
    if (BlockSize) {
      return newHeaderFromHeap(heap.allocate(BlockSize), BlockSize);
    } else {
      return newHeaderFromHeap(galois::runtime::pagePoolAlloc(),
                               galois::runtime::pagePoolSize());
    }
  }

  void destruct_serial() {
    for (unsigned x = 0; x < heads.size(); ++x) {
      PerThread& hpair = *heads.getRemote(x);
      header*& h       = hpair.first;
      while (h) {
        uninitialized_destroy(h->dbegin, h->dend);
        header* h2 = h;
        h          = h->next;
        if (BlockSize)
          heap.deallocate(h2);
        else
          galois::runtime::pagePoolFree(h2);
      }
      hpair.second = 0;
    }
  }

  void destruct_parallel(void) {
    galois::runtime::on_each_gen(
        [this](const unsigned int tid, const unsigned int) {
          PerThread& hpair = *heads.getLocal(tid);
          header*& h       = hpair.first;
          while (h) {
            uninitialized_destroy(h->dbegin, h->dend);
            header* h2 = h;
            h          = h->next;
            if (BlockSize)
              heap.deallocate(h2);
            else
              galois::runtime::pagePoolFree(h2);
          }
          hpair.second = 0;
        },
        std::make_tuple(galois::no_stats()));
  }

public:
  // static_assert(BlockSize == 0 || BlockSize >= (2 * sizeof(T) +
  // sizeof(header)),
  //     "BlockSize should larger than sizeof(T) + O(1)");

  InsertBag() : heap(BlockSize) {}
  InsertBag(InsertBag&& o) : heap(BlockSize) {
    std::swap(heap, o.heap);
    std::swap(heads, o.heads);
  }

  InsertBag& operator=(InsertBag&& o) {
    std::swap(heap, o.heap);
    std::swap(heads, o.heads);
    return *this;
  }

  InsertBag(const InsertBag&) = delete;
  InsertBag& operator=(const InsertBag&) = delete;

  ~InsertBag() { destruct_parallel(); }

  void clear() { destruct_parallel(); }

  void clear_serial() { destruct_serial(); }

  void swap(InsertBag& o) {
    std::swap(heap, o.heap);
    std::swap(heads, o.heads);
  }

  typedef T value_type;
  typedef T* pointer;
  typedef const T* const_pointer;
  typedef const T& const_reference;
  typedef T& reference;
  typedef Iterator<T> iterator;
  typedef Iterator<const T> const_iterator;
  typedef iterator local_iterator;

  iterator begin() { return iterator(&heads, 0); }
  iterator end() { return iterator(&heads, heads.size()); }
  const_iterator begin() const { return const_iterator(&heads, 0); }
  const_iterator end() const { return const_iterator(&heads, heads.size()); }

  local_iterator local_begin() {
    return local_iterator(&heads, galois::substrate::ThreadPool::getTID());
  }
  local_iterator local_end() {
    return local_iterator(&heads, galois::substrate::ThreadPool::getTID() + 1);
  }

  bool empty() const {
    for (unsigned x = 0; x < heads.size(); ++x) {
      header* h = heads.getRemote(x)->first;
      if (h)
        return false;
    }
    return true;
  }
  //! Thread safe bag insertion
  template <typename... Args>
  reference emplace(Args&&... args) {
    header* H = heads.getLocal()->second;
    T* rv;
    if (!H || H->dend == H->dlast) {
      H = newHeader();
      insHeader(H);
    }
    rv = new (H->dend) T(std::forward<Args>(args)...);
    ++H->dend;
    return *rv;
  }

  template <typename... Args>
  reference emplace_back(Args&&... args) {
    return emplace(std::forward<Args>(args)...);
  }

  /**
   * Pop the last element pushed by this thread. The number of consecutive
   * pops supported without intevening pushes is implementation dependent.
   */
  void pop() {
    header* H = heads.getLocal()->second;
    if (H->dbegin == H->dend) {
      throw std::out_of_range("InsertBag::pop");
    }
    uninitialized_destroy(H->dend - 1, H->dend);
    --H->dend;
  }

  //! Thread safe bag insertion
  template <typename ItemTy>
  reference push(ItemTy&& val) {
    return emplace(std::forward<ItemTy>(val));
  }

  //! Thread safe bag insertion
  template <typename ItemTy>
  reference push_back(ItemTy&& val) {
    return emplace(std::forward<ItemTy>(val));
  }
};

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/CheckedObject.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_CHECKEDOBJECT_H
#define GALOIS_CHECKEDOBJECT_H

#include "galois/config.h"
#include "galois/runtime/Context.h"

namespace galois {

/**
 * Conflict-checking wrapper for any type.  Performs global conflict detection
 * on the enclosed object.  This enables arbitrary types to be managed by the
 * Galois runtime.
 */
template <typename T>
class GChecked : public galois::runtime::Lockable {
  T val;

public:
  template <typename... Args>
  GChecked(Args&&... args) : val(std::forward<Args>(args)...) {}

  T& get(galois::MethodFlag m = MethodFlag::WRITE) {
    galois::runtime::acquire(this, m);
    return val;
  }

  const T& get(galois::MethodFlag m = MethodFlag::WRITE) const {
    galois::runtime::acquire(const_cast<GChecked*>(this), m);
    return val;
  }
};

template <>
class GChecked<void> : public galois::runtime::Lockable {
public:
  void get(galois::MethodFlag m = MethodFlag::WRITE) const {
    galois::runtime::acquire(const_cast<GChecked*>(this), m);
  }
};

} // namespace galois

#endif // _GALOIS_CHECKEDOBJECT_H


================================================
FILE: libgalois/include/galois/CopyableTuple.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file CopyableTuple.h
 *
 * Contains copyable tuple classes whose elements are contiguous in memory
 */
#pragma once

#include "galois/config.h"

namespace galois {

/**
 * Struct that contains 2 elements. Used over std::pair as std::pair memory
 * layout isn't guaranteed.
 *
 * @tparam T1 type of first element
 * @tparam T2 type of second element
 */
template <typename T1, typename T2>
struct Pair {
  //! first element
  T1 first;
  //! second element
  T2 second;

  //! empty constructor
  Pair() {}

  //! Constructor that initializes 2 fields
  Pair(T1 one, T2 two) {
    first  = one;
    second = two;
  }
};

/**
 * Struct that contains 3 elements. Used over std::tuple as std::tuple memory
 * layout isn't guaranteed.
 *
 * @tparam T1 type of first element
 * @tparam T2 type of second element
 * @tparam T3 type of third element
 */
template <typename T1, typename T2, typename T3>
struct TupleOfThree {
  //! first element
  T1 first;
  //! second element
  T2 second;
  //! third element
  T3 third;

  //! empty constructor
  TupleOfThree() {}

  //! Constructor that initializes 3 fields
  TupleOfThree(T1 one, T2 two, T3 three) {
    first  = one;
    second = two;
    third  = three;
  }
};

} // namespace galois


================================================
FILE: libgalois/include/galois/DynamicBitset.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file galois/DynamicBitset.h
 *
 * Contains the DynamicBitSet class and most of its implementation.
 */

#ifndef _GALOIS_DYNAMIC_BIT_SET_
#define _GALOIS_DYNAMIC_BIT_SET_

#include <climits>
#include <vector>
#include <cassert>

#include <boost/iterator/counting_iterator.hpp>
#include <boost/mpl/has_xxx.hpp>

#include "galois/config.h"
#include "galois/AtomicWrapper.h"
#include "galois/PODResizeableArray.h"
#include "galois/GaloisForwardDecl.h"
#include "galois/Traits.h"
#include "galois/Galois.h"

namespace galois {
/**
 * Concurrent dynamically allocated bitset
 **/
class DynamicBitSet {
protected:
  galois::PODResizeableArray<galois::CopyableAtomic<uint64_t>> bitvec;
  size_t num_bits;
  static constexpr uint32_t bits_uint64 = sizeof(uint64_t) * CHAR_BIT;

public:
  //! Constructor which initializes to an empty bitset.
  DynamicBitSet() : num_bits(0) {}

  /**
   * Returns the underlying bitset representation to the user
   *
   * @returns constant reference vector of copyable atomics that represents
   * the bitset
   */
  const auto& get_vec() const { return bitvec; }

  /**
   * Returns the underlying bitset representation to the user
   *
   * @returns reference to vector of copyable atomics that represents the
   * bitset
   */
  auto& get_vec() { return bitvec; }

  /**
   * Resizes the bitset.
   *
   * @param n Size to change the bitset to
   */
  void resize(uint64_t n) {
    assert(bits_uint64 == 64); // compatibility with other devices
    num_bits = n;
    bitvec.resize((n + bits_uint64 - 1) / bits_uint64);
    reset();
  }

  /**
   * Reserves capacity for the bitset.
   *
   * @param n Size to reserve the capacity of the bitset to
   */
  void reserve(uint64_t n) {
    assert(bits_uint64 == 64); // compatibility with other devices
    bitvec.reserve((n + bits_uint64 - 1) / bits_uint64);
  }

  /**
   * Gets the size of the bitset
   * @returns The number of bits held by the bitset
   */
  size_t size() const { return num_bits; }

  /**
   * Gets the space taken by the bitset
   * @returns the space in bytes taken by this bitset
   */
  // size_t alloc_size() const { return bitvec.size() * sizeof(uint64_t); }

  /**
   * Unset every bit in the bitset.
   */
  void reset() { std::fill(bitvec.begin(), bitvec.end(), 0); }

  /**
   * Unset a range of bits given an inclusive range
   *
   * @param begin first bit in range to reset
   * @param end last bit in range to reset
   */
  void reset(size_t begin, size_t end) {
    if (num_bits == 0)
      return;

    assert(begin <= (num_bits - 1));
    assert(end <= (num_bits - 1));

    // 100% safe implementation, but slow
    // for (unsigned long i = begin; i <= end; i++) {
    //  size_t bit_index = i / bits_uint64;
    //  uint64_t bit_offset = 1;
    //  bit_offset <<= (i % bits_uint64);
    //  uint64_t mask = ~bit_offset;
    //  bitvec[bit_index] &= mask;
    //}

    // block which you are safe to clear
    size_t vec_begin = (begin + bits_uint64 - 1) / bits_uint64;
    size_t vec_end;

    if (end == (num_bits - 1))
      vec_end = bitvec.size();
    else
      vec_end = (end + 1) / bits_uint64; // floor

    if (vec_begin < vec_end) {
      std::fill(bitvec.begin() + vec_begin, bitvec.begin() + vec_end, 0);
    }

    vec_begin *= bits_uint64;
    vec_end *= bits_uint64;

    // at this point vec_begin -> vec_end-1 has been reset

    if (vec_begin > vec_end) {
      // no fill happened
      if (begin < vec_begin) {
        size_t diff = vec_begin - begin;
        assert(diff < 64);
        uint64_t mask = ((uint64_t)1 << (64 - diff)) - 1;

        size_t end_diff  = end - vec_end + 1;
        uint64_t or_mask = ((uint64_t)1 << end_diff) - 1;
        mask |= ~or_mask;

        size_t bit_index = begin / bits_uint64;
        bitvec[bit_index] &= mask;
      }
    } else {
      if (begin < vec_begin) {
        size_t diff = vec_begin - begin;
        assert(diff < 64);
        uint64_t mask    = ((uint64_t)1 << (64 - diff)) - 1;
        size_t bit_index = begin / bits_uint64;
        bitvec[bit_index] &= mask;
      }
      if (end >= vec_end) {
        size_t diff = end - vec_end + 1;
        assert(diff < 64);
        uint64_t mask    = ((uint64_t)1 << diff) - 1;
        size_t bit_index = end / bits_uint64;
        bitvec[bit_index] &= ~mask;
      }
    }
  }

  /**
   * Check a bit to see if it is currently set.
   * Using this is recommeneded only if set() and reset()
   * are not being used in that parallel section/phase
   *
   * @param index Bit to check to see if set
   * @returns true if index is set
   */
  bool test(size_t index) const {
    size_t bit_index    = index / bits_uint64;
    uint64_t bit_offset = 1;
    bit_offset <<= (index % bits_uint64);
    return ((bitvec[bit_index].load(std::memory_order_relaxed) & bit_offset) !=
            0);
  }

  /**
   * Set a bit in the bitset.
   *
   * @param index Bit to set
   * @returns the old value
   */
  bool set(size_t index) {
    size_t bit_index    = index / bits_uint64;
    uint64_t bit_offset = 1;
    bit_offset <<= (index % bits_uint64);
    uint64_t old_val = bitvec[bit_index];
    // test and set
    // if old_bit is 0, then atomically set it
    while (((old_val & bit_offset) == 0) &&
           !bitvec[bit_index].compare_exchange_weak(
               old_val, old_val | bit_offset, std::memory_order_relaxed))
      ;
    return (old_val & bit_offset);
  }

  /**
   * Reset a bit in the bitset.
   *
   * @param index Bit to reset
   * @returns the old value
   */
  bool reset(size_t index) {
    size_t bit_index    = index / bits_uint64;
    uint64_t bit_offset = 1;
    bit_offset <<= (index % bits_uint64);
    uint64_t old_val = bitvec[bit_index];
    // test and reset
    // if old_bit is 1, then atomically reset it
    while (((old_val & bit_offset) != 0) &&
           !bitvec[bit_index].compare_exchange_weak(
               old_val, old_val & ~bit_offset, std::memory_order_relaxed))
      ;
    return (old_val & bit_offset);
  }

  // assumes bit_vector is not updated (set) in parallel
  void bitwise_or(const DynamicBitSet& other) {
    assert(size() == other.size());
    auto& other_bitvec = other.get_vec();
    galois::do_all(
        galois::iterate(size_t{0}, bitvec.size()),
        [&](size_t i) { bitvec[i] |= other_bitvec[i]; }, galois::no_stats());
  }

  // assumes bit_vector is not updated (set) in parallel

  /**
   * Does an IN-PLACE bitwise and of this bitset and another bitset
   *
   * @param other Other bitset to do bitwise and with
   */
  void bitwise_and(const DynamicBitSet& other) {
    assert(size() == other.size());
    auto& other_bitvec = other.get_vec();
    galois::do_all(
        galois::iterate(size_t{0}, bitvec.size()),
        [&](size_t i) { bitvec[i] &= other_bitvec[i]; }, galois::no_stats());
  }

  /**
   * Does an IN-PLACE bitwise and of 2 passed in bitsets and saves to this
   * bitset
   *
   * @param other1 Bitset to and with other 2
   * @param other2 Bitset to and with other 1
   */
  void bitwise_and(const DynamicBitSet& other1, const DynamicBitSet& other2) {
    assert(size() == other1.size());
    assert(size() == other2.size());
    auto& other_bitvec1 = other1.get_vec();
    auto& other_bitvec2 = other2.get_vec();

    galois::do_all(
        galois::iterate(size_t{0}, bitvec.size()),
        [&](size_t i) { bitvec[i] = other_bitvec1[i] & other_bitvec2[i]; },
        galois::no_stats());
  }

  /**
   * Does an IN-PLACE bitwise xor of this bitset and another bitset
   *
   * @param other Other bitset to do bitwise xor with
   */
  void bitwise_xor(const DynamicBitSet& other) {
    assert(size() == other.size());
    auto& other_bitvec = other.get_vec();
    galois::do_all(
        galois::iterate(size_t{0}, bitvec.size()),
        [&](size_t i) { bitvec[i] ^= other_bitvec[i]; }, galois::no_stats());
  }

  /**
   * Does an IN-PLACE bitwise and of 2 passed in bitsets and saves to this
   * bitset
   *
   * @param other1 Bitset to xor with other 2
   * @param other2 Bitset to xor with other 1
   */
  void bitwise_xor(const DynamicBitSet& other1, const DynamicBitSet& other2) {
    assert(size() == other1.size());
    assert(size() == other2.size());
    auto& other_bitvec1 = other1.get_vec();
    auto& other_bitvec2 = other2.get_vec();

    galois::do_all(
        galois::iterate(size_t{0}, bitvec.size()),
        [&](size_t i) { bitvec[i] = other_bitvec1[i] ^ other_bitvec2[i]; },
        galois::no_stats());
  }

  /**
   * Count how many bits are set in the bitset
   *
   * @returns number of set bits in the bitset
   */
  uint64_t count() const {
    galois::GAccumulator<uint64_t> ret;
    galois::do_all(
        galois::iterate(bitvec.begin(), bitvec.end()),
        [&](uint64_t n) {
#ifdef __GNUC__
          ret += __builtin_popcountll(n);
#else
          n = n - ((n >> 1) & 0x5555555555555555UL);
          n = (n & 0x3333333333333333UL) + ((n >> 2) & 0x3333333333333333UL);
          ret +=
              (((n + (n >> 4)) & 0xF0F0F0F0F0F0F0FUL) * 0x101010101010101UL) >>
              56;
#endif
        },
        galois::no_stats());
    return ret.reduce();
  }

  /**
   * Returns a vector containing the set bits in this bitset in order
   * from left to right.
   * Do NOT call in a parallel region as it uses galois::on_each.
   *
   * @returns vector with offsets into set bits
   */
  // TODO uint32_t is somewhat dangerous; change in the future
  std::vector<uint32_t> getOffsets() const {
    uint32_t activeThreads = galois::getActiveThreads();
    std::vector<unsigned int> tPrefixBitCounts(activeThreads);

    // count how many bits are set on each thread
    galois::on_each([&](unsigned tid, unsigned nthreads) {
      size_t start;
      size_t end;
      std::tie(start, end) =
          galois::block_range((size_t)0, this->size(), tid, nthreads);

      unsigned int count = 0;
      for (unsigned int i = start; i < end; ++i) {
        if (this->test(i))
          ++count;
      }

      tPrefixBitCounts[tid] = count;
    });

    // calculate prefix sum of bits per thread
    for (unsigned int i = 1; i < activeThreads; ++i) {
      tPrefixBitCounts[i] += tPrefixBitCounts[i - 1];
    }

    // total num of set bits
    uint64_t bitsetCount = tPrefixBitCounts[activeThreads - 1];
    std::vector<uint32_t> offsets;

    // calculate the indices of the set bits and save them to the offset
    // vector
    if (bitsetCount > 0) {
      offsets.resize(bitsetCount);
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        size_t start;
        size_t end;
        std::tie(start, end) =
            galois::block_range((size_t)0, this->size(), tid, nthreads);
        unsigned int count = 0;
        unsigned int tPrefixBitCount;
        if (tid == 0) {
          tPrefixBitCount = 0;
        } else {
          tPrefixBitCount = tPrefixBitCounts[tid - 1];
        }

        for (unsigned int i = start; i < end; ++i) {
          if (this->test(i)) {
            offsets[tPrefixBitCount + count] = i;
            ++count;
          }
        }
      });
    }

    return offsets;
  }

  //! this is defined to
  using tt_is_copyable = int;
};

//! An empty bitset object; used mainly by InvalidBitsetFnTy
static galois::DynamicBitSet EmptyBitset;

//! A structure representing an empty bitset.
struct InvalidBitsetFnTy {
  //! Returns false as this is an empty bitset
  static constexpr bool is_vector_bitset() { return false; }

  //! Returns false as this is an empty bitset (invalid)
  static constexpr bool is_valid() { return false; }

  //! Returns the empty bitset
  static galois::DynamicBitSet& get() { return EmptyBitset; }

  //! No-op since it's an empty bitset
  static void reset_range(size_t, size_t) {}
};
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/Endian.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_ENDIAN_H
#define GALOIS_ENDIAN_H

#include <cstdint>

#include "galois/config.h"

namespace galois {

static inline uint32_t bswap32(uint32_t x) {
#if defined(__GNUC__) || defined(__clang__)
  return __builtin_bswap32(x);
#else
  return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
         ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
#endif
}

static inline uint64_t bswap64(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
  return __builtin_bswap64(x);
#else
  return ((x << 56) & 0xff00000000000000UL) |
         ((x << 40) & 0x00ff000000000000UL) |
         ((x << 24) & 0x0000ff0000000000UL) |
         ((x << 8) & 0x000000ff00000000UL) | ((x >> 8) & 0x00000000ff000000UL) |
         ((x >> 24) & 0x0000000000ff0000UL) |
         ((x >> 40) & 0x000000000000ff00UL) |
         ((x >> 56) & 0x00000000000000ffUL);
#endif
}

// NB: Wrap these standard functions with different names because
// sometimes le64toh and such are implemented as macros and we don't
// want any nasty surprises.
static inline uint64_t convert_le64toh(uint64_t x) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return x;
#else
  return bswap64(x);
#endif
}

static inline uint32_t convert_le32toh(uint32_t x) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return x;
#else
  return bswap32(x);
#endif
}

static inline uint64_t convert_htobe64(uint64_t x) {
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return x;
#else
  return bswap64(x);
#endif
}

static inline uint32_t convert_htobe32(uint32_t x) {
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return x;
#else
  return bswap32(x);
#endif
}

static inline uint64_t convert_htole64(uint64_t x) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return x;
#else
  return bswap64(x);
#endif
}

static inline uint32_t convert_htole32(uint32_t x) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return x;
#else
  return bswap32(x);
#endif
}

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/FixedSizeRing.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_FIXEDSIZERING_H
#define GALOIS_FIXEDSIZERING_H

#include <atomic>
#include <utility>

#include <boost/mpl/if.hpp>
#include <boost/iterator/iterator_facade.hpp>
#include <boost/iterator/reverse_iterator.hpp>

#include "galois/config.h"
#include "galois/optional.h"
#include "galois/LazyArray.h"

namespace galois {

//! Unordered collection of bounded size
template <typename T, unsigned ChunkSize, bool Concurrent>
class FixedSizeBagBase {
  LazyArray<T, ChunkSize> datac;
  typedef typename boost::mpl::if_c<Concurrent, std::atomic<unsigned>,
                                    unsigned>::type Count;
  Count count;

  T* at(unsigned i) { return &datac[i]; }
  const T* at(unsigned i) const { return &datac[i]; }

  bool precondition() const { return count <= ChunkSize; }

public:
  typedef T value_type;
  typedef T* pointer;
  typedef const T* const_pointer;
  typedef T& reference;
  typedef const T& const_reference;
  typedef boost::reverse_iterator<pointer> iterator;
  typedef boost::reverse_iterator<const_pointer> const_iterator;
  typedef pointer reverse_iterator;
  typedef const_pointer const_reverse_iterator;

  FixedSizeBagBase() : count(0) {}

  template <typename InputIterator>
  FixedSizeBagBase(InputIterator first, InputIterator last) : count(0) {
    while (first != last) {
      assert(count < ChunkSize);
      datac.emplace(count++, *first++);
    }
  }

  FixedSizeBagBase(const FixedSizeBagBase& o) = delete;
  FixedSizeBagBase& operator=(const FixedSizeBagBase& o) = delete;

  ~FixedSizeBagBase() { clear(); }

  unsigned size() const {
    assert(precondition());
    return count;
  }

  bool empty() const {
    assert(precondition());
    return count == 0;
  }

  bool full() const {
    assert(precondition());
    return count == ChunkSize;
  }

  void clear() {
    assert(precondition());
    for (unsigned x = 0; x < count; ++x)
      datac.destroy(x);
    count = 0;
  }

  template <typename U>
  pointer push_back(U&& val) {
    return push_front(std::forward<U>(val));
  }

  template <typename... Args>
  pointer emplace_back(Args&&... args) {
    return emplace_front(std::forward<Args>(args)...);
  }

  template <typename U, bool C = Concurrent>
  auto push_front(U&& val) -> typename std::enable_if<!C, pointer>::type {
    return emplace_front(std::forward<U>(val));
  }

  template <bool C = Concurrent>
  auto push_front(const value_type& val) ->
      typename std::enable_if<C, pointer>::type {
    unsigned top;
    do {
      top = count.load(std::memory_order_relaxed);
      if (top >= ChunkSize)
        return nullptr;
    } while (!count.compare_exchange_weak(top, top + 1));
    return datac.emplace(top, val);
  }

  /**
   * emplace_front is not available for concurrent versions because it is not
   * possible for clients to know in advance whether insertion will succeed,
   * which will leave xvalue arguments in indeterminate state.
   */
  template <typename... Args, bool C = Concurrent>
  auto emplace_front(Args&&... args) ->
      typename std::enable_if<!C, pointer>::type {
    if (full())
      return 0;
    unsigned top = count++;
    return datac.emplace(top, std::forward<Args>(args)...);
  }

  reference back() { return front(); }
  const_reference back() const { return front(); }
  galois::optional<value_type> extract_back() { return extract_front(); }

  bool pop_back() { return pop_front(); }

  reference front() {
    assert(precondition());
    assert(!empty());
    return *at(count - 1);
  }

  const_reference front() const { return *at(count - 1); }

  template <bool C = Concurrent>
  auto extract_front() ->
      typename std::enable_if<!C, galois::optional<value_type>>::type {
    if (!empty()) {
      galois::optional<value_type> retval(back());
      pop_back();
      return retval;
    }
    return galois::optional<value_type>();
  }

  //! returns true if something was popped
  template <bool C = Concurrent>
  auto pop_front() -> typename std::enable_if<C, bool>::type {
    unsigned top;
    do {
      top = count.load(std::memory_order_relaxed);
      if (top == 0)
        return false;
    } while (!count.compare_exchange_weak(top, top - 1));
    datac.destroy(top);
    return true;
  }

  //! returns true if something was popped
  template <bool C = Concurrent>
  auto pop_front() -> typename std::enable_if<!C, bool>::type {
    if (count == 0)
      return false;
    datac.destroy(--count);
    return true;
  }

  reverse_iterator rbegin() { return &datac[0]; }
  reverse_iterator rend() { return &datac[count]; }
  const_reverse_iterator rbegin() const { return &datac[0]; }
  const_reverse_iterator rend() const { return &datac[count]; }

  iterator begin() { return iterator(rend()); }
  iterator end() { return iterator(rbegin()); }
  const_iterator begin() const { return const_iterator(rend()); }
  const_iterator end() const { return const_iterator(rbegin()); }
};

//! Unordered collection of bounded size
template <typename T, unsigned ChunkSize = 64>
using FixedSizeBag = FixedSizeBagBase<T, ChunkSize, false>;

//! Unordered collection of bounded size with concurrent insertion or deletion
//! but not both simultaneously
template <typename T, unsigned ChunkSize = 64>
using ConcurrentFixedSizeBag = FixedSizeBagBase<T, ChunkSize, true>;

//! Ordered collection of bounded size
template <typename T, unsigned ChunkSize = 64>
class FixedSizeRing {
  LazyArray<T, ChunkSize> datac;
  unsigned start;
  unsigned count;

  T* at(unsigned i) { return &datac[i]; }
  const T* at(unsigned i) const { return &datac[i]; }

  bool precondition() const { return count <= ChunkSize && start <= ChunkSize; }

  template <typename U>
  class Iterator
      : public boost::iterator_facade<Iterator<U>, U,
                                      boost::random_access_traversal_tag> {
    friend class boost::iterator_core_access;
    U* base;
    unsigned cur;
    unsigned count;

    template <typename OtherTy>
    bool equal(const Iterator<OtherTy>& o) const {
      assert(base && o.base);
      return &base[cur] == &o.base[o.cur] && count == o.count;
    }

    U& dereference() const { return base[cur]; }

    void increment() {
      assert(base && count != 0);
      count -= 1;
      cur = (cur + 1) % ChunkSize;
    }

    void decrement() {
      assert(base && count < ChunkSize);
      count += 1;
      cur = (cur + ChunkSize - 1) % ChunkSize;
    }

    void advance(ptrdiff_t x) {
      count -= x;
      cur = (cur + ChunkSize + x) % ChunkSize;
    }

    ptrdiff_t distance_to(const Iterator& o) const {
      ptrdiff_t c  = count;
      ptrdiff_t oc = o.count;
      return c - oc;
    }

  public:
    Iterator() : base(0), cur(0), count(0) {}

    template <typename OtherTy>
    Iterator(const Iterator<OtherTy>& o)
        : base(o.base), cur(o.cur), count(o.count) {}

    Iterator(U* b, unsigned c, unsigned co) : base(b), cur(c), count(co) {}
  };

public:
  typedef T value_type;
  typedef T* pointer;
  typedef T& reference;
  typedef const T& const_reference;
  typedef Iterator<T> iterator;
  typedef Iterator<const T> const_iterator;
  typedef boost::reverse_iterator<Iterator<T>> reverse_iterator;
  typedef boost::reverse_iterator<Iterator<const T>> const_reverse_iterator;

  FixedSizeRing() : start(0), count(0) {}

  template <typename InputIterator>
  FixedSizeRing(InputIterator first, InputIterator last) : start(0), count(0) {
    while (first != last) {
      assert(count < ChunkSize);
      datac.emplace(count++, *first++);
    }
  }

  FixedSizeRing(const FixedSizeRing& o) = delete;
  FixedSizeRing& operator=(const FixedSizeRing& o) = delete;

  ~FixedSizeRing() { clear(); }

  unsigned size() const {
    assert(precondition());
    return count;
  }

  bool empty() const {
    assert(precondition());
    return count == 0;
  }

  bool full() const {
    assert(precondition());
    return count == ChunkSize;
  }

  reference getAt(unsigned x) {
    assert(precondition());
    assert(!empty());
    return *at((start + x) % ChunkSize);
  }

  const_reference getAt(unsigned x) const {
    assert(precondition());
    assert(!empty());
    return *at((start + x) % ChunkSize);
  }

  void clear() {
    assert(precondition());
    for (unsigned x = 0; x < count; ++x)
      datac.destroy((start + x) % ChunkSize);
    count = 0;
    start = 0;
  }

  // NB(ddn): Keeping emplace_front/_back code paths separate to improve
  // branch prediction etc
  template <typename... Args>
  pointer emplace(iterator pos, Args&&... args) {
    if (full())
      return 0;
    unsigned i;
    if (pos == begin()) {
      i = start = (start + ChunkSize - 1) % ChunkSize;
      ++count;
    } else if (pos == end()) {
      i = (start + count) % ChunkSize;
      ++count;
    } else {
      auto d = std::distance(begin(), pos);
      i      = (start + d) % ChunkSize;
      emplace_back();
      std::move_backward(begin() + d, end() - 1, end());
      datac.destroy(i);
    }
    return datac.emplace(i, std::forward<Args>(args)...);
  }

  template <typename U>
  pointer push_front(U&& val) {
    return emplace_front(std::forward<U>(val));
  }

  template <typename... Args>
  pointer emplace_front(Args&&... args) {
    if (full())
      return 0;
    start = (start + ChunkSize - 1) % ChunkSize;
    ++count;
    return datac.emplace(start, std::forward<Args>(args)...);
  }

  template <typename U>
  pointer push_back(U&& val) {
    return emplace_back(std::forward<U>(val));
  }

  template <typename... Args>
  pointer emplace_back(Args&&... args) {
    if (full())
      return 0;
    unsigned end = (start + count) % ChunkSize;
    ++count;
    return datac.emplace(end, std::forward<Args>(args)...);
  }

  reference front() {
    assert(precondition());
    assert(!empty());
    return *at(start);
  }

  const_reference front() const {
    assert(precondition());
    assert(!empty());
    return *at(start);
  }

  galois::optional<value_type> extract_front() {
    if (!empty()) {
      galois::optional<value_type> retval(front());
      pop_front();
      return retval;
    }
    return galois::optional<value_type>();
  }

  void pop_front() {
    assert(precondition());
    assert(!empty());
    datac.destroy(start);
    start = (start + 1) % ChunkSize;
    --count;
  }

  reference back() {
    assert(precondition());
    assert(!empty());
    return *at((start + count - 1) % ChunkSize);
  }

  const_reference back() const {
    assert(precondition());
    assert(!empty());
    return *at((start + count - 1) % ChunkSize);
  }

  galois::optional<value_type> extract_back() {
    if (!empty()) {
      galois::optional<value_type> retval(back());
      pop_back();
      return retval;
    }
    return galois::optional<value_type>();
  }

  void pop_back() {
    assert(precondition());
    assert(!empty());
    datac.destroy((start + count - 1) % ChunkSize);
    --count;
  }

  iterator begin() { return iterator(at(0), start, count); }
  iterator end() { return iterator(at(0), (start + count) % ChunkSize, 0); }
  const_iterator begin() const { return const_iterator(at(0), start, count); }
  const_iterator end() const {
    return const_iterator(at(0), (start + count) % ChunkSize, 0);
  }

  reverse_iterator rbegin() { return reverse_iterator(end()); }
  reverse_iterator rend() { return reverse_iterator(begin()); }
  const_iterator rbegin() const { const_reverse_iterator(this->end()); }
  const_iterator rend() const { const_reverse_iterator(this->begin()); }
};

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/FlatMap.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_FLATMAP_H
#define GALOIS_FLATMAP_H

#include <algorithm>
#include <stdexcept>
#include <type_traits>
#include <vector>

#include "galois/config.h"

namespace galois {

//! Simple map data structure, based off a single array.
template <class _Key, class _Tp, class _Compare = std::less<_Key>,
          class _Alloc = std::allocator<std::pair<_Key, _Tp>>,
          class _Store = std::vector<std::pair<_Key, _Tp>, _Alloc>>
class flat_map {
public:
  typedef _Key key_type;
  typedef _Tp mapped_type;
  typedef std::pair<_Key, _Tp> value_type;
  typedef _Compare key_compare;
  typedef _Alloc allocator_type;

  class value_compare {
    friend class flat_map<_Key, _Tp, _Compare, _Alloc, _Store>;

  protected:
    _Compare comp;

    value_compare(_Compare __c) : comp(__c) {}

  public:
    bool operator()(const value_type& __x, const value_type& __y) const {
      return comp(__x.first, __y.first);
    }
  };

private:
  /// This turns...
  typedef typename _Alloc::template rebind<value_type>::other _Pair_alloc_type;

  typedef _Store _VectTy;
  _VectTy _data;
  _Compare _comp;

  class value_key_compare {
    friend class flat_map<_Key, _Tp, _Compare, _Alloc, _Store>;

  protected:
    _Compare comp;

    value_key_compare(_Compare __c) : comp(__c) {}

  public:
    bool operator()(const value_type& __x, const key_type& __y) const {
      return comp(__x.first, __y);
    }
  };

  value_key_compare value_key_comp() const {
    return value_key_compare(key_comp());
  }

  bool key_eq(const key_type& k1, const key_type& k2) const {
    return !key_comp()(k1, k2) && !key_comp()(k2, k1);
  }

  void resort() { std::sort(_data.begin(), _data.end(), value_comp()); }

public:
  typedef typename _Pair_alloc_type::pointer pointer;
  typedef typename _Pair_alloc_type::const_pointer const_pointer;
  typedef typename _Pair_alloc_type::reference reference;
  typedef typename _Pair_alloc_type::const_reference const_reference;
  typedef typename _VectTy::iterator iterator;
  typedef typename _VectTy::const_iterator const_iterator;
  typedef typename _VectTy::size_type size_type;
  typedef typename _VectTy::difference_type difference_type;
  typedef typename _VectTy::reverse_iterator reverse_iterator;
  typedef typename _VectTy::const_reverse_iterator const_reverse_iterator;

  flat_map() : _data(), _comp() {}

  explicit flat_map(const _Compare& __comp,
                    const allocator_type& = allocator_type())
      // XXX :_data(_Pair_alloc_type(__a)), _comp(__comp) {}
      : _data(), _comp(__comp) {}

  flat_map(const flat_map& __x) : _data(__x._data), _comp(__x._comp) {}

  flat_map(flat_map&& __x)
      /* noexcept(std::is_nothrow_copy_constructible<_Compare>::value) */
      : _data(std::move(__x._data)), _comp(std::move(__x._comp)) {}

  /*
  flat_map(std::initializer_list<value_type> __l,
       const _Compare& __comp = _Compare(),
       const allocator_type& __a = allocator_type())
    : _data(__l, _Pair_alloc_type(__a)), _comp(__comp) { resort(); }
   */

  template <typename _InputIterator>
  flat_map(_InputIterator __first, _InputIterator __last)
      : _data(__first, __last), _comp() {
    resort();
  }

  template <typename _InputIterator>
  flat_map(_InputIterator __first, _InputIterator __last, const _Compare&,
           const allocator_type& __a = allocator_type())
      : _data(__first, __last, _Pair_alloc_type(__a)) {
    resort();
  }

  flat_map& operator=(const flat_map& __x) {
    _data = __x._data;
    _comp = __x._comp;
    return *this;
  }

  flat_map& operator=(flat_map&& __x) {
    clear();
    swap(__x);
    return *this;
  }

  /*
  flat_map& operator=(std::initializer_list<value_type> __l) {
    clear();
    insert(__l.begin(), __l.end());
    return *this;
  }
   */

  allocator_type get_allocator() const /* noexcept */ {
    return allocator_type(_data.get_allocator());
  }

  // iterators

  iterator begin() /* noexcept */ { return _data.begin(); }
  const_iterator begin() const /* noexcept */ { return _data.begin(); }
  iterator end() /* noexcept */ { return _data.end(); }
  const_iterator end() const /* noexcept */ { return _data.end(); }
  reverse_iterator rbegin() /* noexcept */ { return _data.rbegin(); }
  const_reverse_iterator rbegin() const /* noexcept */ {
    return _data.rbegin();
  }
  reverse_iterator rend() /* noexcept */ { return _data.rend(); }
  const_reverse_iterator rend() const /* noexcept */ { return _data.rend(); }
  const_iterator cbegin() const /* noexcept */ { return _data.begin(); }
  const_iterator cend() const /* noexcept */ { return _data.end(); }
  const_reverse_iterator crbegin() const /* noexcept */ {
    return _data.rbegin();
  }
  const_reverse_iterator crend() const /* noexcept */ { return _data.rend(); }

  bool empty() const /* noexcept */ { return _data.empty(); }
  size_type size() const /* noexcept */ { return _data.size(); }
  size_type max_size() const /* noexcept */ { return _data.max_size(); }

  template <typename... Args>
  std::pair<iterator, bool> emplace(Args&&... args) {
    // assert(std::adjacent_find(_data.begin(), _data.end(), [&](const
    // value_type& a, const value_type& b) {
    //    return key_comp()(b.first, a.first);
    //}) == _data.end());
    _data.emplace_back(std::forward<Args>(args)...);
    value_type& v = _data.back();
    auto ee       = _data.end();
    --ee;
    auto __i = std::lower_bound(_data.begin(), ee, v.first, value_key_comp());
    // key < __i->first
    bool retval = __i == ee || key_comp()(v.first, (*__i).first);
    if (retval) {
      if (__i != ee) {
        value_type tmp = std::move(v);
        __i            = _data.emplace(__i, std::move(tmp));
        _data.pop_back();
      }
    } else {
      // key == __i->first
      _data.pop_back();
    }
    return std::make_pair(__i, retval);
  }

  mapped_type& operator[](const key_type& __k) {
    iterator __i = lower_bound(__k);
    // __i->first is greater than or equivalent to __k.
    if (__i == end() || key_comp()(__k, (*__i).first))
      __i = _data.emplace(__i, std::piecewise_construct,
                          std::forward_as_tuple(__k), std::tuple<>());
    return (*__i).second;
  }

  mapped_type& operator[](key_type&& __k) {
    iterator __i = lower_bound(__k);
    // __i->first is greater than or equivalent to __k.
    if (__i == end() || key_comp()(__k, (*__i).first))
      __i =
          _data.emplace(__i, std::piecewise_construct,
                        std::forward_as_tuple(std::move(__k)), std::tuple<>());
    return (*__i).second;
  }

  mapped_type& at(const key_type& __k) {
    iterator __i = lower_bound(__k);
    if (__i == end() || key_comp()(__k, (*__i).first))
      throw std::out_of_range("flat_map::at");
    return (*__i).second;
  }

  const mapped_type& at(const key_type& __k) const {
    const_iterator __i = lower_bound(__k);
    if (__i == end() || key_comp()(__k, (*__i).first))
      throw std::out_of_range("flat_map::at");
    return (*__i).second;
  }

  template <typename PairTy,
            typename = typename std::enable_if<
                std::is_constructible<value_type, PairTy&&>::value>::type>
  std::pair<iterator, bool> insert(PairTy&& __x) {
    return emplace(std::forward<PairTy>(__x));
  }

  /*
  void insert(std::initializer_list<value_type> __list) {
    insert(__list.begin(), __list.end());
  }
   */

  template <typename _InputIterator>
  void insert(_InputIterator __first, _InputIterator __last) {
    while (__first != __last)
      insert(*__first++);
  }

  iterator erase(const_iterator __position) { return _data.erase(__position); }
  iterator erase(iterator __position) { return _data.erase(__position); }

  size_type erase(const key_type& __x) {
    auto i = find(__x);
    if (i != end()) {
      _data.erase(i);
      return 1;
    }
    return 0;
  }

  iterator erase(const_iterator __first, const_iterator __last) {
    return _data.erase(__first, __last);
  }

  void swap(flat_map& __x) {
    _data.swap(__x._data);
    std::swap(_comp, __x._comp);
  }

  void clear() /* noexcept */ { _data.clear(); }

  key_compare key_comp() const { return _comp; }
  value_compare value_comp() const { return value_compare(key_comp()); }

  iterator find(const key_type& __x) {
    auto i = lower_bound(__x);
    if (i != end() && key_eq(i->first, __x))
      return i;
    return end();
  }

  const_iterator find(const key_type& __x) const {
    auto i = lower_bound(__x);
    if (i != end() && key_eq(i->first, __x))
      return i;
    return end();
  }

  size_type count(const key_type& __x) const {
    return find(__x) == end() ? 0 : 1;
  }

  iterator lower_bound(const key_type& __x) {
    return std::lower_bound(_data.begin(), _data.end(), __x, value_key_comp());
  }
  const_iterator lower_bound(const key_type& __x) const {
    return std::lower_bound(_data.begin(), _data.end(), __x, value_key_comp());
  }

  iterator upper_bound(const key_type& __x) {
    return std::upper_bound(_data.begin(), _data.end(), __x, value_key_comp());
  }
  const_iterator upper_bound(const key_type& __x) const {
    return std::upper_bound(_data.begin(), _data.end(), __x, value_key_comp());
  }

  std::pair<iterator, iterator> equal_range(const key_type& __x) {
    return std::make_pair(lower_bound(__x), upper_bound(__x));
  }

  std::pair<const_iterator, const_iterator>
  equal_range(const key_type& __x) const {
    return std::make_pair(lower_bound(__x), upper_bound(__x));
  }
};

template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator==(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return __x._data == __y._data;
}

template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator<(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                      const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return __x._data < __y._data;
}

/// Based on operator==
template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator!=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return !(__x == __y);
}

/// Based on operator<
template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator>(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                      const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return __y < __x;
}

/// Based on operator<
template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator<=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return !(__y < __x);
}

/// Based on operator<
template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline bool operator>=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  return !(__x < __y);
}

} // namespace galois

namespace std {

/// See galois::flat_map::swap().
template <typename _Key, typename _Tp, typename _Compare, typename _Alloc>
inline void swap(galois::flat_map<_Key, _Tp, _Compare, _Alloc>& __x,
                 galois::flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {
  __x.swap(__y);
}

} // namespace std

#endif


================================================
FILE: libgalois/include/galois/Galois.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GALOIS_H
#define GALOIS_GALOIS_H

#include "galois/config.h"
#include "galois/Loops.h"
#include "galois/SharedMemSys.h"
#include "galois/runtime/Mem.h"

#endif


================================================
FILE: libgalois/include/galois/GaloisForwardDecl.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/config.h"

namespace galois {

template <typename RangeFunc, typename FunctionTy, typename... Args>
void for_each(const RangeFunc& rangeMaker, FunctionTy&& fn,
              const Args&... args);

template <typename RangeFunc, typename FunctionTy, typename... Args>
void do_all(const RangeFunc& rangeMaker, FunctionTy&& fn, const Args&... args);

template <typename FunctionTy, typename... Args>
void on_each(FunctionTy&& fn, const Args&... args);

} // end namespace galois


================================================
FILE: libgalois/include/galois/LargeArray.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_LARGEARRAY_H
#define GALOIS_LARGEARRAY_H

#include <iostream>
#include <utility>

#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/serialization/binary_object.hpp>
#include <boost/serialization/array.hpp>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/split_member.hpp>

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/gIO.h"
#include "galois/ParallelSTL.h"
#include "galois/runtime/Mem.h"
#include "galois/substrate/NumaMem.h"

namespace galois {

namespace runtime {
extern unsigned activeThreads;
} // end namespace runtime

/**
 * Large array of objects with proper specialization for void type and
 * supporting various allocation and construction policies.
 *
 * @tparam T value type of container
 */
template <typename T>
class LargeArray {
  substrate::LAptr m_realdata;
  T* m_data;
  size_t m_size;

public:
  typedef T raw_value_type;
  typedef T value_type;
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef value_type& reference;
  typedef const value_type& const_reference;
  typedef value_type* pointer;
  typedef const value_type* const_pointer;
  typedef pointer iterator;
  typedef const_pointer const_iterator;
  const static bool has_value = true;

  // Extra indirection to support incomplete T's
  struct size_of {
    const static size_t value = sizeof(T);
  };

protected:
  enum AllocType { Blocked, Local, Interleaved, Floating };
  void allocate(size_type n, AllocType t) {
    assert(!m_data);
    m_size = n;
    switch (t) {
    case Blocked:
      galois::gDebug("Block-alloc'd");
      m_realdata =
          substrate::largeMallocBlocked(n * sizeof(T), runtime::activeThreads);
      break;
    case Interleaved:
      galois::gDebug("Interleave-alloc'd");
      m_realdata = substrate::largeMallocInterleaved(n * sizeof(T),
                                                     runtime::activeThreads);
      break;
    case Local:
      galois::gDebug("Local-allocd");
      m_realdata = substrate::largeMallocLocal(n * sizeof(T));
      break;
    case Floating:
      galois::gDebug("Floating-alloc'd");
      m_realdata = substrate::largeMallocFloating(n * sizeof(T));
      break;
    };
    m_data = reinterpret_cast<T*>(m_realdata.get());
  }

private:
  /*
   * To support boost serialization
   */
  friend class boost::serialization::access;
  template <typename Archive>
  void save(Archive& ar, const unsigned int) const {

    // TODO DON'T USE CERR
    // std::cerr << "save m_size : " << m_size << " Threads : " <<
    // runtime::activeThreads << "\n";
    ar << m_size;
    // for(size_t i = 0; i < m_size; ++i){
    // ar << m_data[i];
    //}
    ar << boost::serialization::make_binary_object(m_data, m_size * sizeof(T));
    /*
     * Cas use make_array too as shown below
     * IMPORTANT: Use make_array as temp fix for benchmarks using non-trivial
     * structures in nodeData (Eg. SGD) This also requires changes in
     * libgalois/include/galois/graphs/Details.h (specified in the file).
     */
    // ar << boost::serialization::make_array<T>(m_data, m_size);
  }
  template <typename Archive>
  void load(Archive& ar, const unsigned int) {
    ar >> m_size;

    // TODO DON'T USE CERR
    // std::cerr << "load m_size : " << m_size << " Threads : " <<
    // runtime::activeThreads << "\n";

    // TODO: For now, always use allocateInterleaved
    // Allocates and sets m_data pointer
    if (!m_data)
      allocateInterleaved(m_size);

    // for(size_t i = 0; i < m_size; ++i){
    // ar >> m_data[i];
    //}
    ar >> boost::serialization::make_binary_object(m_data, m_size * sizeof(T));
    /*
     * Cas use make_array too as shown below
     * IMPORTANT: Use make_array as temp fix for SGD
     *            This also requires changes in
     * libgalois/include/galois/graphs/Details.h (specified in the file).
     */
    // ar >> boost::serialization::make_array<T>(m_data, m_size);
  }
  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
  // the save or load depending on whether the archive is used for saving or
  // loading
  BOOST_SERIALIZATION_SPLIT_MEMBER()

public:
  /**
   * Wraps existing buffer in LargeArray interface.
   */
  LargeArray(void* d, size_t s) : m_data(reinterpret_cast<T*>(d)), m_size(s) {}

  LargeArray() : m_data(0), m_size(0) {}

  LargeArray(LargeArray&& o) : m_data(0), m_size(0) {
    std::swap(this->m_realdata, o.m_realdata);
    std::swap(this->m_data, o.m_data);
    std::swap(this->m_size, o.m_size);
  }

  LargeArray& operator=(LargeArray&& o) {
    std::swap(this->m_realdata, o.m_realdata);
    std::swap(this->m_data, o.m_data);
    std::swap(this->m_size, o.m_size);
    return *this;
  }

  LargeArray(const LargeArray&) = delete;
  LargeArray& operator=(const LargeArray&) = delete;

  ~LargeArray() {
    destroy();
    deallocate();
  }

  friend void swap(LargeArray& lhs, LargeArray& rhs) {
    std::swap(lhs.m_realdata, rhs.m_realdata);
    std::swap(lhs.m_data, rhs.m_data);
    std::swap(lhs.m_size, rhs.m_size);
  }

  const_reference at(difference_type x) const { return m_data[x]; }
  reference at(difference_type x) { return m_data[x]; }
  const_reference operator[](size_type x) const { return m_data[x]; }
  reference operator[](size_type x) { return m_data[x]; }
  void set(difference_type x, const_reference v) { m_data[x] = v; }
  size_type size() const { return m_size; }
  iterator begin() { return m_data; }
  const_iterator begin() const { return m_data; }
  iterator end() { return m_data + m_size; }
  const_iterator end() const { return m_data + m_size; }

  //! [allocatefunctions]
  //! Allocates interleaved across NUMA (memory) nodes.
  void allocateInterleaved(size_type n) { allocate(n, Interleaved); }

  /**
   * Allocates using blocked memory policy
   *
   * @param  n         number of elements to allocate
   */
  void allocateBlocked(size_type n) { allocate(n, Blocked); }

  /**
   * Allocates using Thread Local memory policy
   *
   * @param  n         number of elements to allocate
   */
  void allocateLocal(size_type n) { allocate(n, Local); }

  /**
   * Allocates using no memory policy (no pre alloc)
   *
   * @param  n         number of elements to allocate
   */
  void allocateFloating(size_type n) { allocate(n, Floating); }

  /**
   * Allocate memory to threads based on a provided array specifying which
   * threads receive which elements of data.
   *
   * @tparam RangeArrayTy The type of the threadRanges array; should either
   * be uint32_t* or uint64_t*
   * @param numberOfElements Number of elements to allocate space for
   * @param threadRanges An array specifying how elements should be split
   * among threads
   */
  template <typename RangeArrayTy>
  void allocateSpecified(size_type numberOfElements,
                         RangeArrayTy& threadRanges) {
    assert(!m_data);

    m_realdata = substrate::largeMallocSpecified(numberOfElements * sizeof(T),
                                                 runtime::activeThreads,
                                                 threadRanges, sizeof(T));

    m_size = numberOfElements;
    m_data = reinterpret_cast<T*>(m_realdata.get());
  }
  //! [allocatefunctions]

  template <typename... Args>
  void construct(Args&&... args) {
    for (T *ii = m_data, *ei = m_data + m_size; ii != ei; ++ii)
      new (ii) T(std::forward<Args>(args)...);
  }

  template <typename... Args>
  void constructAt(size_type n, Args&&... args) {
    new (&m_data[n]) T(std::forward<Args>(args)...);
  }

  //! Allocate and construct
  template <typename... Args>
  void create(size_type n, Args&&... args) {
    allocateInterleaved(n);
    construct(std::forward<Args>(args)...);
  }

  void deallocate() {
    m_realdata.reset();
    m_data = 0;
    m_size = 0;
  }

  void destroy() {
    if (!m_data)
      return;
    galois::ParallelSTL::destroy(m_data, m_data + m_size);
  }

  template <typename U = T>
  std::enable_if_t<!std::is_scalar<U>::value> destroyAt(size_type n) {
    (&m_data[n])->~T();
  }

  template <typename U = T>
  std::enable_if_t<std::is_scalar<U>::value> destroyAt(size_type) {}

  // The following methods are not shared with void specialization
  const_pointer data() const { return m_data; }
  pointer data() { return m_data; }
};

//! Void specialization
template <>
class LargeArray<void> {

private:
  /*
   * To support boost serialization
   * Can use single function serialize instead of save and load, since both save
   * and load have identical code.
   */
  friend class boost::serialization::access;
  template <typename Archive>
  void serialize(Archive&, const unsigned int) const {}

public:
  LargeArray(void*, size_t) {}
  LargeArray()                  = default;
  LargeArray(const LargeArray&) = delete;
  LargeArray& operator=(const LargeArray&) = delete;

  friend void swap(LargeArray&, LargeArray&) {}

  typedef void raw_value_type;
  typedef void* value_type;
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef value_type reference;
  typedef value_type const_reference;
  typedef value_type* pointer;
  typedef value_type* const_pointer;
  typedef pointer iterator;
  typedef const_pointer const_iterator;
  const static bool has_value = false;
  struct size_of {
    const static size_t value = 0;
  };

  const_reference at(difference_type) const { return 0; }
  reference at(difference_type) { return 0; }
  const_reference operator[](size_type) const { return 0; }
  template <typename AnyTy>
  void set(difference_type, AnyTy) {}
  size_type size() const { return 0; }
  iterator begin() { return 0; }
  const_iterator begin() const { return 0; }
  iterator end() { return 0; }
  const_iterator end() const { return 0; }

  void allocateInterleaved(size_type) {}
  void allocateBlocked(size_type) {}
  void allocateLocal(size_type, bool = true) {}
  void allocateFloating(size_type) {}
  template <typename RangeArrayTy>
  void allocateSpecified(size_type, RangeArrayTy) {}

  template <typename... Args>
  void construct(Args&&...) {}
  template <typename... Args>
  void constructAt(size_type, Args&&...) {}
  template <typename... Args>
  void create(size_type, Args&&...) {}

  void deallocate() {}
  void destroy() {}
  void destroyAt(size_type) {}

  const_pointer data() const { return 0; }
  pointer data() { return 0; }
};

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/LazyArray.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_LAZYARRAY_H
#define GALOIS_LAZYARRAY_H

#include <algorithm>
#include <cstddef>
#include <iterator>
#include <stdexcept>
#include <type_traits>
#include <utility>

#include "galois/config.h"
#include "galois/LazyObject.h"

namespace galois {

/**
 * This is a container that encapsulates space for a constant size array.  The
 * initialization and destruction of items is explicitly under the control of
 * the user.
 */
template <typename _Tp, unsigned _Size>
class LazyArray {
  typedef typename std::aligned_storage<
      sizeof(_Tp), std::alignment_of<_Tp>::value>::type CharData;

  LazyObject<_Tp> data_[(_Size > 0 ? _Size : 1)];

  _Tp* get(size_t __n) { return &data_[__n].get(); }
  const _Tp* get(size_t __n) const { return &data_[__n].get(); }

public:
  typedef _Tp value_type;
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef value_type& reference;
  typedef const value_type& const_reference;
  typedef value_type* pointer;
  typedef const value_type* const_pointer;
  typedef pointer iterator;
  typedef const_pointer const_iterator;
  typedef std::reverse_iterator<iterator> reverse_iterator;
  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;

  // iterators:
  iterator begin() { return iterator(get(0)); }
  const_iterator begin() const { return const_iterator(get(0)); }
  iterator end() { return iterator(get(_Size)); }
  const_iterator end() const { return const_iterator(get(_Size)); }

  reverse_iterator rbegin() { return reverse_iterator(end()); }
  const_reverse_iterator rbegin() const {
    return const_reverse_iterator(end());
  }
  reverse_iterator rend() { return reverse_iterator(begin()); }
  const_reverse_iterator rend() const {
    return const_reverse_iterator(begin());
  }

  const_iterator cbegin() const { return begin(); }
  const_iterator cend() const { return end(); }
  const_reverse_iterator crbegin() const { return rbegin(); }
  const_reverse_iterator crend() const { return rend(); }

  // capacity:
  size_type size() const { return _Size; }
  size_type max_size() const { return _Size; }
  bool empty() const { return _Size == 0; }

  // element access:
  reference operator[](size_type __n) { return *get(__n); }
  const_reference operator[](size_type __n) const { return *get(__n); }
  reference at(size_type __n) {
    if (__n >= _Size)
      throw std::out_of_range("lazyArray::at");
    return get(__n);
  }
  const_reference at(size_type __n) const {
    if (__n >= _Size)
      throw std::out_of_range("lazyArray::at");
    return get(__n);
  }

  reference front() { return *get(0); }
  const_reference front() const { return *get(0); }
  reference back() { return *get(_Size > 0 ? _Size - 1 : 0); }
  const_reference back() const { return *get(_Size > 0 ? _Size - 1 : 0); }

  pointer data() { return get(0); }
  const_pointer data() const { return get(0); }

  // missing: fill swap

  template <typename... Args>
  pointer emplace(size_type __n, Args&&... args) {
    return new (get(__n)) _Tp(std::forward<Args>(args)...);
  }

  pointer construct(size_type __n, const _Tp& val) { return emplace(__n, val); }
  pointer construct(size_type __n, _Tp&& val) {
    return emplace(__n, std::move(val));
  }

  void destroy(size_type __n) { (get(__n))->~_Tp(); }
};

} // namespace galois
#endif // GALOIS_LAZYARRAY_H


================================================
FILE: libgalois/include/galois/LazyObject.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_LAZYOBJECT_H
#define GALOIS_LAZYOBJECT_H

#include <type_traits>
#include <utility>

#include "galois/config.h"
#include "galois/gIO.h"

namespace galois {

/**
 * Single object with specialization for void type. To take advantage of empty
 * member optimization, users should subclass this class, otherwise the
 * compiler will insert non-zero padding for fields (even when empty).
 */
template <typename T>
class StrictObject {
  T data;

public:
  typedef T value_type;
  typedef T& reference;
  typedef const T& const_reference;
  const static bool has_value = true;

  StrictObject() {}
  StrictObject(const_reference t) : data(t) {}
  const_reference get() const { return data; }
  reference get() { return data; }
};

template <>
struct StrictObject<void> {
  typedef void* value_type;
  typedef void* reference;
  typedef void* const_reference;
  const static bool has_value = false;

  StrictObject() {}
  StrictObject(const_reference) {}
  reference get() const { return 0; }
};

/**
 * Single (uninitialized) object with specialization for void type. To take
 * advantage of empty member optimization, users should subclass this class,
 * otherwise the compiler will insert non-zero padding for fields (even when
 * empty).
 */
template <typename T>
class LazyObject {
  typedef
      typename std::aligned_storage<sizeof(T),
                                    std::alignment_of<T>::value>::type CharData;

  union Data {
    CharData buf;
    T value_;

    // Declare constructor explicitly because Data must be default
    // constructable regardless of the constructability of T.
    Data() {}  // NOLINT(modernize-use-equals-default)
    ~Data() {} // NOLINT(modernize-use-equals-default)

    T& value() { return value_; }
    const T& value() const { return value_; }
  };

  Data data_;

  T* cast() { return &data_.value(); }
  const T* cast() const { return &data_.value(); }

public:
  typedef T value_type;
  typedef T& reference;
  typedef const T& const_reference;
  const static bool has_value = true;
  // Can't support incomplete T's but provide same interface as
  // {@link galois::LargeArray} for consistency
  struct size_of {
    const static size_t value = sizeof(T);
  };

  void destroy() { cast()->~T(); }
  void construct(const_reference x) { new (cast()) T(x); }

  template <typename... Args>
  void construct(Args&&... args) {
    new (cast()) T(std::forward<Args>(args)...);
  }

  const_reference get() const { return *cast(); }
  reference get() { return *cast(); }
};

template <>
struct LazyObject<void> {
  typedef void* value_type;
  typedef void* reference;
  typedef void* const_reference;
  const static bool has_value = false;
  struct size_of {
    const static size_t value = 0;
  };

  void destroy() {}
  void construct(const_reference) {}

  template <typename... Args>
  void construct(Args&&...) {}

  const_reference get() const { return 0; }
};

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/Loops.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_LOOPS_H
#define GALOIS_LOOPS_H

#include "galois/config.h"
#include "galois/runtime/Executor_Deterministic.h"
#include "galois/runtime/Executor_DoAll.h"
#include "galois/runtime/Executor_ForEach.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/runtime/Executor_Ordered.h"
#include "galois/runtime/Executor_ParaMeter.h"
#include "galois/worklists/WorkList.h"

namespace galois {

////////////////////////////////////////////////////////////////////////////////
// Foreach
////////////////////////////////////////////////////////////////////////////////

/**
 * Galois unordered set iterator.
 * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item
 * is a value from the iteration range and T is the type of item.
 *
 * @param rangeMaker an iterate range maker typically returned by
 * <code>galois::iterate(...)</code>
 * (@see galois::iterate()). rangeMaker is a functor which when called returns a
 * range object
 * @param fn operator
 * @param args optional arguments to loop, e.g., {@see loopname}, {@see wl}
 */

template <typename RangeFunc, typename FunctionTy, typename... Args>
void for_each(const RangeFunc& rangeMaker, FunctionTy&& fn,
              const Args&... args) {
  auto tpl = std::make_tuple(args...);
  runtime::for_each_gen(rangeMaker(tpl), std::forward<FunctionTy>(fn), tpl);
}

/**
 * Standard do-all loop. All iterations should be independent.
 * Operator should conform to <code>fn(item)</code> where item is a value from
 * the iteration range.
 *
 * @param rangeMaker an iterate range maker typically returned by
 * <code>galois::iterate(...)</code>
 * (@see galois::iterate()). rangeMaker is a functor which when called returns a
 * range object
 * @param fn operator
 * @param args optional arguments to loop
 */
template <typename RangeFunc, typename FunctionTy, typename... Args>
void do_all(const RangeFunc& rangeMaker, FunctionTy&& fn, const Args&... args) {
  auto tpl = std::make_tuple(args...);
  runtime::do_all_gen(rangeMaker(tpl), std::forward<FunctionTy>(fn), tpl);
}

/**
 * Low-level parallel loop. Operator is applied for each running thread.
 * Operator should confirm to <code>fn(tid, numThreads)</code> where tid is
 * the id of the current thread and numThreads is the total number of running
 * threads.
 *
 * @param fn operator, which is never copied
 * @param args optional arguments to loop
 */
template <typename FunctionTy, typename... Args>
void on_each(FunctionTy&& fn, const Args&... args) {
  runtime::on_each_gen(std::forward<FunctionTy>(fn), std::make_tuple(args...));
}

/**
 * Preallocates hugepages on each thread.
 *
 * @param num number of pages to allocate of size {@link
 * galois::runtime::MM::hugePageSize}
 */
static inline void preAlloc(int num) {
  static const bool DISABLE_PREALLOC = false;
  if (DISABLE_PREALLOC) {
    galois::gWarn("preAlloc disabled");

  } else {
    runtime::preAlloc_impl(num);
  }
}

/**
 * Reports number of hugepages allocated by the Galois system so far. The value
 * is printing using the statistics infrastructure.
 *
 * @param label Label to associated with report at this program point
 */
static inline void reportPageAlloc(const char* label) {
  runtime::reportPageAlloc(label);
}

/**
 * Galois ordered set iterator for stable source algorithms.
 *
 * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item
 * is a value from the iteration range and T is the type of item. Comparison
 * function should conform to <code>bool r = cmp(item1, item2)</code> where r is
 * true if item1 is less than or equal to item2. Neighborhood function should
 * conform to <code>nhFunc(item)</code> and should visit every element in the
 * neighborhood of active element item.
 *
 * @param b begining of range of initial items
 * @param e end of range of initial items
 * @param cmp comparison function
 * @param nhFunc neighborhood function
 * @param fn operator
 * @param loopname string to identity loop in statistics output
 */
template <typename Iter, typename Cmp, typename NhFunc, typename OpFunc>
void for_each_ordered(Iter b, Iter e, const Cmp& cmp, const NhFunc& nhFunc,
                      const OpFunc& fn, const char* loopname = 0) {
  runtime::for_each_ordered_impl(b, e, cmp, nhFunc, fn, loopname);
}

/**
 * Galois ordered set iterator for unstable source algorithms.
 *
 * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item
 * is a value from the iteration range and T is the type of item. Comparison
 * function should conform to <code>bool r = cmp(item1, item2)</code> where r is
 * true if item1 is less than or equal to item2. Neighborhood function should
 * conform to <code>nhFunc(item)</code> and should visit every element in the
 * neighborhood of active element item. The stability test should conform to
 * <code>bool r = stabilityTest(item)</code> where r is true if item is a stable
 * source.
 *
 * @param b begining of range of initial items
 * @param e end of range of initial items
 * @param cmp comparison function
 * @param nhFunc neighborhood function
 * @param fn operator
 * @param stabilityTest stability test
 * @param loopname string to identity loop in statistics output
 */
template <typename Iter, typename Cmp, typename NhFunc, typename OpFunc,
          typename StableTest>
void for_each_ordered(Iter b, Iter e, const Cmp& cmp, const NhFunc& nhFunc,
                      const OpFunc& fn, const StableTest& stabilityTest,
                      const char* loopname = 0) {
  runtime::for_each_ordered_impl(b, e, cmp, nhFunc, fn, stabilityTest,
                                 loopname);
}

/**
 * Helper functor class to invoke galois::do_all on provided args
 * Can be used to choose between galois::do_all and other equivalents such as
 * std::for_each
 */
struct DoAll {
  template <typename RangeFunc, typename F, typename... Args>
  void operator()(const RangeFunc& rangeMaker, const F& f,
                  Args&&... args) const {
    galois::do_all(rangeMaker, f, std::forward<Args>(args)...);
  }
};

/**
 * Helper functor to invoke std::for_each with the same interface as
 * galois::do_all
 */

struct StdForEach {
  template <typename RangeFunc, typename F, typename... Args>
  void operator()(const RangeFunc& rangeMaker, const F& f,
                  Args&&... args) const {
    auto range = rangeMaker(std::make_tuple(args...));
    std::for_each(range.begin(), range.end(), f);
  }
};

struct ForEach {
  template <typename RangeFunc, typename F, typename... Args>
  void operator()(const RangeFunc& rangeMaker, const F& f,
                  Args&&... args) const {
    galois::for_each(rangeMaker, f, std::forward<Args>(args)...);
  }
};

template <typename Q>
struct WhileQ {
  Q m_q;

  WhileQ(Q&& q = Q()) : m_q(std::move(q)) {}

  template <typename RangeFunc, typename F, typename... Args>
  void operator()(const RangeFunc& rangeMaker, const F& f, Args&&... args) {

    auto range = rangeMaker(std::make_tuple(args...));

    m_q.push(range.begin(), range.end());

    while (!m_q.empty()) {
      auto val = m_q.pop();

      f(val, m_q);
    }
  }
};

} // namespace galois

#endif // GALOIS_LOOPS_H


================================================
FILE: libgalois/include/galois/Mem.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_MEM_H
#define GALOIS_MEM_H

#include "galois/config.h"
#include "galois/runtime/Mem.h"

namespace galois {

//! [PerIterAllocTy example]
//! Base allocator for per-iteration allocator
typedef galois::runtime::BumpWithMallocHeap<
    galois::runtime::FreeListHeap<galois::runtime::SystemHeap>>
    IterAllocBaseTy;

//! Per-iteration allocator that conforms to STL allocator interface
typedef galois::runtime::ExternalHeapAllocator<char, IterAllocBaseTy>
    PerIterAllocTy;
//! [PerIterAllocTy example]

//! Scalable fixed-sized allocator for T that conforms to STL allocator
//! interface but does not support variable sized allocations
template <typename Ty>
using FixedSizeAllocator = galois::runtime::FixedSizeAllocator<Ty>;

//! Scalable variable-sized allocator for T that allocates blocks of sizes in
//! powers of 2 Useful for small and medium sized allocations, e.g. small or
//! medium vectors, strings, deques
template <typename T>
using Pow_2_VarSizeAlloc = typename runtime::Pow_2_BlockAllocator<T>;

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/MethodFlags.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_METHODFLAGS_H
#define GALOIS_METHODFLAGS_H

#include "galois/config.h"

namespace galois {

/**
 * What should the runtime do when executing a method.
 *
 * Various methods take an optional parameter indicating what actions
 * the runtime should do on the user's behalf: (1) checking for conflicts,
 * and/or (2) saving undo information. By default, both are performed (ALL).
 */
enum class MethodFlag : char {
  UNPROTECTED   = 0,
  WRITE         = 1,
  READ          = 2,
  INTERNAL_MASK = 3,
  PREVIOUS      = 4,
};

//! Bitwise & for method flags
inline MethodFlag operator&(MethodFlag x, MethodFlag y) {
  return (MethodFlag)(((int)x) & ((int)y));
}

//! Bitwise | for method flags
inline MethodFlag operator|(MethodFlag x, MethodFlag y) {
  return (MethodFlag)(((int)x) | ((int)y));
}
} // namespace galois

#endif // GALOIS_METHODFLAGS_H


================================================
FILE: libgalois/include/galois/NoDerefIterator.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_NODEREFITERATOR_H
#define GALOIS_NODEREFITERATOR_H

#include "boost/iterator/iterator_adaptor.hpp"

#include "galois/config.h"

namespace galois {

//! Modify an iterator so that *it == it
template <typename Iterator>
struct NoDerefIterator
    : public boost::iterator_adaptor<NoDerefIterator<Iterator>, Iterator,
                                     Iterator, boost::use_default,
                                     const Iterator&> {
  NoDerefIterator() : NoDerefIterator::iterator_adaptor_() {}
  explicit NoDerefIterator(Iterator it)
      : NoDerefIterator::iterator_adaptor_(it) {}
  const Iterator& dereference() const {
    return NoDerefIterator::iterator_adaptor_::base_reference();
  }
  Iterator& dereference() {
    return NoDerefIterator::iterator_adaptor_::base_reference();
  }
};

//! Convenience function to create {@link NoDerefIterator}.
template <typename Iterator>
NoDerefIterator<Iterator> make_no_deref_iterator(Iterator it) {
  return NoDerefIterator<Iterator>(it);
}

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/PODResizeableArray.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_PODRESIZEABLEARRAY_H
#define GALOIS_PODRESIZEABLEARRAY_H

#include <iterator>
#include <stdexcept>
#include <cstddef>
#include <algorithm>
#include <utility>
#include <type_traits>

#include "galois/config.h"

namespace galois {

/**
 * This is a container that encapsulates a resizeable array
 * of plain-old-datatype (POD) elements.
 * There is no initialization or destruction of elements.
 */
template <typename _Tp>
class PODResizeableArray {
  _Tp* data_;
  size_t capacity_;
  size_t size_;

public:
  typedef _Tp value_type;
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef value_type& reference;
  typedef const value_type& const_reference;
  typedef value_type* pointer;
  typedef const value_type* const_pointer;
  typedef pointer iterator;
  typedef const_pointer const_iterator;
  typedef std::reverse_iterator<iterator> reverse_iterator;
  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;

  PODResizeableArray() : data_(NULL), capacity_(0), size_(0) {}

  template <class InputIterator>
  PODResizeableArray(InputIterator first, InputIterator last)
      : data_(NULL), capacity_(0), size_(0) {
    size_t to_add = last - first;
    resize(to_add);
    std::copy_n(first, to_add, begin());
  }

  PODResizeableArray(size_t n) : data_(NULL), capacity_(0), size_(0) {
    resize(n);
  }

  //! disabled (shallow) copy constructor
  PODResizeableArray(const PODResizeableArray&) = delete;

  //! move constructor
  PODResizeableArray(PODResizeableArray&& v)
      : data_(v.data_), capacity_(v.capacity_), size_(v.size_) {
    v.data_     = NULL;
    v.capacity_ = 0;
    v.size_     = 0;
  }

  //! disabled (shallow) copy assignment operator
  PODResizeableArray& operator=(const PODResizeableArray&) = delete;

  //! move assignment operator
  PODResizeableArray& operator=(PODResizeableArray&& v) {
    if (data_ != NULL)
      free(data_);
    data_       = v.data_;
    capacity_   = v.capacity_;
    size_       = v.size_;
    v.data_     = NULL;
    v.capacity_ = 0;
    v.size_     = 0;
    return *this;
  }

  ~PODResizeableArray() {
    if (data_ != NULL)
      free(data_);
  }

  // iterators:
  iterator begin() { return iterator(&data_[0]); }
  const_iterator begin() const { return const_iterator(&data_[0]); }
  iterator end() { return iterator(&data_[size_]); }
  const_iterator end() const { return const_iterator(&data_[size_]); }

  reverse_iterator rbegin() { return reverse_iterator(end()); }
  const_reverse_iterator rbegin() const {
    return const_reverse_iterator(end());
  }
  reverse_iterator rend() { return reverse_iterator(begin()); }
  const_reverse_iterator rend() const {
    return const_reverse_iterator(begin());
  }

  const_iterator cbegin() const { return begin(); }
  const_iterator cend() const { return end(); }
  const_reverse_iterator crbegin() const { return rbegin(); }
  const_reverse_iterator crend() const { return rend(); }

  // size:
  size_type size() const { return size_; }
  size_type max_size() const { return capacity_; }
  bool empty() const { return size_ == 0; }

  void reserve(size_t n) {
    if (n > capacity_) {
      if (capacity_ == 0) {
        capacity_ = 1;
      }
      while (capacity_ < n) {
        capacity_ <<= 1;
      }
      data_ = static_cast<_Tp*>(
          realloc(reinterpret_cast<void*>(data_), capacity_ * sizeof(_Tp)));
    }
  }

  void resize(size_t n) {
    reserve(n);
    size_ = n;
  }

  void clear() { size_ = 0; }

  // element access:
  reference operator[](size_type __n) { return data_[__n]; }
  const_reference operator[](size_type __n) const { return data_[__n]; }
  reference at(size_type __n) {
    if (__n >= size_)
      throw std::out_of_range("PODResizeableArray::at");
    return data_[__n];
  }
  const_reference at(size_type __n) const {
    if (__n >= size_)
      throw std::out_of_range("PODResizeableArray::at");
    return data_[__n];
  }

  void assign(iterator first, iterator last) {
    size_t n = last - first;
    resize(n);
    memcpy(reinterpret_cast<void*>(data_), first, n * sizeof(_Tp));
  }

  reference front() { return data_[0]; }
  const_reference front() const { return data_[0]; }
  reference back() { return data_[size_ - 1]; }
  const_reference back() const { return data_[size_ - 1]; }

  pointer data() { return data_; }
  const_pointer data() const { return data_; }

  void push_back(const _Tp& value) {
    resize(size_ + 1);
    data_[size_ - 1] = value;
  }

  template <class InputIterator>
  void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first,
              InputIterator last) {
    assert(position == end());
    size_t old_size = size_;
    size_t to_add   = last - first;
    resize(old_size + to_add);
    std::copy_n(first, to_add, begin() + old_size);
  }

  void swap(PODResizeableArray& v) {
    std::swap(data_, v.data_);
    std::swap(size_, v.size_);
    std::swap(capacity_, v.capacity_);
  }
};

} // namespace galois
#endif // GALOIS_PODRESIZEABLEARRAY_H


================================================
FILE: libgalois/include/galois/ParallelSTL.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_PARALLELSTL_H
#define GALOIS_PARALLELSTL_H

#include "galois/config.h"
#include "galois/GaloisForwardDecl.h"
#include "galois/NoDerefIterator.h"
#include "galois/runtime/Range.h"
#include "galois/Reduction.h"
#include "galois/Traits.h"
#include "galois/UserContext.h"
#include "galois/Threads.h"
#include "galois/worklists/Chunk.h"

namespace galois {
//! Parallel versions of STL library algorithms.
// TODO: rename to gstl?
namespace ParallelSTL {

template <class InputIterator, class Predicate>
size_t count_if(InputIterator first, InputIterator last, Predicate pred) {

  galois::GAccumulator<size_t> count;

  galois::do_all(galois::iterate(first, last), [&](const auto& v) {
    if (pred(v)) {
      count += 1;
    }
  });

  return count.reduce();
}

template <typename InputIterator, class Predicate>
struct find_if_helper {

  typedef galois::optional<InputIterator> ElementTy;
  typedef substrate::PerThreadStorage<ElementTy> AccumulatorTy;
  AccumulatorTy& accum;
  Predicate& f;

  find_if_helper(AccumulatorTy& a, Predicate& p) : accum(a), f(p) {}
  void operator()(const InputIterator& v, UserContext<InputIterator>& ctx) {
    if (f(*v)) {
      *accum.getLocal() = v;
      ctx.breakLoop();
    }
  }
};

template <class InputIterator, class Predicate>
InputIterator find_if(InputIterator first, InputIterator last, Predicate pred) {
  typedef find_if_helper<InputIterator, Predicate> HelperTy;
  typedef typename HelperTy::AccumulatorTy AccumulatorTy;
  typedef galois::worklists::PerSocketChunkFIFO<256> WL;
  AccumulatorTy accum;
  HelperTy helper(accum, pred);
  for_each(galois::iterate(make_no_deref_iterator(first),
                           make_no_deref_iterator(last)),
           helper, galois::disable_conflict_detection(), galois::no_pushes(),
           galois::parallel_break(), galois::wl<WL>());
  for (unsigned i = 0; i < accum.size(); ++i) {
    if (*accum.getRemote(i))
      return **accum.getRemote(i);
  }
  return last;
}

template <class Iterator>
Iterator choose_rand(Iterator first, Iterator last) {
  size_t dist = std::distance(first, last);
  if (dist)
    std::advance(first, rand() % dist);
  return first;
}

template <class Compare>
struct sort_helper {
  Compare comp;

  //! Not equal in terms of less-than
  template <class value_type>
  struct neq_to {
    Compare comp;
    neq_to(Compare c) : comp(c) {}
    bool operator()(const value_type& a, const value_type& b) const {
      return comp(a, b) || comp(b, a);
    }
  };

  sort_helper(Compare c) : comp(c) {}

  template <class RandomAccessIterator, class Context>
  void operator()(std::pair<RandomAccessIterator, RandomAccessIterator> bounds,
                  Context& ctx) {
    if (std::distance(bounds.first, bounds.second) <= 1024) {
      std::sort(bounds.first, bounds.second, comp);
    } else {
      typedef
          typename std::iterator_traits<RandomAccessIterator>::value_type VT;
      RandomAccessIterator pivot = choose_rand(bounds.first, bounds.second);
      VT pv                      = *pivot;
      pivot                      = std::partition(bounds.first, bounds.second,
                             std::bind(comp, std::placeholders::_1, pv));
      // push the lower bit
      if (bounds.first != pivot)
        ctx.push(std::make_pair(bounds.first, pivot));
      // adjust the upper bit
      pivot =
          std::find_if(pivot, bounds.second,
                       std::bind(neq_to<VT>(comp), std::placeholders::_1, pv));
      // push the upper bit
      if (bounds.second != pivot)
        ctx.push(std::make_pair(pivot, bounds.second));
    }
  }
};

template <typename RandomAccessIterator, class Predicate>
std::pair<RandomAccessIterator, RandomAccessIterator>
dual_partition(RandomAccessIterator first1, RandomAccessIterator last1,
               RandomAccessIterator first2, RandomAccessIterator last2,
               Predicate pred) {
  typedef std::reverse_iterator<RandomAccessIterator> RI;
  RI first3(last2), last3(first2);
  while (true) {
    while (first1 != last1 && pred(*first1))
      ++first1;
    if (first1 == last1)
      break;
    while (first3 != last3 && !pred(*first3))
      ++first3;
    if (first3 == last3)
      break;
    std::swap(*first1++, *first3++);
  }
  return std::make_pair(first1, first3.base());
}

template <typename RandomAccessIterator, class Predicate>
struct partition_helper {
  typedef std::pair<RandomAccessIterator, RandomAccessIterator> RP;
  struct partition_helper_state {
    RandomAccessIterator first, last;
    RandomAccessIterator rfirst, rlast;
    substrate::SimpleLock Lock;
    Predicate pred;
    typename std::iterator_traits<RandomAccessIterator>::difference_type
    BlockSize() {
      return 1024;
    }

    partition_helper_state(RandomAccessIterator f, RandomAccessIterator l,
                           Predicate p)
        : first(f), last(l), rfirst(l), rlast(f), pred(p) {}
    RP takeHigh() {
      Lock.lock();
      unsigned BS = std::min(BlockSize(), std::distance(first, last));
      last -= BS;
      RandomAccessIterator rv = last;
      Lock.unlock();
      return std::make_pair(rv, rv + BS);
    }
    RP takeLow() {
      Lock.lock();
      unsigned BS = std::min(BlockSize(), std::distance(first, last));
      RandomAccessIterator rv = first;
      first += BS;
      Lock.unlock();
      return std::make_pair(rv, rv + BS);
    }
    void update(RP low, RP high) {
      Lock.lock();
      if (low.first != low.second) {
        rfirst = std::min(rfirst, low.first);
        rlast  = std::max(rlast, low.second);
      }
      if (high.first != high.second) {
        rfirst = std::min(rfirst, high.first);
        rlast  = std::max(rlast, high.second);
      }
      Lock.unlock();
    }
  };

  partition_helper(partition_helper_state* s) : state(s) {}

  partition_helper_state* state;

  void operator()(unsigned, unsigned) {
    RP high, low;
    do {
      RP parts  = dual_partition(low.first, low.second, high.first, high.second,
                                state->pred);
      low.first = parts.first;
      high.second = parts.second;
      if (low.first == low.second)
        low = state->takeLow();
      if (high.first == high.second)
        high = state->takeHigh();
    } while (low.first != low.second && high.first != high.second);
    state->update(low, high);
  }
};

template <class RandomAccessIterator, class Predicate>
RandomAccessIterator partition(RandomAccessIterator first,
                               RandomAccessIterator last, Predicate pred) {
  if (std::distance(first, last) <= 1024)
    return std::partition(first, last, pred);
  typedef partition_helper<RandomAccessIterator, Predicate> P;
  typename P::partition_helper_state s(first, last, pred);
  on_each(P(&s));
  if (s.rfirst == first && s.rlast == last) { // perfect !
    // abort();
    return s.first;
  }
  return std::partition(s.rfirst, s.rlast, pred);
}

struct pair_dist {
  template <typename RP>
  bool operator()(const RP& x, const RP& y) {
    return std::distance(x.first, x.second) > std::distance(y.first, y.second);
  }
};

template <class RandomAccessIterator, class Compare>
void sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp) {
  if (std::distance(first, last) <= 1024) {
    std::sort(first, last, comp);
    return;
  }
  typedef galois::worklists::PerSocketChunkFIFO<1> WL;

  for_each(galois::iterate({std::make_pair(first, last)}),
           sort_helper<Compare>(comp), galois::disable_conflict_detection(),
           galois::wl<WL>());
}

template <class RandomAccessIterator>
void sort(RandomAccessIterator first, RandomAccessIterator last) {
  galois::ParallelSTL::sort(
      first, last,
      std::less<
          typename std::iterator_traits<RandomAccessIterator>::value_type>());
}

template <class InputIterator, class T, typename BinaryOperation>
T accumulate(InputIterator first, InputIterator last, const T& identity,
             const BinaryOperation& binary_op) {

  auto id_fn = [=]() { return identity; };

  auto r = make_reducible(binary_op, id_fn);

  do_all(galois::iterate(first, last), [&](const T& v) { r.update(v); });

  return r.reduce();
}

template <class InputIterator, class T>
T accumulate(InputIterator first, InputIterator last, const T& identity = T()) {
  return accumulate(first, last, identity, std::plus<T>());
}

template <class InputIterator, class MapFn, class T, class ReduceFn>
T map_reduce(InputIterator first, InputIterator last, MapFn map_fn,
             ReduceFn reduce_fn, const T& identity) {

  auto id_fn = [=]() { return identity; };

  auto r = make_reducible(reduce_fn, id_fn);

  galois::do_all(galois::iterate(first, last),
                 [&](const auto& v) { r.update(map_fn(v)); });

  return r.reduce();
}

template <typename I>
std::enable_if_t<!std::is_scalar<internal::Val_ty<I>>::value> destroy(I first,
                                                                      I last) {
  using T = internal::Val_ty<I>;
  do_all(iterate(first, last), [=](T& i) { (&i)->~T(); });
}

template <class I>
std::enable_if_t<std::is_scalar<internal::Val_ty<I>>::value> destroy(I, I) {}

/**
 * Does a partial sum from first -> last and writes the results to the d_first
 * iterator.
 */
template <class InputIt, class OutputIt>
OutputIt partial_sum(InputIt first, InputIt last, OutputIt d_first) {
  using ValueType = typename std::iterator_traits<InputIt>::value_type;

  size_t sizeOfVector = std::distance(first, last);

  // only bother with parallel execution if vector is larger than some size
  if (sizeOfVector >= 1024) {
    const size_t numBlocks = galois::getActiveThreads();
    const size_t blockSize = (sizeOfVector + numBlocks - 1) / numBlocks;
    assert(numBlocks * blockSize >= sizeOfVector);

    std::vector<ValueType> localSums(numBlocks);

    // get the block sums
    galois::do_all(
        galois::iterate((size_t)0, numBlocks), [&](const size_t& block) {
          // block start can extend past sizeOfVector if doesn't divide evenly
          size_t blockStart = std::min(block * blockSize, sizeOfVector);
          size_t blockEnd   = std::min((block + 1) * blockSize, sizeOfVector);
          assert(blockStart <= blockEnd);

          // partial accumulation of each block done now
          std::partial_sum(first + blockStart, first + blockEnd,
                           d_first + blockStart);
          // save the last number in this block: used for block prefix sum
          if (blockEnd > 0) {
            localSums[block] = *(d_first + blockEnd - 1);
          } else {
            localSums[block] = 0;
          }
        });

    // bulkPrefix[i] holds the starting sum of a particular block i
    std::vector<ValueType> bulkPrefix(numBlocks);
    // exclusive scan on local sums to get number to add to each block's
    // set of indices
    // Not using std::exclusive_scan because apparently it doesn't work for
    // some compilers
    ValueType runningSum = 0;
    for (size_t i = 0; i < numBlocks; i++) {
      bulkPrefix[i] = runningSum;
      runningSum += localSums[i];
    }

    galois::do_all(
        galois::iterate((size_t)0, numBlocks), [&](const size_t& block) {
          // add the sums of previous elements to blocks
          ValueType numToAdd = bulkPrefix[block];
          size_t blockStart  = std::min(block * blockSize, sizeOfVector);
          size_t blockEnd    = std::min((block + 1) * blockSize, sizeOfVector);
          assert(blockStart <= blockEnd);

          // transform applies addition to appropriate range
          std::transform(d_first + blockStart, d_first + blockEnd,
                         d_first + blockStart,
                         [&](ValueType& val) { return val + numToAdd; });
        });

    // return the iterator past the last element written
    return d_first + sizeOfVector;
  } else {
    // vector is small; do it serially using standard library
    return std::partial_sum(first, last, d_first);
  }
}

} // end namespace ParallelSTL
} // end namespace galois
#endif


================================================
FILE: libgalois/include/galois/PerThreadContainer.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_PERTHREADCONTAINER_H
#define GALOIS_PERTHREADCONTAINER_H

#include <cstdio>
#include <vector>
#include <deque>
#include <list>
#include <map>
#include <set>
#include <limits>
#include <iterator>

#include <boost/iterator/counting_iterator.hpp>
#include <boost/iterator/iterator_facade.hpp>
#include <boost/iterator/transform_iterator.hpp>

#include "galois/config.h"
#include "galois/gdeque.h"
#include "galois/gIO.h"
#include "galois/gstl.h"
#include "galois/PriorityQueue.h"
#include "galois/runtime/Executor_DoAll.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/runtime/Mem.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/Threads.h"
#include "galois/TwoLevelIterator.h"

namespace galois {

namespace {

enum GlobalPos { GLOBAL_BEGIN, GLOBAL_END };

#define ADAPTOR_BASED_OUTER_ITER

// XXX: use a combination of boost::transform_iterator and
// boost::counting_iterator to implement the following OuterPerThreadWLIter
#ifdef ADAPTOR_BASED_OUTER_ITER

template <typename PerThrdCont>
struct WLindexer {
  typedef typename PerThrdCont::container_type Ret_ty;

  PerThrdCont* wl;

  WLindexer() : wl(NULL) {}

  WLindexer(PerThrdCont& _wl) : wl(&_wl) {}

  Ret_ty& operator()(unsigned i) const {
    assert(wl != NULL);
    assert(i < wl->numRows());
    return const_cast<Ret_ty&>(wl->get(i));
  }
};

template <typename PerThrdCont>
struct TypeFactory {
  typedef typename boost::transform_iterator<WLindexer<PerThrdCont>,
                                             boost::counting_iterator<unsigned>>
      OuterIter;
  typedef typename std::reverse_iterator<OuterIter> RvrsOuterIter;
};

template <typename PerThrdCont>
typename TypeFactory<PerThrdCont>::OuterIter make_outer_begin(PerThrdCont& wl) {
  return boost::make_transform_iterator(boost::counting_iterator<unsigned>(0),
                                        WLindexer<PerThrdCont>(wl));
}

template <typename PerThrdCont>
typename TypeFactory<PerThrdCont>::OuterIter make_outer_end(PerThrdCont& wl) {
  return boost::make_transform_iterator(
      boost::counting_iterator<unsigned>(wl.numRows()),
      WLindexer<PerThrdCont>(wl));
}

template <typename PerThrdCont>
typename TypeFactory<PerThrdCont>::RvrsOuterIter
make_outer_rbegin(PerThrdCont& wl) {
  return typename TypeFactory<PerThrdCont>::RvrsOuterIter(make_outer_end(wl));
}

template <typename PerThrdCont>
typename TypeFactory<PerThrdCont>::RvrsOuterIter
make_outer_rend(PerThrdCont& wl) {
  return typename TypeFactory<PerThrdCont>::RvrsOuterIter(make_outer_begin(wl));
}

#else

template <typename PerThrdCont>
class OuterPerThreadWLIter
    : public boost::iterator_facade<OuterPerThreadWLIter<PerThrdCont>,
                                    typename PerThrdCont::container_type,
                                    boost::random_access_traversal_tag> {

  using container_type = typename PerThrdCont::container_type;
  using Diff_ty        = ptrdiff_t;

  friend class boost::iterator_core_access;

  PerThrdCont* workList;
  // using Diff_ty due to reverse iterator, whose
  // end is -1, and,  begin is numRows - 1
  Diff_ty row;

  void assertInRange() const {
    assert((row >= 0) && (row < workList->numRows()));
  }

  // container_type& getWL() {
  // assertInRange();
  // return (*workList)[row];
  // }

  container_type& getWL() const {
    assertInRange();
    return (*workList)[row];
  }

public:
  OuterPerThreadWLIter() : workList(NULL), row(0) {}

  OuterPerThreadWLIter(PerThrdCont& wl, const GlobalPos& pos)
      : workList(&wl), row(0) {

    switch (pos) {
    case GLOBAL_BEGIN:
      row = 0;
      break;
    case GLOBAL_END:
      row = wl.numRows();
      break;
    default:
      std::abort();
    }
  }

  container_type& dereference(void) const { return getWL(); }

  // const container_type& dereference (void) const {
  // getWL ();
  // }

  void increment(void) { ++row; }

  void decrement(void) { --row; }

  bool equal(const OuterPerThreadWLIter& that) const {
    assert(this->workList == that.workList);
    return this->row == that.row;
  }

  void advance(ptrdiff_t n) { row += n; }

  Diff_ty distance_to(const OuterPerThreadWLIter& that) const {
    assert(this->workList == that.workList);
    return that.row - this->row;
  }
};

template <typename PerThrdCont>
OuterPerThreadWLIter<PerThrdCont> make_outer_begin(PerThrdCont& wl) {
  return OuterPerThreadWLIter<PerThrdCont>(wl, GLOBAL_BEGIN);
}

template <typename PerThrdCont>
OuterPerThreadWLIter<PerThrdCont> make_outer_end(PerThrdCont& wl) {
  return OuterPerThreadWLIter<PerThrdCont>(wl, GLOBAL_END);
}

template <typename PerThrdCont>
std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>
make_outer_rbegin(PerThrdCont& wl) {
  typedef typename std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>
      Ret_ty;
  return Ret_ty(make_outer_end(wl));
}

template <typename PerThrdCont>
std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>
make_outer_rend(PerThrdCont& wl) {
  typedef typename std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>
      Ret_ty;
  return Ret_ty(make_outer_begin(wl));
}

#endif

} // end namespace

template <typename Cont_tp>
class PerThreadContainer {
public:
  typedef Cont_tp container_type;
  typedef typename container_type::value_type value_type;
  typedef typename container_type::reference reference;
  typedef typename container_type::pointer pointer;
  typedef typename container_type::size_type size_type;

  typedef typename container_type::iterator local_iterator;
  typedef typename container_type::const_iterator local_const_iterator;
  typedef typename container_type::reverse_iterator local_reverse_iterator;
  typedef typename container_type::const_reverse_iterator
      local_const_reverse_iterator;

  typedef PerThreadContainer This_ty;

#ifdef ADAPTOR_BASED_OUTER_ITER
  typedef typename TypeFactory<This_ty>::OuterIter OuterIter;
  typedef typename TypeFactory<This_ty>::RvrsOuterIter RvrsOuterIter;
#else
  typedef OuterPerThreadWLIter<This_ty> OuterIter;
  typedef typename std::reverse_iterator<OuterIter> RvrsOuterIter;
#endif
  typedef typename galois::ChooseStlTwoLevelIterator<
      OuterIter, typename container_type::iterator>::type global_iterator;
  typedef typename galois::ChooseStlTwoLevelIterator<
      OuterIter, typename container_type::const_iterator>::type
      global_const_iterator;
  typedef typename galois::ChooseStlTwoLevelIterator<
      RvrsOuterIter, typename container_type::reverse_iterator>::type
      global_reverse_iterator;
  typedef typename galois::ChooseStlTwoLevelIterator<
      RvrsOuterIter, typename container_type::const_reverse_iterator>::type
      global_const_reverse_iterator;

  typedef global_iterator iterator;
  typedef global_const_iterator const_iterator;
  typedef global_reverse_iterator reverse_iterator;
  typedef global_const_reverse_iterator const_reverse_iterator;

private:
  // XXX: for testing only

#if 0
  struct FakePTS {
    std::vector<container_type*> v;

    FakePTS () {
      v.resize (size ());
    }

    container_type** getLocal () const {
      return getRemote (galois::runtime::LL::getTID ());
    }

    container_type** getRemote (size_t i) const {
      assert (i < v.size ());
      return const_cast<container_type**> (&v[i]);
    }

    size_t size () const { return galois::runtime::LL::getMaxThreads(); }

  };
#endif
  // typedef FakePTS PerThrdCont_ty;
  typedef galois::substrate::PerThreadStorage<container_type*> PerThrdCont_ty;
  PerThrdCont_ty perThrdCont;

  void destroy() {
    for (unsigned i = 0; i < perThrdCont.size(); ++i) {
      delete *perThrdCont.getRemote(i);
      *perThrdCont.getRemote(i) = NULL;
    }
  }

protected:
  PerThreadContainer() : perThrdCont() {
    for (unsigned i = 0; i < perThrdCont.size(); ++i) {
      *perThrdCont.getRemote(i) = NULL;
    }
  }

  template <typename... Args>
  void init(Args&&... args) {
    for (unsigned i = 0; i < perThrdCont.size(); ++i) {
      *perThrdCont.getRemote(i) =
          new container_type(std::forward<Args>(args)...);
    }
  }

  ~PerThreadContainer() {
    clear_all_parallel();
    destroy();
  }

public:
  unsigned numRows() const { return perThrdCont.size(); }

  container_type& get() { return **(perThrdCont.getLocal()); }

  const container_type& get() const { return **(perThrdCont.getLocal()); }

  container_type& get(unsigned i) { return **(perThrdCont.getRemote(i)); }

  const container_type& get(unsigned i) const {
    return **(perThrdCont.getRemote(i));
  }

  container_type& operator[](unsigned i) { return get(i); }

  const container_type& operator[](unsigned i) const { return get(i); }

  global_iterator begin_all() {
    return galois::stl_two_level_begin(make_outer_begin(*this),
                                       make_outer_end(*this));
  }

  global_iterator end_all() {
    return galois::stl_two_level_end(make_outer_begin(*this),
                                     make_outer_end(*this));
  }

  global_const_iterator begin_all() const { return cbegin_all(); }

  global_const_iterator end_all() const { return cend_all(); }

  // for compatibility with Range.h
  global_iterator begin() { return begin_all(); }

  global_iterator end() { return end_all(); }

  global_const_iterator begin() const { return begin_all(); }

  global_const_iterator end() const { return end_all(); }

  global_const_iterator cbegin() const { return cbegin_all(); }

  global_const_iterator cend() const { return cend_all(); }

  global_const_iterator cbegin_all() const {
    return galois::stl_two_level_cbegin(make_outer_begin(*this),
                                        make_outer_end(*this));
  }

  global_const_iterator cend_all() const {
    return galois::stl_two_level_cend(make_outer_begin(*this),
                                      make_outer_end(*this));
  }

  global_reverse_iterator rbegin_all() {
    return galois::stl_two_level_rbegin(make_outer_rbegin(*this),
                                        make_outer_rend(*this));
  }

  global_reverse_iterator rend_all() {
    return galois::stl_two_level_rend(make_outer_rbegin(*this),
                                      make_outer_rend(*this));
  }

  global_const_reverse_iterator rbegin_all() const { return crbegin_all(); }

  global_const_reverse_iterator rend_all() const { return crend_all(); }

  global_const_reverse_iterator crbegin_all() const {
    return galois::stl_two_level_crbegin(make_outer_rbegin(*this),
                                         make_outer_rend(*this));
  }

  global_const_reverse_iterator crend_all() const {
    return galois::stl_two_level_crend(make_outer_rbegin(*this),
                                       make_outer_rend(*this));
  }

  local_iterator local_begin() { return get().begin(); }
  local_iterator local_end() { return get().end(); }

  // legacy STL
  local_const_iterator local_begin() const { return get().begin(); }
  local_const_iterator local_end() const { return get().end(); }

  local_const_iterator local_cbegin() const { return get().cbegin(); }
  local_const_iterator local_cend() const { return get().cend(); }

  local_reverse_iterator local_rbegin() { return get().rbegin(); }
  local_reverse_iterator local_rend() { return get().rend(); }

  local_const_reverse_iterator local_crbegin() const { return get().crbegin(); }
  local_const_reverse_iterator local_crend() const { return get().crend(); }

  size_type size_all() const {
    size_type sz = 0;

    for (unsigned i = 0; i < perThrdCont.size(); ++i) {
      sz += get(i).size();
    }

    return sz;
  }

  // XXX: disabling because of per thread memory allocators
  // void clear_all() {
  // for (unsigned i = 0; i < perThrdCont.size(); ++i) {
  // get(i).clear();
  // }
  // }

  void clear_all_parallel(void) {
    galois::runtime::on_each_gen(
        [this](const unsigned, const unsigned) { get().clear(); },
        std::make_tuple());
  }

  bool empty_all() const {
    bool res = true;
    for (unsigned i = 0; i < perThrdCont.size(); ++i) {
      res = res && get(i).empty();
    }

    return res;
  }

  template <typename Range, typename Ret>
  void fill_parallel(const Range& range,
                     Ret (container_type::*pushFn)(const value_type&) =
                         &container_type::push_back) {
    galois::runtime::do_all_gen(
        range,
        [this, pushFn](const typename Range::value_type& v) {
          container_type& my = get();
          (my.*pushFn)(v);
          // (get ().*pushFn)(v);
        },
        std::make_tuple());
  }
};

template <typename T>
class PerThreadVector
    : public PerThreadContainer<typename gstl::template Vector<T>> {
public:
  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;
  typedef typename gstl::template Vector<T> container_type;

protected:
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  PerThreadVector() : Super_ty(), alloc() { Super_ty::init(alloc); }

  void reserve_all(size_t sz) {
    size_t numT = galois::getActiveThreads();
    size_t perT = (sz + numT - 1) / numT; // round up

    for (unsigned i = 0; i < numT; ++i) {
      Super_ty::get(i).reserve(perT);
    }
  }
};

template <typename T>
class PerThreadDeque
    : public PerThreadContainer<typename gstl::template Deque<T>> {

public:
  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;

protected:
  typedef typename gstl::template Deque<T> container_type;
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  PerThreadDeque() : Super_ty(), alloc() { Super_ty::init(alloc); }
};

template <typename T, unsigned ChunkSize = 64>
class PerThreadGdeque
    : public PerThreadContainer<galois::gdeque<T, ChunkSize>> {

  using Super_ty = PerThreadContainer<galois::gdeque<T, ChunkSize>>;

public:
  PerThreadGdeque() : Super_ty() { Super_ty::init(); }
};

template <typename T>
class PerThreadList
    : public PerThreadContainer<typename gstl::template List<T>> {

public:
  typedef typename gstl::template FixedSizeAlloc<T> Alloc_ty;

protected:
  typedef typename gstl::template List<T> container_type;
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  PerThreadList() : Super_ty(), alloc() { Super_ty::init(alloc); }
};

template <typename K, typename V, typename C = std::less<K>>
class PerThreadMap
    : public PerThreadContainer<typename gstl::template Map<K, V, C>> {

public:
  typedef typename gstl::template Map<K, V, C> container_type;
  typedef typename gstl::template FixedSizeAlloc<
      typename container_type::value_type>
      Alloc_ty;

protected:
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  explicit PerThreadMap(const C& cmp = C()) : Super_ty(), alloc() {
    Super_ty::init(cmp, alloc);
  }

  typedef typename Super_ty::global_const_iterator global_const_iterator;
  typedef typename Super_ty::global_const_reverse_iterator
      global_const_reverse_iterator;

  // hiding non-const (and const) versions in Super_ty
  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }
  global_const_iterator end_all() const { return Super_ty::cend_all(); }

  // hiding non-const (and const) versions in Super_ty
  global_const_reverse_iterator rbegin_all() const {
    return Super_ty::crbegin_all();
  }
  global_const_reverse_iterator rend_all() const {
    return Super_ty::crend_all();
  }
};

template <typename T, typename C = std::less<T>>
class PerThreadSet
    : public PerThreadContainer<typename gstl::template Set<T, C>> {

public:
  typedef typename gstl::template FixedSizeAlloc<T> Alloc_ty;

protected:
  typedef typename gstl::template Set<T, C> container_type;
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  explicit PerThreadSet(const C& cmp = C()) : Super_ty(), alloc() {
    Super_ty::init(cmp, alloc);
  }

  typedef typename Super_ty::global_const_iterator global_const_iterator;
  typedef typename Super_ty::global_const_reverse_iterator
      global_const_reverse_iterator;

  // hiding non-const (and const) versions in Super_ty
  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }
  global_const_iterator end_all() const { return Super_ty::cend_all(); }

  // hiding non-const (and const) versions in Super_ty
  global_const_reverse_iterator rbegin_all() const {
    return Super_ty::crbegin_all();
  }
  global_const_reverse_iterator rend_all() const {
    return Super_ty::crend_all();
  }
};

template <typename T, typename C = std::less<T>>
class PerThreadMinHeap
    : public PerThreadContainer<typename gstl::template PQ<T, C>> {

public:
  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;

protected:
  typedef typename gstl::template Vector<T> Vec_ty;
  typedef typename gstl::template PQ<T, C> container_type;
  typedef PerThreadContainer<container_type> Super_ty;

  Alloc_ty alloc;

public:
  explicit PerThreadMinHeap(const C& cmp = C()) : Super_ty(), alloc() {
    Super_ty::init(cmp, Vec_ty(alloc));
  }

  typedef typename Super_ty::global_const_iterator global_const_iterator;
  typedef typename Super_ty::global_const_reverse_iterator
      global_const_reverse_iterator;

  // hiding non-const (and const) versions in Super_ty
  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }
  global_const_iterator end_all() const { return Super_ty::cend_all(); }

  // hiding non-const (and const) versions in Super_ty
  global_const_reverse_iterator rbegin_all() const {
    return Super_ty::crbegin_all();
  }
  global_const_reverse_iterator rend_all() const {
    return Super_ty::crend_all();
  }
};

} // end namespace galois
#endif // GALOIS_PERTHREADCONTAINER_H


================================================
FILE: libgalois/include/galois/PriorityQueue.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_PRIORITYQUEUE_H
#define GALOIS_PRIORITYQUEUE_H

#include <algorithm>
#include <set>
#include <vector>

#include "galois/config.h"
#include "galois/Mem.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/substrate/CompilerSpecific.h"

namespace galois {

/**
 * Thread-safe ordered set. Faster than STL heap operations (about 10%-15%
 * faster on serially) and can use scalable allocation, e.g., {@link
 * FixedSizeAllocator}.
 */
template <typename T, typename Cmp = std::less<T>,
          typename Alloc = galois::FixedSizeAllocator<T>>
class ThreadSafeOrderedSet {
  typedef std::set<T, Cmp, Alloc> Set;

public:
  typedef Set container_type;
  typedef typename container_type::value_type value_type;
  typedef typename container_type::reference reference;
  typedef typename container_type::const_reference const_reference;
  typedef typename container_type::pointer pointer;
  typedef typename container_type::size_type size_type;
  typedef typename container_type::const_iterator iterator;
  typedef typename container_type::const_iterator const_iterator;
  typedef typename container_type::const_reverse_iterator reverse_iterator;
  typedef
      typename container_type::const_reverse_iterator const_reverse_iterator;
  typedef galois::substrate::SimpleLock Lock_ty;

private:
  alignas(substrate::GALOIS_CACHE_LINE_SIZE) Lock_ty mutex;
  Set orderedSet;

public:
  template <typename _T, typename _Cmp = std::less<_T>,
            typename _Alloc = galois::FixedSizeAllocator<_T>>
  using retype =
      ThreadSafeOrderedSet<_T, _Cmp,
                           _Alloc>; // FIXME: loses Alloc and Cmp types

  explicit ThreadSafeOrderedSet(const Cmp& cmp     = Cmp(),
                                const Alloc& alloc = Alloc())
      : orderedSet(cmp, alloc) {}

  template <typename Iter>
  ThreadSafeOrderedSet(Iter b, Iter e, const Cmp& cmp = Cmp(),
                       const Alloc& alloc = Alloc())
      : orderedSet(cmp, alloc) {
    for (; b != e; ++b) {
      orderedSet.insert(*b);
    }
  }

  bool empty() const {
    mutex.lock();
    bool ret = orderedSet.empty();
    mutex.unlock();

    return ret;
  }

  size_type size() const {
    mutex.lock();
    size_type sz = orderedSet.size();
    mutex.unlock();

    return sz;
  }

  value_type top() const {
    mutex.lock();
    value_type x = *orderedSet.begin();
    mutex.unlock();
    return x;
  }

  bool find(const value_type& x) const {
    mutex.lock();
    bool ret = (orderedSet.find(x) != orderedSet.end());
    mutex.unlock();
    return ret;
  }

  // for compatibility with various stl types
  inline void push_back(const value_type& x) { this->push(x); }
  inline void insert(const value_type& x) { this->push(x); }

  bool push(const value_type& x) {
    mutex.lock();
    auto p = orderedSet.insert(x);
    mutex.unlock();
    return p.second;
  }

  value_type pop() {
    mutex.lock();
    value_type x = *orderedSet.begin();
    orderedSet.erase(orderedSet.begin());
    mutex.unlock();
    return x;
  }

  bool remove(const value_type& x) {
    mutex.lock();
    bool ret = false;

    if (x == *orderedSet.begin()) {
      orderedSet.erase(orderedSet.begin());
      ret = true;
    } else {
      size_type s = orderedSet.erase(x);
      ret         = (s > 0);
    }
    mutex.unlock();

    return ret;
  }

  void clear() {
    mutex.lock();
    orderedSet.clear();
    mutex.unlock();
  }

  const_iterator begin() const { return orderedSet.begin(); }
  const_iterator end() const { return orderedSet.end(); }
};

template <typename T, typename Cmp = std::less<T>,
          typename Cont = std::vector<T, runtime::Pow_2_BlockAllocator<T>>>
class MinHeap {
public:
  typedef runtime::Pow_2_BlockAllocator<T> alloc_type;
  typedef Cont container_type;

  typedef typename container_type::value_type value_type;
  typedef typename container_type::reference reference;
  typedef typename container_type::const_reference const_reference;
  typedef typename container_type::pointer pointer;
  typedef typename container_type::size_type size_type;
  typedef typename container_type::const_iterator iterator;
  typedef typename container_type::const_iterator const_iterator;
  typedef typename container_type::const_reverse_iterator reverse_iterator;
  typedef
      typename container_type::const_reverse_iterator const_reverse_iterator;
  // typedef typename container_type::const_iterator iterator;

protected:
  struct RevCmp {
    Cmp cmp;

    explicit RevCmp(const Cmp& cmp) : cmp(cmp) {}

    bool operator()(const T& left, const T& right) const {
      return cmp(right, left);
    }
  };

  Cont container;
  RevCmp revCmp;

  const_reference top_internal() const {
    assert(!container.empty());
    return container.front();
  }

  value_type pop_internal() {
    assert(!container.empty());
    std::pop_heap(container.begin(), container.end(), revCmp);

    value_type x = container.back();
    container.pop_back();

    return x;
  }

public:
  explicit MinHeap(const Cmp& cmp = Cmp(), const Cont& container = Cont())
      : container(container), revCmp(cmp) {}

  template <typename Iter>
  MinHeap(Iter b, Iter e, const Cmp& cmp = Cmp())
      : container(b, e), revCmp(cmp) {
    std::make_heap(container.begin(), container.end());
  }

  bool empty() const { return container.empty(); }

  size_type size() const { return container.size(); }

  const_reference top() const { return container.front(); }

  // for compatibility with various stl types
  inline void push_back(const value_type& x) { this->push(x); }
  inline void insert(const value_type& x) { this->push(x); }

  void push(const value_type& x) {
    container.push_back(x);
    std::push_heap(container.begin(), container.end(), revCmp);
  }

  value_type pop() {
    assert(!container.empty());
    std::pop_heap(container.begin(), container.end(), revCmp);

    value_type x = container.back();
    container.pop_back();
    return x;
  }

  bool remove(const value_type& x) {
    bool ret = false;

    // TODO: write a better remove method
    if (x == top()) {
      pop();
      ret = true;
    } else {
      typename container_type::iterator nend =
          std::remove(container.begin(), container.end(), x);

      ret = (nend != container.end());
      container.erase(nend, container.end());

      std::make_heap(container.begin(), container.end(), revCmp);
    }

    return ret;
  }

  bool find(const value_type& x) const {
    return (std::find(begin(), end(), x) != end());
  }

  void clear() { container.clear(); }

  const_iterator begin() const { return container.begin(); }
  const_iterator end() const { return container.end(); }

  void reserve(size_type s) { container.reserve(s); }
};

/**
 * Thread-safe min heap.
 */
template <typename T, typename Cmp = std::less<T>>
class ThreadSafeMinHeap {
public:
  typedef MinHeap<T, Cmp> container_type;

  typedef typename container_type::value_type value_type;
  typedef typename container_type::reference reference;
  typedef typename container_type::const_reference const_reference;
  typedef typename container_type::pointer pointer;
  typedef typename container_type::size_type size_type;
  typedef typename container_type::const_iterator iterator;
  typedef typename container_type::const_iterator const_iterator;
  typedef typename container_type::const_reverse_iterator reverse_iterator;
  typedef
      typename container_type::const_reverse_iterator const_reverse_iterator;

protected:
  typedef galois::substrate::SimpleLock Lock_ty;

  alignas(substrate::GALOIS_CACHE_LINE_SIZE) Lock_ty mutex;
  container_type heap;

public:
  explicit ThreadSafeMinHeap(const Cmp& cmp = Cmp()) : heap(cmp) {}

  template <typename Iter>
  ThreadSafeMinHeap(Iter b, Iter e, const Cmp& cmp = Cmp()) : heap(b, e, cmp) {}

  bool empty() const {
    mutex.lock();
    bool ret = heap.empty();
    mutex.unlock();

    return ret;
  }

  size_type size() const {
    mutex.lock();
    size_type sz = heap.size();
    mutex.unlock();

    return sz;
  }

  // can't return a reference, because the reference may not be pointing
  // to a valid location due to vector doubling in size and moving to
  // another memory location
  value_type top() const {
    mutex.lock();
    value_type x = heap.top();
    mutex.unlock();

    return x;
  }

  // for compatibility with various stl types
  inline void push_back(const value_type& x) { this->push(x); }
  inline void insert(const value_type& x) { this->push(x); }

  void push(const value_type& x) {
    mutex.lock();
    heap.push(x);
    mutex.unlock();
  }

  value_type pop() {
    mutex.lock();
    value_type x = heap.pop();
    mutex.unlock();
    return x;
  }

  bool remove(const value_type& x) {
    // TODO: write a better remove method
    mutex.lock();
    bool ret = heap.remove(x);
    mutex.unlock();

    return ret;
  }

  bool find(const value_type& x) const {
    mutex.lock();
    bool ret = heap.find(x);
    mutex.unlock();

    return ret;
  }

  void clear() {
    mutex.lock();
    heap.clear();
    mutex.unlock();
  }

  // TODO: can't use in parallel context
  const_iterator begin() const { return heap.begin(); }
  const_iterator end() const { return heap.end(); }

  void reserve(size_type s) { heap.reserve(s); }
};

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/Reduction.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_REDUCTION_H
#define GALOIS_REDUCTION_H

#include <functional>
#include <limits>

#include "galois/config.h"
#include "galois/substrate/PerThreadStorage.h"

namespace galois {

/**
 * A Reducible stores per-thread values of a variable of type T and merges
 * multiple values into one.
 *
 * The reduced value is obtained by merging per thread values using the binary
 * functor MergeFunc. MergeFunc takes two values of type T and produces the
 * resulting merged value:
 *
 *   T operator()(T lhs, T rhs)
 *
 * If T is expensive to copy, a moving merge function is more appropriate:
 *
 *   T& operator()(T& lhs, T&& rhs)
 *
 * IdFunc returns the identity element, which is used to initialize and reset
 * the per thread values.
 *
 * Both MergeFunc and IdFunc should be copy constructable.
 *
 * The MergeFunc and IdFunc should be related as follows:
 *
 *   MergeFunc(x, IdFunc()) == x    for all x in X
 *
 * An example of using a move merge function:
 *
 *   auto merge_func = [](T& lhs, T&& rhs) -> T& { ... }
 *   auto identity_func = []() -> T { ... }
 *
 *   auto r = make_reducible(merge_func, identity_func);
 *   T u = ...
 *   r.update(std::move(u));
 *   T& result = r.reduce();
 */
template <typename T, typename MergeFunc, typename IdFunc>
class Reducible : public MergeFunc, public IdFunc {

  galois::substrate::PerThreadStorage<T> data_;

  void merge(T& lhs, T&& rhs) {
    T v{std::move(MergeFunc::operator()(lhs, std::move(rhs)))};
    lhs = std::move(v);
  }

  void merge(T& lhs, const T& rhs) { lhs = MergeFunc::operator()(lhs, rhs); }

public:
  using value_type = T;

  Reducible(MergeFunc merge_func, IdFunc id_func)
      : MergeFunc(merge_func), IdFunc(id_func) {
    for (unsigned i = 0; i < data_.size(); ++i) {
      *(data_.getRemote(i)) = IdFunc::operator()();
    }
  }

  /**
   * Updates the thread local value by applying the reduction operator to
   * current and newly provided value
   */
  void update(T&& rhs) { merge(*data_.getLocal(), std::move(rhs)); }

  void update(const T& rhs) { merge(*data_.getLocal(), rhs); }

  /**
   * Returns a reference to the local value of T.
   */
  T& getLocal() { return *data_.getLocal(); }

  /**
   * Returns the final reduction value. Only valid outside the parallel region.
   */
  T& reduce() {
    T& lhs = *data_.getLocal();
    for (unsigned int i = 1; i < data_.size(); ++i) {
      T& rhs = *data_.getRemote(i);
      merge(lhs, std::move(rhs));
      rhs = IdFunc::operator()();
    }

    return lhs;
  }

  void reset() {
    for (unsigned int i = 0; i < data_.size(); ++i) {
      *data_.getRemote(i) = IdFunc::operator()();
    }
  }
};

/**
 * make_reducible creates a Reducible from a merge function and identity
 * function.
 */
template <typename MergeFn, typename IdFn>
auto make_reducible(const MergeFn& mergeFn, const IdFn& idFn) {
  return Reducible<std::invoke_result_t<IdFn>, MergeFn, IdFn>(mergeFn, idFn);
}

//! gmax is the functional form of std::max
template <typename T>
struct gmax {
  constexpr T operator()(const T& lhs, const T& rhs) const {
    return std::max<T>(lhs, rhs);
  }
};

//! gmax is the functional form of std::max
template <typename T>
struct gmin {
  constexpr T operator()(const T& lhs, const T& rhs) const {
    return std::min<T>(lhs, rhs);
  }
};

template <typename T, T value>
struct identity_value {
  constexpr T operator()() const { return T{value}; }
};

// The following identity_value specializations exist because floating point
// numbers cannot be template arguments.

template <typename T>
struct identity_value_zero {
  constexpr T operator()() const { return T{0}; }
};

template <typename T>
struct identity_value_min {
  constexpr T operator()() const { return std::numeric_limits<T>::min(); }
};

template <typename T>
struct identity_value_max {
  constexpr T operator()() const { return std::numeric_limits<T>::max(); }
};

//! Accumulator for T where accumulation is plus
template <typename T>
class GAccumulator : public Reducible<T, std::plus<T>, identity_value_zero<T>> {
  using base_type = Reducible<T, std::plus<T>, identity_value_zero<T>>;

public:
  GAccumulator() : base_type(std::plus<T>(), identity_value_zero<T>()) {}

  GAccumulator& operator+=(const T& rhs) {
    base_type::update(rhs);
    return *this;
  }

  GAccumulator& operator-=(const T& rhs) {
    base_type::update(rhs);
    return *this;
  }
};

//! Accumulator for T where accumulation is max
template <typename T>
class GReduceMax : public Reducible<T, gmax<T>, identity_value_min<T>> {
  using base_type = Reducible<T, gmax<T>, identity_value_min<T>>;

public:
  GReduceMax() : base_type(gmax<T>(), identity_value_min<T>()) {}
};

//! Accumulator for T where accumulation is min
template <typename T>
class GReduceMin : public Reducible<T, gmin<T>, identity_value_max<T>> {
  using base_type = Reducible<T, gmin<T>, identity_value_max<T>>;

public:
  GReduceMin() : base_type(gmin<T>(), identity_value_max<T>()) {}
};

//! logical AND reduction
class GReduceLogicalAnd : public Reducible<bool, std::logical_and<bool>,
                                           identity_value<bool, true>> {
  using base_type =
      Reducible<bool, std::logical_and<bool>, identity_value<bool, true>>;

public:
  GReduceLogicalAnd()
      : base_type(std::logical_and<bool>(), identity_value<bool, true>()) {}
};

//! logical OR reduction
class GReduceLogicalOr : public Reducible<bool, std::logical_or<bool>,
                                          identity_value<bool, false>> {
  using base_type =
      Reducible<bool, std::logical_or<bool>, identity_value<bool, false>>;

public:
  GReduceLogicalOr()
      : base_type(std::logical_or<bool>(), identity_value<bool, false>()) {}
};

} // namespace galois
#endif // GALOIS_REDUCTION_H


================================================
FILE: libgalois/include/galois/SharedMemSys.h
================================================
#ifndef GALOIS_SHAREDMEMSYS_H
#define GALOIS_SHAREDMEMSYS_H

#include "galois/config.h"
#include "galois/runtime/SharedMem.h"

namespace galois {

/**
 * SharedMemSys is an explicit class to initialize the Galois runtime. The
 * runtime is destroyed when this object is destroyed.
 */
class SharedMemSys : public runtime::SharedMem<runtime::StatManager> {

public:
  explicit SharedMemSys();
  ~SharedMemSys();

  SharedMemSys(const SharedMemSys&) = delete;
  SharedMemSys& operator=(const SharedMemSys&) = delete;

  SharedMemSys(SharedMemSys&&) = delete;
  SharedMemSys& operator=(SharedMemSys&&) = delete;
};

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/Threads.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_THREADS_H
#define GALOIS_THREADS_H

#include "galois/config.h"

namespace galois {

/**
 * Sets the number of threads to use when running any Galois iterator. Returns
 * the actual value of threads used, which could be less than the requested
 * value. System behavior is undefined if this function is called during
 * parallel execution or after the first parallel execution.
 */
unsigned int setActiveThreads(unsigned int num) noexcept;

/**
 * Returns the number of threads in use.
 */
unsigned int getActiveThreads() noexcept;

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/Timer.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_TIMER_H
#define GALOIS_TIMER_H

#include <chrono>

#include "galois/config.h"
#include "galois/gstl.h"

namespace galois {

//! A simple timer
class Timer {
  typedef std::chrono::steady_clock clockTy;
  // typedef std::chrono::high_resolution_clock clockTy;
  std::chrono::time_point<clockTy> startT, stopT;

public:
  void start();
  void stop();
  uint64_t get() const;
  uint64_t get_usec() const;
};

//! A multi-start time accumulator.
//! Gives the final runtime for a series of intervals
class TimeAccumulator {
  Timer ltimer;
  uint64_t acc;

public:
  TimeAccumulator();

  void start();
  //! adds the current timed interval to the total
  void stop();
  uint64_t get() const;
  uint64_t get_usec() const;
  TimeAccumulator& operator+=(const TimeAccumulator& rhs);
  TimeAccumulator& operator+=(const Timer& rhs);
};

//! Galois Timer that automatically reports stats upon destruction
//! Provides statistic interface around timer
class StatTimer : public TimeAccumulator {
  gstl::Str name_;
  gstl::Str region_;
  bool valid_;

public:
  StatTimer(const char* name, const char* region);

  StatTimer(const char* const n) : StatTimer(n, nullptr) {}

  StatTimer() : StatTimer(nullptr, nullptr) {}

  StatTimer(const StatTimer&) = delete;
  StatTimer(StatTimer&&)      = delete;
  StatTimer& operator=(const StatTimer&) = delete;
  StatTimer& operator=(StatTimer&&) = delete;

  ~StatTimer();

  void start();
  void stop();
  uint64_t get_usec() const;
};

template <bool Enable>
class CondStatTimer : public StatTimer {
public:
  CondStatTimer(const char* const n, const char* region)
      : StatTimer(n, region) {}

  CondStatTimer(const char* region) : CondStatTimer("Time", region) {}
};

template <>
class CondStatTimer<false> {
public:
  CondStatTimer(const char*) {}
  CondStatTimer(const char* const, const char*) {}

  void start() const {}
  void stop() const {}
  uint64_t get_usec() const { return 0; }
};

template <typename F>
void timeThis(const F& f, const char* const name) {
  StatTimer t("Time", name);

  t.start();

  f();

  t.stop();
}

} // end namespace galois
#endif


================================================
FILE: libgalois/include/galois/Traits.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_TRAITS_H
#define GALOIS_TRAITS_H

#include <tuple>
#include <type_traits>

#include "galois/config.h"
#include "galois/worklists/WorkList.h"

namespace galois {

// Trait classifications

template <typename T>
struct trait_has_type {
  typedef T type;
};

template <typename T>
struct trait_has_value {
  typedef T type;
  type value;
  trait_has_value(const type& v) : value(v) {}
  trait_has_value(type&& v) : value(std::move(v)) {}
  T getValue() const { return value; }
};

template <typename T, T V>
struct trait_has_svalue {
  typedef T type;
  static const type value = V;
  T getValue() const { return V; }
};

/**
 * Utility function to simplify creating traits that take unnamed functions
 * (i.e., lambdas).
 */
template <template <typename...> class TT, typename... Args>
auto make_trait_with_args(Args... args) -> TT<Args...> {
  return TT<Args...>(args...);
}

/**
 * True if Derived is derived from Base or is Base itself.
 *
 * A matching trait is any type that inherits from a trait.
 */
template <typename Base, typename Derived>
constexpr bool at_least_base_of =
    std::is_base_of<Base, Derived>::value || std::is_same<Base, Derived>::value;

/**
 * Returns index of first matching trait in Tuple.
 *
 * This function is not well-defined if there is no matching trait.
 */
template <typename T, typename Tuple, size_t Int, size_t... Ints>
constexpr size_t find_trait(std::index_sequence<Int, Ints...> /*seq*/) {
  if constexpr (at_least_base_of<
                    T, typename std::tuple_element<Int, Tuple>::type>) {
    return Int;
  } else {
    return find_trait<T, Tuple>(std::index_sequence<Ints...>{});
  }
}

template <typename T, typename Tuple>
constexpr size_t find_trait() {
  constexpr std::make_index_sequence<std::tuple_size<Tuple>::value> seq{};
  return find_trait<T, Tuple>(seq);
}

/**
 * Returns true if the tuple type contains the given trait T.
 */
template <typename T, typename... Ts>
constexpr bool has_trait(std::tuple<Ts...>* /*tpl*/) {
  return (... || at_least_base_of<T, Ts>);
}

template <typename T, typename Tuple>
constexpr bool has_trait() {
  return has_trait<T>(static_cast<Tuple*>(nullptr));
}

/**
 * Returns the value associated with the given trait T in a tuple.
 *
 * This function is not well-defined when there is not matching trait.
 */
template <typename T, typename Tuple>
constexpr auto get_trait_value(Tuple tpl) {
  constexpr size_t match(find_trait<T, Tuple>());
  return std::get<match>(tpl);
}

/**
 * Returns the type associated with the given trait in a tuple.
 */
template <typename T, typename Tuple>
struct get_trait_type {
  using type = typename std::tuple_element<find_trait<T, Tuple>(), Tuple>::type;
};

// Fallback to enable_if tricks over if constexpr to play more nicely with
// unused parameter warnings.

template <typename S, typename T, typename D>
constexpr auto get_default_trait_value(
    S /*source*/, T /*tag*/, D /*def*/,
    typename std::enable_if<has_trait<T, S>()>::type* = nullptr) {
  return std::make_tuple();
}

template <typename S, typename T, typename D>
constexpr auto get_default_trait_value(
    S GALOIS_UNUSED(source), T GALOIS_UNUSED(tags), D defaults,
    typename std::enable_if<!has_trait<T, S>()>::type* = nullptr) {
  return std::make_tuple(defaults);
}

/**
 * Returns a tuple that has an element from defaults[i] for every type
 * from tags[i] missing in source.
 */
template <typename S, typename T, typename D>
constexpr auto
get_default_trait_values(std::index_sequence<> GALOIS_UNUSED(seq),
                         S GALOIS_UNUSED(source), T GALOIS_UNUSED(tags),
                         D GALOIS_UNUSED(defaults)) {
  return std::make_tuple();
}

template <size_t... Ints, typename S, typename T, typename D>
constexpr auto
get_default_trait_values(std::index_sequence<Ints...> GALOIS_UNUSED(seq),
                         S source, T tags, D defaults) {
  return std::tuple_cat(get_default_trait_value(source, std::get<Ints>(tags),
                                                std::get<Ints>(defaults))...);
}

template <typename S, typename T, typename D>
constexpr auto get_default_trait_values(S source, T tags, D defaults) {
  constexpr std::make_index_sequence<std::tuple_size<T>::value> seq{};
  return get_default_trait_values(seq, source, tags, defaults);
}

template <typename T>
constexpr auto has_function_traits(int)
    -> decltype(std::declval<typename T::function_traits>(), bool()) {
  return true;
}

template <typename>
constexpr auto has_function_traits(...) -> bool {
  return false;
}

template <typename T, typename Enable = void>
struct function_traits {
  typedef std::tuple<> type;
};

template <typename T>
struct function_traits<
    T, typename std::enable_if<has_function_traits<T>(0)>::type> {
  typedef typename T::function_traits type;
};

// Traits

/**
 * Indicate name to appear in statistics. Optional argument to {@link do_all()}
 * and {@link for_each()} loops.
 */
struct loopname_tag {};
struct loopname : public trait_has_value<const char*>, loopname_tag {
  loopname(const char* p = "ANON_LOOP") : trait_has_value<const char*>(p) {}
};

/**
 * Indicate whether @{link do_all()} loops should perform work-stealing.
 * Optional argument to {@link do_all()} loops.
 */
struct steal_tag {};
struct steal : public trait_has_type<bool>, steal_tag {};

/**
 * Indicates worklist to use. Optional argument to {@link for_each()} loops.
 */
struct wl_tag {};
template <typename T, typename... Args>
struct s_wl : public trait_has_type<T>, wl_tag {
  std::tuple<Args...> args;
  s_wl(Args&&... a) : args(std::forward<Args>(a)...) {}
};

template <typename T, typename... Args>
s_wl<T, Args...> wl(Args&&... args) {
  return s_wl<T, Args...>(std::forward<Args>(args)...);
}

//
/**
 * Indicates the operator may request the parallel loop to be suspended and a
 * given function run in serial
 */
struct parallel_break_tag {};
struct parallel_break : public trait_has_type<bool>, parallel_break_tag {};

/**
 * Indicates the operator does not generate new work and push it on the worklist
 */
struct no_pushes_tag {};
struct no_pushes : public trait_has_type<bool>, no_pushes_tag {};

/**
 * Indicates the operator may request the access to a per-iteration allocator
 */
struct per_iter_alloc_tag {};
struct per_iter_alloc : public trait_has_type<bool>, per_iter_alloc_tag {};

/**
 * Indicates the operator doesn't need its execution stats recorded
 */
struct no_stats_tag {};
struct no_stats : public trait_has_type<bool>, no_stats_tag {};

/**
 * Indicates the operator needs detailed stats
 * Must provide loopname to enable this flag
 */
struct more_stats_tag {};
struct more_stats : public trait_has_type<bool>, more_stats_tag {};

/**
 * Indicates the operator doesn't need abort support
 */
struct disable_conflict_detection_tag {};
struct disable_conflict_detection : public trait_has_type<bool>,
                                    disable_conflict_detection_tag {};

/**
 * Indicates that the neighborhood set does not change through out i.e. is not
 * dependent on computed values. Examples of such fixed neighborhood is e.g.
 * the neighborhood being all the neighbors of a node in the input graph,
 * while the counter example is the neighborhood being some of the neighbors
 * based on some predicate.
 */
struct fixed_neighborhood_tag {};
struct fixed_neighborhood : public trait_has_type<bool>,
                            fixed_neighborhood_tag {};

/**
 * Indicates that the operator uses the intent to read flag.
 */
struct intent_to_read_tag {};
struct intent_to_read : public trait_has_type<bool>, intent_to_read_tag {};

/**
 * Indicates the operator has a function that visits the neighborhood of the
 * operator without modifying it.
 */
struct neighborhood_visitor_tag {};
template <typename T>
struct neighborhood_visitor : public trait_has_value<T>,
                              neighborhood_visitor_tag {
  neighborhood_visitor(const T& t = T{}) : trait_has_value<T>(t) {}
  neighborhood_visitor(T&& t) : trait_has_value<T>(std::move(t)) {}
};

/**
 * Indicates the operator has a function that allows a {@link
 * galois::for_each} loop to be exited deterministically.
 *
 * The function should have the signature <code>bool()</code>.
 *
 * It will be periodically called by the deterministic scheduler.  If it
 * returns true, the loop ends as if calling {@link UserContext::breakLoop},
 * but unlike that function, these breaks are deterministic.
 */
struct det_parallel_break_tag {};
template <typename T>
struct det_parallel_break : public trait_has_value<T>, det_parallel_break_tag {
  static_assert(std::is_same<typename std::result_of<T()>::type, bool>::value,
                "signature must be bool()");
  det_parallel_break(const T& t = T()) : trait_has_value<T>(t) {}
  det_parallel_break(T&& t) : trait_has_value<T>(std::move(t)) {}
};

/**
 * Indicates the operator has a function that optimizes the generation of
 * unique ids for active elements. This function should be thread-safe.
 *
 * The function should have the signature <code>uintptr_t (A)</code> where
 * A is the type of active elements.
 */
struct det_id_tag {};
template <typename T>
struct det_id : public trait_has_value<T>, det_id_tag {
  det_id(const T& t = T()) : trait_has_value<T>(t) {}
  det_id(T&& t) : trait_has_value<T>(std::move(t)) {}
};

/**
 * Indicates the operator has a type that encapsulates state that is passed
 * between the suspension and resumpsion of an operator during deterministic
 * scheduling.
 */
struct local_state_tag {};
template <typename T>
struct local_state : public trait_has_type<T>, local_state_tag {};

// TODO: separate to libdist
/** For distributed Galois **/
struct op_tag {};

struct chunk_size_tag {
  enum { MIN = 1, MAX = 4096 };
};

/**
 * Specify chunk size for do_all_coupled & do_all_choice at compile time or at
 * runtime.
 *
 * For compile time, use the template argument, e.g., galois::chunk_size<16> ()
 * Additionally, user may provide a runtime argument, e.g,
 * galois::chunk_size<16> (8)
 *
 * Currently, only do_all_coupled can take advantage of the runtime argument.
 * TODO: allow runtime provision/tuning of chunk_size in other loop executors
 *
 * chunk size is clamped to within [chunk_size_tag::MIN, chunk_size_tag::MAX]
 */
template <unsigned SZ = 32>
struct chunk_size : public trait_has_value<unsigned>, chunk_size_tag {
private:
  constexpr static unsigned clamp(unsigned int v) {
    return std::min(std::max(v, unsigned{chunk_size_tag::MIN}),
                    unsigned{chunk_size_tag::MAX});
  }

public:
  constexpr static unsigned value = clamp(SZ);

  chunk_size(unsigned cs = SZ) : trait_has_value(clamp(cs)) {}
};

typedef worklists::PerSocketChunkFIFO<chunk_size<>::value> defaultWL;

namespace internal {

template <typename Tup>
struct NeedStats {
  constexpr static const bool value =
      !has_trait<no_stats_tag, Tup>() && has_trait<loopname_tag, Tup>();
};

template <typename Tup>
std::enable_if_t<has_trait<loopname_tag, Tup>(), const char*>
getLoopName(const Tup& t) {
  return get_trait_value<loopname_tag>(t).value;
}

template <typename Tup>
std::enable_if_t<!has_trait<loopname_tag, Tup>(), const char*>
getLoopName(const Tup&) {
  return "ANON_LOOP";
}
} // namespace internal

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/TwoLevelIterator.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_TWO_LEVEL_ITER_H
#define GALOIS_TWO_LEVEL_ITER_H

#include <cassert>
#include <cstdlib>
#include <functional>
#include <iterator>
#include <type_traits>

#include <cstdlib>
#include <cassert>

#include "galois/config.h"

namespace galois {

namespace internal {
template <typename Iter>
void safe_decrement(Iter& it, const Iter& beg,
                    const Iter& GALOIS_USED_ONLY_IN_DEBUG(end),
                    std::forward_iterator_tag) {

  Iter next = beg;
  Iter curr(next);

  while (next != it) {
    curr = next;
    assert(next != end);
    ++next;
  }

  assert(next == it);
  assert(curr != it);

  it = curr;
}

template <typename Iter>
void safe_decrement(Iter& it, const Iter& GALOIS_USED_ONLY_IN_DEBUG(beg),
                    const Iter&, std::bidirectional_iterator_tag) {
  assert(it != beg);
  --it;
}

template <typename Iter>
void safe_decrement(Iter& it, const Iter& beg, const Iter& end) {
  safe_decrement(it, beg, end,
                 typename std::iterator_traits<Iter>::iterator_category());
}
} // namespace internal

//! Common functionality of TwoLevelIterators
template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
class TwoLevelIterBase {

protected:
  // TODO: make begin and end const
  Outer m_beg_outer;
  Outer m_end_outer;
  Outer m_outer;

  Inner m_beg_inner;
  Inner m_end_inner;
  Inner m_inner;

  InnerBegFn innerBegFn;
  InnerEndFn innerEndFn;

  inline bool outerAtBegin() const { return m_outer == m_beg_outer; }

  inline bool outerAtEnd() const { return m_outer == m_end_outer; }

  inline bool outerEmpty() const { return m_beg_outer == m_end_outer; }

  inline const Inner& getInnerBegin() const { return m_beg_inner; }

  inline const Inner& getInnerEnd() const { return m_end_inner; }

  inline void setInnerAtBegin(void) {
    assert(!outerAtEnd());
    m_inner = m_beg_inner = innerBegFn(*m_outer);
    m_end_inner           = innerEndFn(*m_outer);
  }

  inline void setInnerAtEnd(void) {
    assert(!outerAtEnd());
    m_beg_inner = innerBegFn(*m_outer);
    m_inner = m_end_inner = innerEndFn(*m_outer);
  }

  inline bool innerAtBegin() const {
    assert(m_beg_inner == innerBegFn(*m_outer));
    return m_inner == m_beg_inner;
  }

  inline bool innerAtEnd() const {
    assert(m_end_inner == innerEndFn(*m_outer));
    return m_inner == m_end_inner;
  }

  TwoLevelIterBase()
      : m_beg_outer(), m_end_outer(), m_outer(), m_beg_inner(), m_end_inner(),
        m_inner(), innerBegFn(), innerEndFn() {}

  TwoLevelIterBase(Outer beg_outer, Outer end_outer, Outer outer_pos,
                   InnerBegFn innerBegFn, InnerEndFn innerEndFn)
      : m_beg_outer(beg_outer), m_end_outer(end_outer), m_outer(outer_pos),
        m_beg_inner(), m_end_inner(), m_inner(), innerBegFn(innerBegFn),
        innerEndFn(innerEndFn) {}
};

//! Two-Level forward iterator
template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
class TwoLevelFwdIter
    : public std::iterator_traits<Inner>,
      public TwoLevelIterBase<Outer, Inner, InnerBegFn, InnerEndFn> {

protected:
  typedef std::iterator_traits<Inner> Traits;
  typedef TwoLevelIterBase<Outer, Inner, InnerBegFn, InnerEndFn> Base;

  void nextOuter() {
    assert(!Base::outerAtEnd());
    assert(!Base::outerEmpty());
    ++Base::m_outer;
    if (!Base::outerAtEnd()) {

      Base::setInnerAtBegin();
      // Base::m_inner = Base::innerBegin ();
    }
  }

  void seekValidBegin() {
    while (!Base::outerAtEnd() && Base::innerAtEnd()) {
      nextOuter();
    }
  }

  void step_forward() {
    assert(!Base::innerAtEnd());
    ++Base::m_inner;

    if (Base::innerAtEnd()) {
      seekValidBegin();
    }
  }

  bool is_equal(const TwoLevelFwdIter& that) const {
    // the outer iterators of 'this' and 'that' have been initialized
    // with either (beg,end), or, (end, end)
    //  - for two level begin, outer is initialized to (beg,end)
    //  - for two level end, outer is initialized to (end, end)
    assert(this->m_end_outer == that.m_end_outer);

    return (this->m_outer == that.m_outer) &&
           (Base::outerAtEnd() || (this->m_inner == that.m_inner));
  }

public:
  TwoLevelFwdIter() : Base() {}

  TwoLevelFwdIter(Outer beg_outer, Outer end_outer, Outer outer_pos,
                  InnerBegFn innerBegFn, InnerEndFn innerEndFn)
      : Base(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {

    if (!Base::outerAtEnd()) {
      // Base::m_inner = Base::innerBegin ();
      Base::setInnerAtBegin();
      seekValidBegin();
    }
  }

  typename Traits::reference operator*() const { return *Base::m_inner; }

  typename Traits::pointer operator->() const {
    return Base::m_inner->operator->();
  }

  TwoLevelFwdIter& operator++() {
    step_forward();
    return *this;
  }

  TwoLevelFwdIter operator++(int) {
    TwoLevelFwdIter tmp(*this);
    step_forward();
    return tmp;
  }

  friend bool operator==(const TwoLevelFwdIter& left,
                         const TwoLevelFwdIter& right) {
    return left.is_equal(right);
  }

  friend bool operator!=(const TwoLevelFwdIter& left,
                         const TwoLevelFwdIter& right) {
    return !left.is_equal(right);
  }
};

//! Two-Level bidirectional iterator
template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
class TwoLevelBiDirIter
    : public TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> {

protected:
  typedef TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> FwdBase;

protected:
  void prevOuter() {
    assert(!FwdBase::outerAtBegin());
    assert(!FwdBase::outerEmpty());

    internal::safe_decrement(FwdBase::m_outer, FwdBase::m_beg_outer,
                             FwdBase::m_end_outer);

    // FwdBase::m_inner = FwdBase::innerEnd ();
    FwdBase::setInnerAtEnd();
  }

  void step_backward() {
    assert(!FwdBase::outerEmpty());

    // assert (!FwdBase::outerAtBegin ());

    // calling innerBegin when m_outer == m_end_outer is invalid
    // so call prevOuter first, and check for innerBegin afterwards

    if (FwdBase::outerAtEnd()) {
      prevOuter();
    }

    while (FwdBase::innerAtBegin()) {
      assert(!FwdBase::outerAtBegin());
      prevOuter();
    }

    assert(FwdBase::innerAtBegin() ? FwdBase::outerAtBegin() : true);

    --FwdBase::m_inner;
  }

public:
  TwoLevelBiDirIter() : FwdBase() {}

  TwoLevelBiDirIter(Outer beg_outer, Outer end_outer, Outer outer_pos,
                    InnerBegFn innerBegFn, InnerEndFn innerEndFn)
      : FwdBase(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {}

  TwoLevelBiDirIter& operator--() {
    step_backward();
    return *this;
  }

  TwoLevelBiDirIter operator--(int) {
    TwoLevelBiDirIter tmp(*this);
    step_backward();
    return tmp;
  }
};

//! Two-Level random access iterator
template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
class TwoLevelRandIter
    : public TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> {

protected:
  typedef TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> BiDirBase;

  typedef typename BiDirBase::Traits::difference_type Diff_ty;

  void jump_forward(const Diff_ty d) {
    assert(!BiDirBase::outerEmpty());

    if (d < 0) {
      jump_backward(-d);

    } else {
      Diff_ty rem(d);

      while (rem > 0) {
        assert(!BiDirBase::outerAtEnd());

        Diff_ty avail =
            std::distance(BiDirBase::m_inner, BiDirBase::getInnerEnd());
        assert(avail >= 0);

        if (rem > avail) {
          rem -= avail;
          assert(!BiDirBase::outerAtEnd());
          BiDirBase::nextOuter();

        } else {
          BiDirBase::m_inner += rem;
          rem = 0;
        }

        BiDirBase::seekValidBegin();
      }
    }
  }

  void jump_backward(const Diff_ty d) {
    assert(!BiDirBase::outerEmpty());

    if (d < 0) {
      jump_forward(-d);

    } else {

      Diff_ty rem(d);

      if ((rem > 0) && BiDirBase::outerAtEnd()) {
        BiDirBase::prevOuter();
      }

      while (rem > 0) {
        Diff_ty avail =
            std::distance(BiDirBase::getInnerBegin(), BiDirBase::m_inner);
        assert(avail >= 0);

        if (rem > avail) {
          rem -= avail;
          assert(!BiDirBase::outerAtBegin());
          BiDirBase::prevOuter();

        } else {

          BiDirBase::m_inner -= rem;
          rem = 0;
          break;
        }
      }
    }
  }

  Diff_ty compute_dist(const TwoLevelRandIter& that) const {

    if (std::distance(this->m_outer, that.m_outer) <
        0) { // this->m_outer > that.m_outer
      return -(that.compute_dist(*this));

    } else if (this->m_outer == that.m_outer) {
      if (!BiDirBase::outerAtEnd()) {
        return std::distance(this->m_inner, that.m_inner);

      } else {
        return 0;
      }

    } else {

      assert(std::distance(this->m_outer, that.m_outer) >
             0); // this->m_outer < that.m_outer;
      assert(!BiDirBase::outerAtEnd());

      TwoLevelRandIter tmp(*this);

      Diff_ty d = tmp.m_inner - tmp.m_inner; // 0

      while (tmp.m_outer != that.m_outer) {
        d += std::distance(tmp.m_inner, tmp.getInnerEnd());
        tmp.nextOuter();
      }

      assert(tmp.m_outer == that.m_outer);

      if (tmp.m_outer != tmp.m_end_outer) {
        d += std::distance(tmp.m_inner, that.m_inner);
      }

      assert(d >= 0);

      return d;
    }
  }

public:
  TwoLevelRandIter() : BiDirBase() {}

  TwoLevelRandIter(Outer beg_outer, Outer end_outer, Outer outer_pos,
                   InnerBegFn innerBegFn, InnerEndFn innerEndFn)
      : BiDirBase(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {}

  TwoLevelRandIter& operator+=(Diff_ty d) {
    jump_forward(d);
    return *this;
  }

  TwoLevelRandIter& operator-=(Diff_ty d) {
    jump_backward(d);
    return *this;
  }

  friend TwoLevelRandIter operator+(const TwoLevelRandIter& it, Diff_ty d) {
    TwoLevelRandIter tmp(it);
    tmp += d;
    return tmp;
  }

  friend TwoLevelRandIter operator+(Diff_ty d, const TwoLevelRandIter& it) {
    return (it + d);
  }

  friend TwoLevelRandIter operator-(const TwoLevelRandIter& it, Diff_ty d) {
    TwoLevelRandIter tmp(it);
    tmp -= d;
    return tmp;
  }

  friend Diff_ty operator-(const TwoLevelRandIter& left,
                           const TwoLevelRandIter& right) {

    return right.compute_dist(left);
  }

  typename BiDirBase::Traits::reference operator[](Diff_ty d) const {
    return *((*this) + d);
  }

  friend bool operator<(const TwoLevelRandIter& left,
                        const TwoLevelRandIter& right) {
    return ((left.m_outer == right.m_outer) ? (left.m_inner < right.m_inner)
                                            : (left.m_outer < right.m_outer));
  }

  friend bool operator<=(const TwoLevelRandIter& left,
                         const TwoLevelRandIter& right) {
    return (left < right) || (left == right);
  }

  friend bool operator>(const TwoLevelRandIter& left,
                        const TwoLevelRandIter& right) {
    return !(left <= right);
  }

  friend bool operator>=(const TwoLevelRandIter& left,
                         const TwoLevelRandIter& right) {
    return !(left < right);
  }
};

namespace internal {

template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn, typename Cat>
struct ByCategory {};

template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
struct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,
                  std::forward_iterator_tag> {
  typedef TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> type;
};

template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
struct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,
                  std::bidirectional_iterator_tag> {
  typedef TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> type;
};

template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
struct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,
                  std::random_access_iterator_tag> {
  typedef TwoLevelRandIter<Outer, Inner, InnerBegFn, InnerEndFn> type;
};

// template <typename Outer, typename Inner>
// struct IsRvrsIter {
//
// template <typename O, typename I>
// struct IsRev {
// static const bool VAL = false;
// };
//
// template <typename O>
// struct IsRev<O, typename O::value_type::reverse_iterator> {
// static const bool VAL = true;
// };
//
// template <typename O, typename I>
// struct IsConstRev {
// static const bool VAL = false;
// };
//
// template <typename O>
// struct IsConstRev<O, typename O::value_type::const_reverse_iterator> {
// static const bool VAL = true;
// };
//
//
// static const bool VAL =
// IsRev<Outer, Inner>::VAL || IsConstRev<Outer, Inner>::VAL;
// };

} // namespace internal

//! Type function to select appropriate two-level iterator
template <typename Outer, typename Inner, typename InnerBegFn,
          typename InnerEndFn>
struct ChooseTwoLevelIterator {
private:
  // typedef typename std::iterator_traits<Outer>::iterator_category CatOuter;
  typedef typename std::iterator_traits<Inner>::iterator_category CatInner;

public:
  typedef typename internal::ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,
                                        CatInner>::type type;
};

//! Creates two level iterator
template <typename Outer, typename InnerBegFn, typename InnerEndFn>
typename ChooseTwoLevelIterator<Outer, typename InnerBegFn::result_type,
                                InnerBegFn, InnerEndFn>::type
make_two_level_begin(Outer beg, Outer end, InnerBegFn innerBegFn,
                     InnerEndFn innerEndFn) {
#ifndef NDEBUG
  const bool V = std::is_same<typename InnerBegFn::result_type,
                              typename InnerEndFn::result_type>::value;
  assert(V);
#endif

  typedef typename InnerBegFn::result_type Inner;
  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,
                                          InnerEndFn>::type Ret_ty;

  return Ret_ty(beg, end, beg, innerBegFn, innerEndFn);
}

//! Creates two level iterator
template <typename Outer, typename InnerBegFn, typename InnerEndFn>
typename ChooseTwoLevelIterator<Outer, typename InnerBegFn::result_type,
                                InnerBegFn, InnerEndFn>::type
make_two_level_end(Outer beg, Outer end, InnerBegFn innerBegFn,
                   InnerEndFn innerEndFn) {
  // const bool V = std::is_same<typename InnerBegFn::result_type, typename
  // InnerEndFn::result_type>::value; static_assert (V);

  typedef typename InnerBegFn::result_type Inner;
  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,
                                          InnerEndFn>::type Ret_ty;

  return Ret_ty(beg, end, end, innerBegFn, innerEndFn);
}

namespace internal {
template <typename C>
struct GetBegin {
  inline typename C::iterator operator()(C& c) const { return c.begin(); }
};

template <typename C>
struct GetEnd {
  inline typename C::iterator operator()(C& c) const { return c.end(); }
};

// TODO: update to c++11 names
template <typename C>
struct GetCbegin {
  inline typename C::const_iterator operator()(const C& c) const {
    return c.begin();
  }
};

template <typename C>
struct GetCend {
  inline typename C::const_iterator operator()(const C& c) const {
    return c.end();
  }
};

template <typename C>
struct GetRbegin {
  inline typename C::reverse_iterator operator()(C& c) const {
    return c.rbegin();
  }
};

template <typename C>
struct GetRend {
  inline typename C::reverse_iterator operator()(C& c) const {
    return c.rend();
  }
};

// TODO: update to c++11 names
template <typename C>
struct GetCRbegin {
  inline typename C::const_reverse_iterator operator()(const C& c) const {
    return c.rbegin();
  }
};

template <typename C>
struct GetCRend {
  inline typename C::const_reverse_iterator operator()(const C& c) const {
    return c.rend();
  }
};

enum StlIterKind { NORMAL, _CONST, REVERSE, _CONST_REVERSE };

template <typename C, typename I>
struct IsConstIter {
  static const bool value = false;
};

template <typename C>
struct IsConstIter<C, typename C::const_iterator> {
  static const bool value = true;
};

template <typename C, typename I>
struct IsRvrsIter {
  static const bool value = false;
};

template <typename C>
struct IsRvrsIter<C, typename C::reverse_iterator> {
  static const bool value = true;
};

template <typename C, typename I>
struct IsRvrsConstIter {
  static const bool value = false;
};

template <typename C>
struct IsRvrsConstIter<C, typename C::const_reverse_iterator> {
  static const bool value = true;
};

template <typename C, typename I>
struct GetStlIterKind {
  static const bool isRvrs =
      IsRvrsIter<C, I>::value || IsRvrsConstIter<C, I>::value;
  static const bool isConst =
      IsConstIter<C, I>::value || IsRvrsConstIter<C, I>::value;

  static const StlIterKind value = isRvrs ? (isConst ? _CONST_REVERSE : REVERSE)
                                          : (isConst ? _CONST : NORMAL);
};

template <typename C, typename I, enum StlIterKind>
struct ChooseStlIter {
  typedef void Inner;
};

template <typename C, typename I>
struct ChooseStlIter<C, I, NORMAL> {

  typedef typename C::iterator Inner;
  typedef GetBegin<C> InnerBegFn;
  typedef GetEnd<C> InnerEndFn;
};

template <typename C, typename I>
struct ChooseStlIter<C, I, _CONST> {

  typedef typename C::const_iterator Inner;
  typedef GetCbegin<C> InnerBegFn;
  typedef GetCend<C> InnerEndFn;
};

template <typename C, typename I>
struct ChooseStlIter<C, I, REVERSE> {

  typedef typename C::reverse_iterator Inner;
  typedef GetRbegin<C> InnerBegFn;
  typedef GetRend<C> InnerEndFn;
};

template <typename C, typename I>
struct ChooseStlIter<C, I, _CONST_REVERSE> {

  typedef typename C::const_reverse_iterator Inner;
  typedef GetCRbegin<C> InnerBegFn;
  typedef GetCRend<C> InnerEndFn;
};

template <typename Outer, typename Inner>
struct ChooseStlTwoLevelIterImpl {

  typedef typename std::iterator_traits<Outer>::value_type C;
  static const internal::StlIterKind KIND =
      internal::GetStlIterKind<C, Inner>::value;
  typedef internal::ChooseStlIter<C, Inner, KIND> CStl;
  typedef typename CStl::InnerBegFn InnerBegFn;
  typedef typename CStl::InnerEndFn InnerEndFn;
  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,
                                          InnerEndFn>::type type;

  static type make(Outer beg, Outer end, Outer outer_pos) {
    return type(beg, end, outer_pos, InnerBegFn(), InnerEndFn());
  }
};

template <typename Outer>
struct StlInnerIsIterator
    : public ChooseStlTwoLevelIterImpl<
          Outer, typename std::iterator_traits<Outer>::value_type::iterator> {};

template <typename Outer>
struct StlInnerIsConstIterator
    : public ChooseStlTwoLevelIterImpl<
          Outer,
          typename std::iterator_traits<Outer>::value_type::const_iterator> {};

template <typename Outer>
struct StlInnerIsRvrsIterator
    : public ChooseStlTwoLevelIterImpl<
          Outer,
          typename std::iterator_traits<Outer>::value_type::reverse_iterator> {
};

template <typename Outer>
struct StlInnerIsConstRvrsIterator
    : public ChooseStlTwoLevelIterImpl<
          Outer, typename std::iterator_traits<
                     Outer>::value_type::const_reverse_iterator> {};

} // namespace internal

//! Type function to select appropriate two-level iterator
template <typename Outer, typename Inner>
struct ChooseStlTwoLevelIterator {
  typedef typename internal::ChooseStlTwoLevelIterImpl<Outer, Inner>::type type;
};

template <typename Outer>
typename internal::StlInnerIsIterator<Outer>::type
stl_two_level_begin(Outer beg, Outer end) {
  return internal::StlInnerIsIterator<Outer>::make(beg, end, beg);
}

template <typename Outer>
typename internal::StlInnerIsIterator<Outer>::type
stl_two_level_end(Outer beg, Outer end) {
  return internal::StlInnerIsIterator<Outer>::make(beg, end, end);
}

template <typename Outer>
typename internal::StlInnerIsConstIterator<Outer>::type
stl_two_level_cbegin(Outer beg, Outer end) {
  return internal::StlInnerIsConstIterator<Outer>::make(beg, end, beg);
}

template <typename Outer>
typename internal::StlInnerIsConstIterator<Outer>::type
stl_two_level_cend(Outer beg, Outer end) {
  return internal::StlInnerIsConstIterator<Outer>::make(beg, end, end);
}

template <typename Outer>
typename internal::StlInnerIsRvrsIterator<Outer>::type
stl_two_level_rbegin(Outer beg, Outer end) {
  return internal::StlInnerIsRvrsIterator<Outer>::make(beg, end, beg);
}

template <typename Outer>
typename internal::StlInnerIsRvrsIterator<Outer>::type
stl_two_level_rend(Outer beg, Outer end) {
  return internal::StlInnerIsRvrsIterator<Outer>::make(beg, end, end);
}

template <typename Outer>
typename internal::StlInnerIsConstRvrsIterator<Outer>::type
stl_two_level_crbegin(Outer beg, Outer end) {
  return internal::StlInnerIsConstRvrsIterator<Outer>::make(beg, end, beg);
}

template <typename Outer>
typename internal::StlInnerIsConstRvrsIterator<Outer>::type
stl_two_level_crend(Outer beg, Outer end) {
  return internal::StlInnerIsConstRvrsIterator<Outer>::make(beg, end, end);
}

} // end namespace galois

#endif // GALOIS_TWO_LEVEL_ITER_H


================================================
FILE: libgalois/include/galois/TwoLevelIteratorA.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_TWOLEVELITERATORA_H
#define GALOIS_TWOLEVELITERATORA_H

#include <cassert>
#include <iterator>
#include <type_traits>
#include <utility>

#include <boost/iterator/iterator_adaptor.hpp>

#include "galois/config.h"
#include "galois/gIO.h"

namespace galois {

/**
 * Alternate implementation of {@link ChooseTwoLevelIterator}.
 */
template <class OuterIter, class InnerIter, class CategoryOrTraversal,
          class InnerBeginFn, class InnerEndFn>
class TwoLevelIteratorA
    : public boost::iterator_adaptor<
          TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,
                            InnerBeginFn, InnerEndFn>,
          InnerIter, boost::use_default, CategoryOrTraversal> {
public:
  typedef typename TwoLevelIteratorA::iterator_adaptor_::difference_type
      difference_type;

private:
  OuterIter m_outer_begin; // TODO could skip this field when modeling a forward
                           // iterator
  OuterIter m_outer_end;
  OuterIter m_outer;
  InnerBeginFn m_inner_begin_fn;
  InnerEndFn m_inner_end_fn;

#if __cplusplus >= 201103L
  static_assert(
      std::is_convertible<typename std::result_of<InnerBeginFn(
                              decltype(*std::declval<OuterIter>()))>::type,
                          InnerIter>::value,
      "Result of InnerBeginFn(*OuterIter) should be convertable to InnerIter");
  static_assert(
      std::is_convertible<typename std::result_of<InnerEndFn(
                              decltype(*std::declval<OuterIter>()))>::type,
                          InnerIter>::value,
      "Result of InnerEndFn(*OuterIter) should be convertable to InnerIter");
#endif

  friend class boost::iterator_core_access;

  /**
   * Update base iterator to beginning of first non-empty inner range after
   * current one. Also update outer iterators appropriately.
   */
  void seek_forward() {
    if (this->base_reference() != m_inner_end_fn(*m_outer))
      return;

    ++m_outer;

    for (; m_outer != m_outer_end; ++m_outer) {
      this->base_reference() = m_inner_begin_fn(*m_outer);

      if (this->base_reference() != m_inner_end_fn(*m_outer))
        break;
    }
  }

  template <class Iter>
  void safe_decrement_dispatch(std::forward_iterator_tag, Iter& it,
                               Iter begin) {
    Iter prev = begin;

    for (; begin != it; ++begin)
      prev = begin;
  }

  template <class Iter>
  void safe_decrement_dispatch(std::bidirectional_iterator_tag, Iter& it,
                               const Iter&) {
    --it;
  }

  //! Decrement iterator or return true if it == begin.
  template <class Iter>
  bool safe_decrement(Iter& it, const Iter& begin) {
    if (it == begin)
      return true;
    safe_decrement_dispatch(
        typename std::iterator_traits<Iter>::iterator_category(), it, begin);
    return false;
  }

  template <class Iter>
  typename std::iterator_traits<Iter>::difference_type
  safe_difference_dispatch(Iter it1, Iter it2, Iter end,
                           std::input_iterator_tag) const {
    if (it1 == it2)
      return 0;

    Iter it1_orig(it1);
    Iter it2_orig(it2);

    typename std::iterator_traits<Iter>::difference_type count1 = 0;
    typename std::iterator_traits<Iter>::difference_type count2 = 0;

    while (true) {
      if (it1 != end) {
        ++count1;
        if (++it1 == it2_orig)
          return count1;
      }
      if (it2 != end) {
        ++count2;
        if (++it2 == it1_orig)
          return -count2;
      }
    }
  }

  template <class Iter>
  typename std::iterator_traits<Iter>::difference_type
  safe_difference_dispatch(Iter it1, Iter it2, Iter,
                           std::random_access_iterator_tag) const {
    return std::distance(it1, it2);
  }

  /**
   * Returns correct distances even for forward iterators when it2 is not
   * reachable from it1.
   */
  template <class Iter>
  typename std::iterator_traits<Iter>::difference_type
  safe_distance(Iter it1, Iter it2, Iter end) const {
    return safe_difference_dispatch(
        it1, it2, end,
        typename std::iterator_traits<Iter>::iterator_category());
  }

  /**
   * Update base iterator to end of first non-empty inner range before current
   * one. Also update outer iterators appropriately.
   */
  void seek_backward() {
    InnerIter end;

    for (end = m_inner_end_fn(*m_outer); m_inner_begin_fn(*m_outer) == end;) {
      bool too_far __attribute__((unused)) =
          safe_decrement(m_outer, m_outer_begin);
      assert(!too_far);
      end = m_inner_end_fn(*m_outer);
    }

    this->base_reference() = end;
  }

  void increment() {
    ++this->base_reference();
    seek_forward();
  }

  void decrement() {
    if (m_outer == m_outer_end) {
      bool too_far __attribute__((unused)) =
          safe_decrement(m_outer, m_outer_begin);
      assert(!too_far);
      seek_backward();
    } else if (!safe_decrement(this->base_reference(),
                               m_inner_begin_fn(*m_outer))) {
      // Common case
      return;
    } else {
      // Reached end of inner range
      bool too_far __attribute__((unused)) =
          safe_decrement(m_outer, m_outer_begin);
      assert(!too_far);
      seek_backward();
    }

    bool too_far __attribute__((unused)) =
        safe_decrement(this->base_reference(), m_inner_begin_fn(*m_outer));
    assert(!too_far);
  }

  template <class DiffType = difference_type>
  void advance_dispatch(DiffType n, std::input_iterator_tag) {
    if (n < 0) {
      for (; n; ++n)
        decrement();
    } else if (n > 0) {
      for (; n; --n)
        increment();
    }
  }

  template <class DiffType = difference_type>
  void jump_forward(DiffType n) {
    assert(n >= 0);
    while (n) {
      difference_type k =
          std::distance(this->base_reference(), m_inner_end_fn(*m_outer));
      difference_type m = std::min(k, n);
      n -= m;
      std::advance(this->base_reference(), m);
      if (m == k)
        seek_forward();
    }
  }

  template <class DiffType = difference_type>
  void jump_backward(DiffType n) {
    // Note: not the same as jump_forward due to difference between beginning
    // and end of ranges
    assert(n >= 0);
    if (n && m_outer == m_outer_end) {
      decrement();
      --n;
    }

    while (n) {
      difference_type k =
          std::distance(m_inner_begin_fn(*m_outer), this->base_reference()) + 1;
      if (k == 1) {
        decrement();
        --n;
      } else if (k < n) {
        seek_backward();
        n -= k;
      } else {
        std::advance(this->base_reference(), -n);
        n = 0;
      }
    }
  }

  template <class DiffType = difference_type>
  void advance_dispatch(DiffType n, std::random_access_iterator_tag) {
    if (n == 1)
      increment();
    else if (n == -1)
      decrement();
    else if (n < 0)
      jump_backward(-n);
    else if (n > 0)
      jump_forward(n);
  }

  void advance(difference_type n) {
    advance_dispatch(
        n, typename std::iterator_traits<InnerIter>::iterator_category());
  }

  template <class Other>
  difference_type distance_to_dispatch(Other it2,
                                       std::input_iterator_tag) const {
    // Inline safe_distance here otherwise there is a cyclic dependency:
    // std::distance -> iterator_adaptor -> distance_to -> safe_distance ->
    // std::distance
    if (*this == it2)
      return 0;

    TwoLevelIteratorA it1(*this);
    TwoLevelIteratorA it2_orig(it2);

    difference_type count1 = 0;
    difference_type count2 = 0;

    while (true) {
      if (it1.m_outer != it1.m_outer_end) {
        ++count1;
        if (++it1 == it2_orig)
          return count1;
      }
      if (it2.m_outer != it2.m_outer_end) {
        ++count2;
        if (++it2 == *this)
          return -count2;
      }
    }
  }

  template <class Other>
  difference_type distance_to_dispatch(const Other& x,
                                       std::random_access_iterator_tag) const {
    if (*this == x)
      return 0;
    else if (m_outer == x.m_outer)
      return safe_distance(this->base_reference(), x.base_reference(),
                           m_inner_end_fn(*m_outer));
    else if (safe_distance(m_outer, x.m_outer, m_outer_end) < 0)
      return -x.distance_to(*this);

    difference_type me_count = 0;

    TwoLevelIteratorA me(*this);

    while (me.m_outer != me.m_outer_end) {
      difference_type d;
      if (me.m_outer != x.m_outer)
        d = std::distance(me.base_reference(), me.m_inner_end_fn(*me.m_outer));
      else
        d = std::distance(me.base_reference(), x.base_reference());
      me_count += d;
      std::advance(me, d);
      if (me == x)
        return me_count;
    }

    GALOIS_DIE("invalid iterator ", std::distance(m_outer, x.m_outer));
    return 0;
  }

  template <class OtherOuterIter, class OtherInnerIter, class C, class BF,
            class EF>
  difference_type distance_to(
      const TwoLevelIteratorA<OtherOuterIter, OtherInnerIter, C, BF, EF>& x)
      const {
    return distance_to_dispatch(
        x, typename std::iterator_traits<InnerIter>::iterator_category());
  }

  template <class OtherOuterIter, class OtherInnerIter, class C, class BF,
            class EF>
  bool
  equal(const TwoLevelIteratorA<OtherOuterIter, OtherInnerIter, C, BF, EF>& x)
      const {
    if (m_outer == m_outer_end && m_outer == x.m_outer)
      return true;

    return m_outer == x.m_outer && this->base_reference() == x.base_reference();
  }

public:
  TwoLevelIteratorA() {}

  TwoLevelIteratorA(OuterIter outer_begin, OuterIter outer_end, OuterIter outer,
                    InnerBeginFn inner_begin_fn, InnerEndFn inner_end_fn)
      : m_outer_begin(outer_begin), m_outer_end(outer_end), m_outer(outer),
        m_inner_begin_fn(inner_begin_fn), m_inner_end_fn(inner_end_fn) {
    if (m_outer != m_outer_end) {
      this->base_reference() = m_inner_begin_fn(*m_outer);
      seek_forward();
    }
  }

  TwoLevelIteratorA(OuterIter outer_begin, OuterIter outer_end, OuterIter outer,
                    InnerIter inner, InnerBeginFn inner_begin_fn,
                    InnerEndFn inner_end_fn)
      : m_outer_begin(outer_begin), m_outer_end(outer_end), m_outer(outer),
        m_inner_begin_fn(inner_begin_fn), m_inner_end_fn(inner_end_fn) {
    this->base_reference() = inner;
  }

  const OuterIter& get_outer_reference() const { return m_outer; }

  const InnerIter& get_inner_reference() const {
    return this->base_reference();
  }
};

//! Helper functor, returns <code>t.end()</code>
struct GetBegin {
  template <class T>
  auto operator()(T&& x) const -> decltype(std::forward<T>(x).begin()) {
    return std::forward<T>(x).begin();
  }
};

//! Helper functor, returns <code>t.end()</code>
struct GetEnd {
  template <class T>
  auto operator()(T&& x) const -> decltype(std::forward<T>(x).end()) {
    return std::forward<T>(x).end();
  }
};

#if __cplusplus >= 201103L
template <
    class CategoryOrTraversal = std::forward_iterator_tag, class OuterIter,
    class InnerIter           = decltype(std::declval<OuterIter>()->begin()),
    class InnerBeginFn = GetBegin, class InnerEndFn = GetEnd,
    class Iter = TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,
                                   InnerBeginFn, InnerEndFn>>
std::pair<Iter, Iter> make_two_level_iterator(OuterIter outer_begin,
                                              OuterIter outer_end) {
  return std::make_pair(
      Iter(outer_begin, outer_end, outer_begin, InnerBeginFn(), InnerEndFn()),
      Iter(outer_begin, outer_end, outer_end, InnerBeginFn(), InnerEndFn()));
}
#else
// XXX(ddn): More direct encoding crashes XL 12.1, so lean towards more verbose
// types
template <class CategoryOrTraversal, class OuterIter, class InnerIter,
          class InnerBeginFn, class InnerEndFn>
std::pair<TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,
                            InnerBeginFn, InnerEndFn>,
          TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,
                            InnerBeginFn, InnerEndFn>>
make_two_level_iterator(OuterIter outer_begin, OuterIter outer_end) {
  return std::make_pair(
      TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal, InnerBeginFn,
                        InnerEndFn>(outer_begin, outer_end, outer_begin,
                                    InnerBeginFn(), InnerEndFn()),
      TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal, InnerBeginFn,
                        InnerEndFn>(outer_begin, outer_end, outer_end,
                                    InnerBeginFn(), InnerEndFn()));
}
#endif

} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/UnionFind.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_UNIONFIND_H
#define GALOIS_UNIONFIND_H

#include <atomic>

#include "galois/config.h"

namespace galois {
/**
 * Intrusive union-find implementation. Users subclass this to get disjoint
 * functionality for the subclass object.
 */
template <typename T>
class UnionFindNode {
  T* findImpl() const {
    if (isRep())
      return m_component.load(std::memory_order_relaxed);

    T* rep = m_component;
    while (rep->m_component != rep) {
      T* next = rep->m_component.load(std::memory_order_relaxed);
      rep     = next;
    }
    return rep;
  }

protected:
  std::atomic<T*> m_component;

  UnionFindNode(T* s) : m_component(s) {}

public:
  typedef UnionFindNode<T> SuperTy;

  bool isRep() const {
    return m_component.load(std::memory_order_relaxed) == this;
  }

  T* get() const { return m_component.load(std::memory_order_relaxed); }

  const T* find() const { return findImpl(); }

  T* find() { return findImpl(); }

  //! Compress ONLY node to point directly to the root of the tree;
  //! nodes on path are not altered
  void compress() {
    if (isRep())
      return;

    // my current component
    T* rep = m_component;

    // loop until rep == itself; i.e. get root
    while (rep->m_component.load(std::memory_order_relaxed) != rep) {
      // get next parent
      T* next = rep->m_component.load(std::memory_order_relaxed);
      rep     = next;
    }

    // at this point rep is the parent: save as my parent
    m_component.store(rep, std::memory_order_relaxed);
  }

  T* findAndCompress() {
    // Basic outline of race in synchronous path compression is that two path
    // compressions along two different paths to the root can create a cycle
    // in the union-find tree. Prevent that from happening by compressing
    // incrementally.
    if (isRep())
      return m_component.load(std::memory_order_relaxed);

    T* rep  = m_component;
    T* prev = 0;
    while (rep->m_component.load(std::memory_order_relaxed) != rep) {
      T* next = rep->m_component.load(std::memory_order_relaxed);

      if (prev && prev->m_component.load(std::memory_order_relaxed) == rep) {
        prev->m_component.store(next, std::memory_order_relaxed);
      }
      prev = rep;
      rep  = next;
    }

    return rep;
  }

  //! Lock-free merge. Returns if merge was done.
  T* merge(T* b) {
    T* a = m_component.load(std::memory_order_relaxed);
    while (true) {
      a = a->findAndCompress();
      b = b->findAndCompress();
      if (a == b)
        return 0;
      // Avoid cycles by directing edges consistently
      if (a < b)
        std::swap(a, b);
      if (a->m_component.compare_exchange_strong(a, b)) {
        return b;
      }
    }
  }
};
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/UserContext.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_USERCONTEXT_H
#define GALOIS_USERCONTEXT_H

#include <functional>

#include "galois/config.h"
#include "galois/gdeque.h"
#include "galois/Mem.h"
#include "galois/runtime/Context.h"

namespace galois {

/**
 * This is the object passed to the user's parallel loop.  This
 * provides the in-loop api.
 */
template <typename T>
class UserContext : private boost::noncopyable {
protected:
  //! push stuff
  typedef gdeque<T> PushBufferTy;
  static const unsigned int fastPushBackLimit = 64;
  typedef std::function<void(PushBufferTy&)> FastPushBack;

  PushBufferTy pushBuffer;
  //! Allocator stuff
  IterAllocBaseTy IterationAllocatorBase;
  PerIterAllocTy PerIterationAllocator;

  //! used by all
  bool* didBreak = nullptr;
  FastPushBack fastPushBack;

  //! some flags used by deterministic
  bool firstPassFlag = false;
  void* localState   = nullptr;

  void __resetAlloc() { IterationAllocatorBase.clear(); }

  void __setFirstPass(void) { firstPassFlag = true; }

  void __resetFirstPass(void) { firstPassFlag = false; }

  PushBufferTy& __getPushBuffer() { return pushBuffer; }

  void __resetPushBuffer() { pushBuffer.clear(); }

  void __setLocalState(void* p) { localState = p; }

  void __setFastPushBack(FastPushBack f) { fastPushBack = f; }

public:
  UserContext()
      : IterationAllocatorBase(),
        PerIterationAllocator(&IterationAllocatorBase), didBreak(0) {}

  //! Signal break in parallel loop, current iteration continues
  //! untill natural termination
  void breakLoop() { *didBreak = true; }

  //! Acquire a per-iteration allocator
  PerIterAllocTy& getPerIterAlloc() { return PerIterationAllocator; }

  //! Push new work
  template <typename... Args>
  void push(Args&&... args) {
    // galois::runtime::checkWrite(MethodFlag::WRITE, true);
    pushBuffer.emplace_back(std::forward<Args>(args)...);
    if (fastPushBack && pushBuffer.size() > fastPushBackLimit)
      fastPushBack(pushBuffer);
  }

  //! Push new work
  template <typename... Args>
  inline void push_back(Args&&... args) {
    this->push(std::forward<Args>(args)...);
  }

  //! Push new work
  template <typename... Args>
  inline void insert(Args&&... args) {
    this->push(std::forward<Args>(args)...);
  }

  //! Force the abort of this iteration
  void abort() { galois::runtime::signalConflict(); }

  //! Store and retrieve local state for deterministic
  template <typename LS>
  LS* getLocalState(void) {
    return reinterpret_cast<LS*>(localState);
  }

  template <typename LS, typename... Args>
  LS* createLocalState(Args&&... args) {
    new (localState) LS(std::forward<Args>(args)...);
    return getLocalState<LS>();
  }

  //! used by deterministic and ordered
  //! @returns true when the operator is invoked for the first time. The
  //! operator can use this information and choose to expand the neighborhood
  //! only in the first pass.
  bool isFirstPass(void) const { return firstPassFlag; }

  //! declare that the operator has crossed the cautious point.  This
  //! implies all data has been touched thus no new locks will be
  //! acquired.
  void cautiousPoint() {
    if (isFirstPass()) {
      galois::runtime::signalFailSafe();
    }
  }
};

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/Version.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_VERSION_H
#define GALOIS_VERSION_H

#include <string>

#include "galois/config.h"

namespace galois {

std::string getVersion();
std::string getRevision();
int getVersionMajor();
int getVersionMinor();
int getVersionPatch();
int getCopyrightYear();

} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/config.h.in
================================================
#ifndef GALOIS_CONFIG_H
#define GALOIS_CONFIG_H

#if !(defined(GALOIS_USE_LONGJMP_ABORT) || defined(GALOIS_USE_EXCEPTION_ABORT))
#define GALOIS_USE_LONGJMP_ABORT
#endif
#if !(defined(GALOIS_USE_LONGJMP_ABORT) ^ defined(GALOIS_USE_EXCEPTION_ABORT))
#error Exactly one of GALOIS_USE_LONGJMP_ABORT or GALOIS_USE_EXCEPTION_ABORT must be defined.
#endif

#if defined(__GNUC__)
#define GALOIS_ALLOW_WARNINGS                \
_Pragma("GCC diagnostic push")               \
_Pragma("GCC diagnostic warning \"-Wall\"")  \
_Pragma("GCC diagnostic warning \"-Wextra\"")
#define GALOIS_END_ALLOW_WARNINGS _Pragma("GCC diagnostic pop")
#else
#define GALOIS_ALLOW_WARNINGS
#define GALOIS_END_ALLOW_WARNINGS
#endif

#if defined(__GNUC__)
#define GALOIS_IGNORE_WARNINGS               \
_Pragma("GCC diagnostic push")               \
_Pragma("GCC diagnostic ignored \"-Wall\"")  \
_Pragma("GCC diagnostic ignored \"-Wextra\"")
#define GALOIS_END_IGNORE_WARNINGS _Pragma("GCC diagnostic pop")
#else
#define GALOIS_IGNORE_WARNINGS
#define GALOIS_END_IGNORE_WARNINGS
#endif

#if defined(__GNUC__)
#define GALOIS_IGNORE_UNUSED_PARAMETERS                 \
_Pragma("GCC diagnostic push")                          \
_Pragma("GCC diagnostic ignored \"-Wunused-parameter\"")
#define GALOIS_END_IGNORE_UNUSED_PARAMETERS _Pragma("GCC diagnostic pop")
#else
#define GALOIS_IGNORE_UNUSED_PARAMETERS
#define GALOIS_END_IGNORE_UNUSED_PARAMETERS
#endif

#if defined(__GNUC__) && !defined(__clang__)
#define GALOIS_IGNORE_MAYBE_UNINITIALIZED                  \
_Pragma("GCC diagnostic push")                             \
_Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
#define GALOIS_END_IGNORE_MAYBE_UNINITIALIZED _Pragma("GCC diagnostic pop")
#else
#define GALOIS_IGNORE_MAYBE_UNINITIALIZED
#define GALOIS_END_IGNORE_MAYBE_UNINITIALIZED
#endif

#if defined(__GNUC__)
#define GALOIS_IGNORE_UNUSED_BUT_SET                           \
_Pragma("GCC diagnostic push")                                 \
_Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
#define GALOIS_END_IGNORE_UNUSED_BUT_SET _Pragma("GCC diagnostic pop")
#else
#define GALOIS_IGNORE_UNUSED_BUT_SET
#define GALOIS_END_IGNORE_UNUSED_BUT_SET
#endif

// Macro to suppress compiler warnings that a variable is set but unused.
// This warning is buggy in gcc 7.
#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
#define GALOIS_GCC7_IGNORE_UNUSED_BUT_SET                      \
_Pragma("GCC diagnostic push")                                 \
_Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
#define GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET _Pragma("GCC diagnostic pop")
#else
#define GALOIS_GCC7_IGNORE_UNUSED_BUT_SET
#define GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET
#endif

#if defined(NDEBUG)
#define GALOIS_USED_ONLY_IN_DEBUG(NAME) NAME [[maybe_unused]]
#else
#define GALOIS_USED_ONLY_IN_DEBUG(NAME) NAME
#endif

#define GALOIS_UNUSED(NAME) NAME [[maybe_unused]]

#if defined(__GNUC__)
#define GALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS         \
_Pragma("GCC diagnostic push")                           \
_Pragma("GCC diagnostic ignored \"-Wunused-parameter\"")
#define GALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS _Pragma("GCC diagnostic pop")
#else
#define GALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS
#define GALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS
#endif

#endif


================================================
FILE: libgalois/include/galois/gIO.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GIO_H
#define GALOIS_GIO_H

#include <sstream>
#include <cerrno>
#include <cstdlib>
#include <string.h>

#include "galois/config.h"

// FIXME: move to Runtime

namespace galois {

//! Prints a string
void gPrintStr(const std::string&);
//! Prints an info string (for easy parsing)
void gInfoStr(const std::string&);
//! Prints a warning string (for easy parsing)
void gWarnStr(const std::string&);
//! Prints a debug string (for easy parsing)
void gDebugStr(const std::string&);
//! Prints an error string (for easy parsing)
void gErrorStr(const std::string&);

//! Prints a sequence of things
template <typename... Args>
void gPrint(Args&&... args) {
  std::ostringstream os;
  (os << ... << args);
  gPrintStr(os.str());
}

//! Prints an info string from a sequence of things
template <typename... Args>
void gInfo(Args&&... args) {
  std::ostringstream os;
  (os << ... << args);
  gInfoStr(os.str());
}

//! Prints a warning string from a sequence of things
template <typename... Args>
void gWarn(Args&&... args) {
  std::ostringstream os;
  (os << ... << args);
  gWarnStr(os.str());
}

//! Prints a debug string from a sequence of things; prints nothing if NDEBUG
//! is defined.
template <typename... Args>
void gDebug(Args&&... GALOIS_USED_ONLY_IN_DEBUG(args)) {
#ifndef NDEBUG
  std::ostringstream os;
  (os << ... << args);
  gDebugStr(os.str());
#endif
}

//! Prints error message
template <typename... Args>
void gError(Args&&... args) {
  std::ostringstream os;
  (os << ... << args);
  gErrorStr(os.str());
}

void gFlush();

#define GALOIS_SYS_DIE(...)                                                    \
  do {                                                                         \
    galois::gError(__FILE__, ":", __LINE__, ": ", strerror(errno), ": ",       \
                   ##__VA_ARGS__);                                             \
    abort();                                                                   \
  } while (0)
#define GALOIS_DIE(...)                                                        \
  do {                                                                         \
    galois::gError(__FILE__, ":", __LINE__, ": ", ##__VA_ARGS__);              \
    abort();                                                                   \
  } while (0)
//! Like assert but unconditionally executed
#define GALOIS_ASSERT(cond, ...)                                               \
  do {                                                                         \
    bool b = (cond);                                                           \
    if (!b) {                                                                  \
      galois::gError(__FILE__, ":", __LINE__, ": assertion failed: ", #cond,   \
                     " ", ##__VA_ARGS__);                                      \
      abort();                                                                 \
    }                                                                          \
  } while (0)

template <unsigned ENABLE>
struct debug {
  template <typename... Args>
  static void print(const Args&... args) {
    gDebug(args...);
  }
};

template <>
struct debug<0> {
  template <typename... Args>
  inline static void print(const Args&...) {}
};

} // end namespace galois

#endif //_GIO_H


================================================
FILE: libgalois/include/galois/gdeque.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GDEQUE_H
#define GALOIS_GDEQUE_H

#include "galois/config.h"
#include "galois/FixedSizeRing.h"
#include "galois/Mem.h"
#include "galois/TwoLevelIteratorA.h"

#include <boost/iterator/iterator_facade.hpp>
#include <boost/iterator/reverse_iterator.hpp>

#include <algorithm>
#include <utility>

namespace galois {

// Experimental random access iterator. Slower than old iterator for simple
// traversals, so disable for now
//#define _NEW_ITERATOR

//! Like std::deque but use Galois memory management functionality
template <typename T, unsigned ChunkSize = 64,
          typename ContainerTy = FixedSizeRing<T, ChunkSize>>
class gdeque {

protected:
  struct Block : ContainerTy {
    Block* next;
    Block* prev;

    Block() : next(), prev() {}

    template <typename InputIterator>
    Block(InputIterator first, InputIterator second)
        : ContainerTy(first, second), next(), prev() {}
  };

#ifdef _NEW_ITERATOR
  template <typename U>
  class outer_iterator
      : public boost::iterator_facade<outer_iterator<U>, U,
                                      boost::bidirectional_traversal_tag> {
    friend class boost::iterator_core_access;
    template <typename, unsigned, typename>
    friend class gdeque;
    Block* cur;
    Block* last;

    void increment() { cur = cur->next; }
    void decrement() {
      if (cur) {
        cur = cur->prev;
      } else {
        cur = last;
      }
    }

    template <typename OtherTy>
    bool equal(const outer_iterator<OtherTy>& o) const {
      return cur == o.cur;
    }

    U& dereference() const { return *cur; }

  public:
    outer_iterator(Block* b = 0, Block* l = 0) : cur(b), last(l) {}

    template <typename OtherTy>
    outer_iterator(const outer_iterator<OtherTy>& o)
        : cur(o.cur), last(o.last) {}
  };

  typedef typename Block::iterator inner_iterator;
  typedef typename Block::const_iterator const_inner_iterator;
#endif

  Block* first;

private:
  Block* last;
  unsigned num;

  //! [Example Fixed Size Allocator]
  galois::FixedSizeAllocator<Block> heap;

  template <typename... Args>
  Block* alloc_block(Args&&... args) {
    // Fixed size allocator can only allocate 1 object at a time of size
    // sizeof(Block). Argument to allocate is always 1.
    Block* b = heap.allocate(1);
    return new (b) Block(std::forward<Args>(args)...);
  }

  void free_block(Block* b) {
    b->~Block();
    heap.deallocate(b, 1);
  }
  //! [Example Fixed Size Allocator]

  bool precondition() const {
    return (num == 0 && first == NULL && last == NULL) ||
           (num > 0 && first != NULL && last != NULL);
  }

  Block* extend_first() {
    Block* b = alloc_block();
    b->next  = first;
    if (b->next)
      b->next->prev = b;
    first = b;
    if (!last)
      last = b;
    return b;
  }

  Block* extend_last() {
    Block* b = alloc_block();
    b->prev  = last;
    if (b->prev)
      b->prev->next = b;
    last = b;
    if (!first)
      first = b;
    return b;
  }

  void shrink(Block* b) {
    if (b->next)
      b->next->prev = b->prev;
    if (b->prev)
      b->prev->next = b->next;
    if (b == first)
      first = b->next;
    if (b == last)
      last = b->prev;
    free_block(b);
  }

  template <typename... Args>
  std::pair<Block*, typename Block::iterator>
  emplace(Block* b, typename Block::iterator ii, Args&&... args) {
    ++num;
    if (!b) {
      // gdeque is empty or iteration == end
      b = last;
      if (!b || b->full())
        b = extend_last();
      ii = b->end();
    } else if (b == first && ii == b->begin()) {
      // iteration == begin
      b = first;
      if (!b || b->full())
        b = extend_first();
      ii = b->begin();
    } else if (b->full()) {
      auto d   = std::distance(ii, b->end());
      Block* n = alloc_block(std::make_move_iterator(ii),
                             std::make_move_iterator(b->end()));
      for (; d > 0; --d)
        b->pop_back();
      ii      = b->end();
      n->next = b->next;
      n->prev = b;
      b->next = n;
      if (b == last)
        last = n;
    }
    unsigned boff = std::distance(b->begin(), ii);
    b->emplace(ii, std::forward<Args>(args)...);
    return std::make_pair(b, b->begin() + boff);
  }

public:
#ifdef _NEW_ITERATOR
  typedef galois::TwoLevelIteratorA<outer_iterator<Block>, inner_iterator,
                                    std::random_access_iterator_tag,
                                    GetBegin<Block>, GetEnd<Block>>
      iterator;
  typedef galois::TwoLevelIteratorA<outer_iterator<const Block>,
                                    const_inner_iterator,
                                    std::random_access_iterator_tag,
                                    GetBegin<const Block>, GetEnd<const Block>>
      const_iterator;
#endif
#ifndef _NEW_ITERATOR
  template <typename U>
  struct Iterator
      : public boost::iterator_facade<Iterator<U>, U,
                                      boost::bidirectional_traversal_tag> {
    friend class boost::iterator_core_access;

    Block* b;
    Block* last;
    unsigned offset;

  private:
    void increment() {
      ++offset;
      if (offset == b->size()) {
        b      = b->next;
        offset = 0;
      }
    }

    void decrement() {
      if (!b) {
        b      = last;
        offset = b->size() - 1;
        return;
      } else if (offset == 0) {
        b      = b->prev;
        offset = b->size() - 1;
      } else {
        --offset;
      }
    }

    template <typename OtherTy>
    bool equal(const Iterator<OtherTy>& o) const {
      return b == o.b && offset == o.offset;
    }

    U& dereference() const { return b->getAt(offset); }

  public:
    Iterator(Block* _b = 0, Block* _l = 0, unsigned _off = 0)
        : b(_b), last(_l), offset(_off) {}

    template <typename OtherTy>
    Iterator(const Iterator<OtherTy>& o)
        : b(o.b), last(o.last), offset(o.offset) {}
  };
  typedef Iterator<T> iterator;
  typedef Iterator<const T> const_iterator;
#endif

  typedef boost::reverse_iterator<iterator> reverse_iterator;
  typedef boost::reverse_iterator<const_iterator> const_reverse_iterator;
  typedef typename iterator::value_type value_type;
  typedef typename iterator::pointer pointer;
  typedef typename iterator::reference reference;
  typedef typename const_iterator::reference const_reference;
  typedef typename iterator::difference_type difference_type;
  typedef size_t size_type;

  gdeque() : first(), last(), num(), heap() {}

  gdeque(gdeque&& o) : first(), last(), num(), heap() {
    std::swap(first, o.first);
    std::swap(last, o.last);
    std::swap(num, o.num);
  }

  gdeque& operator=(gdeque&& o) {
    std::swap(first, o.first);
    std::swap(last, o.last);
    std::swap(num, o.num);
    return *this;
  }

  gdeque(const gdeque&) = delete;
  gdeque& operator=(const gdeque&) = delete;

  ~gdeque() { clear(); }

  iterator begin() {
    assert(precondition());

#ifdef _NEW_ITERATOR
    return iterator{outer_iterator<Block>{first, last},
                    outer_iterator<Block>{nullptr, last},
                    outer_iterator<Block>{first, last}, GetBegin<Block>{},
                    GetEnd<Block>{}};
#else
    return iterator{first, last, 0};
#endif
  }

  iterator end() {
    assert(precondition());
#ifdef _NEW_ITERATOR
    return iterator{outer_iterator<Block>{first, last},
                    outer_iterator<Block>{nullptr, last},
                    outer_iterator<Block>{nullptr, last}, GetBegin<Block>{},
                    GetEnd<Block>{}};
#else
    return iterator{nullptr, last, 0};
#endif
  }

  const_iterator begin() const {
    assert(precondition());

#ifdef _NEW_ITERATOR
    return const_iterator{outer_iterator<const Block>{first, last},
                          outer_iterator<const Block>{nullptr, last},
                          outer_iterator<const Block>{first, last},
                          GetBegin<const Block>{},
                          GetEnd<const Block, const_inner_iterator>{}};
#else
    return const_iterator{first, last, 0};
#endif
  }

  const_iterator end() const {
#ifdef _NEW_ITERATOR
    return const_iterator{outer_iterator<const Block>{first, last},
                          outer_iterator<const Block>{nullptr, last},
                          outer_iterator<const Block>{nullptr, last},
                          GetBegin<const Block>{},
                          GetEnd<const Block, const_inner_iterator>{}};
#else
    return const_iterator{nullptr, last, 0};
#endif
  }

  reverse_iterator rbegin() { return reverse_iterator{end()}; }

  reverse_iterator rend() { return reverse_iterator{begin()}; }

  const_reverse_iterator rbegin() const {
    return const_reverse_iterator{end()};
  }

  const_reverse_iterator rend() const {
    return const_reverse_iterator{begin()};
  }

  size_t size() const {
    assert(precondition());
    return num;
  }

  bool empty() const {
    assert(precondition());
    return num == 0;
  }

  reference front() {
    assert(!empty());
    return first->front();
  }

  const_reference front() const {
    assert(!empty());
    return first->front();
  }

  reference back() {
    assert(!empty());
    return last->back();
  }

  const_reference back() const {
    assert(!empty());
    return last->back();
  }

  void pop_back() {
    assert(!empty());
    --num;
    last->pop_back();
    if (last->empty())
      shrink(last);
  }

  void pop_front() {
    assert(!empty());
    --num;
    first->pop_front();
    if (first->empty())
      shrink(first);
  }

  void clear() {
    assert(precondition());
    Block* b = first;
    while (b) {
      b->clear();
      Block* old = b;
      b          = b->next;
      free_block(old);
    }
    first = last = NULL;
    num          = 0;
  }

  //! Invalidates pointers
  template <typename... Args>
  iterator emplace(iterator pos, Args&&... args) {
#ifdef _NEW_ITERATOR
    Block* b          = pos.get_outer_reference().cur;
    inner_iterator ii = pos.get_inner_reference();
#else
    Block* b = pos.b;
    typename Block::iterator ii;
    if (b)
      ii = b->begin() + pos.offset;
#endif
    auto p = emplace(b, ii, std::forward<Args>(args)...);
#ifdef _NEW_ITERATOR
    return iterator{outer_iterator<Block>{first, last},
                    outer_iterator<Block>{nullptr, last},
                    outer_iterator<Block>{p.first, last},
                    p.second,
                    GetBegin<Block>{},
                    GetEnd<Block>{}};
#else
    return iterator(p.first, last, std::distance(p.first->begin(), p.second));
#endif
  }

  iterator erase(iterator pos) {
    GALOIS_DIE("not yet implemented");
    return pos;
  }

#ifdef _NEW_ITERATOR
  //! Not truly constant time
  reference operator[](size_t x) {
    if (x == 0)
      return front();
    else if (x == num)
      return back();
    auto ii = begin();
    std::advance(ii, x);
    return *ii;
  }

  //! Not truly constant time
  const_reference operator[](size_t x) const {
    if (x == 0)
      return front();
    else if (x == num)
      return back();
    auto ii = begin();
    std::advance(ii, x);
    return *ii;
  }
#endif

  template <typename... Args>
  void emplace_back(Args&&... args) {
    assert(precondition());
    ++num;
    if (!last || last->full())
      extend_last();
#ifndef NDEBUG
    pointer p = last->emplace_back(std::forward<Args>(args)...);
    assert(p);
#else
    last->emplace_back(std::forward<Args>(args)...);
#endif
  }

  template <typename ValueTy>
  void push_back(ValueTy&& v) {
    emplace_back(std::forward<ValueTy>(v));
  }

  template <typename... Args>
  void emplace_front(Args&&... args) {
    assert(precondition());
    ++num;
    if (!first || first->full())
      extend_first();
    pointer p = first->emplace_front(std::forward<Args>(args)...);
    assert(p);
  }

  template <typename ValueTy>
  void push_front(ValueTy&& v) {
    emplace_front(std::forward<ValueTy>(v));
  }
};

#undef _NEW_ITERATOR
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/BufferedGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file BufferedGraph.h
 *
 * Contains the implementation of BufferedGraph
 */

#ifndef GALOIS_GRAPHS_BUFGRAPH_H
#define GALOIS_GRAPHS_BUFGRAPH_H

#include <fstream>

#include <boost/iterator/counting_iterator.hpp>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/Reduction.h"

namespace galois {
namespace graphs {

/**
 * Class that loads a portion of a Galois graph from disk directly into
 * memory buffers for access.
 *
 * @tparam EdgeDataType type of the edge data
 * @todo version 2 Galois binary graph support; currently only suppports
 * version 1
 */
template <typename EdgeDataType>
class BufferedGraph {
private:
  // buffers that you load data into
  //! buffer that tells you where a particular node's edges begin
  uint64_t* outIndexBuffer = nullptr;
  //! buffer that tells the destination of edges
  uint32_t* edgeDestBuffer = nullptr;
  //! buffer that stores edge data
  EdgeDataType* edgeDataBuffer = nullptr;

  //! size of the entire graph (not just locallly loaded portion)
  uint32_t globalSize = 0;
  //! number of edges in the entire graph (not just locallly loaded portion)
  uint64_t globalEdgeSize = 0;

  //! number of nodes loaded into this graph
  uint32_t numLocalNodes = 0;
  //! number of edges loaded into this graph
  uint64_t numLocalEdges = 0;

  //! specifies how many nodes are skipped from the beginning of the graph
  //! in this loaded portion of it
  uint64_t nodeOffset = 0;
  //! specifies how many edges are skipped from the beginning of the graph
  //! in this loaded portion of it
  uint64_t edgeOffset = 0;
  //! specifies whether or not the graph is loaded
  bool graphLoaded = false;

  // accumulators for tracking bytes read
  //! number of bytes read related to the out index buffer
  galois::GAccumulator<uint64_t> numBytesReadOutIndex;
  //! number of bytes read related to the edge dest buffer
  galois::GAccumulator<uint64_t> numBytesReadEdgeDest;
  //! number of bytes read related to the edge data buffer
  galois::GAccumulator<uint64_t> numBytesReadEdgeData;

  /**
   * Load the out indices (i.e. where a particular node's edges begin in the
   * array of edges) from the file.
   *
   * @param graphFile loaded file for the graph
   * @param nodeStart the first node to load
   * @param numNodesToLoad number of nodes to load
   */
  void loadOutIndex(std::ifstream& graphFile, uint64_t nodeStart,
                    uint64_t numNodesToLoad) {
    if (numNodesToLoad == 0) {
      return;
    }
    assert(outIndexBuffer == nullptr);
    outIndexBuffer = (uint64_t*)malloc(sizeof(uint64_t) * numNodesToLoad);

    if (outIndexBuffer == nullptr) {
      GALOIS_DIE("Failed to allocate memory for out index buffer.");
    }

    // position to start of contiguous chunk of nodes to read
    uint64_t readPosition = (4 + nodeStart) * sizeof(uint64_t);
    graphFile.seekg(readPosition);

    uint64_t numBytesToLoad = numNodesToLoad * sizeof(uint64_t);
    uint64_t bytesRead      = 0;

    while (numBytesToLoad > 0) {
      graphFile.read(((char*)this->outIndexBuffer) + bytesRead, numBytesToLoad);
      size_t numRead = graphFile.gcount();
      numBytesToLoad -= numRead;
      bytesRead += numRead;
    }

    assert(numBytesToLoad == 0);

    nodeOffset = nodeStart;
  }

  /**
   * Load the edge destination information from the file.
   *
   * @param graphFile loaded file for the graph
   * @param edgeStart the first edge to load
   * @param numEdgesToLoad number of edges to load
   * @param numGlobalNodes total number of nodes in the graph file; needed
   * to determine offset into the file
   */
  void loadEdgeDest(std::ifstream& graphFile, uint64_t edgeStart,
                    uint64_t numEdgesToLoad, uint64_t numGlobalNodes) {
    if (numEdgesToLoad == 0) {
      return;
    }

    assert(edgeDestBuffer == nullptr);
    edgeDestBuffer = (uint32_t*)malloc(sizeof(uint32_t) * numEdgesToLoad);

    if (edgeDestBuffer == nullptr) {
      GALOIS_DIE("Failed to allocate memory for edge dest buffer.");
    }

    // position to start of contiguous chunk of edges to read
    uint64_t readPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
                            (sizeof(uint32_t) * edgeStart);
    graphFile.seekg(readPosition);

    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(uint32_t);
    uint64_t bytesRead      = 0;
    while (numBytesToLoad > 0) {
      graphFile.read(((char*)this->edgeDestBuffer) + bytesRead, numBytesToLoad);
      size_t numRead = graphFile.gcount();
      numBytesToLoad -= numRead;
      bytesRead += numRead;
    }

    assert(numBytesToLoad == 0);
    // save edge offset of this graph for later use
    edgeOffset = edgeStart;
  }

  /**
   * Load the edge data information from the file.
   *
   * @tparam EdgeType must be non-void in order to call this function
   *
   * @param edgeStart the first edge to load
   * @param numEdgesToLoad number of edges to load
   * @param numGlobalNodes total number of nodes in the graph file; needed
   * to determine offset into the file
   * @param numGlobalEdges total number of edges in the graph file; needed
   * to determine offset into the file
   */
  template <
      typename EdgeType,
      typename std::enable_if<!std::is_void<EdgeType>::value>::type* = nullptr>
  void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart,
                    uint64_t numEdgesToLoad, uint64_t numGlobalNodes,
                    uint64_t numGlobalEdges) {
    galois::gDebug("Loading edge data");

    if (numEdgesToLoad == 0) {
      return;
    }

    assert(edgeDataBuffer == nullptr);
    edgeDataBuffer =
        (EdgeDataType*)malloc(sizeof(EdgeDataType) * numEdgesToLoad);

    if (edgeDataBuffer == nullptr) {
      GALOIS_DIE("Failed to allocate memory for edge data buffer.");
    }

    // position after nodes + edges
    uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
                                (sizeof(uint32_t) * numGlobalEdges);

    // version 1 padding TODO make version agnostic
    if (numGlobalEdges % 2) {
      baseReadPosition += sizeof(uint32_t);
    }

    // jump to first byte of edge data
    uint64_t readPosition =
        baseReadPosition + (sizeof(EdgeDataType) * edgeStart);
    graphFile.seekg(readPosition);
    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);
    uint64_t bytesRead      = 0;

    while (numBytesToLoad > 0) {
      graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, numBytesToLoad);
      size_t numRead = graphFile.gcount();
      numBytesToLoad -= numRead;
      bytesRead += numRead;
    }

    assert(numBytesToLoad == 0);
  }

  /**
   * Load edge data function for when the edge data type is void, i.e.
   * no edge data to load.
   *
   * Does nothing of importance.
   *
   * @tparam EdgeType if EdgeType is void, this function will be used
   */
  template <
      typename EdgeType,
      typename std::enable_if<std::is_void<EdgeType>::value>::type* = nullptr>
  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) {
    galois::gDebug("Not loading edge data");
    // do nothing (edge data is void, i.e. no edge data)
  }

  /**
   * Resets graph metadata to default values. Does NOT touch the buffers.
   */
  void resetGraphStatus() {
    graphLoaded    = false;
    globalSize     = 0;
    globalEdgeSize = 0;
    nodeOffset     = 0;
    edgeOffset     = 0;
    numLocalNodes  = 0;
    numLocalEdges  = 0;
    resetReadCounters();
  }

  /**
   * Free all of the buffers in memory.
   */
  void freeMemory() {
    free(outIndexBuffer);
    outIndexBuffer = nullptr;
    free(edgeDestBuffer);
    edgeDestBuffer = nullptr;
    free(edgeDataBuffer);
    edgeDataBuffer = nullptr;
  }

public:
  /**
   * Class vars should be initialized by in-class initialization; all
   * left is to reset read counters.
   */
  BufferedGraph() { resetReadCounters(); }

  /**
   * On destruction, free allocated buffers (if necessary).
   */
  ~BufferedGraph() noexcept { freeMemory(); }

  // copy not allowed
  //! disabled copy constructor
  BufferedGraph(const BufferedGraph&) = delete;
  //! disabled copy constructor operator
  BufferedGraph& operator=(const BufferedGraph&) = delete;
  // move not allowed
  //! disabled move operator
  BufferedGraph(BufferedGraph&&) = delete;
  //! disabled move constructor operator
  BufferedGraph& operator=(BufferedGraph&&) = delete;

  /**
   * Gets the number of global nodes in the graph
   * @returns the total number of nodes in the graph (not just local loaded
   * nodes)
   */
  uint32_t size() const { return globalSize; }

  /**
   * Gets the number of global edges in the graph
   * @returns the total number of edges in the graph (not just local loaded
   * edges)
   */
  uint32_t sizeEdges() const { return globalEdgeSize; }

  //! @returns node offset of this buffered graph
  uint64_t getNodeOffset() const { return nodeOffset; }

  /**
   * Loads given Galois CSR graph into memory.
   *
   * @param filename name of graph to load; should be in Galois binary graph
   * format
   */
  void loadGraph(const std::string& filename) {
    if (graphLoaded) {
      GALOIS_DIE("Cannot load an buffered graph more than once.");
    }

    std::ifstream graphFile(filename.c_str());
    uint64_t header[4];
    graphFile.read(((char*)header), sizeof(uint64_t) * 4);

    numLocalNodes = globalSize = header[2];
    numLocalEdges = globalEdgeSize = header[3];

    loadOutIndex(graphFile, 0, globalSize);
    loadEdgeDest(graphFile, 0, globalEdgeSize, globalSize);
    // may or may not do something depending on EdgeDataType
    loadEdgeData<EdgeDataType>(graphFile, 0, globalEdgeSize, globalSize,
                               globalEdgeSize);
    graphLoaded = true;

    graphFile.close();
  }

  /**
   * Given a node/edge range to load, loads the specified portion of the graph
   * into memory buffers using read.
   *
   * @param filename name of graph to load; should be in Galois binary graph
   * format
   * @param nodeStart First node to load
   * @param nodeEnd Last node to load, non-inclusive
   * @param edgeStart First edge to load; should correspond to first edge of
   * first node
   * @param edgeEnd Last edge to load, non-inclusive
   * @param numGlobalNodes Total number of nodes in the graph
   * @param numGlobalEdges Total number of edges in the graph
   */
  void loadPartialGraph(const std::string& filename, uint64_t nodeStart,
                        uint64_t nodeEnd, uint64_t edgeStart, uint64_t edgeEnd,
                        uint64_t numGlobalNodes, uint64_t numGlobalEdges) {
    if (graphLoaded) {
      GALOIS_DIE("Cannot load an buffered graph more than once.");
    }

    std::ifstream graphFile(filename.c_str());

    globalSize     = numGlobalNodes;
    globalEdgeSize = numGlobalEdges;

    assert(nodeEnd >= nodeStart);
    numLocalNodes = nodeEnd - nodeStart;
    loadOutIndex(graphFile, nodeStart, numLocalNodes);

    assert(edgeEnd >= edgeStart);
    numLocalEdges = edgeEnd - edgeStart;
    loadEdgeDest(graphFile, edgeStart, numLocalEdges, numGlobalNodes);

    // may or may not do something depending on EdgeDataType
    loadEdgeData<EdgeDataType>(graphFile, edgeStart, numLocalEdges,
                               numGlobalNodes, numGlobalEdges);
    graphLoaded = true;

    graphFile.close();
  }

  //! Edge iterator typedef
  using EdgeIterator = boost::counting_iterator<uint64_t>;
  /**
   * Get the index to the first edge of the provided node THAT THIS GRAPH
   * HAS LOADED (not necessary the first edge of it globally).
   *
   * @param globalNodeID the global node id of the node to get the edge
   * for
   * @returns a GLOBAL edge id iterator
   */
  EdgeIterator edgeBegin(uint64_t globalNodeID) {
    if (!graphLoaded) {
      GALOIS_DIE("Graph hasn't been loaded yet.");
    }

    if (numLocalNodes == 0) {
      return EdgeIterator(0);
    }
    assert(nodeOffset <= globalNodeID);
    assert(globalNodeID < (nodeOffset + numLocalNodes));

    uint64_t localNodeID = globalNodeID - nodeOffset;

    if (localNodeID != 0) {
      numBytesReadOutIndex += sizeof(uint64_t);
      return EdgeIterator(outIndexBuffer[localNodeID - 1]);
    } else {
      return EdgeIterator(edgeOffset);
    }
  }

  /**
   * Get the index to the first edge of the node after the provided node.
   *
   * @param globalNodeID the global node id of the node to get the edge
   * for
   * @returns a GLOBAL edge id iterator
   */
  EdgeIterator edgeEnd(uint64_t globalNodeID) {
    if (!graphLoaded) {
      GALOIS_DIE("Graph hasn't been loaded yet.");
    }

    if (numLocalNodes == 0) {
      return EdgeIterator(0);
    }
    assert(nodeOffset <= globalNodeID);
    assert(globalNodeID < (nodeOffset + numLocalNodes));

    numBytesReadOutIndex += sizeof(uint64_t);

    uint64_t localNodeID = globalNodeID - nodeOffset;
    return EdgeIterator(outIndexBuffer[localNodeID]);
  }

  /**
   * Get the global node id of the destination of the provided edge.
   *
   * @param globalEdgeID the global edge id of the edge to get the destination
   * for (should obtain from edgeBegin/End)
   */
  uint64_t edgeDestination(uint64_t globalEdgeID) {
    if (!graphLoaded) {
      GALOIS_DIE("Graph hasn't been loaded yet.");
    }

    if (numLocalEdges == 0) {
      return 0;
    }
    assert(edgeOffset <= globalEdgeID);
    assert(globalEdgeID < (edgeOffset + numLocalEdges));

    numBytesReadEdgeDest += sizeof(uint32_t);

    uint64_t localEdgeID = globalEdgeID - edgeOffset;
    return edgeDestBuffer[localEdgeID];
  }

  /**
   * Get the edge data of some edge.
   *
   * @param globalEdgeID the global edge id of the edge to get the data of
   * @returns the edge data of the requested edge id
   */
  template <typename K = EdgeDataType,
            typename std::enable_if<!std::is_void<K>::value>::type* = nullptr>
  EdgeDataType edgeData(uint64_t globalEdgeID) {
    if (!graphLoaded) {
      GALOIS_DIE("Graph hasn't been loaded yet.");
    }
    if (edgeDataBuffer == nullptr) {
      GALOIS_DIE("Trying to get edge data when graph has no edge data.");
    }

    if (numLocalEdges == 0) {
      return 0;
    }

    assert(edgeOffset <= globalEdgeID);
    assert(globalEdgeID < (edgeOffset + numLocalEdges));

    numBytesReadEdgeData += sizeof(EdgeDataType);

    uint64_t localEdgeID = globalEdgeID - edgeOffset;
    return edgeDataBuffer[localEdgeID];
  }

  /**
   * Version of above function when edge data type is void.
   */
  template <typename K = EdgeDataType,
            typename std::enable_if<std::is_void<K>::value>::type* = nullptr>
  unsigned edgeData(uint64_t) {
    galois::gWarn("Getting edge data on graph when it doesn't exist\n");
    return 0;
  }

  /**
   * Reset reading counters.
   */
  void resetReadCounters() {
    numBytesReadOutIndex.reset();
    numBytesReadEdgeDest.reset();
    numBytesReadEdgeData.reset();
  }

  /**
   * Returns the total number of bytes read from this graph so far.
   *
   * @returns Total number of bytes read using the "get" functions on
   * out indices, edge destinations, and edge data.
   */
  uint64_t getBytesRead() {
    return numBytesReadOutIndex.reduce() + numBytesReadEdgeDest.reduce() +
           numBytesReadEdgeData.reduce();
  }

  /**
   * Free all of the in memory buffers in this object and reset graph status.
   */
  void resetAndFree() {
    freeMemory();
    resetGraphStatus();
  }
};
} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/Details.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_DETAILS_H
#define GALOIS_GRAPHS_DETAILS_H

#include <algorithm>
#include <boost/mpl/if.hpp>

#include "galois/config.h"
#include "galois/LargeArray.h"
#include "galois/LazyObject.h"
#include "galois/NoDerefIterator.h"
#include "galois/Threads.h"
#include "galois/runtime/Iterable.h"
#include "galois/runtime/Context.h"
#include "galois/substrate/PerThreadStorage.h"

// Forward declarations

namespace galois::graphs {

struct read_default_graph_tag {};
struct read_with_aux_graph_tag {};
struct read_lc_inout_graph_tag {};
struct read_with_aux_first_graph_tag {};

} // namespace galois::graphs

namespace galois::graphs::internal {

template <typename, typename, typename, typename, typename>
struct EdgeSortReference;
} // namespace galois::graphs::internal

namespace galois::graphs {

//! Proxy object for internal EdgeSortReference
template <typename GraphNode, typename EdgeTy>
class EdgeSortValue : public StrictObject<EdgeTy> {
  template <typename, typename, typename, typename, typename>
  friend struct internal::EdgeSortReference;

  GraphNode rawDst;

public:
  GraphNode dst;
  typedef StrictObject<EdgeTy> Super;
  typedef typename Super::value_type value_type;

  EdgeSortValue(GraphNode d, GraphNode rd, const value_type& v)
      : Super(v), rawDst(rd), dst(d) {}

  template <typename ER>
  EdgeSortValue(const ER& ref) {
    ref.initialize(*this);
  }
};

} // namespace galois::graphs

namespace galois::graphs::internal {

template <bool Enable>
class LocalIteratorFeature {
  typedef std::pair<uint64_t, uint64_t> Range;
  substrate::PerThreadStorage<Range> localIterators;

public:
  uint64_t localBegin(uint64_t numNodes) const {
    return std::min(localIterators.getLocal()->first, numNodes);
  }

  uint64_t localEnd(uint64_t numNodes) const {
    return std::min(localIterators.getLocal()->second, numNodes);
  }

  void setLocalRange(uint64_t begin, uint64_t end) {
    Range& r = *localIterators.getLocal();
    r.first  = begin;
    r.second = end;
  }
};

template <>
struct LocalIteratorFeature<false> {
  uint64_t localBegin(uint64_t numNodes) const {
    unsigned int id  = substrate::ThreadPool::getTID();
    unsigned int num = galois::getActiveThreads();
    uint64_t begin   = (numNodes + num - 1) / num * id;
    return std::min(begin, numNodes);
  }

  uint64_t localEnd(uint64_t numNodes) const {
    unsigned int id  = substrate::ThreadPool::getTID();
    unsigned int num = galois::getActiveThreads();
    uint64_t end     = (numNodes + num - 1) / num * (id + 1);
    return std::min(end, numNodes);
  }

  void setLocalRange(uint64_t, uint64_t) {}
};

//! Proxy object for {@link EdgeSortIterator}
template <typename GraphNode, typename EdgeIndex, typename EdgeDst,
          typename EdgeData, typename GraphNodeConverter>
struct EdgeSortReference {
  typedef typename EdgeData::raw_value_type EdgeTy;
  EdgeIndex at;
  EdgeDst* edgeDst;
  EdgeData* edgeData;

  EdgeSortReference(EdgeIndex x, EdgeDst* dsts, EdgeData* data)
      : at(x), edgeDst(dsts), edgeData(data) {}

  // Explicitly declare what the implicit copy constructor
  // would do since using the implicit copy constructor
  // from a class with a non-defaulted copy assignment
  // operator is deprecated.
  EdgeSortReference(EdgeSortReference const& x) {
    at       = x.at;
    edgeDst  = x.edgeDst;
    edgeData = x.edgeData;
  }

  EdgeSortReference operator=(const EdgeSortValue<GraphNode, EdgeTy>& x) {
    edgeDst->set(at, x.rawDst);
    edgeData->set(at, x.get());
    return *this;
  }

  EdgeSortReference operator=(const EdgeSortReference& x) {
    edgeDst->set(at, edgeDst->at(x.at));
    edgeData->set(at, edgeData->at(x.at));
    return *this;
  }

  EdgeSortValue<GraphNode, EdgeTy> operator*() const {
    return EdgeSortValue<GraphNode, EdgeTy>(
        GraphNodeConverter()(edgeDst->at(at)), edgeDst->at(at),
        edgeData->at(at));
  }

  void initialize(EdgeSortValue<GraphNode, EdgeTy>& value) const {
    value = *(*this);
  }
};

/**
 * Converts comparison functions over EdgeTy to be over {@link EdgeSortValue}.
 */
template <typename EdgeSortValueTy, typename CompTy>
struct EdgeSortCompWrapper {
  const CompTy& comp;

  EdgeSortCompWrapper(const CompTy& c) : comp(c) {}
  bool operator()(const EdgeSortValueTy& a, const EdgeSortValueTy& b) const {
    return comp(a.get(), b.get());
  }
};

struct Identity {
  template <typename T>
  T operator()(const T& x) const {
    return x;
  }
};

/**
 * Iterator to facilitate sorting of CSR-like graphs. Converts random access
 * operations on iterator to appropriate computations on edge destinations and
 * edge data.
 *
 * @tparam GraphNode Graph node pointer
 * @tparam EdgeIndex Integer-like value that is passed to EdgeDst and EdgeData
 * @tparam EdgeDst {@link LargeArray}-like container of edge destinations
 * @tparam EdgeData {@link LargeArray}-like container of edge data
 * @tparam GraphNodeConverter A functor to apply when returning values of
 *   EdgeDst when dereferencing this iterator; assignment uses untransformed
 *   EdgeDst values
 */
template <typename GraphNode, typename EdgeIndex, typename EdgeDst,
          typename EdgeData, typename GraphNodeConverter = Identity>
class EdgeSortIterator
    : public boost::iterator_facade<
          EdgeSortIterator<GraphNode, EdgeIndex, EdgeDst, EdgeData,
                           GraphNodeConverter>,
          EdgeSortValue<GraphNode, typename EdgeData::raw_value_type>,
          boost::random_access_traversal_tag,
          EdgeSortReference<GraphNode, EdgeIndex, EdgeDst, EdgeData,
                            GraphNodeConverter>> {
  typedef EdgeSortIterator<GraphNode, EdgeIndex, EdgeDst, EdgeData,
                           GraphNodeConverter>
      Self;
  typedef EdgeSortReference<GraphNode, EdgeIndex, EdgeDst, EdgeData,
                            GraphNodeConverter>
      Reference;

  EdgeIndex at;
  EdgeDst* edgeDst;
  EdgeData* edgeData;

public:
  EdgeSortIterator() : at(0) {}
  EdgeSortIterator(EdgeIndex x, EdgeDst* dsts, EdgeData* data)
      : at(x), edgeDst(dsts), edgeData(data) {}

private:
  friend class boost::iterator_core_access;

  bool equal(const Self& other) const { return at == other.at; }
  Reference dereference() const { return Reference(at, edgeDst, edgeData); }
  ptrdiff_t distance_to(const Self& other) const {
    return other.at - (ptrdiff_t)at;
  }
  void increment() { ++at; }
  void decrement() { --at; }
  void advance(ptrdiff_t n) { at += n; }
};

template <typename IdTy>
class IntrusiveId {
  IdTy id;

public:
  IdTy& getId() { return id; }
  void setId(size_t n) { id = n; }
};

template <>
class IntrusiveId<void> {
public:
  char getId() { return 0; }
  void setId(size_t) {}
};

//! Empty class for HasLockable optimization
class NoLockable {};

//! Separate types from definitions to allow incomplete types as NodeTy
template <typename NodeTy, bool HasLockable>
struct NodeInfoBaseTypes {
  typedef NodeTy& reference;
};

template <bool HasLockable>
struct NodeInfoBaseTypes<void, HasLockable> {
  typedef void* reference;
};

//! Specializations for void node data
template <typename NodeTy, bool HasLockable>
class NodeInfoBase
    : public boost::mpl::if_c<HasLockable, galois::runtime::Lockable,
                              NoLockable>::type,
      public NodeInfoBaseTypes<NodeTy, HasLockable> {
  NodeTy data;

public:
  template <typename... Args>
  NodeInfoBase(Args&&... args) : data(std::forward<Args>(args)...) {}

  typename NodeInfoBase::reference getData() { return data; }
};

template <bool HasLockable>
struct NodeInfoBase<void, HasLockable>
    : public boost::mpl::if_c<HasLockable, galois::runtime::Lockable,
                              NoLockable>::type,
      public NodeInfoBaseTypes<void, HasLockable> {
  typename NodeInfoBase::reference getData() { return 0; }
};

template <bool Enable>
class OutOfLineLockableFeature {
  typedef NodeInfoBase<void, true> OutOfLineLock;
  LargeArray<OutOfLineLock> outOfLineLocks;

public:
  struct size_of_out_of_line {
    static const size_t value = sizeof(OutOfLineLock);
  };

  void outOfLineAcquire(size_t n, MethodFlag mflag) {
    galois::runtime::acquire(&outOfLineLocks[n], mflag);
  }
  void outOfLineAllocateLocal(size_t numNodes) {
    outOfLineLocks.allocateLocal(numNodes);
  }
  void outOfLineAllocateInterleaved(size_t numNodes) {
    outOfLineLocks.allocateInterleaved(numNodes);
  }
  void outOfLineAllocateBlocked(size_t numNodes) {
    outOfLineLocks.allocateBlocked(numNodes);
  }
  void outOfLineAllocateFloating(size_t numNodes) {
    outOfLineLocks.allocateFloating(numNodes);
  }

  template <typename RangeArrayType>
  void outOfLineAllocateSpecified(size_t n, RangeArrayType threadRanges) {
    outOfLineLocks.allocateSpecified(n, threadRanges);
  }

  void outOfLineConstructAt(size_t n) { outOfLineLocks.constructAt(n); }
};

template <>
class OutOfLineLockableFeature<false> {
public:
  struct size_of_out_of_line {
    static const size_t value = 0;
  };
  void outOfLineAcquire(size_t, MethodFlag) {}
  void outOfLineAllocateLocal(size_t) {}
  void outOfLineAllocateInterleaved(size_t) {}
  void outOfLineAllocateBlocked(size_t) {}
  void outOfLineAllocateFloating(size_t) {}
  void outOfLineConstructAt(size_t) {}
  template <typename RangeArrayType>
  void outOfLineAllocateSpecified(size_t, RangeArrayType) {}
};

//! Edge specialization for void edge data
template <typename NodeInfoPtrTy, typename EdgeTy>
struct EdgeInfoBase : public LazyObject<EdgeTy> {
  NodeInfoPtrTy dst;
};

/**
 * Convenience wrapper around Graph.edge_begin and Graph.edge_end to allow
 * C++11 foreach iteration of edges.
 */
template <typename GraphTy>
class EdgesIterator {
  typename GraphTy::edge_iterator ii, ee;

public:
  typedef NoDerefIterator<typename GraphTy::edge_iterator> iterator;

  EdgesIterator(GraphTy& g, typename GraphTy::GraphNode n, MethodFlag f)
      : ii(g.edge_begin(n, f)), ee(g.edge_end(n, f)) {}
  EdgesIterator(typename GraphTy::edge_iterator _ii,
                typename GraphTy::edge_iterator _ee)
      : ii(_ii), ee(_ee) {}

  iterator begin() { return make_no_deref_iterator(ii); }
  iterator end() { return make_no_deref_iterator(ee); }
};

template <typename ItTy>
runtime::iterable<NoDerefIterator<ItTy>> make_no_deref_range(ItTy ii, ItTy ee) {
  return runtime::make_iterable(make_no_deref_iterator(ii),
                                make_no_deref_iterator(ee));
}

/**
 * Convenience wrapper around Graph.in_edge_begin and Graph.in_edge_end to allow
 * C++11 foreach iteration of in edges.
 */
template <typename GraphTy>
class InEdgesIterator {
  GraphTy& g;
  typename GraphTy::GraphNode n;
  MethodFlag flag;

public:
  typedef NoDerefIterator<typename GraphTy::in_edge_iterator> iterator;

  InEdgesIterator(GraphTy& g, typename GraphTy::GraphNode n, MethodFlag f)
      : g(g), n(n), flag(f) {}

  iterator begin() { return make_no_deref_iterator(g.in_edge_begin(n, flag)); }
  iterator end() { return make_no_deref_iterator(g.in_edge_end(n, flag)); }
};

template <typename GraphTy>
class EdgesWithNoFlagIterator {
  GraphTy& g;
  typename GraphTy::GraphNode n;

public:
  typedef NoDerefIterator<typename GraphTy::edge_iterator> iterator;

  EdgesWithNoFlagIterator(GraphTy& g, typename GraphTy::GraphNode n)
      : g(g), n(n) {}

  iterator begin() { return make_no_deref_iterator(g.edge_begin(n)); }
  iterator end() { return make_no_deref_iterator(g.edge_end(n)); }
};

template <typename A, typename B, typename C, typename D, typename E>
void swap(EdgeSortReference<A, B, C, D, E> a,
          EdgeSortReference<A, B, C, D, E> b) {
  auto aa = *a;
  auto bb = *b;
  a       = bb;
  b       = aa;
}

} // namespace galois::graphs::internal

#endif


================================================
FILE: libgalois/include/galois/graphs/FileGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file FileGraph.h
 *
 * Contains FileGraph and FileGraphWriter class declarations.
 *
 * @todo finish up doxygen
 */

#ifndef GALOIS_GRAPHS_FILEGRAPH_H
#define GALOIS_GRAPHS_FILEGRAPH_H

#include <cstring>
#include <deque>
#include <type_traits>
#include <vector>

#include <boost/iterator/counting_iterator.hpp>
#include <boost/iterator/transform_iterator.hpp>

#include "galois/config.h"
#include "galois/Endian.h"
#include "galois/MethodFlags.h"
#include "galois/LargeArray.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/GraphHelpers.h"
#include "galois/runtime/Context.h"
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/CompilerSpecific.h"
#include "galois/substrate/NumaMem.h"
#include "galois/Reduction.h"

namespace galois {
namespace graphs {

// XXX(ddn): Refactor to eliminate OCFileGraph

//! Graph that mmaps Galois gr files for access
class FileGraph {
public:
  //! type of a node
  using GraphNode = uint64_t;

private:
  struct Convert32 {
    uint32_t operator()(uint32_t x) const { return convert_le32toh(x); }
  };

  struct Convert64 {
    uint64_t operator()(uint64_t x) const { return convert_le64toh(x); }
  };

  struct mapping {
    void* ptr;
    size_t len;
  };

protected:
  std::deque<mapping> mappings;
  std::deque<int> fds;

  //! The size of edge data (on 1 edge)
  uint64_t sizeofEdge;
  //! Number of nodes in this (sub)graph
  uint64_t numNodes;
  //! Number of edges in this (sub)graph
  uint64_t numEdges;

  //! Array specifying where a node's edges begin in memory
  uint64_t* outIdx;
  //! Array storing outgoing edge destinations
  void* outs;
  //! Array storing edge data (if it exists)
  char* edgeData;

  //! Galois gr version of read in graph
  int graphVersion;

  //! adjustments to node index when we load only part of a graph
  uint64_t nodeOffset;
  //! adjustments to edge index when we load only part of a graph
  uint64_t edgeOffset;

private:
  //! If initialized, this array stores node degrees in memory for fast access
  //! via the getDegree function
  LargeArray<uint64_t> node_degrees;

  // graph reading speed variables
  galois::GAccumulator<uint64_t> numBytesReadIndex, numBytesReadEdgeDst,
      numBytesReadEdgeData;

  /**
   * Construct a file graph by moving in structures from the passed in file
   * graph.
   */
  void move_assign(FileGraph&&);
  /**
   * Get the local edge id of the edge with a specific source and destination
   * if it exists.
   *
   * @param src Global source id of edge to find
   * @param dst Global destination id of edge to find
   * @returns the local edge id of the edge (src, dst) if it exists, otherwise
   * return ~0
   */
  uint64_t getEdgeIdx(GraphNode src, GraphNode dst);

  /**
   * Gets a pointer to the first neighbor of node N.
   *
   * @param N global node id of neighbor begin to get
   * @returns pointer to global id of first neighbor of node N
   */
  void* raw_neighbor_begin(GraphNode N);
  /**
   * Gets a pointer to the end of node N's neighbors in the edge destination
   * array.
   *
   * @param N global node id of neighbor end to get
   * @returns pointer to end of node N's neighbors in edge destination array.
   */
  void* raw_neighbor_end(GraphNode N);

  /**
   * Given an mmap'd version of the graph, initialize graph from that block of
   * memory
   */
  void fromMem(void* m, uint64_t nodeOffset, uint64_t edgeOffset, uint64_t);

  /**
   * Loads a graph from another file graph
   *
   * @param g FileGraph to load from
   * @param sizeofEdgeData Size of edge data (for 1 edge)
   *
   * @returns Pointer to the edge data array of the newly loaded file graph
   */
  void* fromGraph(FileGraph& g, size_t sizeofEdgeData);

  /**
   * Finds the first node N such that
   *
   *  N * nodeSize +
   *  (sum_{i=0}^{N-1} E[i]) * edgeSize
   *    >=
   *  targetSize
   *
   *  in range [lb, ub). Returns ub if unsuccessful.
   *
   * @param nodeSize Weight of nodes
   * @param edgeSize Weight of edges
   * @param targetSize Size that returned node id should attempt to hit
   * @param lb Lower bound of nodes to consider
   * @param ub Upper bound of nodes to consider
   *
   * @returns A node id that hits the target size (or gets close to it)
   */
  size_t findIndex(size_t nodeSize, size_t edgeSize, size_t targetSize,
                   size_t lb, size_t ub);

  void fromFileInterleaved(const std::string& filename, size_t sizeofEdgeData);

  /**
   * Page in a portion of the loaded graph data based based on division of labor
   * by nodes.
   *
   * @param id ID of unit of thread/unit of execution that will page in pages
   * @param total Total number of threads/units of execution to split page in
   * work among
   * @param sizeofEdgeData Size of the loaded edge data
   */
  void pageInByNode(size_t id, size_t total, size_t sizeofEdgeData);

  /**
   * Copies graph connectivity information from arrays. Returns a pointer to
   * array to populate with edge data.
   *
   * @param outIdx Out index information in an array
   * @param numNodes number of nodes
   * @param outs edge destination array
   * @param numEdges number of edges
   * @param edgeData array of edge data
   * @param sizeofEdgeData The size of the edge data
   * @param nodeOffset how many nodes from the beginning will this graph start
   * from
   * @param edgeOffset how many edges from the beginning will this edge start
   * from
   * @param converted whether values in arrays are in host byte ordering
   * (false) or in FileGraph byte ordering (true)
   * @param oGraphVersion Galois graph version to use
   * @return pointer to begining of edgeData in graph
   */
  void* fromArrays(uint64_t* outIdx, uint64_t numNodes, void* outs,
                   uint64_t numEdges, char* edgeData, size_t sizeofEdgeData,
                   uint64_t nodeOffset, uint64_t edgeOffset, bool converted,
                   int oGraphVersion = 1);

public:
  /**
   * Reset the num bytes counters
   */
  void reset_byte_counters() {
    numBytesReadEdgeDst.reset();
    numBytesReadIndex.reset();
    numBytesReadEdgeData.reset();
  }

  /**
   * Return all bytes read
   */
  uint64_t num_bytes_read() {
    return numBytesReadEdgeDst.reduce() + numBytesReadEdgeData.reduce() +
           numBytesReadIndex.reduce();
  }

  // Node Handling

  //! Checks if a node is in the graph (already added)
  bool containsNode(const GraphNode n) const {
    return n + nodeOffset < numNodes;
  }

  // Edge Handling

  //! Get edge data of an edge between 2 nodes
  template <typename EdgeTy>
  EdgeTy& getEdgeData(GraphNode src, GraphNode dst) {
    assert(sizeofEdge == sizeof(EdgeTy));
    numBytesReadEdgeData += sizeof(EdgeTy);
    return reinterpret_cast<EdgeTy*>(edgeData)[getEdgeIdx(src, dst)];
  }

  //! Edge iterators (boost iterator)
  using edge_iterator = boost::counting_iterator<uint64_t>;

  /**
   * Returns the index to the beginning of global node N's outgoing edges
   * in the outgoing edges array.
   *
   * @param N global node id of edge begin to get
   * @returns Iterator to first edge of node N
   */
  edge_iterator edge_begin(GraphNode N);
  /**
   * Returns the index to the end of global node N's outgoing edges
   * in the outgoing edges array.
   *
   * @param N global node id of edge end to get
   * @returns Iterator to end of node N's edges
   */
  edge_iterator edge_end(GraphNode N);

  /**
   * Returns the edges of node N as a range that can be iterated through
   * by C++ foreach.
   */
  runtime::iterable<NoDerefIterator<edge_iterator>> edges(GraphNode N) {
    return internal::make_no_deref_range(edge_begin(N), edge_end(N));
  }

  /**
   * Returns the edges of node N as a range that can be iterated through
   * by C++ foreach.
   */
  runtime::iterable<NoDerefIterator<edge_iterator>> out_edges(GraphNode N) {
    return edges(N);
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
   */
  template <typename EdgeTy, typename CompTy>
  void sortEdgesByEdgeData(GraphNode N,
                           const CompTy& comp = std::less<EdgeTy>()) {
    if (graphVersion == 1) {
      typedef LargeArray<uint32_t> EdgeDst;
      typedef LargeArray<EdgeTy> EdgeData;

      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,
                                         Convert32>
          edge_sort_iterator;

      EdgeDst edgeDst(outs, numEdges);
      EdgeData ed(edgeData, numEdges);

      edge_sort_iterator begin(
          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_begin(N)),
          &edgeDst, &ed);
      edge_sort_iterator end(
          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_end(N)),
          &edgeDst, &ed);
      std::sort(begin, end,
                internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>,
                                              CompTy>(comp));
    } else if (graphVersion == 2) {
      typedef LargeArray<uint64_t> EdgeDst;
      typedef LargeArray<EdgeTy> EdgeData;

      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,
                                         Convert64>
          edge_sort_iterator;

      EdgeDst edgeDst(outs, numEdges);
      EdgeData ed(edgeData, numEdges);

      edge_sort_iterator begin(
          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_begin(N)),
          &edgeDst, &ed);
      edge_sort_iterator end(
          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_end(N)),
          &edgeDst, &ed);
      std::sort(begin, end,
                internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>,
                                              CompTy>(comp));
    } else {
      GALOIS_DIE("unknown file version: ", graphVersion);
    }
  }

  /**
   * Sorts outgoing edges of a node.
   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
   */
  template <typename EdgeTy, typename CompTy>
  void sortEdges(GraphNode N, const CompTy& comp) {
    if (graphVersion == 1) {
      typedef LargeArray<uint32_t> EdgeDst;
      typedef LargeArray<EdgeTy> EdgeData;
      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,
                                         Convert32>
          edge_sort_iterator;

      EdgeDst edgeDst(outs, numEdges);
      EdgeData ed(edgeData, numEdges);

      edge_sort_iterator begin(
          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_begin(N)),
          &edgeDst, &ed);
      edge_sort_iterator end(
          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_end(N)),
          &edgeDst, &ed);
      std::sort(begin, end, comp);
    } else if (graphVersion == 2) {
      typedef LargeArray<uint64_t> EdgeDst;
      typedef LargeArray<EdgeTy> EdgeData;
      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,
                                         Convert64>
          edge_sort_iterator;

      EdgeDst edgeDst(outs, numEdges);
      EdgeData ed(edgeData, numEdges);

      edge_sort_iterator begin(
          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_begin(N)),
          &edgeDst, &ed);
      edge_sort_iterator end(
          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_end(N)),
          &edgeDst, &ed);
      std::sort(begin, end, comp);
    } else {
      GALOIS_DIE("unknown file version: ", graphVersion);
    }
  }

  // template<typename EdgeTy>
  // const EdgeTy& getEdgeData(edge_iterator it) const {
  //  assert(edgeData);
  //  return reinterpret_cast<const EdgeTy*>(edgeData)[*it];
  //}

  //! Get edge data given an edge iterator
  template <typename EdgeTy>
  EdgeTy& getEdgeData(edge_iterator it) {
    assert(edgeData);
    numBytesReadEdgeData += sizeof(EdgeTy);
    return reinterpret_cast<EdgeTy*>(edgeData)[*it];
  }

  /**
   * Gets the destination of some edge.
   *
   * @param it local edge id of edge destination to get
   * @returns a global node id representing the destination of the edge
   */
  GraphNode getEdgeDst(edge_iterator it);

  //! iterator over neighbors
  typedef boost::transform_iterator<Convert32, uint32_t*> neighbor_iterator;
  //! iterator over node ids
  typedef boost::transform_iterator<Convert32, uint32_t*> node_id_iterator;
  //! edge iterator
  typedef boost::transform_iterator<Convert64, uint64_t*> edge_id_iterator;
  //! uint64 boost counting iterator
  typedef boost::counting_iterator<uint64_t> iterator;

  /**
   * Gets an iterator to the first neighbor of node N
   *
   * @warning only version 1 support, do not use with version 2
   */
  neighbor_iterator neighbor_begin(GraphNode N) {
    return boost::make_transform_iterator((uint32_t*)raw_neighbor_begin(N),
                                          Convert32());
  }

  /**
   * Gets an iterator to the end of node N's neighbors
   *
   * @warning only version 1 support, do not use with version 2
   */
  neighbor_iterator neighbor_end(GraphNode N) {
    return boost::make_transform_iterator((uint32_t*)raw_neighbor_end(N),
                                          Convert32());
  }

  template <typename EdgeTy>
  EdgeTy* edge_data_begin() const {
    assert(edgeData);
    return reinterpret_cast<EdgeTy*>(edgeData);
  }

  template <typename EdgeTy>
  EdgeTy* edge_data_end() const {
    assert(edgeData);
    assert(sizeof(EdgeTy) == sizeofEdge);
    EdgeTy* r = reinterpret_cast<EdgeTy*>(edgeData);
    return &r[numEdges];
  }

  //! Calculates node degrees and saves them to a class variable for
  //! access by getDegree.
  void initNodeDegrees();

  /**
   * Gets the degree of a particular node. Assumes that initNodeDegrees has
   * been called to initialize the array of degrees; if not, it is very likely
   * going to segfault as it will attempt to access uninitialized memory.
   *
   * @param node_id ID of a node to get the degree of.
   * @returns Degree of a specified node
   */
  uint64_t getDegree(uint32_t node_id) const;

  /**
   * Gets the first node of the loaded graph.
   *
   * @returns An iterator to the first node of the graph. Note it is a GLOBAL
   * id.
   */
  iterator begin() const;
  /**
   * Gets the end of the nodes of the loaded graph.
   *
   * @returns An iterator to the end of the nodes of the graph (of the
   * loaded part of the graph).
   */
  iterator end() const;

  //! pair specifying a node range
  typedef std::pair<iterator, iterator> NodeRange;
  //! pair specifying an edge range
  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;
  //! pair of a NodeRange and an EdgeRange
  typedef std::pair<NodeRange, EdgeRange> GraphRange;

  /**
   * Given a division and a total number of divisions, return a range for that
   * particular division to work on. (i.e. this divides labor among divisions
   * depending on how much weight is given to nodes/edges).
   *
   * @param nodeSize Weight of nodes
   * @param edgeSize Weight of edges
   * @param id Division id
   * @param total Total number of divisions
   *
   * @returns A node range and an edge range specifying division "id"'s assigned
   * nodes/edges
   */
  GraphRange divideByNode(size_t nodeSize, size_t edgeSize, size_t id,
                          size_t total);

  /**
   * Divides nodes only considering edges.
   *
   * IMPORTANT: Note that it may potentially not return all nodes in the graph
   * (it will return up to the last node with edges).
   *
   * @param nodeSize Weight of nodes
   * @param edgeSize Weight of edges
   * @param id Division id
   * @param total Total number of divisions
   *
   * @returns A node range and an edge range specifying division "id"'s assigned
   * nodes/edges
   */
  GraphRange divideByEdge(size_t nodeSize, size_t edgeSize, size_t id,
                          size_t total);
  /**
   * Returns an iterator to the beginning of the node destination
   * array.
   *
   * @returns iterator to beginning of the node destination array of the
   * loaded graph (local)
   * @todo implement version 2 support
   */
  node_id_iterator node_id_begin() const;
  /**
   * Returns an iterator to the end of the node destination
   * array.
   *
   * @returns iterator to end of the node destination array of the loaded
   * graph (local)
   * @todo implement version 2 support
   */
  node_id_iterator node_id_end() const;
  /**
   * Returns an iterator to the beginning of the array specifying
   * the index into the destination array where a particular node's
   * edges begin.
   *
   * @return iterator to beginning of edge index array of the loaded graph
   */
  edge_id_iterator edge_id_begin() const;
  /**
   * Returns an iterator to the end of the array specifying
   * the index into the destination array where a particular node's
   * edges begin.
   *
   * @return iterator to end of edge index array of the loaded graph
   */
  edge_id_iterator edge_id_end() const;

  /**
   * Determines if an edge with source N1 and destination N2 existed
   * in the currently loaded (local) graph.
   *
   * @param N1 global node id of neighbor 1 (source)
   * @param N2 global node id of neighbor 2 (destination)
   *
   * @returns true if edge (N1, N2) exists locally, false otherwise
   */
  bool hasNeighbor(GraphNode N1, GraphNode N2);

  //! Returns the number of nodes in the (sub)graph
  size_t size() const { return numNodes; }

  //! Returns the number of edges in the (sub)graph
  size_t sizeEdges() const { return numEdges; }

  //! Returns the size of an edge
  size_t edgeSize() const { return sizeofEdge; }

  /**
   * Default file graph constructor which initializes fields to null values.
   */
  FileGraph();

  /**
   * Construct graph from another FileGraph
   *
   * @param o Other filegraph to initialize from.
   */
  FileGraph(const FileGraph&);
  /**
   * Copy constructor operator for FileGraph
   */
  FileGraph& operator=(const FileGraph&);
  /**
   * Move constructor for FileGraph
   */
  FileGraph(FileGraph&&);
  /**
   * Move constructor operator for FileGraph
   */
  FileGraph& operator=(FileGraph&&);
  /**
   * Destructor. Un-mmaps mmap'd things and closes opened files.
   */
  ~FileGraph();

  /**
   * Given a file name, mmap the entire file into memory. Should
   * be a graph with some specific layout.
   *
   * @param filename Graph file to load
   */
  void fromFile(const std::string& filename);

  /**
   * Loads/mmaps particular portions of a graph corresponding to a node
   * range and edge range into memory.
   *
   * Note that it makes the object work on a LOCAL scale (i.e. there are
   * now local ids corresponding to the subgraph). Most functions will
   * still handle global ids, though. (see below)
   *
   * @param filename File to load
   * @param nrange Node range to load
   * @param erange Edge range to load
   * @param numaMap if true, does interleaved numa allocation for data
   * structures
   */
  void partFromFile(const std::string& filename, NodeRange nrange,
                    EdgeRange erange, bool numaMap = false);

  /**
   * Reads graph connectivity information from file. Tries to balance memory
   * evenly across system.  Cannot be called during parallel execution.
   *
   * Edge data version.
   */
  template <typename EdgeTy>
  void fromFileInterleaved(
      const std::string& filename,
      typename std::enable_if<!std::is_void<EdgeTy>::value>::type* = 0) {
    fromFileInterleaved(filename, sizeof(EdgeTy));
  }

  /**
   * Reads graph connectivity information from file. Tries to balance memory
   * evenly across system.  Cannot be called during parallel execution.
   *
   * No edge data version.
   */
  template <typename EdgeTy>
  void fromFileInterleaved(
      const std::string& filename,
      typename std::enable_if<std::is_void<EdgeTy>::value>::type* = 0) {
    fromFileInterleaved(filename, 0);
  }

  /**
   * Reads graph connectivity information from graph but not edge data. Returns
   * a pointer to array to populate with edge data.
   */
  template <typename T>
  T* fromGraph(FileGraph& g) {
    return reinterpret_cast<T*>(fromGraph(g, sizeof(T)));
  }

  /**
   * Write current contents of mappings to a file
   *
   * @param file File to write to
   * @todo perform host -> le on data
   */
  void toFile(const std::string& file);
};

/**
 * Simplifies writing graphs.
 *
 * Writer your file in rounds:
 * <ol>
 *  <li>setNumNodes(), setNumEdges<EdgeTy>()</li>
 *  <li>phase1(), for each node, incrementDegree(Node x)</li>
 *  <li>phase2(), add neighbors for each node, addNeighbor(Node src, Node dst),
 *    or add neighbors and corresponding data, addNeighbor<EdgeTy>(Node src,
 *    Node dst, EdgeTy data)</li>
 *  <li>finish(), use as FileGraph</li>
 * </ol>
 */
class FileGraphWriter : public FileGraph {
  std::unique_ptr<uint64_t[]> starts;

public:
  //! Set number of nodes to write to n
  //! @param n number of nodes to set to
  void setNumNodes(size_t n) { numNodes = n; }
  //! Set number of edges to write to n
  //! @tparam EdgeTy edge data type
  //! @param n number of edges to set to
  template <typename EdgeTy, typename std::enable_if<
                                 std::is_void<EdgeTy>::value>::type* = nullptr>
  void setNumEdges(size_t n) {
    numEdges   = n;
    sizeofEdge = 0;
  }
  template <typename EdgeTy, typename std::enable_if<
                                 !std::is_void<EdgeTy>::value>::type* = nullptr>
  void setNumEdges(size_t n) {
    numEdges   = n;
    sizeofEdge = sizeof(EdgeTy);
  }

  //! Marks the transition to next phase of parsing: counting the degree of
  //! nodes
  void phase1();

  //! Increments degree of id by delta
  void incrementDegree(size_t id, uint64_t delta = 1) {
    assert(id < numNodes);
    outIdx[id] += delta;
  }

  //! Marks the transition to next phase of parsing, adding edges
  void phase2();

  //! Adds a neighbor between src and dst
  size_t addNeighbor(size_t src, size_t dst) {
    size_t base = src ? outIdx[src - 1] : 0;
    size_t idx  = base + starts[src]++;
    assert(idx < outIdx[src]);

    if (numNodes <= std::numeric_limits<uint32_t>::max())
      reinterpret_cast<uint32_t*>(outs)[idx] = dst; // version 1
    else
      reinterpret_cast<uint64_t*>(outs)[idx] = dst; // version 2
    return idx;
  }

  //! Adds a neighbor between src and dst w/ corresponding data
  template <typename T>
  size_t addNeighbor(
      size_t src, size_t dst,
      const typename std::enable_if<!std::is_void<T>::value, T>::type& data) {
    assert(edgeData);
    size_t idx                          = addNeighbor(src, dst);
    reinterpret_cast<T*>(edgeData)[idx] = data;
    return idx;
  }

  /**
   * Finish making graph.
   */
  void finish() { starts.reset(nullptr); } // free reserved memory asap

  /**
   * Finish making graph. Returns pointer to block of memory that should be
   * used to store edge data.
   */
  template <typename T>
  T* finish() {
    starts.reset(nullptr); // free reserved memory asap
    return reinterpret_cast<T*>(edgeData);
  }
};

/**
 * Adds reverse edges to a graph. Reverse edges have edge data copied from the
 * original edge. The new graph is placed in the out parameter.  The previous
 * out is destroyed.
 */
template <typename EdgeTy>
void makeSymmetric(FileGraph& in_graph, FileGraph& out) {
  typedef FileGraph::GraphNode GNode;

  FileGraphWriter g;

  size_t numEdges = 0;

  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;
       ++ii) {
    GNode src = *ii;
    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),
                                  ej = in_graph.edge_end(src);
         jj != ej; ++jj) {
      GNode dst = in_graph.getEdgeDst(jj);
      numEdges += 1;
      if (src != dst)
        numEdges += 1;
    }
  }

  g.setNumNodes(in_graph.size());
  g.setNumEdges<EdgeTy>(numEdges);

  g.phase1();
  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;
       ++ii) {
    GNode src = *ii;
    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),
                                  ej = in_graph.edge_end(src);
         jj != ej; ++jj) {
      GNode dst = in_graph.getEdgeDst(jj);
      g.incrementDegree(src);
      if (src != dst)
        g.incrementDegree(dst);
    }
  }

  g.phase2();
  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;
       ++ii) {
    GNode src = *ii;
    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),
                                  ej = in_graph.edge_end(src);
         jj != ej; ++jj) {
      GNode dst = in_graph.getEdgeDst(jj);
      if constexpr (std::is_void<EdgeTy>::value) {
        g.addNeighbor(src, dst);
        if (src != dst)
          g.addNeighbor(dst, src);
      } else {
        EdgeTy& data = in_graph.getEdgeData<EdgeTy>(jj);
        g.addNeighbor<EdgeTy>(src, dst, data);
        if (src != dst)
          g.addNeighbor<EdgeTy>(dst, src, data);
      }
    }
  }

  g.finish();

  out = std::move(g);
}

/**
 * Permutes a graph.
 *
 * Permutation array, P, conforms to: P[i] = j where i is a node index from the
 * original graph and j is a node index in the permuted graph. New, permuted
 * graph is placed in the out parameter. The previous out is destroyed.
 *
 * @param in_graph original graph
 * @param p permutation array
 * @param out permuted graph
 */
template <typename EdgeTy, typename PTy>
void permute(FileGraph& in_graph, const PTy& p, FileGraph& out) {
  typedef FileGraph::GraphNode GNode;

  FileGraphWriter g;

  size_t numEdges = in_graph.sizeEdges();
  g.setNumNodes(in_graph.size());
  g.setNumEdges<EdgeTy>(numEdges);

  g.phase1();
  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;
       ++ii) {
    GNode src = *ii;
    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),
                                  ej = in_graph.edge_end(src);
         jj != ej; ++jj) {
      g.incrementDegree(p[src]);
    }
  }

  g.phase2();
  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;
       ++ii) {
    GNode src = *ii;
    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),
                                  ej = in_graph.edge_end(src);
         jj != ej; ++jj) {
      GNode dst = in_graph.getEdgeDst(jj);
      if constexpr (std::is_void<EdgeTy>::value) {
        g.addNeighbor(p[src], p[dst]);
      } else {
        EdgeTy& data = in_graph.getEdgeData<EdgeTy>(jj);
        g.addNeighbor<EdgeTy>(p[src], p[dst], data);
      }
    }
  }

  g.finish();

  out = std::move(g);
}

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_GRAPH_H
#define GALOIS_GRAPHS_GRAPH_H

#include "galois/config.h"
#include "galois/graphs/MorphGraph.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/ReadGraph.h"

#endif


================================================
FILE: libgalois/include/galois/graphs/GraphHelpers.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once

#include <cassert>
#include <vector>

#include <boost/iterator/counting_iterator.hpp>

#include "galois/config.h"
#include "galois/gIO.h"

namespace galois {
namespace graphs {

namespace internal {
/**
 * Return a suitable index between an upper bound and a lower bound that
 * attempts to get close to the target size (i.e. find a good chunk that
 * corresponds to some size) using a prefix sum.
 *
 * @tparam PrefixSumType type of the object that holds the edge prefix sum
 *
 * @param nodeWeight weight to give to a node in division
 * @param edgeWeight weight to give to an edge in division
 * @param targetWeight The amount of weight we want from the returned index
 * @param lb lower bound to start search from
 * @param ub upper bound to start search from
 * @param edgePrefixSum prefix sum of edges; may be full or partial prefix
 * sum of the object you are attempting to split
 * @param edgeOffset number of edges to subtract from edge count retrieved
 * from prefix sum; used if array is a partial prefix sum
 * @param nodeOffset number of nodes to skip over when looking in the
 * prefix sum: useful if the prefix sum is over the entire graph while you
 * just want to divide the nodes for a particular region (jump to the region
 * with the nodeOffset)
 *
 * @returns The node id that hits (or gets close to) the target size
 */
// Note: "inline" may be required if PrefixSumType is exactly the same type
// in 2 different translation units; otherwise it should be fine
template <typename PrefixSumType>
size_t findIndexPrefixSum(size_t nodeWeight, size_t edgeWeight,
                          size_t targetWeight, uint64_t lb, uint64_t ub,
                          PrefixSumType& edgePrefixSum, uint64_t edgeOffset,
                          uint64_t nodeOffset) {
  assert(nodeWeight != 0 || edgeWeight != 0);

  while (lb < ub) {
    size_t mid = lb + (ub - lb) / 2;
    size_t num_edges;

    if ((mid + nodeOffset) != 0) {
      num_edges = edgePrefixSum[mid - 1 + nodeOffset] - edgeOffset;
    } else {
      num_edges = 0;
    }

    size_t weight = num_edges * edgeWeight + mid * nodeWeight;

    if (weight < targetWeight) {
      lb = mid + 1;
    } else if (weight >= targetWeight) {
      ub = mid;
    }
  }

  return lb;
}

/**
 * Given a number of divisions and a scale factor specifying how much of a
 * chunk of blocks each division should get, determine the total number
 * of blocks to split among all divisions + calculate the prefix sum and
 * save it in-place to the scaleFactor variable.
 *
 * @param numDivisions number of divisions to split blocks among
 * @param scaleFactor vector specifying how much a particular vision should get
 *
 * @returns The total number of blocks to split among all divisions
 */
uint32_t determine_block_division(uint32_t numDivisions,
                                  std::vector<unsigned>& scaleFactor);

} // end namespace internal

/**
 * Returns 2 ranges (one for nodes, one for edges) for a particular division.
 * The ranges specify the nodes/edges that a division is responsible for. The
 * function attempts to split them evenly among units given some kind of
 * weighting for both nodes and edges.
 *
 * Assumes the parameters passed in apply to a local portion of whatever
 * is being divided (i.e. concept of a "global" object is abstracted away in
 * some sense)
 *
 * @tparam PrefixSumType type of the object that holds the edge prefix sum
 * @tparam NodeType size of the type representing the node
 *
 * @param numNodes Total number of nodes included in prefix sum
 * @param numEdges Total number of edges included in prefix sum
 * @param nodeWeight weight to give to a node in division
 * @param edgeWeight weight to give to an edge in division
 * @param id Division number you want the range for
 * @param total Total number of divisions to divide nodes among
 * @param edgePrefixSum Prefix sum of the edges in the graph
 * @param scaleFactor Vector specifying if certain divisions should get more
 * than other divisions
 * @param edgeOffset number of edges to subtract from numbers in edgePrefixSum
 * @param nodeOffset number of nodes to skip over when looking in the
 * prefix sum: useful if the prefix sum is over the entire graph while you
 * just want to divide the nodes for a particular region (jump to the region
 * with the nodeOffset)
 *
 * @returns A node pair and an edge pair specifying the assigned nodes/edges
 * to division "id"; returns LOCAL ids, not global ids (i.e. if node offset
 * was used, it is up to the caller to add the offset to the numbers)
 */
// Note: "inline" may be required if PrefixSumType is exactly the same type
// in 2 different translation units; otherwise it should be fine
// If inline is used, then apparently you cannot use typedefs, so get rid
// of those if the need arises.
template <typename PrefixSumType, typename NodeType = uint64_t>
auto divideNodesBinarySearch(
    NodeType numNodes, uint64_t numEdges, size_t nodeWeight, size_t edgeWeight,
    size_t id, size_t total, PrefixSumType& edgePrefixSum,
    std::vector<unsigned> scaleFactor = std::vector<unsigned>(),
    uint64_t edgeOffset = 0, uint64_t nodeOffset = 0) {
  typedef boost::counting_iterator<NodeType> iterator;
  typedef boost::counting_iterator<uint64_t> edge_iterator;
  typedef std::pair<iterator, iterator> NodeRange;
  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;
  typedef std::pair<NodeRange, EdgeRange> GraphRange;

  // numNodes = 0 corner case
  if (numNodes == 0) {
    return GraphRange(NodeRange(iterator(0), iterator(0)),
                      EdgeRange(edge_iterator(0), edge_iterator(0)));
  }

  assert(nodeWeight != 0 || edgeWeight != 0);
  assert(total >= 1);
  assert(id < total);

  // weight of all data
  uint64_t weight = numNodes * nodeWeight + (numEdges + 1) * edgeWeight;
  // determine the number of blocks to divide among total divisions + setup the
  // scale factor vector if necessary
  uint32_t numBlocks = internal::determine_block_division(total, scaleFactor);
  // weight of a block (one block for each division by default; if scale
  // factor specifies something different, then use that instead)
  uint64_t blockWeight = (weight + numBlocks - 1) / numBlocks;
  // galois::gDebug("weight ", weight, " numblock ", numBlocks, " blockwegith ",
  //               blockWeight);

  // lower and upper blocks that this division should use determined
  // using scaleFactor
  uint32_t blockLower;
  if (id != 0) {
    blockLower = scaleFactor[id - 1];
  } else {
    blockLower = 0;
  }

  uint32_t blockUpper = scaleFactor[id];

  assert(blockLower <= blockUpper);
  // galois::gDebug("Unit ", id, " block ", blockLower, " to ",
  //               blockUpper, "; ", blockLower * blockWeight, " ",
  //               blockUpper * blockWeight);

  uint64_t nodesLower;
  // use prefix sum to find node bounds
  if (blockLower == 0) {
    nodesLower = 0;
  } else {
    nodesLower = internal::findIndexPrefixSum(
        nodeWeight, edgeWeight, blockWeight * blockLower, 0, numNodes,
        edgePrefixSum, edgeOffset, nodeOffset);
  }

  uint64_t nodesUpper;
  nodesUpper = internal::findIndexPrefixSum(
      nodeWeight, edgeWeight, blockWeight * blockUpper, nodesLower, numNodes,
      edgePrefixSum, edgeOffset, nodeOffset);

  // get the edges bounds using node lower/upper bounds
  uint64_t edgesLower = numEdges;
  uint64_t edgesUpper = numEdges;

  if (nodesLower != nodesUpper) {
    if ((nodesLower + nodeOffset) != 0) {
      edgesLower = edgePrefixSum[nodesLower - 1 + nodeOffset] - edgeOffset;
    } else {
      edgesLower = 0;
    }

    edgesUpper = edgePrefixSum[nodesUpper - 1 + nodeOffset] - edgeOffset;
  }

  // galois::gDebug("Unit ", id, " nodes ", nodesLower, " to ",
  //               nodesUpper, " edges ", edgesLower, " ",
  //               edgesUpper);

  return GraphRange(
      NodeRange(iterator(nodesLower), iterator(nodesUpper)),
      EdgeRange(edge_iterator(edgesLower), edge_iterator(edgesUpper)));
}

// second internal namespace
namespace internal {

/**
 * Checks the begin/end node and number of units to split to for corner cases
 * (e.g. only one unit to split to, only 1 node, etc.).
 *
 * @param unitsToSplit number of units to split nodes among
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param returnRanges vector to store result in
 * @returns true if a corner case was found (indicates that returnRanges has
 * been finalized)
 */
bool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,
                               uint32_t endNode,
                               std::vector<uint32_t>& returnRanges);

/**
 * Helper function used by determineUnitRangesGraph that consists of the main
 * loop over all units and calls to divide by node to determine the
 * division of nodes to units.
 *
 * Saves the ranges to an argument vector provided by the caller.
 *
 * @tparam GraphTy type of the graph object
 *
 * @param graph The graph object to get prefix sum information from
 * @param unitsToSplit number of units to split nodes among
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param returnRanges Vector to store unit offsets for ranges in
 * @param nodeAlpha The higher the number, the more weight nodes have in
 * determining division of nodes (edges have weight 1).
 */
template <typename GraphTy>
void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
                                  uint32_t beginNode, uint32_t endNode,
                                  std::vector<uint32_t>& returnRanges,
                                  uint32_t nodeAlpha) {
  assert(beginNode != endNode);

  uint32_t numNodesInRange = endNode - beginNode;
  uint64_t numEdgesInRange =
      graph.edge_end(endNode - 1) - graph.edge_begin(beginNode);
  uint64_t edgeOffset = *graph.edge_begin(beginNode);

  returnRanges[0] = beginNode;
  std::vector<unsigned int> dummyScaleFactor;

  for (uint32_t i = 0; i < unitsToSplit; i++) {
    // determine division for unit i
    auto nodeSplits =
        divideNodesBinarySearch<GraphTy, uint32_t>(
            numNodesInRange, numEdgesInRange, nodeAlpha, 1, i, unitsToSplit,
            graph, dummyScaleFactor, edgeOffset, beginNode)
            .first;

    // i.e. if there are actually assigned nodes
    if (nodeSplits.first != nodeSplits.second) {
      if (i != 0) {
        assert(returnRanges[i] == *(nodeSplits.first) + beginNode);
      } else { // i == 0
        assert(returnRanges[i] == beginNode);
      }
      returnRanges[i + 1] = *(nodeSplits.second) + beginNode;
    } else {
      // unit assinged no nodes, copy last one
      returnRanges[i + 1] = returnRanges[i];
    }

    galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
                   " to ", returnRanges[i + 1], ", num edges is ",
                   graph.edge_end(returnRanges[i + 1] - 1) -
                       graph.edge_begin(returnRanges[i]));
  }
}

/**
 * Helper function used by determineUnitRangesPrefixSum that consists of the
 * main loop over all units and calls to divide by node to determine the
 * division of nodes to units.
 *
 * Saves the ranges to an argument vector provided by the caller.
 *
 * @tparam VectorTy type of the prefix sum object
 *
 * @param prefixSum Holds prefix sum information
 * @param unitsToSplit number of units to split nodes among
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param returnRanges Vector to store unit offsets for ranges in
 * @param nodeAlpha The higher the number, the more weight nodes have in
 * determining division of nodes (edges have weight 1).
 */
template <typename VectorTy>
void determineUnitRangesLoopPrefixSum(VectorTy& prefixSum,
                                      uint32_t unitsToSplit, uint32_t beginNode,
                                      uint32_t endNode,
                                      std::vector<uint32_t>& returnRanges,
                                      uint32_t nodeAlpha) {
  assert(beginNode != endNode);

  uint32_t numNodesInRange = endNode - beginNode;

  uint64_t numEdgesInRange;
  uint64_t edgeOffset;
  if (beginNode != 0) {
    numEdgesInRange = prefixSum[endNode - 1] - prefixSum[beginNode - 1];
    edgeOffset      = prefixSum[beginNode - 1];
  } else {
    numEdgesInRange = prefixSum[endNode - 1];
    edgeOffset      = 0;
  }

  returnRanges[0] = beginNode;
  std::vector<unsigned int> dummyScaleFactor;

  for (uint32_t i = 0; i < unitsToSplit; i++) {
    // determine division for unit i
    auto nodeSplits =
        divideNodesBinarySearch<VectorTy, uint32_t>(
            numNodesInRange, numEdgesInRange, nodeAlpha, 1, i, unitsToSplit,
            prefixSum, dummyScaleFactor, edgeOffset, beginNode)
            .first;

    // i.e. if there are actually assigned nodes
    if (nodeSplits.first != nodeSplits.second) {
      if (i != 0) {
        assert(returnRanges[i] == *(nodeSplits.first) + beginNode);
      } else { // i == 0
        assert(returnRanges[i] == beginNode);
      }
      returnRanges[i + 1] = *(nodeSplits.second) + beginNode;
    } else {
      // unit assinged no nodes
      returnRanges[i + 1] = returnRanges[i];
    }

    galois::gDebug("Unit ", i, " gets nodes ", returnRanges[i], " to ",
                   returnRanges[i + 1]);
  }
}

/**
 * Sanity checks a finalized unit range vector.
 *
 * @param unitsToSplit number of units to split nodes among
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param returnRanges Ranges to sanity check
 */
void unitRangeSanity(uint32_t unitsToSplit, uint32_t beginNode,
                     uint32_t endNode, std::vector<uint32_t>& returnRanges);

} // namespace internal

/**
 * Determines node division ranges for all nodes in a graph and returns it in
 * an offset vector. (node ranges = assigned nodes that a particular unit
 * of execution should work on)
 *
 * Checks for corner cases, then calls the main loop function.
 *
 * ONLY CALL AFTER GRAPH IS CONSTRUCTED as it uses functions that assume
 * the graph is already constructed.
 *
 * @tparam GraphTy type of the graph object
 *
 * @param graph The graph object to get prefix sum information from
 * @param unitsToSplit number of units to split nodes among
 * @param nodeAlpha The higher the number, the more weight nodes have in
 * determining division of nodes (edges have weight 1).
 * @returns vector that indirectly specifies which units get which nodes
 */
template <typename GraphTy>
std::vector<uint32_t> determineUnitRangesFromGraph(GraphTy& graph,
                                                   uint32_t unitsToSplit,
                                                   uint32_t nodeAlpha = 0) {
  uint32_t totalNodes = graph.size();

  std::vector<uint32_t> returnRanges;
  returnRanges.resize(unitsToSplit + 1);

  // check corner cases
  if (internal::unitRangeCornerCaseHandle(unitsToSplit, 0, totalNodes,
                                          returnRanges)) {
    return returnRanges;
  }

  // no corner cases: onto main loop over nodes that determines
  // node ranges
  internal::determineUnitRangesLoopGraph(graph, unitsToSplit, 0, totalNodes,
                                         returnRanges, nodeAlpha);

  internal::unitRangeSanity(unitsToSplit, 0, totalNodes, returnRanges);

  return returnRanges;
}

/**
 * Determines node division ranges for a given range of nodes and returns it
 * as an offset vector. (node ranges = assigned nodes that a particular unit
 * of execution should work on)
 *
 * Checks for corner cases, then calls the main loop function.
 *
 * ONLY CALL AFTER GRAPH IS CONSTRUCTED as it uses functions that assume
 * the graph is already constructed.
 *
 * @tparam GraphTy type of the graph object
 *
 * @param graph The graph object to get prefix sum information from
 * @param unitsToSplit number of units to split nodes among
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param nodeAlpha The higher the number, the more weight nodes have in
 * determining division of nodes (edges have weight 1).
 * @returns vector that indirectly specifies which units get which nodes
 */
template <typename GraphTy>
std::vector<uint32_t>
determineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,
                             uint32_t beginNode, uint32_t endNode,
                             uint32_t nodeAlpha = 0) {
  std::vector<uint32_t> returnRanges;
  returnRanges.resize(unitsToSplit + 1);

  if (internal::unitRangeCornerCaseHandle(unitsToSplit, beginNode, endNode,
                                          returnRanges)) {
    return returnRanges;
  }

  // no corner cases: onto main loop over nodes that determines
  // node ranges
  internal::determineUnitRangesLoopGraph(graph, unitsToSplit, beginNode,
                                         endNode, returnRanges, nodeAlpha);

  internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);

  return returnRanges;
}

/**
 * Uses the divideByNode function (which is binary search based) to
 * divide nodes among units using a provided prefix sum.
 *
 * @tparam VectorTy type of the prefix sum object
 *
 * @param unitsToSplit number of units to split nodes among
 * @param edgePrefixSum A prefix sum of edges
 * @param nodeAlpha amount of weight to give to nodes when dividing work among
 * threads
 * @returns vector that indirectly specifies how nodes are split amongs units
 * of execution
 */
template <typename VectorTy>
std::vector<uint32_t> determineUnitRangesFromPrefixSum(uint32_t unitsToSplit,
                                                       VectorTy& edgePrefixSum,
                                                       uint32_t nodeAlpha = 0) {
  assert(unitsToSplit > 0);

  std::vector<uint32_t> nodeRanges;
  nodeRanges.resize(unitsToSplit + 1);

  nodeRanges[0] = 0;

  uint32_t numNodes = edgePrefixSum.size();
  // handle corner case TODO there are better ways to do this, i.e. call helper
  if (numNodes == 0) {
    nodeRanges[0] = 0;

    for (uint32_t i = 0; i < unitsToSplit; i++) {
      nodeRanges[i + 1] = 0;
    }
    return nodeRanges;
  }

  uint64_t numEdges = edgePrefixSum[numNodes - 1];

  for (uint32_t i = 0; i < unitsToSplit; i++) {
    auto nodeSplits =
        divideNodesBinarySearch<VectorTy, uint32_t>(
            numNodes, numEdges, nodeAlpha, 1, i, unitsToSplit, edgePrefixSum)
            .first;

    // i.e. if there are actually assigned nodes
    if (nodeSplits.first != nodeSplits.second) {
      if (i != 0) {
        assert(nodeRanges[i] == *(nodeSplits.first));
      } else { // i == 0
        assert(nodeRanges[i] == 0);
      }
      nodeRanges[i + 1] = *(nodeSplits.second);
    } else {
      // unit assinged no nodes
      nodeRanges[i + 1] = nodeRanges[i];
    }

    galois::gDebug("Unit ", i, " gets nodes ", nodeRanges[i], " to ",
                   nodeRanges[i + 1]);
  }

  return nodeRanges;
}

/**
 * Uses the divideByNode function (which is binary search based) to
 * divide nodes among units using a provided prefix sum. Provide a node range
 * so that the prefix sum is only calculated using that range.
 *
 * @tparam VectorTy type of the prefix sum object
 *
 * @param unitsToSplit number of units to split nodes among
 * @param edgePrefixSum A prefix sum of edges
 * @param beginNode Beginning of range
 * @param endNode End of range, non-inclusive
 * @param nodeAlpha amount of weight to give to nodes when dividing work among
 * threads
 * @returns vector that indirectly specifies how nodes are split amongs units
 * of execution
 */
template <typename VectorTy>
std::vector<uint32_t>
determineUnitRangesFromPrefixSum(uint32_t unitsToSplit, VectorTy& edgePrefixSum,
                                 uint32_t beginNode, uint32_t endNode,
                                 uint32_t nodeAlpha = 0) {
  std::vector<uint32_t> returnRanges;
  returnRanges.resize(unitsToSplit + 1);

  if (internal::unitRangeCornerCaseHandle(unitsToSplit, beginNode, endNode,
                                          returnRanges)) {
    return returnRanges;
  }

  // no corner cases: onto main loop over nodes that determines
  // node ranges
  internal::determineUnitRangesLoopPrefixSum(
      edgePrefixSum, unitsToSplit, beginNode, endNode, returnRanges, nodeAlpha);

  internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);

  return returnRanges;
}

} // end namespace graphs
} // end namespace galois


================================================
FILE: libgalois/include/galois/graphs/LCGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LCGRAPH_H
#define GALOIS_GRAPHS_LCGRAPH_H

#include "galois/config.h"
#include "galois/graphs/LC_CSR_Graph.h"
#include "galois/graphs/LC_InlineEdge_Graph.h"
#include "galois/graphs/LC_Linear_Graph.h"
#include "galois/graphs/LC_Morph_Graph.h"
#include "galois/graphs/LC_InOut_Graph.h"
#include "galois/graphs/LC_Adaptor_Graph.h"
#include "galois/graphs/ReadGraph.h"

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_Adaptor_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_ADAPTOR_GRAPH_H
#define GALOIS_GRAPHS_LC_ADAPTOR_GRAPH_H

#include "galois/config.h"
#include "galois/graphs/Details.h"
#include "galois/LargeArray.h"

namespace galois {
namespace graphs {

template <typename NodeTy, typename EdgeTy, typename DerivedTy,
          typename GraphNodeTy, typename IteratorTy, typename EdgeIteratorTy,
          bool HasNoLockable = false>
class LC_Adaptor_Graph
    : private internal::OutOfLineLockableFeature<HasNoLockable>,
      private internal::LocalIteratorFeature<false> {
public:
  //! If true, do not use abstract locks in graph
  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef LC_Adaptor_Graph<NodeTy, EdgeTy, DerivedTy, GraphNodeTy, IteratorTy,
                             EdgeIteratorTy, _has_no_lockable>
        type;
  };

  typedef GraphNodeTy GraphNode;
  typedef EdgeTy edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename internal::EdgeInfoBase<void*, EdgeTy>::reference
      edge_data_reference;
  typedef typename internal::NodeInfoBase<NodeTy, false>::reference
      node_data_reference;
  typedef EdgeIteratorTy edge_iterator;
  typedef IteratorTy iterator;
  typedef iterator const_iterator;
  typedef iterator local_iterator;

protected:
  template <bool _A1 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  template <bool _A1 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A1>::type* = 0) {}

  const DerivedTy& derived() const {
    return *static_cast<const DerivedTy*>(this);
  }

  DerivedTy& derived() { return *static_cast<DerivedTy*>(this); }

  size_t getId(GraphNode n) { return derived().get_id(n); }

public:
  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    acquireNode(N, mflag);
    return derived().get_data(N);
  }

  edge_data_reference
  getEdgeData(edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    return derived().get_edge_data(ni);
  }

  GraphNode getEdgeDst(edge_iterator ni) { return derived().get_edge_dst(ni); }

  uint64_t size() const { return derived().get_size(); }
  uint64_t sizeEdges() const { return derived().get_size_edges(); }

  iterator begin() const { return derived().get_begin(); }
  iterator end() const { return derived().get_end(); }
  local_iterator local_begin() {
    return local_iterator(this->localBegin(size()));
  }
  local_iterator local_end() { return local_iterator(this->localEnd(size())); }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = derived().get_edge_begin(N),
                         ee = derived().get_edge_end(N);
           ii != ee; ++ii) {
        acquireNode(getEdgeDst(ii), mflag);
      }
    }
    return derived().get_edge_begin(N);
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return derived().get_edge_end(N);
  }

  internal::EdgesIterator<LC_Adaptor_Graph>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::EdgesIterator<LC_Adaptor_Graph>(*this, N, mflag);
  }
};

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file LC_CSR_CSC_Graph.h
 *
 * Contains the implementation of a bidirectional LC_CS_Graph.
 */
#ifndef GALOIS_GRAPHS_LC_CSR_CSC_GRAPH_H
#define GALOIS_GRAPHS_LC_CSR_CSC_GRAPH_H

#include "galois/config.h"

#include "galois/graphs/LC_CSR_Graph.h"

namespace galois {
namespace graphs {

/**
 * An bidirectional LC_CSR_Graph that allows the construction of in-edges from
 * its outedges.
 *
 * @tparam NodeTy type of the node data
 * @tparam EdgeTy type of the edge data
 * @tparam EdgeDataByValue If set to true, the in-edges will have their own
 * copy of the edge data. Otherwise, the in-edge edge data will be shared with
 * its corresponding out-edge.
 * @tparam HasNoLockable If set to true, then node accesses will cannot acquire
 * an abstract lock. Otherwise, accessing nodes can get a lock.
 * @tparam UseNumaAlloc If set to true, allocate data in a possibly more NUMA
 * friendly way.
 * @tparam HasOutOfLineLockable
 * @tparam FileEdgeTy
 */
template <typename NodeTy, typename EdgeTy, bool EdgeDataByValue = false,
          bool HasNoLockable = false, bool UseNumaAlloc = false,
          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>
class LC_CSR_CSC_Graph
    : public LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                          HasOutOfLineLockable, FileEdgeTy> {
  // typedef to make it easier to read
  //! Typedef referring to base LC_CSR_Graph
  using BaseGraph = LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                 HasOutOfLineLockable, FileEdgeTy>;
  //! Typedef referring to this class itself
  using ThisGraph =
      LC_CSR_CSC_Graph<NodeTy, EdgeTy, EdgeDataByValue, HasNoLockable,
                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy>;

public:
  //! Graph node typedef
  using GraphNode = uint32_t;

protected:
  // retypedefs of base class
  //! large array for edge data
  using EdgeData = LargeArray<EdgeTy>;
  //! large array for edge destinations
  using EdgeDst = LargeArray<uint32_t>;
  //! large array for edge index data
  using EdgeIndData = LargeArray<uint64_t>;

public:
  //! iterator for edges
  using edge_iterator =
      boost::counting_iterator<typename EdgeIndData::value_type>;
  //! reference to edge data
  using edge_data_reference = typename EdgeData::reference;

protected:
  //! edge index data for the reverse edges
  EdgeIndData inEdgeIndData;
  //! edge destination data for the reverse edges
  EdgeDst inEdgeDst;
  //! Edge data of inedges can be a value copy of the outedges (i.e. in and
  //! out edges have separate edge values) or inedges can refer to the same
  //! data as its corresponding outedge; this is what this typedef is for
  using EdgeDataRep =
      typename std::conditional<EdgeDataByValue, EdgeData, EdgeIndData>::type;
  //! The data for the reverse edges
  EdgeDataRep inEdgeData;

  //! redefinition of the edge sort iterator in LC_CSR_Graph
  using edge_sort_iterator =
      internal::EdgeSortIterator<GraphNode, typename EdgeIndData::value_type,
                                 EdgeDst, EdgeDataRep>;

  //! beginning iterator to an edge sorter for in-edges
  edge_sort_iterator in_edge_sort_begin(GraphNode N) {
    return edge_sort_iterator(*in_raw_begin(N), &inEdgeDst, &inEdgeData);
  }

  //! ending iterator to an edge sorter for in-edges
  edge_sort_iterator in_edge_sort_end(GraphNode N) {
    return edge_sort_iterator(*in_raw_end(N), &inEdgeDst, &inEdgeData);
  }

  /**
   * Copy the data of outedge by value to inedge.
   *
   * @param e_new position of out-edge to copy as an in-edge
   * @param e position of in-edge
   */
  template <bool A                            = EdgeDataByValue,
            typename std::enable_if<A>::type* = nullptr>
  void createEdgeData(const uint64_t e_new, const uint64_t e) {
    BaseGraph::edgeDataCopy(inEdgeData, BaseGraph::edgeData, e_new, e);
  }

  /**
   * Save a pointer to an outedge (i.e. map an in-edge to an out-edge). Done
   * to share edge data.
   *
   * @param e_new position of out-edge to save
   * @param e position of in-edge
   */
  template <bool A                             = EdgeDataByValue,
            typename std::enable_if<!A>::type* = nullptr>
  void createEdgeData(const uint64_t e_new, const uint64_t e) {
    if (!std::is_void<EdgeTy>::value) {
      inEdgeData[e_new] = e;
    }
  }

  /**
   * Determine the in-edge indices for every node by accumulating how many
   * in-edges each node has, getting a prefix sum, and saving it to the
   * in edge index data array.
   *
   * @param dataBuffer temporary buffer that is used to accumulate in-edge
   * counts; at the end of this function, it will contain a prefix sum of
   * in-edges
   */
  void determineInEdgeIndices(EdgeIndData& dataBuffer) {
    // counting outgoing edges in the tranpose graph by
    // counting incoming edges in the original graph
    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numEdges),
                   [&](uint64_t e) {
                     auto dst = BaseGraph::edgeDst[e];
                     __sync_add_and_fetch(&(dataBuffer[dst]), 1);
                   });

    // prefix sum calculation of the edge index array
    for (uint32_t n = 1; n < BaseGraph::numNodes; ++n) {
      dataBuffer[n] += dataBuffer[n - 1];
    }

    // copy over the new tranposed edge index data
    inEdgeIndData.allocateInterleaved(BaseGraph::numNodes);
    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),
                   [&](uint64_t n) { inEdgeIndData[n] = dataBuffer[n]; });
  }

  /**
   * Determine the destination of each in-edge and copy the data associated
   * with an edge (or point to it).
   *
   * @param dataBuffer A prefix sum of in-edges
   */
  void determineInEdgeDestAndData(EdgeIndData& dataBuffer) {
    // after this block dataBuffer[i] will now hold number of edges that all
    // nodes before the ith node have; used to determine where to start
    // saving an edge for a node
    if (BaseGraph::numNodes >= 1) {
      dataBuffer[0] = 0;
      galois::do_all(galois::iterate(UINT64_C(1), BaseGraph::numNodes),
                     [&](uint64_t n) { dataBuffer[n] = inEdgeIndData[n - 1]; });
    }

    // allocate edge dests and data
    inEdgeDst.allocateInterleaved(BaseGraph::numEdges);

    if (!std::is_void<EdgeTy>::value) {
      inEdgeData.allocateInterleaved(BaseGraph::numEdges);
    }

    galois::do_all(
        galois::iterate(UINT64_C(0), BaseGraph::numNodes), [&](uint64_t src) {
          // e = start index into edge array for a particular node
          uint64_t e = (src == 0) ? 0 : BaseGraph::edgeIndData[src - 1];

          // get all outgoing edges of a particular node in the non-transpose
          // and convert to incoming
          while (e < BaseGraph::edgeIndData[src]) {
            // destination nodde
            auto dst = BaseGraph::edgeDst[e];
            // location to save edge
            auto e_new = __sync_fetch_and_add(&(dataBuffer[dst]), 1);
            // save src as destination
            inEdgeDst[e_new] = src;
            // edge data to "new" array
            createEdgeData(e_new, e);
            e++;
          }
        });
  }

public:
  //! default constructor
  LC_CSR_CSC_Graph() = default;
  //! default move constructor
  LC_CSR_CSC_Graph(LC_CSR_CSC_Graph&& rhs) = default;
  //! default = operator
  LC_CSR_CSC_Graph& operator=(LC_CSR_CSC_Graph&&) = default;

  /////////////////////////////////////////////////////////////////////////////
  // Construction functions
  /////////////////////////////////////////////////////////////////////////////

  /**
   * Call only after the LC_CSR_Graph part of this class is fully constructed.
   * Creates the in edge data by reading from the out edge data.
   */
  void constructIncomingEdges() {
    galois::StatTimer incomingEdgeConstructTimer("IncomingEdgeConstruct");
    incomingEdgeConstructTimer.start();

    // initialize the temp array
    EdgeIndData dataBuffer;
    dataBuffer.allocateInterleaved(BaseGraph::numNodes);
    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),
                   [&](uint64_t n) { dataBuffer[n] = 0; });

    determineInEdgeIndices(dataBuffer);
    determineInEdgeDestAndData(dataBuffer);

    incomingEdgeConstructTimer.stop();
  }

  /////////////////////////////////////////////////////////////////////////////
  // Access functions
  /////////////////////////////////////////////////////////////////////////////

  /**
   * Grabs in edge beginning without lock/safety.
   *
   * @param N node to get edge beginning of
   * @returns Iterator to first in edge of node N
   */
  edge_iterator in_raw_begin(GraphNode N) const {
    return edge_iterator((N == 0) ? 0 : inEdgeIndData[N - 1]);
  }

  /**
   * Grabs in edge end without lock/safety.
   *
   * @param N node to get edge end of
   * @returns Iterator to end of in edges of node N (i.e. first edge of
   * node N+1)
   */
  edge_iterator in_raw_end(GraphNode N) const {
    return edge_iterator(inEdgeIndData[N]);
  }

  /**
   * Wrapper to get the in edge end of a node; lock if necessary.
   *
   * @param N node to get edge beginning of
   * @param mflag how safe the acquire should be
   * @returns Iterator to first in edge of node N
   */
  edge_iterator in_edge_begin(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    BaseGraph::acquireNode(N, mflag);
    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = in_raw_begin(N), ee = in_raw_end(N); ii != ee;
           ++ii) {
        BaseGraph::acquireNode(inEdgeDst[*ii], mflag);
      }
    }
    return in_raw_begin(N);
  }

  /**
   * Wrapper to get the in edge end of a node; lock if necessary.
   *
   * @param N node to get in edge end of
   * @param mflag how safe the acquire should be
   * @returns Iterator to end of in edges of node N (i.e. first in edge of N+1)
   */
  edge_iterator in_edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    BaseGraph::acquireNode(N, mflag);
    return in_raw_end(N);
  }

  uint64_t getInDegree(GraphNode N) const {
    return (in_raw_end(N) - in_raw_begin(N));
  }

  /**
   * @param N node to get in edges for
   * @param mflag how safe the acquire should be
   * @returns Range to in edges of node N
   */
  runtime::iterable<NoDerefIterator<edge_iterator>>
  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(in_edge_begin(N, mflag),
                                         in_edge_end(N, mflag));
  }

  /**
   * Given an edge id for in edges, get the destination of the edge
   *
   * @param ni edge id
   * @returns destination for that in edge
   */
  GraphNode getInEdgeDst(edge_iterator ni) const { return inEdgeDst[*ni]; }

  /**
   * Given an edge id for in edge, get the data associated with that edge.
   * Returns a constant reference.
   *
   * In-edge has own copy of edge-data version.
   *
   * @param ni in-edge id
   * @returns data of the edge
   */
  template <bool A                            = EdgeDataByValue,
            typename std::enable_if<A>::type* = nullptr>
  edge_data_reference
  getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {
    return inEdgeData[*ni];
  }

  /**
   * Given an edge id for in edge, get the data associated with that edge.
   * Returns a non-constant reference.
   *
   * In-edge has own copy of edge-data version.
   *
   * @param ni in-edge id
   * @returns data of the edge
   */
  template <bool A                            = EdgeDataByValue,
            typename std::enable_if<A>::type* = nullptr>
  edge_data_reference getInEdgeData(edge_iterator ni,
                                    MethodFlag = MethodFlag::UNPROTECTED) {
    return inEdgeData[*ni];
  }

  /**
   * Given an edge id for in edge, get the data associated with that edge.
   * Returns a constant reference.
   *
   * In-edge and out-edge share edge data version.
   *
   * @param ni in-edge id
   * @returns data of the edge
   */
  template <bool A                             = EdgeDataByValue,
            typename std::enable_if<!A>::type* = nullptr>
  edge_data_reference
  getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {
    return BaseGraph::edgeData[inEdgeData[*ni]];
  }

  /**
   * Given an edge id for in edge, get the data associated with that edge.
   * Returns a non-constant reference.
   *
   * In-edge and out-edge share edge data version.
   *
   * @param ni in-edge id
   * @returns data of the edge
   */
  template <bool A                             = EdgeDataByValue,
            typename std::enable_if<!A>::type* = nullptr>
  edge_data_reference getInEdgeData(edge_iterator ni,
                                    MethodFlag = MethodFlag::UNPROTECTED) {
    return BaseGraph::edgeData[inEdgeData[*ni]];
  }

  /**
   * @returns the prefix sum of in-edges
   */
  const EdgeIndData& getInEdgePrefixSum() const { return inEdgeIndData; }

  /////////////////////////////////////////////////////////////////////////////
  // Utility
  /////////////////////////////////////////////////////////////////////////////

  /**
   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
   */
  void sortInEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    BaseGraph::acquireNode(N, mflag);
    // depending on value/ref the type of EdgeSortValue changes
    using EdgeSortVal = EdgeSortValue<
        GraphNode,
        typename std::conditional<EdgeDataByValue, EdgeTy, uint64_t>::type>;

    std::sort(in_edge_sort_begin(N), in_edge_sort_end(N),
              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
                return e1.dst < e2.dst;
              });
  }

  /**
   * Sorts all incoming edges of all nodes in parallel. Comparison is over
   * getEdgeDst(e).
   */
  void sortAllInEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate((size_t)0, this->size()),
        [=](GraphNode N) { this->sortInEdgesByDst(N, mflag); },
        galois::no_stats(), galois::steal());
  }

  /**
   * Directly reads the GR file to construct CSR graph
   * and then constructs reverse edges based on that.
   */
  void readAndConstructBiGraphFromGRFile(const std::string& filename) {
    this->readGraphFromGRFile(filename);
    constructIncomingEdges();
  }
};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/LC_CSR_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_CSR_GRAPH_H
#define GALOIS_GRAPHS_LC_CSR_GRAPH_H

#include <fstream>
#include <type_traits>

#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/serialization/split_member.hpp>
#include <boost/serialization/binary_object.hpp>
#include <boost/serialization/serialization.hpp>

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/GraphHelpers.h"
#include "galois/PODResizeableArray.h"

namespace galois::graphs {
/**
 * Local computation graph (i.e., graph structure does not change). The data
 * representation is the traditional compressed-sparse-row (CSR) format.
 *
 * The position of template parameters may change between Galois releases; the
 * most robust way to specify them is through the with_XXX nested templates.
 *
 * An example of use:
 *
 * \snippet test/graph.cpp Using a graph
 *
 * And in C++11:
 *
 * \snippet test/graph.cpp Using a graph cxx11
 *
 * @tparam NodeTy data on nodes
 * @tparam EdgeTy data on out edges
 */
//! [doxygennuma]
template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
          typename FileEdgeTy = EdgeTy>
class LC_CSR_Graph :
    //! [doxygennuma]
    private boost::noncopyable,
    private internal::LocalIteratorFeature<UseNumaAlloc>,
    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                               !HasNoLockable> {
  template <typename Graph>
  friend class LC_InOut_Graph;

public:
  template <bool _has_id>
  struct with_id {
    typedef LC_CSR_Graph type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef LC_CSR_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
                         HasOutOfLineLockable, FileEdgeTy>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef LC_CSR_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
                         HasOutOfLineLockable, FileEdgeTy>
        type;
  };

  template <typename _file_edge_data>
  struct with_file_edge_data {
    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                         HasOutOfLineLockable, _file_edge_data>
        type;
  };

  //! If true, do not use abstract locks in graph
  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef LC_CSR_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                         HasOutOfLineLockable, FileEdgeTy>
        type;
  };
  template <bool _has_no_lockable>
  using _with_no_lockable =
      LC_CSR_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                   HasOutOfLineLockable, FileEdgeTy>;

  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
  //! allocation.
  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                         HasOutOfLineLockable, FileEdgeTy>
        type;
  };
  template <bool _use_numa_alloc>
  using _with_numa_alloc =
      LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                   HasOutOfLineLockable, FileEdgeTy>;

  //! If true, store abstract locks separate from nodes
  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                         _has_out_of_line_lockable, FileEdgeTy>
        type;
  };

  typedef read_default_graph_tag read_tag;

protected:
  typedef LargeArray<EdgeTy> EdgeData;
  typedef LargeArray<uint32_t> EdgeDst;
  typedef internal::NodeInfoBaseTypes<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable>
      NodeInfoTypes;
  typedef internal::NodeInfoBase<NodeTy,
                                 !HasNoLockable && !HasOutOfLineLockable>
      NodeInfo;
  typedef LargeArray<uint64_t> EdgeIndData;
  typedef LargeArray<NodeInfo> NodeData;

public:
  typedef uint32_t GraphNode;
  typedef EdgeTy edge_data_type;
  typedef FileEdgeTy file_edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename EdgeData::reference edge_data_reference;
  typedef typename NodeInfoTypes::reference node_data_reference;
  using edge_iterator =
      boost::counting_iterator<typename EdgeIndData::value_type>;
  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
  typedef iterator const_iterator;
  typedef iterator local_iterator;
  typedef iterator const_local_iterator;

protected:
  NodeData nodeData;
  EdgeIndData edgeIndData;
  EdgeDst edgeDst;
  EdgeData edgeData;

  uint64_t numNodes;
  uint64_t numEdges;

  typedef internal::EdgeSortIterator<
      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
      edge_sort_iterator;

  edge_iterator raw_begin(GraphNode N) const {
    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);
  }

  edge_iterator raw_end(GraphNode N) const {
    return edge_iterator(edgeIndData[N]);
  }

  edge_sort_iterator edge_sort_begin(GraphNode N) {
    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
  }

  edge_sort_iterator edge_sort_end(GraphNode N) {
    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
  }

  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(&nodeData[N], mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

  template <bool _A1 = EdgeData::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph& graph,
                          typename FileGraph::edge_iterator nn,
                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef LargeArray<FileEdgeTy> FED;
    if (EdgeData::has_value)
      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
  }

  template <bool _A1 = EdgeData::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
    edgeData.set(*nn, {});
  }

  size_t getId(GraphNode N) { return N; }

  GraphNode getNode(size_t n) { return n; }

private:
  friend class boost::serialization::access;

  template <typename Archive>
  void save(Archive& ar, const unsigned int) const {
    ar << numNodes;
    ar << numEdges;

    // Large Arrays
    ar << edgeIndData;
    ar << edgeDst;
    ar << edgeData;
  }

  template <typename Archive>
  void load(Archive& ar, const unsigned int) {
    ar >> numNodes;
    ar >> numEdges;

    // Large Arrays
    ar >> edgeIndData;
    ar >> edgeDst;
    ar >> edgeData;

    if (!nodeData.data()) {
      if (UseNumaAlloc) {
        nodeData.allocateBlocked(numNodes);
        this->outOfLineAllocateBlocked(numNodes);
      } else {
        nodeData.allocateInterleaved(numNodes);
        this->outOfLineAllocateInterleaved(numNodes);
      }

      // Construct nodeData largeArray
      for (size_t n = 0; n < numNodes; ++n) {
        nodeData.constructAt(n);
      }
    }
  }

  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
  // the save or load depending on whether the archive is used for saving or
  // loading
  BOOST_SERIALIZATION_SPLIT_MEMBER()

public:
  LC_CSR_Graph(LC_CSR_Graph&& rhs) = default;

  LC_CSR_Graph() = default;

  LC_CSR_Graph& operator=(LC_CSR_Graph&&) = default;

  /**
   * Serializes node data using Boost.
   *
   * @param ar Boost archive to serialize to.
   */
  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
    ar << nodeData;
  }

  /**
   * Deserializes a Boost archive containing node data to the local node data
   * variable.
   *
   * @param ar Boost archive to deserialize from.
   */
  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
    ar >> nodeData;
  }

  /**
   * Serializes graph using Boost.
   *
   * @param ar Boost archive to serialize to.
   */
  void serializeGraph(boost::archive::binary_oarchive& ar) const {
    ar << numNodes;
    ar << numEdges;

    // Large Arrays
    ar << nodeData;
    ar << edgeIndData;
    ar << edgeDst;
    ar << edgeData;
  }

  /**
   * Deserializes a Boost archive to the local graph.
   *
   * @param ar Boost archive to deserialize from.
   */
  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
    ar >> numNodes;
    ar >> numEdges;

    // Large Arrays
    ar >> nodeData;
    ar >> edgeIndData;
    ar >> edgeDst;
    ar >> edgeData;
  }

  /**
   * Accesses the "prefix sum" of this graph; takes advantage of the fact
   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
   * if prefix_sum[0] = number of edges in node 0).
   *
   * ONLY USE IF GRAPH HAS BEEN LOADED
   *
   * @param n Index into edge prefix sum
   * @returns The value that would be located at index n in an edge prefix sum
   * array
   */
  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }

  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
  LC_CSR_Graph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
               EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
      : numNodes(_numNodes), numEdges(_numEdges) {
    if (UseNumaAlloc) {
      //! [numaallocex]
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      //! [numaallocex]
      this->outOfLineAllocateBlocked(numNodes, false);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
    for (size_t n = 0; n < numNodes; ++n) {
      nodeData.constructAt(n);
    }
    uint64_t cur = 0;
    for (size_t n = 0; n < numNodes; ++n) {
      cur += edgeNum(n);
      edgeIndData[n] = cur;
    }
    cur = 0;
    for (size_t n = 0; n < numNodes; ++n) {
      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
        if (EdgeData::has_value)
          edgeData.set(cur, _edgeData(n, e));
        edgeDst[cur] = _edgeDst(n, e);
        ++cur;
      }
    }
  }

  friend void swap(LC_CSR_Graph& lhs, LC_CSR_Graph& rhs) {
    swap(lhs.nodeData, rhs.nodeData);
    swap(lhs.edgeIndData, rhs.edgeIndData);
    swap(lhs.edgeDst, rhs.edgeDst);
    swap(lhs.edgeData, rhs.edgeData);
    std::swap(lhs.numNodes, rhs.numNodes);
    std::swap(lhs.numEdges, rhs.numEdges);
  }

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    NodeInfo& NI = nodeData[N];
    acquireNode(N, mflag);
    return NI.getData();
  }

  edge_data_reference
  getEdgeData(edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    return edgeData[*ni];
  }

  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }

  iterator begin() const { return iterator(0); }
  iterator end() const { return iterator(numNodes); }

  const_local_iterator local_begin() const {
    return const_local_iterator(this->localBegin(numNodes));
  }

  const_local_iterator local_end() const {
    return const_local_iterator(this->localEnd(numNodes));
  }

  local_iterator local_begin() {
    return local_iterator(this->localBegin(numNodes));
  }

  local_iterator local_end() {
    return local_iterator(this->localEnd(numNodes));
  }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
        acquireNode(edgeDst[*ii], mflag);
      }
    }
    return raw_begin(N);
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return raw_end(N);
  }

  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }

  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
    return std::find_if(edge_begin(N1), edge_end(N1),
                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
  }

  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
    auto e = std::lower_bound(
        edge_begin(N1), edge_end(N1), N2,
        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return edges(N, mflag);
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
   */
  template <typename CompTy>
  void sortEdgesByEdgeData(GraphNode N,
                           const CompTy& comp = std::less<EdgeTy>(),
                           MethodFlag mflag   = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(
        edge_sort_begin(N), edge_sort_end(N),
        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
            comp));
  }

  /**
   * Sorts outgoing edges of a node.
   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
   */
  template <typename CompTy>
  void sortEdges(GraphNode N, const CompTy& comp,
                 MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
  }

  /**
   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
   */
  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
    std::sort(edge_sort_begin(N), edge_sort_end(N),
              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
                return e1.dst < e2.dst;
              });
  }

  /**
   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
   * getEdgeDst(e).
   */
  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(size_t{0}, this->size()),
        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
        galois::no_stats(), galois::steal());
  }

  void allocateFrom(const FileGraph& graph) {
    numNodes = graph.size();
    numEdges = graph.sizeEdges();
    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {
    numNodes = nNodes;
    numEdges = nEdges;

    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void destroyAndAllocateFrom(uint32_t nNodes, uint64_t nEdges) {
    numNodes = nNodes;
    numEdges = nEdges;

    deallocate();
    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void constructNodes() {
#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
    for (uint32_t x = 0; x < numNodes; ++x) {
      nodeData.constructAt(x);
      this->outOfLineConstructAt(x);
    }
#else
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t x) {
          nodeData.constructAt(x);
          this->outOfLineConstructAt(x);
        },
        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
#endif
  }

  void deallocate() {
    nodeData.destroy();
    nodeData.deallocate();

    edgeIndData.deallocate();
    edgeIndData.destroy();

    edgeDst.deallocate();
    edgeDst.destroy();

    edgeData.deallocate();
    edgeData.destroy();
  }

  void constructEdge(uint64_t e, uint32_t dst,
                     const typename EdgeData::value_type& val) {
    edgeData.set(e, val);
    edgeDst[e] = dst;
  }

  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }

  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }

  /**
   * Perform an in-memory transpose of the graph, replacing the original
   * CSR to CSC
   */
  void transpose(const char* regionName = NULL) {
    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
    timer.start();

    EdgeDst edgeDst_old;
    EdgeData edgeData_new;
    EdgeIndData edgeIndData_old;
    EdgeIndData edgeIndData_temp;

    if (UseNumaAlloc) {
      edgeIndData_old.allocateBlocked(numNodes);
      edgeIndData_temp.allocateBlocked(numNodes);
      edgeDst_old.allocateBlocked(numEdges);
      edgeData_new.allocateBlocked(numEdges);
    } else {
      edgeIndData_old.allocateInterleaved(numNodes);
      edgeIndData_temp.allocateInterleaved(numNodes);
      edgeDst_old.allocateInterleaved(numEdges);
      edgeData_new.allocateInterleaved(numEdges);
    }

    // Copy old node->index location + initialize the temp array
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t n) {
          edgeIndData_old[n]  = edgeIndData[n];
          edgeIndData_temp[n] = 0;
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));

    // get destination of edge, copy to array, and
    galois::do_all(
        galois::iterate(UINT64_C(0), numEdges),
        [&](uint64_t e) {
          auto dst       = edgeDst[e];
          edgeDst_old[e] = dst;
          // counting outgoing edges in the tranpose graph by
          // counting incoming edges in the original graph
          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));

    // TODO is it worth doing parallel prefix sum?
    // prefix sum calculation of the edge index array
    for (uint32_t n = 1; n < numNodes; ++n) {
      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
    }

    // copy over the new tranposed edge index data
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));

    // edgeIndData_temp[i] will now hold number of edges that all nodes
    // before the ith node have
    if (numNodes >= 1) {
      edgeIndData_temp[0] = 0;
      galois::do_all(
          galois::iterate(UINT64_C(1), numNodes),
          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
    }

    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t src) {
          // e = start index into edge array for a particular node
          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];

          // get all outgoing edges of a particular node in the
          // non-transpose and convert to incoming
          while (e < edgeIndData_old[src]) {
            // destination nodde
            auto dst = edgeDst_old[e];
            // location to save edge
            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
            // save src as destination
            edgeDst[e_new] = src;
            // copy edge data to "new" array
            edgeDataCopy(edgeData_new, edgeData, e_new, e);
            e++;
          }
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));

    // if edge weights, then overwrite edgeData with new edge data
    if (EdgeData::has_value) {
      galois::do_all(
          galois::iterate(UINT64_C(0), numEdges),
          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
    }

    timer.stop();
  }

  template <bool is_non_void = EdgeData::has_value>
  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
                    uint64_t e,
                    typename std::enable_if<is_non_void>::type* = 0) {
    edgeData_new[e_new] = edgeData[e];
  }

  template <bool is_non_void = EdgeData::has_value>
  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
                    typename std::enable_if<!is_non_void>::type* = 0) {
    // does nothing
  }

  template <typename E                                            = EdgeTy,
            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
                     const bool readUnweighted = false) {
    // at this point memory should already be allocated
    auto r =
        graph
            .divideByNode(
                NodeData::size_of::value + EdgeIndData::size_of::value +
                    LC_CSR_Graph::size_of_out_of_line::value,
                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
            .first;

    this->setLocalRange(*r.first, *r.second);

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      nodeData.constructAt(*ii);
      edgeIndData[*ii] = *graph.edge_end(*ii);

      this->outOfLineConstructAt(*ii);

      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        if (readUnweighted) {
          edgeData.set(*nn, {});
        } else {
          constructEdgeValue(graph, nn);
        }
        edgeDst[*nn] = graph.getEdgeDst(nn);
      }
    }
  }

  template <typename E                                           = EdgeTy,
            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
                     const bool GALOIS_UNUSED(readUnweighted) = false) {
    // at this point memory should already be allocated
    auto r =
        graph
            .divideByNode(
                NodeData::size_of::value + EdgeIndData::size_of::value +
                    LC_CSR_Graph::size_of_out_of_line::value,
                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
            .first;

    this->setLocalRange(*r.first, *r.second);

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      nodeData.constructAt(*ii);
      edgeIndData[*ii] = *graph.edge_end(*ii);

      this->outOfLineConstructAt(*ii);

      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructEdgeValue(graph, nn);
        edgeDst[*nn] = graph.getEdgeDst(nn);
      }
    }
  }

  /**
   * Returns the reference to the edgeIndData LargeArray
   * (a prefix sum of edges)
   *
   * @returns reference to LargeArray edgeIndData
   */
  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }

  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
    return galois::graphs::divideNodesBinarySearch(
        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
  }
  /**
   *
   * custom allocator for vector<vector<>>
   * Adding for Louvain clustering
   * TODO: Find better way to do this
   */
  void constructFrom(uint32_t numNodes, uint64_t numEdges,
                     std::vector<uint64_t>& prefix_sum,
                     std::vector<std::vector<uint32_t>>& edges_id,
                     std::vector<std::vector<EdgeTy>>& edges_data) {
    // allocateFrom(numNodes, numEdges);
    /*
     * Deallocate if reusing the graph
     */
    destroyAndAllocateFrom(numNodes, numEdges);
    constructNodes();

    galois::do_all(galois::iterate((uint32_t)0, numNodes),
                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });

    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {
      if (n == 0) {
        if (edgeIndData[n] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
          std::copy(edges_data[n].begin(), edges_data[n].end(),
                    edgeData.begin());
        }
      } else {
        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(),
                    edgeDst.begin() + edgeIndData[n - 1]);
          std::copy(edges_data[n].begin(), edges_data[n].end(),
                    edgeData.begin() + edgeIndData[n - 1]);
        }
      }
    });

    initializeLocalRanges();
  }
  void constructFrom(
      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
      galois::gstl::Vector<galois::PODResizeableArray<uint32_t>>& edges_id,
      std::vector<std::vector<EdgeTy>>& edges_data) {
    allocateFrom(numNodes, numEdges);
    constructNodes();

    galois::do_all(galois::iterate((uint32_t)0, numNodes),
                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });

    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {
      if (n == 0) {
        if (edgeIndData[n] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
          std::copy(edges_data[n].begin(), edges_data[n].end(),
                    edgeData.begin());
        }
      } else {
        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(),
                    edgeDst.begin() + edgeIndData[n - 1]);
          std::copy(edges_data[n].begin(), edges_data[n].end(),
                    edgeData.begin() + edgeIndData[n - 1]);
        }
      }
    });

    initializeLocalRanges();
  }

  /**
   * Reads the GR files directly into in-memory
   * data-structures of LC_CSR graphs using freads.
   *
   * Edge is not void.
   *
   */
  template <
      typename U                                                      = void,
      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
  void readGraphFromGRFile(const std::string& filename) {
    std::ifstream graphFile(filename.c_str());
    if (!graphFile.is_open()) {
      GALOIS_DIE("failed to open file");
    }
    uint64_t header[4];
    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
    uint64_t version = header[0];
    numNodes         = header[2];
    numEdges         = header[3];
    galois::gPrint("Number of Nodes: ", numNodes,
                   ", Number of Edges: ", numEdges, "\n");
    allocateFrom(numNodes, numEdges);
    constructNodes();
    /**
     * Load outIndex array
     **/
    assert(edgeIndData.data());
    if (!edgeIndData.data()) {
      GALOIS_DIE("out of memory");
    }

    // start position to read index data
    uint64_t readPosition = (4 * sizeof(uint64_t));
    graphFile.seekg(readPosition);
    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
                   sizeof(uint64_t) * numNodes);
    /**
     * Load edgeDst array
     **/
    assert(edgeDst.data());
    if (!edgeDst.data()) {
      GALOIS_DIE("out of memory");
    }

    readPosition = ((4 + numNodes) * sizeof(uint64_t));
    graphFile.seekg(readPosition);
    if (version == 1) {
      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
                     sizeof(uint32_t) * numEdges);
      readPosition =
          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
      // version 1 padding TODO make version agnostic
      if (numEdges % 2) {
        readPosition += sizeof(uint32_t);
      }
    } else if (version == 2) {
      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
                     sizeof(uint64_t) * numEdges);
      readPosition =
          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
      if (numEdges % 2) {
        readPosition += sizeof(uint64_t);
      }
    } else {
      GALOIS_DIE("unknown file version: ", version);
    }
    /**
     * Load edge data array
     **/
    assert(edgeData.data());
    if (!edgeData.data()) {
      GALOIS_DIE("out of memory");
    }
    graphFile.seekg(readPosition);
    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
                   sizeof(EdgeTy) * numEdges);

    initializeLocalRanges();
    graphFile.close();
  }

  /**
   * Reads the GR files directly into in-memory
   * data-structures of LC_CSR graphs using freads.
   *
   * Edge is void.
   *
   */
  template <
      typename U                                                     = void,
      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
  void readGraphFromGRFile(const std::string& filename) {
    std::ifstream graphFile(filename.c_str());
    if (!graphFile.is_open()) {
      GALOIS_DIE("failed to open file");
    }
    uint64_t header[4];
    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
    uint64_t version = header[0];
    numNodes         = header[2];
    numEdges         = header[3];
    galois::gPrint("Number of Nodes: ", numNodes,
                   ", Number of Edges: ", numEdges, "\n");
    allocateFrom(numNodes, numEdges);
    constructNodes();
    /**
     * Load outIndex array
     **/
    assert(edgeIndData.data());
    if (!edgeIndData.data()) {
      GALOIS_DIE("out of memory");
    }
    // start position to read index data
    uint64_t readPosition = (4 * sizeof(uint64_t));
    graphFile.seekg(readPosition);
    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
                   sizeof(uint64_t) * numNodes);
    /**
     * Load edgeDst array
     **/
    assert(edgeDst.data());
    if (!edgeDst.data()) {
      GALOIS_DIE("out of memory");
    }
    readPosition = ((4 + numNodes) * sizeof(uint64_t));
    graphFile.seekg(readPosition);
    if (version == 1) {
      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
                     sizeof(uint32_t) * numEdges);
    } else if (version == 2) {
      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
                     sizeof(uint64_t) * numEdges);
    } else {
      GALOIS_DIE("unknown file version: ", version);
    }

    initializeLocalRanges();
    graphFile.close();
  }

  /**
   * Given a manually created graph, initialize the local ranges on this graph
   * so that threads can iterate over a balanced number of vertices.
   */
  void initializeLocalRanges() {
    galois::on_each([&](unsigned tid, unsigned total) {
      auto r = divideByNode(0, 1, tid, total).first;
      this->setLocalRange(*r.first, *r.second);
    });
  }
};

} // namespace galois::graphs

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_CSR_HYPERGRAPH_H
#define GALOIS_GRAPHS_LC_CSR_HYPERGRAPH_H

#include <type_traits>

#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/serialization/split_member.hpp>
#include <boost/serialization/binary_object.hpp>
#include <boost/serialization/serialization.hpp>

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/GraphHelpers.h"
#include "galois/PODResizeableArray.h"

namespace galois {
namespace graphs {
/**
 * Local computation graph (i.e., graph structure does not change). The data
 * representation is the traditional compressed-sparse-row (CSR) format.
 *
 * The position of template parameters may change between Galois releases; the
 * most robust way to specify them is through the with_XXX nested templates.
 *
 * An example of use:
 *
 * \snippet test/graph.cpp Using a graph
 *
 * And in C++11:
 *
 * \snippet test/graph.cpp Using a graph cxx11
 *
 * @tparam NodeTy data on nodes
 * @tparam EdgeTy data on out edges
 */
//! [doxygennuma]
template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          bool UseNumaAlloc =
              false, // true => numa-blocked, false => numa-interleaved
          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>
class LC_CSR_Hypergraph :
    //! [doxygennuma]
    private boost::noncopyable,
    private internal::LocalIteratorFeature<UseNumaAlloc>,
    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                               !HasNoLockable> {
  template <typename Graph>
  friend class LC_InOut_Graph;

public:
  template <bool _has_id>
  struct with_id {
    typedef LC_CSR_Hypergraph type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef LC_CSR_Hypergraph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
                              HasOutOfLineLockable, FileEdgeTy>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef LC_CSR_Hypergraph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
                              HasOutOfLineLockable, FileEdgeTy>
        type;
  };

  template <typename _file_edge_data>
  struct with_file_edge_data {
    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                              HasOutOfLineLockable, _file_edge_data>
        type;
  };

  //! If true, do not use abstract locks in graph
  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                              HasOutOfLineLockable, FileEdgeTy>
        type;
  };
  template <bool _has_no_lockable>
  using _with_no_lockable =
      LC_CSR_Hypergraph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                        HasOutOfLineLockable, FileEdgeTy>;

  //! If true, use NUMA-aware graph allocation
  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                              HasOutOfLineLockable, FileEdgeTy>
        type;
  };
  template <bool _use_numa_alloc>
  using _with_numa_alloc =
      LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                        HasOutOfLineLockable, FileEdgeTy>;

  //! If true, store abstract locks separate from nodes
  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                              _has_out_of_line_lockable, FileEdgeTy>
        type;
  };

  typedef read_default_graph_tag read_tag;

protected:
  typedef LargeArray<EdgeTy> EdgeData;
  typedef LargeArray<uint32_t> EdgeDst;
  typedef internal::NodeInfoBaseTypes<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable>
      NodeInfoTypes;
  typedef internal::NodeInfoBase<NodeTy,
                                 !HasNoLockable && !HasOutOfLineLockable>
      NodeInfo;
  typedef LargeArray<uint64_t> EdgeIndData;
  typedef LargeArray<NodeInfo> NodeData;

public:
  typedef uint32_t GraphNode;
  typedef EdgeTy edge_data_type;
  typedef FileEdgeTy file_edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename EdgeData::reference edge_data_reference;
  typedef typename NodeInfoTypes::reference node_data_reference;
  using edge_iterator =
      boost::counting_iterator<typename EdgeIndData::value_type>;
  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
  typedef iterator const_iterator;
  typedef iterator local_iterator;
  typedef iterator const_local_iterator;
  // for hypergraphs
  size_t hedges;
  size_t hnodes;

protected:
  NodeData nodeData;
  EdgeIndData edgeIndData;
  EdgeDst edgeDst;
  EdgeData edgeData;

  uint64_t numNodes;
  uint64_t numEdges;

  typedef internal::EdgeSortIterator<
      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
      edge_sort_iterator;

  edge_iterator raw_begin(GraphNode N) const {
    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);
  }

  edge_iterator raw_end(GraphNode N) const {
    return edge_iterator(edgeIndData[N]);
  }

  edge_sort_iterator edge_sort_begin(GraphNode N) {
    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
  }

  edge_sort_iterator edge_sort_end(GraphNode N) {
    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
  }

  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(&nodeData[N], mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

  template <bool _A1 = EdgeData::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph& graph,
                          typename FileGraph::edge_iterator nn,
                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef LargeArray<FileEdgeTy> FED;
    if (EdgeData::has_value)
      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
  }

  template <bool _A1 = EdgeData::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
    edgeData.set(*nn, {});
  }

  size_t getId(GraphNode N) { return N; }

  GraphNode getNode(size_t n) { return n; }

private:
  friend class boost::serialization::access;
  template <typename Archive>
  void save(Archive& ar, const unsigned int) const {
    ar << numNodes;
    ar << numEdges;

    // Large Arrays
    ar << edgeIndData;
    ar << edgeDst;
    ar << edgeData;
  }

  template <typename Archive>
  void load(Archive& ar, const unsigned int) {
    ar >> numNodes;
    ar >> numEdges;

    // Large Arrays
    ar >> edgeIndData;
    ar >> edgeDst;
    ar >> edgeData;

    if (!nodeData.data()) {
      if (UseNumaAlloc) {
        nodeData.allocateBlocked(numNodes);
        this->outOfLineAllocateBlocked(numNodes);
      } else {
        nodeData.allocateInterleaved(numNodes);
        this->outOfLineAllocateInterleaved(numNodes);
      }

      // Construct nodeData largeArray
      for (size_t n = 0; n < numNodes; ++n) {
        nodeData.constructAt(n);
      }
    }
  }
  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
  // the save or load depending on whether the archive is used for saving or
  // loading
  BOOST_SERIALIZATION_SPLIT_MEMBER()

public:
  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default;
  LC_CSR_Hypergraph()                        = default;
  LC_CSR_Hypergraph& operator=(LC_CSR_Hypergraph&&) = default;

  /**
   * Serializes node data using Boost.
   *
   * @param ar Boost archive to serialize to.
   */
  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
    ar << nodeData;
  }

  /**
   * Deserializes a Boost archive containing node data to the local node data
   * variable.
   *
   * @param ar Boost archive to deserialize from.
   */
  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
    ar >> nodeData;
  }

  /**
   * Serializes graph using Boost.
   *
   * @param ar Boost archive to serialize to.
   */
  void serializeGraph(boost::archive::binary_oarchive& ar) const {
    ar << numNodes;
    ar << numEdges;

    // Large Arrays
    ar << nodeData;
    ar << edgeIndData;
    ar << edgeDst;
    ar << edgeData;
  }

  /**
   * Deserializes a Boost archive to the local graph.
   *
   * @param ar Boost archive to deserialize from.
   */
  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
    ar >> numNodes;
    ar >> numEdges;

    // Large Arrays
    ar >> nodeData;
    ar >> edgeIndData;
    ar >> edgeDst;
    ar >> edgeData;
  }

  /**
   * Accesses the "prefix sum" of this graph; takes advantage of the fact
   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
   * if prefix_sum[0] = number of edges in node 0).
   *
   * ONLY USE IF GRAPH HAS BEEN LOADED
   *
   * @param n Index into edge prefix sum
   * @returns The value that would be located at index n in an edge prefix sum
   * array
   */
  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }

  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
  LC_CSR_Hypergraph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
                    EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
      : numNodes(_numNodes), numEdges(_numEdges) {
    // std::cerr << "\n**" << numNodes << " " << numEdges << "\n\n";
    if (UseNumaAlloc) {
      //! [numaallocex]
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      //! [numaallocex]
      this->outOfLineAllocateBlocked(numNodes, false);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
    // std::cerr << "Done Alloc\n";
    for (size_t n = 0; n < numNodes; ++n) {
      nodeData.constructAt(n);
    }
    // std::cerr << "Done Node Construct\n";
    uint64_t cur = 0;
    for (size_t n = 0; n < numNodes; ++n) {
      cur += edgeNum(n);
      edgeIndData[n] = cur;
    }
    // std::cerr << "Done Edge Reserve\n";
    cur = 0;
    for (size_t n = 0; n < numNodes; ++n) {
      // if (n % (1024*128) == 0)
      //  std::cout << n << " " << cur << "\n";
      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
        if (EdgeData::has_value)
          edgeData.set(cur, _edgeData(n, e));
        edgeDst[cur] = _edgeDst(n, e);
        ++cur;
      }
    }

    // std::cerr << "Done Construct\n";
  }

  friend void swap(LC_CSR_Hypergraph& lhs, LC_CSR_Hypergraph& rhs) {
    swap(lhs.nodeData, rhs.nodeData);
    swap(lhs.edgeIndData, rhs.edgeIndData);
    swap(lhs.edgeDst, rhs.edgeDst);
    swap(lhs.edgeData, rhs.edgeData);
    std::swap(lhs.numNodes, rhs.numNodes);
    std::swap(lhs.numEdges, rhs.numEdges);
  }

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    NodeInfo& NI = nodeData[N];
    acquireNode(N, mflag);
    return NI.getData();
  }

  edge_data_reference
  getEdgeData(edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    return edgeData[*ni];
  }

  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }

  iterator begin() const { return iterator(0); }
  iterator end() const { return iterator(numNodes); }

  const_local_iterator local_begin() const {
    return const_local_iterator(this->localBegin(numNodes));
  }

  const_local_iterator local_end() const {
    return const_local_iterator(this->localEnd(numNodes));
  }

  local_iterator local_begin() {
    return local_iterator(this->localBegin(numNodes));
  }

  local_iterator local_end() {
    return local_iterator(this->localEnd(numNodes));
  }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
        acquireNode(edgeDst[*ii], mflag);
      }
    }
    return raw_begin(N);
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return raw_end(N);
  }

  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
    return std::find_if(edge_begin(N1), edge_end(N1),
                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
  }

  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
    auto e = std::lower_bound(
        edge_begin(N1), edge_end(N1), N2,
        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return edges(N, mflag);
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
   */
  template <typename CompTy>
  void sortEdgesByEdgeData(GraphNode N,
                           const CompTy& comp = std::less<EdgeTy>(),
                           MethodFlag mflag   = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(
        edge_sort_begin(N), edge_sort_end(N),
        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
            comp));
  }

  /**
   * Sorts outgoing edges of a node.
   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
   */
  template <typename CompTy>
  void sortEdges(GraphNode N, const CompTy& comp,
                 MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
  }

  /**
   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
   */
  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
    std::sort(edge_sort_begin(N), edge_sort_end(N),
              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
                return e1.dst < e2.dst;
              });
  }

  /**
   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
   * getEdgeDst(e).
   */
  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(size_t{0}, this->size()),
        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
        galois::no_stats(), galois::steal());
  }

  void allocateFrom(FileGraph& graph) {
    numNodes = graph.size();
    numEdges = graph.sizeEdges();
    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {
    numNodes = nNodes;
    numEdges = nEdges;

    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeIndData.allocateBlocked(numNodes);
      edgeDst.allocateBlocked(numEdges);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeIndData.allocateInterleaved(numNodes);
      edgeDst.allocateInterleaved(numEdges);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void constructNodes() {
#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
    for (uint32_t x = 0; x < numNodes; ++x) {
      nodeData.constructAt(x);
      this->outOfLineConstructAt(x);
    }
#else
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t x) {
          nodeData.constructAt(x);
          this->outOfLineConstructAt(x);
        },
        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
#endif
  }

  void deallocate() {
    nodeData.destroy();
    nodeData.deallocate();

    edgeIndData.deallocate();
    edgeIndData.destroy();

    edgeDst.deallocate();
    edgeDst.destroy();

    edgeData.deallocate();
    edgeData.destroy();
  }

  void constructEdge(uint64_t e, uint32_t dst,
                     const typename EdgeData::value_type& val) {
    edgeData.set(e, val);
    edgeDst[e] = dst;
  }

  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }

  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }

  /**
   * Perform an in-memory transpose of the graph, replacing the original
   * CSR to CSC
   */
  void transpose(const char* regionName = NULL) {
    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
    timer.start();

    EdgeDst edgeDst_old;
    EdgeData edgeData_new;
    EdgeIndData edgeIndData_old;
    EdgeIndData edgeIndData_temp;

    if (UseNumaAlloc) {
      edgeIndData_old.allocateBlocked(numNodes);
      edgeIndData_temp.allocateBlocked(numNodes);
      edgeDst_old.allocateBlocked(numEdges);
      edgeData_new.allocateBlocked(numEdges);
    } else {
      edgeIndData_old.allocateInterleaved(numNodes);
      edgeIndData_temp.allocateInterleaved(numNodes);
      edgeDst_old.allocateInterleaved(numEdges);
      edgeData_new.allocateInterleaved(numEdges);
    }

    // Copy old node->index location + initialize the temp array
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t n) {
          edgeIndData_old[n]  = edgeIndData[n];
          edgeIndData_temp[n] = 0;
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));

    // get destination of edge, copy to array, and
    galois::do_all(
        galois::iterate(UINT64_C(0), numEdges),
        [&](uint64_t e) {
          auto dst       = edgeDst[e];
          edgeDst_old[e] = dst;
          // counting outgoing edges in the tranpose graph by
          // counting incoming edges in the original graph
          __sync_add_and_fetch(&(edgeIndData_temp[dst]), 1);
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));

    // TODO is it worth doing parallel prefix sum?
    // prefix sum calculation of the edge index array
    for (uint32_t n = 1; n < numNodes; ++n) {
      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
    }

    // copy over the new tranposed edge index data
    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));

    // edgeIndData_temp[i] will now hold number of edges that all nodes
    // before the ith node have
    if (numNodes >= 1) {
      edgeIndData_temp[0] = 0;
      galois::do_all(
          galois::iterate(UINT64_C(1), numNodes),
          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
    }

    galois::do_all(
        galois::iterate(UINT64_C(0), numNodes),
        [&](uint64_t src) {
          // e = start index into edge array for a particular node
          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];

          // get all outgoing edges of a particular node in the
          // non-transpose and convert to incoming
          while (e < edgeIndData_old[src]) {
            // destination nodde
            auto dst = edgeDst_old[e];
            // location to save edge
            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
            // save src as destination
            edgeDst[e_new] = src;
            // copy edge data to "new" array
            edgeDataCopy(edgeData_new, edgeData, e_new, e);
            e++;
          }
        },
        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));

    // if edge weights, then overwrite edgeData with new edge data
    if (EdgeData::has_value) {
      galois::do_all(
          galois::iterate(UINT64_C(0), numEdges),
          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
    }

    timer.stop();
  }

  template <bool is_non_void = EdgeData::has_value>
  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
                    uint64_t e,
                    typename std::enable_if<is_non_void>::type* = 0) {
    edgeData_new[e_new] = edgeData[e];
  }

  template <bool is_non_void = EdgeData::has_value>
  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
                    typename std::enable_if<!is_non_void>::type* = 0) {
    // does nothing
  }

  void constructFrom(FileGraph& graph, unsigned tid, unsigned total) {
    // at this point memory should already be allocated
    auto r =
        graph
            .divideByNode(
                NodeData::size_of::value + EdgeIndData::size_of::value +
                    LC_CSR_Hypergraph::size_of_out_of_line::value,
                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
            .first;

    this->setLocalRange(*r.first, *r.second);

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      nodeData.constructAt(*ii);
      edgeIndData[*ii] = *graph.edge_end(*ii);

      this->outOfLineConstructAt(*ii);

      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructEdgeValue(graph, nn);
        edgeDst[*nn] = graph.getEdgeDst(nn);
      }
    }
  }

  /**
   * Returns the reference to the edgeIndData LargeArray
   * (a prefix sum of edges)
   *
   * @returns reference to LargeArray edgeIndData
   */
  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }

  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
    return galois::graphs::divideNodesBinarySearch(
        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
  }
  /**
   *
   * custom allocator for vector<vector<>>
   * Adding for Louvain clustering
   * TODO: Find better way to do this
   */
  void constructFrom(
      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
      std::vector<std::vector<uint32_t>>&
          edges_id) { //, std::vector<std::vector<EdgeTy>>& edges_data) {
    allocateFrom(numNodes, numEdges);
    constructNodes();

    galois::do_all(galois::iterate((uint32_t)0, numNodes),
                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });

    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {
      if (n == 0) {
        if (edgeIndData[n] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
          // std::copy(edges_data[n].begin(), edges_data[n].end(),
          // edgeData.begin());
        }
      } else {
        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(),
                    edgeDst.begin() + edgeIndData[n - 1]);
          //   std::copy(edges_data[n].begin(), edges_data[n].end(),
          //   edgeData.begin() + edgeIndData[n-1]);
        }
      }
    });

    galois::on_each([&](unsigned tid, unsigned total) {
      std::vector<unsigned>
          dummy_scale_factor; // dummy passed in to function call

      auto r = divideByNode(0, 1, tid, total).first;

      // galois::gPrint("[", tid, "] : Ranges : ", *r.first, ", ", *r.second,
      // "\n");
      this->setLocalRange(*r.first, *r.second);
    });
  }
  void constructFrom(
      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
      galois::gstl::Vector<galois::PODResizeableArray<uint32_t>>&
          edges_id) { //, std::vector<std::vector<EdgeTy>>& edges_data) {
    allocateFrom(numNodes, numEdges);
    constructNodes();

    galois::do_all(galois::iterate((uint32_t)0, numNodes),
                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });

    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {
      if (n == 0) {
        if (edgeIndData[n] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
          // std::copy(edges_data[n].begin(), edges_data[n].end(),
          // edgeData.begin());
        }
      } else {
        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
          std::copy(edges_id[n].begin(), edges_id[n].end(),
                    edgeDst.begin() + edgeIndData[n - 1]);
          //   std::copy(edges_data[n].begin(), edges_data[n].end(),
          //   edgeData.begin() + edgeIndData[n-1]);
        }
      }
    });

    galois::on_each([&](unsigned tid, unsigned total) {
      std::vector<unsigned>
          dummy_scale_factor; // dummy passed in to function call

      auto r = divideByNode(0, 1, tid, total).first;

      // galois::gPrint("[", tid, "] : Ranges : ", *r.first, ", ", *r.second,
      // "\n");
      this->setLocalRange(*r.first, *r.second);
    });
  }
  // uint32_t edgeSize(uint32_t n) {
  //   return edgeDst[n].size();
  //}
};
} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_InOut_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_INOUT_GRAPH_H
#define GALOIS_GRAPHS_LC_INOUT_GRAPH_H

#include <boost/iterator/iterator_facade.hpp>
#include <boost/fusion/include/vector.hpp>
#include <boost/fusion/include/at_c.hpp>

#include "galois/config.h"
#include "galois/graphs/Details.h"
#include "galois/Galois.h"

namespace galois {
namespace graphs {

/**
 * Modify a LC_Graph to have in and out edges. In edges are stored by value, so
 * modifying them does not modify the corresponding out edge.
 */
template <typename GraphTy>
class LC_InOut_Graph : public GraphTy::template with_id<true>::type {
public:
  template <typename _node_data>
  struct with_node_data {
    typedef LC_InOut_Graph<
        typename GraphTy::template with_node_data<_node_data>::type>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef LC_InOut_Graph<
        typename GraphTy::template with_edge_data<_edge_data>::type>
        type;
  };

private:
  template <typename G>
  friend void readGraphDispatch(G&, read_lc_inout_graph_tag, const std::string&,
                                const std::string&);

  typedef typename GraphTy ::template with_id<true>::type Super;
  typedef
      typename GraphTy ::template with_id<true>::type ::template with_node_data<
          void>::type ::template with_no_lockable<true>::type InGraph;
  InGraph inGraph;
  bool asymmetric;

  typename InGraph::GraphNode inGraphNode(typename Super::GraphNode n) {
    return inGraph.getNode(idFromNode(n));
  }

  void createAsymmetric() { asymmetric = true; }

public:
  typedef Super out_graph_type;
  typedef InGraph in_graph_type;
  typedef typename Super::GraphNode GraphNode;
  typedef typename Super::file_edge_data_type file_edge_data_type;
  typedef typename Super::edge_data_type edge_data_type;
  typedef typename Super::node_data_type node_data_type;
  typedef typename Super::edge_data_reference edge_data_reference;
  typedef typename Super::node_data_reference node_data_reference;
  typedef typename Super::edge_iterator edge_iterator;
  typedef typename Super::iterator iterator;
  typedef typename Super::const_iterator const_iterator;
  typedef typename Super::local_iterator local_iterator;
  typedef typename Super::const_local_iterator const_local_iterator;
  typedef read_lc_inout_graph_tag read_tag;

  // Union of edge_iterator and InGraph::edge_iterator
  class in_edge_iterator
      : public boost::iterator_facade<in_edge_iterator, void*,
                                      std::random_access_iterator_tag, void*> {
    friend class boost::iterator_core_access;
    friend class LC_InOut_Graph;
    typedef edge_iterator Iterator0;
    typedef typename InGraph::edge_iterator Iterator1;
    typedef boost::fusion::vector<Iterator0, Iterator1> Iterators;

    Iterators its;
    LC_InOut_Graph* self;
    int type;

    void increment() {
      if (type == 0)
        ++boost::fusion::at_c<0>(its);
      else
        ++boost::fusion::at_c<1>(its);
    }

    void advance(unsigned n) {
      if (type == 0)
        boost::fusion::at_c<0>(its) += n;
      else
        boost::fusion::at_c<1>(its) += n;
    }

    bool equal(const in_edge_iterator& o) const {
      if (type != o.type)
        return false;
      if (type == 0) {
        return boost::fusion::at_c<0>(its) == boost::fusion::at_c<0>(o.its);
      } else {
        return boost::fusion::at_c<1>(its) == boost::fusion::at_c<1>(o.its);
      }
    }

    typename in_edge_iterator::difference_type
    distance_to(const in_edge_iterator& lhs) const {
      if (type == 0)
        return boost::fusion::at_c<0>(lhs.its) - boost::fusion::at_c<0>(its);
      else
        return boost::fusion::at_c<1>(lhs.its) - boost::fusion::at_c<1>(its);
    }

    void* dereference() const { return 0; }

  public:
    in_edge_iterator() : type(0) {}
    in_edge_iterator(Iterator0 it) : type(0) {
      boost::fusion::at_c<0>(its) = it;
    }
    in_edge_iterator(Iterator1 it, int) : type(1) {
      boost::fusion::at_c<1>(its) = it;
    }
  };

  LC_InOut_Graph() : asymmetric(false) {}

  edge_data_reference
  getInEdgeData(in_edge_iterator ni,
                MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    if (ni.type == 0) {
      return this->getEdgeData(boost::fusion::at_c<0>(ni.its));
    } else {
      return inGraph.getEdgeData(boost::fusion::at_c<1>(ni.its));
    }
  }

  GraphNode getInEdgeDst(in_edge_iterator ni) {
    if (ni.type == 0) {
      return this->getEdgeDst(boost::fusion::at_c<0>(ni.its));
    } else {
      return nodeFromId(
          inGraph.getId(inGraph.getEdgeDst(boost::fusion::at_c<1>(ni.its))));
    }
  }

  in_edge_iterator in_edge_begin(GraphNode N,
                                 MethodFlag mflag = MethodFlag::WRITE) {
    this->acquireNode(N, mflag);
    if (!asymmetric) {
      if (galois::runtime::shouldLock(mflag)) {
        for (edge_iterator ii = this->raw_begin(N), ei = this->raw_end(N);
             ii != ei; ++ii) {
          this->acquireNode(this->getEdgeDst(ii), mflag);
        }
      }
      return in_edge_iterator(this->raw_begin(N));
    } else {
      if (galois::runtime::shouldLock(mflag)) {
        for (typename InGraph::edge_iterator
                 ii = inGraph.raw_begin(inGraphNode(N)),
                 ei = inGraph.raw_end(inGraphNode(N));
             ii != ei; ++ii) {
          this->acquireNode(nodeFromId(inGraph.getId(inGraph.getEdgeDst(ii))),
                            mflag);
        }
      }
      return in_edge_iterator(inGraph.raw_begin(inGraphNode(N)), 0);
    }
  }

  in_edge_iterator in_edge_end(GraphNode N,
                               MethodFlag mflag = MethodFlag::WRITE) {
    this->acquireNode(N, mflag);
    if (!asymmetric) {
      return in_edge_iterator(this->raw_end(N));
    } else {
      return in_edge_iterator(inGraph.raw_end(inGraphNode(N)), 0);
    }
  }

  internal::InEdgesIterator<LC_InOut_Graph>
  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::InEdgesIterator<LC_InOut_Graph>(*this, N, mflag);
  }

  /**
   * Sorts incoming edges of a node. Comparison function is over
   * Graph::edge_data_type.
   */
  template <typename CompTy>
  void sortInEdgesByEdgeData(
      GraphNode N,
      const CompTy& comp = std::less<typename GraphTy::edge_data_type>(),
      MethodFlag mflag   = MethodFlag::WRITE) {
    this->acquireNode(N, mflag);
    if (!asymmetric) {
      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N),
                internal::EdgeSortCompWrapper<
                    EdgeSortValue<GraphNode, typename GraphTy::edge_data_type>,
                    CompTy>(comp));
    } else {
      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),
                inGraph.edge_sort_end(inGraphNode(N)),
                internal::EdgeSortCompWrapper<
                    EdgeSortValue<GraphNode, typename GraphTy::edge_data_type>,
                    CompTy>(comp));
    }
  }

  /**
   * Sorts incoming edges of a node. Comparison function is over
   * <code>EdgeSortValue<GraphTy::edge_data_type></code>.
   */
  template <typename CompTy>
  void sortInEdges(GraphNode N, const CompTy& comp,
                   MethodFlag mflag = MethodFlag::WRITE) {
    this->acquireNode(N, mflag);
    if (!asymmetric) {
      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N), comp);
    } else {
      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),
                inGraph.edge_sort_end(inGraphNode(N)), comp);
    }
  }

  /**
   * Sorts incoming edges of a node. Comparison is by getInEdgeDst(e).
   */
  void sortInEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    this->acquireNode(N, mflag);
    if (!asymmetric) {
      typedef EdgeSortValue<GraphNode, edge_data_type> EdgeSortVal;
      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N),
                [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
                  return e1.dst < e2.dst;
                });
    } else {
      typedef EdgeSortValue<typename InGraph::GraphNode,
                            typename InGraph::edge_data_type>
          InEdgeSortVal;
      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),
                inGraph.edge_sort_end(inGraphNode(N)),
                [=](const InEdgeSortVal& e1, const InEdgeSortVal& e2) {
                  return e1.dst < e2.dst;
                });
    }
  }

  /**
   * Sorts incoming edges of all nodes. Comparison is by getInEdgeDst(e).
   */
  void sortAllInEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(*this),
        [=](GraphNode N) { this->sortInEdgesByDst(N, mflag); },
        galois::steal());
  }

  size_t idFromNode(GraphNode N) { return this->getId(N); }

  GraphNode nodeFromId(size_t N) { return this->getNode(N); }
};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_INLINEEDGE_GRAPH_H
#define GALOIS_GRAPHS_LC_INLINEEDGE_GRAPH_H

#include <type_traits>

#include "galois/config.h"
#include "galois/LargeArray.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/Details.h"

namespace galois {
namespace graphs {

/**
 * Local computation graph (i.e., graph structure does not change). The data
 * representation is a modification of {@link LC_CSR_Graph} where the edge data
 * is stored inline with the adjacency information.
 *
 * The position of template parameters may change between Galois releases; the
 * most robust way to specify them is through the with_XXX nested templates.
 */
template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
          bool HasCompressedNodePtr = false, typename FileEdgeTy = EdgeTy>
class LC_InlineEdge_Graph
    : private boost::noncopyable,
      private internal::LocalIteratorFeature<UseNumaAlloc>,
      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                                 !HasNoLockable> {
  template <typename Graph>
  friend class LC_InOut_Graph;

public:
  template <bool _has_id>
  struct with_id {
    typedef LC_InlineEdge_Graph type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef LC_InlineEdge_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasCompressedNodePtr,
                                FileEdgeTy>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef LC_InlineEdge_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasCompressedNodePtr,
                                FileEdgeTy>
        type;
  };

  template <typename _file_edge_data>
  struct with_file_edge_data {
    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasCompressedNodePtr,
                                _file_edge_data>
        type;
  };

  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasCompressedNodePtr,
                                FileEdgeTy>
        type;
  };

  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                                HasOutOfLineLockable, HasCompressedNodePtr,
                                FileEdgeTy>
        type;
  };

  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                _has_out_of_line_lockable, HasCompressedNodePtr,
                                FileEdgeTy>
        type;
  };

  /**
   * Compress representation of graph at the expense of one level of indirection
   * on accessing neighbors of a node
   */
  template <bool _has_compressed_node_ptr>
  struct with_compressed_node_ptr {
    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, _has_compressed_node_ptr,
                                FileEdgeTy>
        type;
  };

  typedef read_default_graph_tag read_tag;

protected:
  class NodeInfo;
  typedef internal::EdgeInfoBase<
      typename std::conditional<HasCompressedNodePtr, uint32_t,
                                NodeInfo*>::type,
      EdgeTy>
      EdgeInfo;
  typedef LargeArray<EdgeInfo> EdgeData;
  typedef LargeArray<NodeInfo> NodeData;
  typedef internal::NodeInfoBaseTypes<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable>
      NodeInfoTypes;

  class NodeInfo
      : public internal::NodeInfoBase<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable> {
    EdgeInfo* m_edgeBegin;
    EdgeInfo* m_edgeEnd;

  public:
    EdgeInfo*& edgeBegin() { return m_edgeBegin; }
    EdgeInfo*& edgeEnd() { return m_edgeEnd; }
  };

public:
  typedef NodeInfo* GraphNode;
  typedef EdgeTy edge_data_type;
  typedef FileEdgeTy file_edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename EdgeInfo::reference edge_data_reference;
  typedef typename NodeInfoTypes::reference node_data_reference;
  typedef EdgeInfo* edge_iterator;
  typedef galois::NoDerefIterator<NodeInfo*> iterator;
  typedef galois::NoDerefIterator<const NodeInfo*> const_iterator;
  typedef iterator local_iterator;
  typedef const_iterator const_local_iterator;

protected:
  NodeData nodeData;
  EdgeData edgeData;
  uint64_t numNodes;
  uint64_t numEdges;

  template <bool C_b = HasCompressedNodePtr>
  NodeInfo* getDst(edge_iterator ii,
                   typename std::enable_if<C_b>::type* = 0) const {
    return const_cast<NodeInfo*>(&nodeData[ii->dst]);
  }

  template <bool C_b = HasCompressedNodePtr>
  NodeInfo* getDst(edge_iterator ii,
                   typename std::enable_if<!C_b>::type* = 0) const {
    return ii->dst;
  }

  template <typename Container, typename Index, bool C_b = HasCompressedNodePtr>
  void setEdgeDst(Container&, edge_iterator edge, Index idx,
                  typename std::enable_if<C_b>::type* = 0) {
    edge->dst = idx;
  }

  template <typename Container, typename Index, bool C_b = HasCompressedNodePtr>
  void setEdgeDst(Container& c, edge_iterator edge, Index idx,
                  typename std::enable_if<!C_b>::type* = 0) {
    edge->dst = &c[idx];
  }

  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(N, mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

  edge_iterator raw_begin(GraphNode N) {
    return nodeData[getId(N)].edgeBegin();
  }

  edge_iterator raw_end(GraphNode N) { return nodeData[getId(N)].edgeEnd(); }

  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph& graph,
                          typename FileGraph::edge_iterator nn, EdgeInfo* edge,
                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef LargeArray<FileEdgeTy> FED;
    if (EdgeInfo::has_value)
      edge->construct(graph.getEdgeData<typename FED::value_type>(nn));
  }

  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                          EdgeInfo* edge,
                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
    edge->construct();
  }

  size_t getId(GraphNode N) { return std::distance(this->nodeData.data(), N); }

  GraphNode getNode(size_t n) { return &nodeData[n]; }

public:
  ~LC_InlineEdge_Graph() {
    if (!EdgeInfo::has_value)
      return;
    if (numNodes == 0)
      return;

    for (edge_iterator ii = nodeData[0].edgeBegin(),
                       ei = nodeData[numNodes - 1].edgeEnd();
         ii != ei; ++ii) {
      ii->destroy();
    }
  }

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    acquireNode(N, mflag);
    return N->getData();
  }

  edge_data_reference
  getEdgeData(edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) const {
    // galois::runtime::checkWrite(mflag, false);
    return ni->get();
  }

  GraphNode getEdgeDst(edge_iterator ni) const { return getDst(ni); }

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }

  const_iterator begin() const { return const_iterator(nodeData.begin()); }
  const_iterator end() const { return const_iterator(nodeData.end()); }
  iterator begin() { return iterator(nodeData.data()); }
  iterator end() { return iterator(nodeData.end()); }

  local_iterator local_begin() {
    return local_iterator(&nodeData[this->localBegin(numNodes)]);
  }
  local_iterator local_end() {
    return local_iterator(&nodeData[this->localEnd(numNodes)]);
  }
  const_local_iterator local_begin() const {
    return const_local_iterator(&nodeData[this->localBegin(numNodes)]);
  }
  const_local_iterator local_end() const {
    return const_local_iterator(&nodeData[this->localEnd(numNodes)]);
  }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = N->edgeBegin(), ee = N->edgeEnd(); ii != ee;
           ++ii) {
        acquireNode(getDst(ii), mflag);
      }
    }
    return N->edgeBegin();
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return N->edgeEnd();
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return edges(N, mflag);
  }

#if 0
  /**
   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
   */
  template<typename CompTy>
  void sortEdgesByEdgeData(GraphNode N, const CompTy& comp = std::less<EdgeTy>(), MethodFlag mflag = MethodFlag::WRITE) {
    galois::runtime::acquire(N, mflag);
    std::sort(edge_sort_begin(N), edge_sort_end(N), EdgeSortCompWrapper<EdgeSortValue<GraphNode,EdgeTy>,CompTy>(comp));
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
   */
  template<typename CompTy>
  void sortEdges(GraphNode N, const CompTy& comp, MethodFlag mflag = MethodFlag::WRITE) {
    galois::runtime::acquire(N, mflag);
    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
  }
#endif

  void allocateFrom(FileGraph& graph) {
    numNodes = graph.size();
    numEdges = graph.sizeEdges();

    if (UseNumaAlloc) {
      nodeData.allocateBlocked(numNodes);
      edgeData.allocateBlocked(numEdges);
      this->outOfLineAllocateBlocked(numNodes);
    } else {
      nodeData.allocateInterleaved(numNodes);
      edgeData.allocateInterleaved(numEdges);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void constructFrom(FileGraph& graph, unsigned tid, unsigned total) {
    auto r =
        graph
            .divideByNode(NodeData::size_of::value +
                              LC_InlineEdge_Graph::size_of_out_of_line::value,
                          EdgeData::size_of::value, tid, total)
            .first;

    EdgeInfo* curEdge = edgeData.data() + *graph.edge_begin(*r.first);

    this->setLocalRange(*r.first, *r.second);

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      nodeData.constructAt(*ii);
      this->outOfLineConstructAt(*ii);
      nodeData[*ii].edgeBegin() = curEdge;
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructEdgeValue(graph, nn, curEdge);
        setEdgeDst(nodeData, curEdge, graph.getEdgeDst(nn));
        ++curEdge;
      }
      nodeData[*ii].edgeEnd() = curEdge;
    }
  }
};

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_Linear_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_LC_LINEAR_GRAPH_H
#define GALOIS_GRAPHS_LC_LINEAR_GRAPH_H

#include <type_traits>

#include <boost/mpl/if.hpp>

#include "galois/config.h"
#include "galois/LargeArray.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/Details.h"

namespace galois {
namespace graphs {

/**
 * Local computation graph (i.e., graph structure does not change). The data
 * representation is a modification of {@link LC_CSR_Graph} where the edge data
 * and node data is stored inline with the adjacency information.
 *
 * The position of template parameters may change between Galois releases; the
 * most robust way to specify them is through the with_XXX nested templates.
 */
template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
          bool HasId = false, typename FileEdgeTy = EdgeTy>
class LC_Linear_Graph
    : private boost::noncopyable,
      private internal::LocalIteratorFeature<UseNumaAlloc>,
      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                                 !HasNoLockable> {
  template <typename Graph>
  friend class LC_InOut_Graph;

public:
  template <bool _has_id>
  struct with_id {
    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                            HasOutOfLineLockable, _has_id, FileEdgeTy>
        type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef LC_Linear_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
                            HasOutOfLineLockable, HasId, FileEdgeTy>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef LC_Linear_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
                            HasOutOfLineLockable, HasId, FileEdgeTy>
        type;
  };

  template <typename _file_edge_data>
  struct with_file_edge_data {
    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                            HasOutOfLineLockable, HasId, _file_edge_data>
        type;
  };

  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef LC_Linear_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                            HasOutOfLineLockable, HasId, FileEdgeTy>
        type;
  };

  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                            HasOutOfLineLockable, HasId, FileEdgeTy>
        type;
  };

  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                            _has_out_of_line_lockable,
                            _has_out_of_line_lockable || HasId, FileEdgeTy>
        type;
  };

  typedef read_with_aux_graph_tag read_tag;

protected:
  class NodeInfo;
  typedef internal::EdgeInfoBase<NodeInfo*, EdgeTy> EdgeInfo;
  typedef LargeArray<NodeInfo*> Nodes;
  typedef internal::NodeInfoBaseTypes<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable>
      NodeInfoTypes;

  class NodeInfo
      : public internal::NodeInfoBase<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable>,
        public internal::IntrusiveId<
            typename boost::mpl::if_c<HasId, uint32_t, void>::type> {
    friend class LC_Linear_Graph;
    int numEdges;

    EdgeInfo* edgeBegin() {
      NodeInfo* n = this;
      ++n; // start of edges
      return reinterpret_cast<EdgeInfo*>(n);
    }

    EdgeInfo* edgeEnd() {
      EdgeInfo* ei = edgeBegin();
      ei += numEdges;
      return ei;
    }

    NodeInfo* next() {
      NodeInfo* ni = this;
      EdgeInfo* ei = edgeEnd();
      while (reinterpret_cast<char*>(ni) < reinterpret_cast<char*>(ei))
        ++ni;
      return ni;
    }
  };

public:
  typedef NodeInfo* GraphNode;
  typedef EdgeTy edge_data_type;
  typedef FileEdgeTy file_edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename NodeInfoTypes::reference node_data_reference;
  typedef typename EdgeInfo::reference edge_data_reference;
  typedef EdgeInfo* edge_iterator;
  typedef NodeInfo** iterator;
  typedef NodeInfo* const* const_iterator;
  typedef iterator local_iterator;
  typedef const_iterator const_local_iterator;
  typedef int ReadGraphAuxData;

protected:
  LargeArray<char> data;
  uint64_t numNodes;
  uint64_t numEdges;
  Nodes nodes;

  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(N, mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

  edge_iterator raw_begin(GraphNode N) { return N->edgeBegin(); }

  edge_iterator raw_end(GraphNode N) { return N->edgeEnd(); }

  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph& graph,
                          typename FileGraph::edge_iterator nn, EdgeInfo* edge,
                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef LargeArray<FileEdgeTy> FED;
    if (EdgeInfo::has_value)
      edge->construct(graph.getEdgeData<typename FED::value_type>(nn));
  }

  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                          EdgeInfo* edge,
                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
    edge->construct();
  }

  template <bool _Enable = HasId>
  size_t getId(GraphNode N, typename std::enable_if<_Enable>::type* = 0) {
    return N->getId();
  }

  template <bool _Enable = HasId>
  GraphNode getNode(size_t n, typename std::enable_if<_Enable>::type* = 0) {
    return nodes[n];
  }

public:
  ~LC_Linear_Graph() {
    for (typename Nodes::iterator ii = nodes.begin(), ei = nodes.end();
         ii != ei; ++ii) {
      NodeInfo* n         = *ii;
      EdgeInfo* edgeBegin = n->edgeBegin();
      EdgeInfo* edgeEnd   = n->edgeEnd();

      if (EdgeInfo::has_value) {
        while (edgeBegin != edgeEnd) {
          edgeBegin->destroy();
          ++edgeBegin;
        }
      }
      n->~NodeInfo();
    }
  }

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    acquireNode(N, mflag);
    return N->getData();
  }

  edge_data_reference
  getEdgeData(edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) const {
    // galois::runtime::checkWrite(mflag, false);
    return ni->get();
  }

  GraphNode getEdgeDst(edge_iterator ni) const { return ni->dst; }

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }
  iterator begin() { return &nodes[0]; }
  iterator end() { return &nodes[numNodes]; }
  const_iterator begin() const { return &nodes[0]; }
  const_iterator end() const { return &nodes[numNodes]; }

  local_iterator local_begin() { return &nodes[this->localBegin(numNodes)]; }
  local_iterator local_end() { return &nodes[this->localEnd(numNodes)]; }
  const_local_iterator local_begin() const {
    return &nodes[this->localBegin(numNodes)];
  }
  const_local_iterator local_end() const {
    return &nodes[this->localEnd(numNodes)];
  }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = N->edgeBegin(), ee = N->edgeEnd(); ii != ee;
           ++ii) {
        acquireNode(ii->dst, mflag);
      }
    }
    return N->edgeBegin();
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return N->edgeEnd();
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return edges(N, mflag);
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
   */
  template <typename CompTy>
  void sortEdgesByEdgeData(GraphNode N,
                           const CompTy& comp = std::less<EdgeTy>(),
                           MethodFlag mflag   = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(N->edgeBegin(), N->edgeEnd(),
              internal::EdgeSortCompWrapper<EdgeInfo, CompTy>(comp));
  }

  /**
   * Sorts outgoing edges of a node. Comparison function is over
   * <code>EdgeSortValue<EdgeTy></code>.
   */
  template <typename CompTy>
  void sortEdges(GraphNode N, const CompTy& comp,
                 MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    std::sort(N->edgeBegin(), N->edgeEnd(), comp);
  }

  void allocateFrom(FileGraph& graph, const ReadGraphAuxData&) {
    numNodes = graph.size();
    numEdges = graph.sizeEdges();
    if (UseNumaAlloc) {
      data.allocateLocal(sizeof(NodeInfo) * numNodes * 2 +
                         sizeof(EdgeInfo) * numEdges);
      nodes.allocateLocal(numNodes);
      this->outOfLineAllocateLocal(numNodes);
    } else {
      data.allocateInterleaved(sizeof(NodeInfo) * numNodes * 2 +
                               sizeof(EdgeInfo) * numEdges);
      nodes.allocateInterleaved(numNodes);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          const ReadGraphAuxData&) {
    auto r = graph
                 .divideByNode(Nodes::size_of::value + 2 * sizeof(NodeInfo) +
                                   LC_Linear_Graph::size_of_out_of_line::value,
                               sizeof(EdgeInfo), tid, total)
                 .first;

    this->setLocalRange(*r.first, *r.second);
    NodeInfo* curNode = reinterpret_cast<NodeInfo*>(data.data());

    size_t id    = *r.first;
    size_t edges = *graph.edge_begin(*r.first);
    size_t bytes = edges * sizeof(EdgeInfo) + 2 * (id + 1) * sizeof(NodeInfo);
    curNode += bytes / sizeof(NodeInfo);
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei;
         ++ii, ++id) {
      nodes.constructAt(*ii);
      new (curNode) NodeInfo();
      // curNode->construct();
      curNode->setId(id);
      curNode->numEdges =
          std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));
      nodes[*ii] = curNode;
      curNode    = curNode->next();
    }
  }

  void constructEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          const ReadGraphAuxData&) {
    auto r = graph
                 .divideByNode(Nodes::size_of::value + 2 * sizeof(NodeInfo) +
                                   LC_Linear_Graph::size_of_out_of_line::value,
                               sizeof(EdgeInfo), tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      EdgeInfo* edge = nodes[*ii]->edgeBegin();
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructEdgeValue(graph, nn, edge);
        edge->dst = nodes[graph.getEdgeDst(nn)];
        ++edge;
      }
    }
  }
};

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/LC_Morph_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file LC_Morph_Graph.h
 *
 * Contains the LC_Morph_Graph class.
 */

#ifndef GALOIS_GRAPHS_LC_MORPH_GRAPH_H
#define GALOIS_GRAPHS_LC_MORPH_GRAPH_H

#include <type_traits>

#include <boost/mpl/if.hpp>

#include "galois/Bag.h"
#include "galois/config.h"
#include "galois/LargeArray.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/Details.h"

namespace galois {
namespace graphs {

/**
 * Local computation graph that allows addition of nodes (but not removals)
 * if the maximum degree of a node is known at the time it is added.
 */
template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
          bool HasId = false, typename FileEdgeTy = EdgeTy>
class LC_Morph_Graph
    : private boost::noncopyable,
      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                                 !HasNoLockable> {
  //! Friend of LC_InOut_Graph (makes a graph have both in and out edges)
  template <typename Graph>
  friend class LC_InOut_Graph;

public:
  /**
   * Struct that allows activation of the HasId template parameter
   * Example: using Graph = LC_Morph_Graph::with_id<true> defines
   * LC_Morph_Graph with HasId = true
   */
  template <bool _has_id>
  struct with_id {
    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, _has_id, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of node data through the
   * template parameter. See with_id doxygen for example.
   */
  template <typename _node_data>
  struct with_node_data {
    using type = LC_Morph_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasId, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of edge data through the
   * template parameter. See with_id doxygen for example.
   */
  template <typename _edge_data>
  struct with_edge_data {
    using type = LC_Morph_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasId, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of file edge data through the
   * template parameter. See with_id doxygen for example.
   */
  template <typename _file_edge_data>
  struct with_file_edge_data {
    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasId, _file_edge_data>;
  };

  /**
   * Struct used to define the HasNoLockable template parameter.
   * See with_id doxygen for example.
   */
  template <bool _has_no_lockable>
  struct with_no_lockable {
    using type = LC_Morph_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
                                HasOutOfLineLockable, HasId, FileEdgeTy>;
  };

  /**
   * Struct used to define the UseNumaAlloc template parameter.
   * See with_id doxygen for example.
   */
  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
                                HasOutOfLineLockable, HasId, FileEdgeTy>;
  };

  /**
   * Struct used to define the HasOutOfLineLockable template parameter.
   * See with_id doxygen for example.
   */
  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
                                _has_out_of_line_lockable,
                                _has_out_of_line_lockable || HasId, FileEdgeTy>;
  };

  //! type that tells graph reader how to read a file for this graph
  using read_tag = read_with_aux_graph_tag;

protected:
  // Forward declaration of class (defined below)
  class NodeInfo;

  //! EdgeInfo keeps destination of edges
  using EdgeInfo = internal::EdgeInfoBase<NodeInfo*, EdgeTy>;
  //! Nodes are stored in an insert bag
  using Nodes = galois::InsertBag<NodeInfo>;
  //! Type of nodes
  using NodeInfoTypes =
      internal::NodeInfoBaseTypes<NodeTy,
                                  !HasNoLockable && !HasOutOfLineLockable>;

  //! Linked list structure holding together blocks of memory that stores
  //! edges.
  struct EdgeHolder {
    //! Beginning of memory for this block.
    EdgeInfo* begin;
    //! End of memory for this block.
    EdgeInfo* end;
    //! Pointer to another block of memory for edges (if it exists).
    EdgeHolder* next;
  };

  /**
   * Class that stores node info (e.g. where its edges begin and end, its data,
   * etc.).
   */
  class NodeInfo
      : public internal::NodeInfoBase<NodeTy,
                                      !HasNoLockable && !HasOutOfLineLockable> {
    using Super =
        internal::NodeInfoBase<NodeTy, !HasNoLockable && !HasOutOfLineLockable>;
    friend class LC_Morph_Graph;

    EdgeInfo* edgeBegin;
    EdgeInfo* edgeEnd;
#ifndef NDEBUG
    EdgeInfo* trueEdgeEnd;
#endif

  public:
    //! Calls NodeInfoBase constructor
    template <typename... Args>
    NodeInfo(Args&&... args) : Super(std::forward<Args>(args)...) {}
  }; // end NodeInfo

  //! Functor that returns pointers to NodeInfo objects given references
  struct makeGraphNode {
    //! Returns a pointer to the NodeInfo reference passed into this functor
    NodeInfo* operator()(NodeInfo& data) const { return &data; }
  };

  /**
   * Functor: contains an operator to compare the destination of an edge with
   * a particular node.
   */
  struct dst_equals {
    //! Destination to compare with
    NodeInfo* dst;
    //! Constructor: takes a node to compare edge destinations with
    dst_equals(NodeInfo* d) : dst(d) {}
    //! Given an edge, check if the edge destination matches the node that
    //! this functor was constructed with
    bool operator()(const EdgeInfo& edge) { return edge.dst == dst; }
  };

public:
  //! A graph node is a NodeInfo object.
  using GraphNode = NodeInfo*;
  //! Type of edge data in file
  using file_edge_data_type = FileEdgeTy;
  //! Type of edge data
  using edge_data_type = EdgeTy;
  //! Type of node data
  using node_data_type = NodeTy;
  //! Reference type to node data
  using node_data_reference = typename NodeInfoTypes::reference;
  //! Reference type to edge data
  using edge_data_reference = typename EdgeInfo::reference;
  //! Iterator over EdgeInfo objects (edges)
  using edge_iterator = EdgeInfo*;
  //! Iterator over nodes
  using iterator =
      boost::transform_iterator<makeGraphNode, typename Nodes::iterator>;
  //! Constant iterator over nodes
  using const_iterator =
      boost::transform_iterator<makeGraphNode, typename Nodes::const_iterator>;
  //! Local iterator is just an iterator
  using local_iterator = iterator;
  //! Const local iterator is just an const_iterator
  using const_local_iterator = const_iterator;
  //! @todo doxygen this
  using ReadGraphAuxData = LargeArray<GraphNode>;

protected:
  //! Nodes in this graph
  Nodes nodes;
  //! Memory for edges in this graph (memory held in EdgeHolders)
  galois::substrate::PerThreadStorage<EdgeHolder*> edgesL;

  /**
   * Acquire a node for the scope in which the function is called.
   *
   * @param N node to acquire
   * @param mflag Method flag specifying type of acquire (e.g. read, write,
   * etc.)
   */
  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(N, mflag);
  }

  /**
   * Acquire a node for the scope in which the function is called. The
   * lock is out of line (not local to the node).
   *
   * @param N node to acquire
   * @param mflag Method flag specifying type of acquire (e.g. read, write,
   * etc.)
   */
  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(getId(N), mflag);
  }

  /**
   * Given a FileGraph and an edge in it, add it to the LCMorphGraph.
   * Can handle edge weights.
   */
  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph& graph,
                          typename FileGraph::edge_iterator nn, GraphNode src,
                          GraphNode dst,
                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
    if (EdgeInfo::has_value) {
      // type of edge data in file graph
      using FEDV = typename LargeArray<FileEdgeTy>::value_type;
      // add an edge with edge data
      addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED,
                   graph.getEdgeData<FEDV>(nn));
    } else {
      // add an edge without edge data
      addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Given a FileGraph and an edge in it, add it to the LCMorphGraph.
   * Does not handle edge weights.
   */
  template <bool _A1 = EdgeInfo::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                          GraphNode src, GraphNode dst,
                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
    addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);
  }

  /**
   * No-op acquire node when HasNoLockable is true.
   */
  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

  /**
   * Get the ID of a graph node if they're enabled in the class.
   */
  template <bool _Enable = HasId>
  size_t getId(GraphNode N, typename std::enable_if<_Enable>::type* = 0) {
    return N->getId();
  }

public:
  /**
   * Destructor. If edges have some value, destory all of it (i.e. free up
   * memory).
   */
  ~LC_Morph_Graph() {
    for (typename Nodes::iterator ii = nodes.begin(), ei = nodes.end();
         ii != ei; ++ii) {
      NodeInfo& n         = *ii;
      EdgeInfo* edgeBegin = n.edgeBegin;
      EdgeInfo* edgeEnd   = n.edgeEnd;

      if (EdgeInfo::has_value) {
        while (edgeBegin != edgeEnd) {
          edgeBegin->destroy();
          ++edgeBegin;
        }
      }
    }
  }

  /**
   * Get the data of a node N.
   */
  node_data_reference getData(const GraphNode& N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    acquireNode(N, mflag);
    return N->getData();
  }

  /**
   * Get edge data of an edge given an iterator to the edge.
   */
  edge_data_reference getEdgeData(edge_iterator ni,
                                  MethodFlag mflag = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    acquireNode(ni->dst, mflag);
    return ni->get();
  }

  /**
   * Get the destination of an edge given an iterator to the edge.
   */
  GraphNode getEdgeDst(edge_iterator ni) {
    // galois::runtime::checkWrite(mflag, false);
    // acquireNode(ni->dst, mflag);
    return GraphNode(ni->dst);
  }

  /**
   * Returns an iterator to all the nodes in the graph. Not thread-safe.
   */
  iterator begin() {
    return boost::make_transform_iterator(nodes.begin(), makeGraphNode());
  }

  //! Returns the end of the node iterator. Not thread-safe.
  iterator end() {
    return boost::make_transform_iterator(nodes.end(), makeGraphNode());
  }

  //! Return an iterator to the beginning of the local nodes of the graph.
  local_iterator local_begin() {
    return boost::make_transform_iterator(nodes.local_begin(), makeGraphNode());
  }

  //! Return an iterator to the end of the local nodes of the graph.
  local_iterator local_end() {
    return boost::make_transform_iterator(nodes.local_end(), makeGraphNode());
  }

  /**
   * Return an iterator to the first edge of a particular node.
   */
  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    // Locks all destinations before returning edge iterator.
    if (galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = N->edgeBegin, ee = N->edgeEnd; ii != ee; ++ii) {
        acquireNode(ii->dst, mflag);
      }
    }
    return N->edgeBegin;
  }

  /**
   * Return an iterator to the end of edges of a particular node.
   */
  edge_iterator edge_end(GraphNode N,
                         MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {
    return N->edgeEnd;
  }

  /**
   * Return a range for edges of a node for use by C++ for_each loops.
   */
  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  /**
   * Returns an object with begin() and end() methods to iterate over the
   * outgoing edges of N.
   */
  internal::EdgesIterator<LC_Morph_Graph>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::EdgesIterator<LC_Morph_Graph>(*this, N, mflag);
  }

  /**
   * Creates a new node with a cap on the number of edges.
   *
   * @param nedges Number of edges reserved for this node.
   * @param args Arguments required to construct a new node
   * @returns Newly created node
   */
  template <typename... Args>
  GraphNode createNode(int nedges, Args&&... args) {
    NodeInfo* N = &nodes.emplace(std::forward<Args>(args)...);
    acquireNode(N, MethodFlag::WRITE);
    EdgeHolder*& local_edges = *edgesL.getLocal();

    // Allocate space for a new EdgeHolder object if necessary
    if (!local_edges ||
        std::distance(local_edges->begin, local_edges->end) < nedges) {
      EdgeHolder* old = local_edges;
      // FIXME: this seems to leak
      size_t size       = runtime::pagePoolSize();
      void* block       = runtime::pagePoolAlloc();
      local_edges       = reinterpret_cast<EdgeHolder*>(block);
      local_edges->next = old;

      size -= sizeof(EdgeHolder);
      block = reinterpret_cast<char*>(block) + sizeof(EdgeHolder);

      if (!std::align(std::alignment_of_v<EdgeInfo>, sizeof(EdgeInfo), block,
                      size)) {
        GALOIS_DIE("not enough space for EdgeInfo");
      }

      local_edges->begin = reinterpret_cast<EdgeInfo*>(block);
      local_edges->end   = local_edges->begin;
      local_edges->end += size / sizeof(EdgeInfo);
      if (std::distance(local_edges->begin, local_edges->end) < nedges) {
        GALOIS_DIE("not enough space for EdgeInfo");
      }
    }

    // Set the memory aside for the new node in the edge holder object
    N->edgeBegin = N->edgeEnd = local_edges->begin;
    local_edges->begin += nedges;
#ifndef NDEBUG
    N->trueEdgeEnd = local_edges->begin;
#endif
    return GraphNode(N);
  }

  /**
   * Adds an edge if it doesn't already exist.
   *
   * @param src Source to add edge to
   * @param dst Destination to add edge to
   * @param mflag Method flag specifying type of acquire (e.g. read, write)
   * @param args Arguments needed to construct an edge
   */
  template <typename... Args>
  edge_iterator addEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,
                        Args&&... args) {
    // galois::runtime::checkWrite(mflag, true);
    acquireNode(src, mflag);
    auto it = std::find_if(src->edgeBegin, src->edgeEnd, dst_equals(dst));
    if (it == src->edgeEnd) {
      it->dst = dst;
      it->construct(std::forward<Args>(args)...);
      src->edgeEnd++;
      assert(src->edgeEnd <= src->trueEdgeEnd);
    }
    return it;
  }

  /**
   * Construct a new edge for a node. Can add duplicate edges.
   *
   * @param src Source node to add edge to
   * @param dst Destination node of new edge
   * @param mflag Method flag specifying type of acquire (e.g. read, write)
   * @param args Other arguments that need to be passed in to construct
   * a new edge
   * @returns Iterator to newly added edge
   */
  template <typename... Args>
  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,
                             galois::MethodFlag mflag, Args&&... args) {
    acquireNode(src, mflag);
    auto it = src->edgeEnd;
    it->dst = dst;
    it->construct(std::forward<Args>(args)...);
    src->edgeEnd++;
    assert(src->edgeEnd <= src->trueEdgeEnd);
    return it;
  }

  /**
   * Remove an edge from the graph.
   *
   * Invalidates edge iterator.
   */
  void removeEdge(GraphNode src, edge_iterator dst,
                  galois::MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, true);
    acquireNode(src, mflag);
    src->edgeEnd--;
    assert(src->edgeBegin <= src->edgeEnd);
    std::swap(*dst, *src->edgeEnd);
    src->edgeEnd->destroy();
  }

  /**
   * Finds an edge between 2 nodes and returns the iterator to it if it exists.
   */
  edge_iterator findEdge(GraphNode src, GraphNode dst,
                         galois::MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, true); // TODO: double check 'true'
    // here
    acquireNode(src, mflag);
    return std::find_if(src->edgeBegin, src->edgeEnd, dst_equals(dst));
  }

  /**
   * Allocate memory for nodes given a file graph with a particular number of
   * nodes. This graph will allocate out of line space for that number of
   * nodes as well.
   *
   * @param graph FileGraph with a number of nodes to allocate
   * @param aux Data structure in which to allocate space for nodes.
   */
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();

    if (UseNumaAlloc) {
      aux.allocateLocal(numNodes);
      this->outOfLineAllocateLocal(numNodes);
    } else {
      aux.allocateInterleaved(numNodes);
      this->outOfLineAllocateInterleaved(numNodes);
    }
  }

  /**
   * Constructs the LCMorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store pointers to the created nodes
   */
  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          ReadGraphAuxData& aux) {
    // get the portion of the graph that this thread is responsible for
    // creating
    auto r = graph
                 .divideByNode(sizeof(NodeInfo) +
                                   LC_Morph_Graph::size_of_out_of_line::value,
                               sizeof(EdgeInfo), tid, total)
                 .first;

    // create nodes of portion we are responsible for only
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux[*ii] =
          createNode(std::distance(graph.edge_begin(*ii), graph.edge_end(*ii)));
    }
  }

  /**
   * Constructs the LCMorphGraph edges given a FileGraph to construct it from
   * and pointers to already created nodes. Meant to be called by multiple
   * threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains pointers to already created nodes to
   * create edges for.
   */
  void constructEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          const ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(NodeInfo) +
                                   LC_Morph_Graph::size_of_out_of_line::value,
                               sizeof(EdgeInfo), tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);
      }
    }
  }
};

} // namespace graphs
} // namespace galois

#endif /* LC_MORPH_GRAPH_H_ */


================================================
FILE: libgalois/include/galois/graphs/MorphGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file MorphGraph.h
 *
 * Contains MorphGraph and associated helpers.
 */

#ifndef GALOIS_GRAPHS_MORPHGRAPH_H
#define GALOIS_GRAPHS_MORPHGRAPH_H

#include <algorithm>
#include <map>
#include <set>
#include <type_traits>
#include <vector>

#include <boost/container/small_vector.hpp>
#include <boost/functional.hpp>
#include <boost/iterator/transform_iterator.hpp>
#include <boost/iterator/filter_iterator.hpp>
#include <boost/container/small_vector.hpp>

#include "galois/Bag.h"
#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/Details.h"
#include "galois/gstl.h"

#ifdef AUX_MAP
#include "galois/PerThreadContainer.h"
#else
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/SimpleLock.h"
#endif

namespace galois {
namespace graphs {

namespace internal {
/**
 * Wrapper class to have a valid type on void edges
 */
template <typename NTy, typename ETy, bool DirectedButNotInOut>
struct UEdgeInfoBase;

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, true> {
  typedef ETy& reference;

  NTy* N;
  ETy Ea;

  inline NTy* first() {
    assert(N);
    return N;
  }
  inline const NTy* first() const {
    assert(N);
    return N;
  }
  inline ETy* second() { return &Ea; }
  inline const ETy* second() const { return &Ea; }

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)
      : N(n), Ea(std::forward<Args>(args)...) {}

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {
    Ea = v;
  }

  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return false; }
};

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, false> {
  typedef ETy& reference;

  NTy* N;
  ETy* Ea;

  inline NTy* first() {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline const NTy* first() const {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline ETy* second() { return Ea; }
  inline const ETy* second() const { return Ea; }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}
  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, true> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return N; }
  inline const NTy* first() const { return N; }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return false; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, false> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }
  inline const NTy* first() const { return (NTy*)((uintptr_t)N & ~1); }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

/*
 * Only graphs w/ in-out/symmetric edges and non-void edge data,
 * i.e. ETy != void and DirectedNotInOut = false,
 * need to allocate memory for edge data
 */
template <typename ETy, bool DirectedNotInOut>
struct EdgeFactory {
  galois::InsertBag<ETy> mem;
  template <typename... Args>
  ETy* mkEdge(Args&&... args) {
    return &mem.emplace(std::forward<Args>(args)...);
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <typename ETy>
struct EdgeFactory<ETy, true> {
  template <typename... Args>
  ETy* mkEdge(Args&&...) {
    return nullptr;
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <>
struct EdgeFactory<void, false> {
  template <typename... Args>
  void* mkEdge(Args&&...) {
    return static_cast<void*>(NULL);
  }
  void delEdge(void*) {}
  bool mustDel() const { return false; }
};

} // namespace internal

/**
 * A graph that can have new nodes and edges added to it.
 *
 * An example of use:
 *
 * \code
 * struct Node {
 *   ... // Definition of node data
 * };
 *
 * typedef galois::graphs::MorphGraph<Node,int,true> Graph;
 *
 * // Create graph
 * Graph g;
 * Node n1, n2;
 * Graph::GraphNode a, b;
 * a = g.createNode(n1);
 * g.addNode(a);
 * b = g.createNode(n2);
 * g.addNode(b);
 * g.getEdgeData(g.addEdge(a, b)) = 5;
 *
 * // Traverse graph
 * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
 *   Graph::GraphNode src = *ii;
 *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);
 *        jj != ej;
 *        ++jj) {
 *     Graph::GraphNode dst = graph.getEdgeDst(jj);
 *     int edgeData = g.getEdgeData(jj);
 *     assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * And in C++11:
 *
 * \code
 * // Traverse graph
 * for (Graph::GraphNode src : g) {
 *   for (Graph::edge_iterator edge : g.out_edges(src)) {
 *     Graph::GraphNode dst = g.getEdgeDst(edge);
 *     int edgeData = g.getEdgeData(edge);
 *     assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * @tparam NodeTy Type of node data
 * @tparam EdgeTy Type of edge data
 * @tparam Directional true if graph is directed
 * @tparam InOut true if directed graph tracks in-edges
 * @tparam HasNoLockable if true, use no abstract locks in the graph
 * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)
 * @tparam FileEdgeTy type of edges on file to be read from
 */
template <typename NodeTy, typename EdgeTy, bool Directional,
          bool InOut = false, bool HasNoLockable = false,
          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>
class MorphGraph : private boost::noncopyable {
public:
  /**
   * Struct used to define the HasNoLockable template parameter as a type
   * in the struct.
   */
  template <bool _has_no_lockable>
  struct with_no_lockable {
    //! Type with Lockable parameter set according to struct template arg
    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut,
                            _has_no_lockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of node data in the graph.
   */
  template <typename _node_data>
  struct with_node_data {
    //! Type with node data parameter set according to struct template arg
    using type = MorphGraph<_node_data, EdgeTy, Directional, InOut,
                            HasNoLockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of edge data in the graph.
   */
  template <typename _edge_data>
  struct with_edge_data {
    //! Type with edge data parameter set according to struct template arg
    using type = MorphGraph<NodeTy, _edge_data, Directional, InOut,
                            HasNoLockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of file edge data in the graph.
   */
  template <typename _file_edge_data>
  struct with_file_edge_data {
    //! Type with file edge data parameter set according to struct template arg
    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,
                            SortedNeighbors, _file_edge_data>;
  };

  /**
   * Struct used to define directionality of the graph.
   */
  template <bool _directional>
  struct with_directional {
    //! Type with directional parameter set according to struct template arg
    using type = MorphGraph<NodeTy, EdgeTy, _directional, InOut, HasNoLockable,
                            SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define if neighbors are sorted or not in the graph.
   */
  template <bool _sorted_neighbors>
  struct with_sorted_neighbors {
    //! Type with sort neighbor parameter set according to struct template arg
    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,
                            _sorted_neighbors, FileEdgeTy>;
  };

  //! Tag that defines to graph reader how to read a graph into this class
  using read_tag = read_with_aux_first_graph_tag;

private: ///////////////////////////////////////////////////////////////////////
  template <typename T>
  struct first_eq_and_valid {
    T N2;
    first_eq_and_valid(T& n) : N2(n) {}
    template <typename T2>
    bool operator()(const T2& ii) const {
      return ii.first() == N2 && ii.first() && ii.first()->active;
    }
  };

  struct first_not_valid {
    template <typename T2>
    bool operator()(const T2& ii) const {
      return !ii.first() || !ii.first()->active;
    }
  };

  template <typename T>
  struct first_lt {
    template <typename T2>
    bool operator()(const T& N2, const T2& ii) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return N2 < ii.first();
    }
    template <typename T2>
    bool operator()(const T2& ii, const T& N2) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return ii.first() < N2;
    }
  };

  // forward declaration for graph node type
  class gNode;
  struct gNodeTypes
      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {
    //! The storage type for an edge
    using EdgeInfo =
        internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>;

    //! The storage type for edges
    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;
    using EdgesTy = boost::container::small_vector<
        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>;

    using iterator = typename EdgesTy::iterator;
  };

  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,
                public gNodeTypes {
    //! friend of MorphGraph since MorphGraph contains gNodes
    friend class MorphGraph;
    //! Storage type for node
    using NodeInfo = internal::NodeInfoBase<NodeTy, !HasNoLockable>;
    //! iterator over edges (taken from gNodeTypes)
    using iterator = typename gNode::iterator;
    //! Storage type of a single edge (taken from gNodeTypes)
    using EdgeInfo = typename gNode::EdgeInfo;

    //! edges on this node
    typename gNodeTypes::EdgesTy edges;

    //! Tracks if this node is considered as "in" the graph
    bool active;

    //! Return iterator to first edge
    iterator begin() { return edges.begin(); }
    //! Return iterator to end of edges
    iterator end() { return edges.end(); }

    //! Remove the provided edge from this node
    //! @param ii iterator to edge to remove
    void erase(iterator ii) {
      if (SortedNeighbors) {
        // For sorted case remove the element, moving following
        // elements back to fill the space.
        edges.erase(ii);
      } else {
        // We don't need to preserve the order, so move the last edge
        // into this place and then remove last edge.
        *ii = edges.back();
        edges.pop_back();
      }
    }

    /**
     * Erase an edge with a provided destination.
     */
    void erase(gNode* N, bool inEdge = false) {
      iterator ii = find(N, inEdge);
      if (ii != end())
        edges.erase(ii);
    }

    /**
     * Find an edge with a particular destination node.
     */
    iterator find(gNode* N, bool inEdge = false) {
      iterator ii, ei = edges.end();
      // find starting point to start search
      if (SortedNeighbors) {
        assert(std::is_sorted(edges.begin(), edges.end(),
                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                                return e1.first() < e2.first();
                              }));
        ii =
            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
      } else {
        ii = edges.begin();
      }

      first_eq_and_valid<gNode*> checker(N);
      ii = std::find_if(ii, ei, checker);
      while (ii != ei && ii->isInEdge() != inEdge) {
        ++ii;
        ii = std::find_if(ii, ei, checker);
      };
      return ii;
    }

    /**
     * Make space for more edges stored by this node
     */
    void resizeEdges(size_t size) {
      edges.resize(size, EdgeInfo(new gNode(), 0));
    }

    /**
     * Add a new edge to this node
     */
    template <typename... Args>
    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {
      iterator ii;
      if (SortedNeighbors) {
        // If neighbors are sorted, find appropriate insertion point.
        // Insert before first neighbor that is too far.
        ii =
            std::upper_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
      } else {
        ii = edges.end();
      }

      return edges.insert(ii,
                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    /**
     * Add an edge to this node; if space exists to add it in, then reuse that
     * space.
     */
    template <typename... Args>
    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,
                                 Args&&... args) {
      // First check for holes
      iterator ii, ei;
      if (SortedNeighbors) {
        // If neighbors are sorted, find acceptable range for insertion.
        ii =
            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
        ei = std::upper_bound(ii, edges.end(), N, first_lt<gNode*>());
      } else {
        // If not sorted, we can insert anywhere in the list.
        ii = edges.begin();
        ei = edges.end();
      }
      ii = std::find_if(ii, ei, first_not_valid());
      if (ii != ei) {
        // FIXME: We could move elements around (short distances).
        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);
        return ii;
      }
      return edges.insert(ei,
                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {
      galois::runtime::acquire(this, mflag);
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}

  public:
    template <typename... Args>
    gNode(Args&&... args)
        : NodeInfo(std::forward<Args>(args)...), active(false) {}
  };

  // The graph manages the lifetimes of the data in the nodes and edges
  //! Container for nodes
  using NodeListTy = galois::InsertBag<gNode>;
  //! nodes in this graph
  NodeListTy nodes;

  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;

  // Helpers for iterator classes
  struct is_node {
    bool operator()(const gNode& g) const { return g.active; }
  };
  struct is_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active;
    }
  };
  struct is_in_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && e.isInEdge();
    }
  };
  struct is_out_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && !e.isInEdge();
    }
  };
  struct makeGraphNode {
    gNode* operator()(gNode& data) const { return &data; }
  };

public: ////////////////////////////////////////////////////////////////////////
  //! Graph node handle
  using GraphNode = gNode*;
  //! Edge data type
  using edge_data_type = EdgeTy;
  //! Edge data type of file we are loading this graph from
  using file_edge_data_type = FileEdgeTy;
  //! Node data type
  using node_data_type = NodeTy;
  //! (Out or Undirected) Edge iterator
  using edge_iterator =
      typename boost::filter_iterator<is_out_edge,
                                      typename gNodeTypes::iterator>;
  //! In Edge iterator
  using in_edge_iterator =
      typename boost::filter_iterator<is_in_edge,
                                      typename gNodeTypes::iterator>;

  //! Reference to edge data
  using edge_data_reference = typename gNodeTypes::EdgeInfo::reference;
  //! Reference to node data
  using node_data_reference = typename gNodeTypes::reference;
  //! Node iterator
  using iterator = boost::transform_iterator<
      makeGraphNode,
      boost::filter_iterator<is_node, typename NodeListTy::iterator>>;

#ifdef AUX_MAP
  //! Auxiliary data for nodes that stores in neighbors in per thread storage
  //! accessed through a map
  struct ReadGraphAuxData {
    LargeArray<GraphNode> nodes;
    //! stores in neighbors
    galois::PerThreadMap<FileGraph::GraphNode,
                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>
        inNghs;
  };
#else
  //! Wrapper around a graph node that provides a lock for it as well as
  //! in-neighbor tracking
  struct AuxNode {
    //! lock for wrapped graph node
    galois::substrate::SimpleLock lock;
    //! single graph node wrapped by this struct
    GraphNode n;
    //! stores in neighbors
    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;
  };
  //! Padded version of AuxNode
  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;

  //! True if a node is both directional and not storing both in and out
  //! edges
  constexpr static const bool DirectedNotInOut = (Directional && !InOut);
  //! Large array that contains auxiliary data for each node (AuxNodes)
  using ReadGraphAuxData =
      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,
                                LargeArray<AuxNodePadded>>::type;
#endif

private: ///////////////////////////////////////////////////////////////////////
  template <typename... Args>
  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,
                                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->find(dst);
    // add edge only if it doesn't already exist
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdgeWithReuse(dst, 0, false,
                                      std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
                                      std::forward<Args>(args)...);
        ii        = src->createEdgeWithReuse(dst, e, false,
                                      std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  template <typename... Args>
  edge_iterator createEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    // add edge only if it doesn't already exist
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdge(src, e, Directional ? true : false,
                             std::forward<Args>(args)...);
        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  /**
   * Creates an outgoing edge at src for the edge from src to dst.
   * Only called by constructOutEdgeValue.
   */
  template <typename... Args>
  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,
                        Args&&... args) {
    assert(src);
    assert(dst);

    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    if (ii == src->end()) {
      dst->acquire(mflag);
      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      return e;
    }
    return nullptr;
  }

  /**
   * Creates an incoming edge at dst for the edge from src to dst.
   * Only called by constructInEdgeValue.
   * Reuse data from the corresponding outgoing edge.
   */
  template <typename... Args>
  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,
                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);

    dst->acquire(mflag);
    typename gNode::iterator ii = dst->end();
    if (ii == dst->end()) {
      src->acquire(mflag);
      ii = dst->createEdge(src, e, Directional ? true : false,
                           std::forward<Args>(args)...);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;
    typedef LargeArray<EdgeTy> ED;
    if (ED::has_value) {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,
                           graph.getEdgeData<FEDV>(nn));
    } else {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
  }

  // will reuse edge data from outgoing edges
  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,
                            GraphNode dst) {
    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);
  }

public
    : /////////////////////////////////////////////////////////////////////////
  /**
   * Creates a new node holding the indicated data. Usually you should call
   * {@link addNode()} afterwards.
   *
   * @param[in] args constructor arguments for node data
   * @returns newly created graph node
   */
  template <typename... Args>
  GraphNode createNode(Args&&... args) {
    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));
    N->active = false;
    return GraphNode(N);
  }

  /**
   * Adds a node to the graph.
   */
  void addNode(const GraphNode& n,
               galois::MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    n->active = true;
  }

  //! Gets the node data for a node.
  node_data_reference
  getData(const GraphNode& n,
          galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    // galois::runtime::checkWrite(mflag, false);
    n->acquire(mflag);
    return n->getData();
  }

  //! Checks if a node is in the graph
  //! @returns true if a node has is in the graph
  bool containsNode(const GraphNode& n,
                    galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    n->acquire(mflag);
    return n->active;
  }

  /**
   * Removes a node from the graph along with all its outgoing/incoming edges
   * for undirected graphs or outgoing edges for directed graphs.
   *
   * @todo handle edge memory
   */
  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(n);
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    gNode* N = n;
    if (N->active) {
      N->active = false;
      N->edges.clear();
    }
  }

  /**
   * Resize the edges of the node. For best performance, should be done
   * serially.
   */
  void resizeEdges(GraphNode src, size_t size,
                   galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, false);
    src->acquire(mflag);
    src->resizeEdges(size);
  }

  /**
   * Adds an edge to graph, replacing existing value if edge already exists.
   *
   * Ignore the edge data, let the caller use the returned iterator to set the
   * value if desired.  This frees us from dealing with the void edge data
   * problem in this API
   */
  edge_iterator addEdge(GraphNode src, GraphNode dst,
                        galois::MethodFlag mflag = MethodFlag::WRITE) {
    return createEdgeWithReuse(src, dst, mflag);
  }

  //! Adds and initializes an edge to graph but does not check for duplicate
  //! edges
  template <typename... Args>
  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,
                             galois::MethodFlag mflag, Args&&... args) {
    return createEdge(src, dst, mflag, std::forward<Args>(args)...);
  }

  //! Removes an edge from the graph
  void removeEdge(GraphNode src, edge_iterator dst,
                  galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    if (Directional && !InOut) {
      src->erase(dst.base());
    } else {
      dst->first()->acquire(mflag);
      // EdgeTy* e = dst->second();
      dst->first()->erase(
          src, Directional ? true : false); // erase incoming/symmetric edge
      src->erase(dst.base());
    }
  }

  //! Finds if an edge between src and dst exists
  edge_iterator findEdge(GraphNode src, GraphNode dst,
                         galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();
    is_out_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edge, lock dst and verify still active
      dst->acquire(mflag);
      if (!edge_predicate(*ii))
        // I think we need this too, else we'll return some random iterator.
        ii = ei;
    } else {
      ii = ei;
    }
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  //! Find/return edge between src/dst if it exists; assumes that edges
  //! are sorted by destination
  edge_iterator
  findEdgeSortedByDst(GraphNode src, GraphNode dst,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    assert(std::is_sorted(src->begin(), src->end(),
                          [=](const typename gNode::EdgeInfo& e1,
                              const typename gNode::EdgeInfo& e2) {
                            return e1.first() < e2.first();
                          }));

    auto ei = src->end();

    // jump directly to edges with destination we are looking for
    auto ii =
        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());

    first_eq_and_valid<gNode*> checker(dst);
    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound
    // ignore in edges
    while (ii != ei && ii->isInEdge()) {
      ++ii;
      ii = std::find_if(ii, ei, checker);
    };

    // make sure destination node is active else return end iterator
    is_out_edge edge_predicate;
    if (ii != ei) {
      dst->acquire(mflag);
      if (!edge_predicate(*ii)) {
        ii = ei;
      }
    }
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  //! Find a particular in-edge: note this function activates for the undirected
  //! graph case, so it just calls the regular out-edge finding function
  template <bool _Undirected = !Directional>
  edge_iterator findInEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag = MethodFlag::WRITE,
                           typename std::enable_if<_Undirected>::type* = 0) {
    // incoming neighbors are the same as outgoing neighbors in undirected
    // graphs
    return findEdge(src, dst, mflag);
  }

  //! Find if an incoming edge between src and dst exists for directed in-out
  //! graphs
  template <bool _DirectedInOut = (Directional && InOut)>
  in_edge_iterator
  findInEdge(GraphNode src, GraphNode dst,
             galois::MethodFlag mflag                       = MethodFlag::WRITE,
             typename std::enable_if<_DirectedInOut>::type* = 0) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    typename gNodeTypes::iterator ii = src->find(dst, true), ei = src->end();
    is_in_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edges, lock dst and verify still active
      dst->acquire(mflag);
      if (!edge_predicate(*ii))
        // need this to avoid returning a random iterator
        ii = ei;
    } else
      ii = ei;
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  /**
   * Returns the edge data associated with the edge. It is an error to
   * get the edge data for a non-existent edge.  It is an error to get
   * edge data for inactive edges. By default, the mflag is
   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call
   * and should perform the appropriate locking.
   */
  edge_data_reference
  getEdgeData(edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  /**
   * Get edge data for an in-edge
   */
  edge_data_reference
  getEdgeData(in_edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  //! Returns the destination of an edge
  GraphNode getEdgeDst(edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  //! Returns the destination of an in-edge
  GraphNode getEdgeDst(in_edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  //! Sorts edge of a node by destination.
  void sortEdgesByDst(GraphNode N,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    acquire(N, mflag);
    typedef typename gNode::EdgeInfo EdgeInfo;
    std::sort(N->begin(), N->end(),
              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                return e1.first() < e2.first();
              });
  }

  //! Sort all edges by destination
  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(*this),
        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());
  }

  // General Things

  //! Returns an iterator to the neighbors of a node
  edge_iterator edge_begin(GraphNode N,
                           galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;
           ++ii) {
        if (ii->first()->active && !ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());
  }

  //! Returns an iterator to the in-neighbors of a node
  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
                typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;
           ++ii) {
        if (ii->first()->active && ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_in_edge(), N->begin(), N->end());
  }

  //! Returns an iterator to the in-neighbors of a node; undirected case
  //! in which it's the same as a regular neighbor
  template <bool _Undirected = !Directional>
  edge_iterator in_edge_begin(GraphNode N,
                              galois::MethodFlag mflag = MethodFlag::WRITE,
                              typename std::enable_if<_Undirected>::type* = 0) {
    return edge_begin(N, mflag);
  }

  //! Returns the end of the neighbor edge iterator
  edge_iterator
  edge_end(GraphNode N,
           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());
  }

  //! Returns the end of an in-neighbor edge iterator
  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_end(GraphNode N,
              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,
              typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_in_edge(), N->end(), N->end());
  }

  //! Returns the end of an in-neighbor edge iterator, undirected case
  template <bool _Undirected = !Directional>
  edge_iterator in_edge_end(GraphNode N,
                            galois::MethodFlag mflag = MethodFlag::WRITE,
                            typename std::enable_if<_Undirected>::type* = 0) {
    return edge_end(N, mflag);
  }

  //! Return a range of edges that can be iterated over by C++ for-each
  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  //! Return a range of in-edges that can be iterated over by C++ for-each
  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<in_edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<!_Undirected>::type* = 0) {
    return internal::make_no_deref_range(in_edge_begin(N, mflag),
                                         in_edge_end(N, mflag));
  }

  //! Return a range of in-edges that can be iterated over by C++ for-each
  //! Undirected case, equivalent to out-edge iteration
  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<_Undirected>::type* = 0) {
    return edges(N, mflag);
  }

  /**
   * An object with begin() and end() methods to iterate over the outgoing
   * edges of N.
   */
  internal::EdgesIterator<MorphGraph>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::EdgesIterator<MorphGraph>(*this, N, mflag);
  }

  /**
   * Returns an iterator to all the nodes in the graph. Not thread-safe.
   */
  iterator begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),
        makeGraphNode());
  }

  //! Returns the end of the node iterator. Not thread-safe.
  iterator end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),
        makeGraphNode());
  }

  //! local iterator over nodes
  using local_iterator = iterator;

  //! Return the beginning of local range of nodes
  local_iterator local_begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_begin(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  //! Return the end of local range of nodes
  local_iterator local_end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_end(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  /**
   * Returns the number of nodes in the graph. Not thread-safe.
   */
  unsigned int size() { return std::distance(begin(), end()); }

  //! Returns the size of edge data.
  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }

#ifdef AUX_MAP
  /**
   * Allocate memory for nodes given a file graph with a particular number of
   * nodes.
   *
   * @param graph FileGraph with a number of nodes to allocate
   * @param aux Data structure in which to allocate space for nodes.
   */
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.nodes.allocateInterleaved(numNodes);
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux.nodes[*ii] = createNode();
      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    auto& map = aux.inNghs.get();

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto dstID = graph.getEdgeDst(nn);
        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];
        auto e = constructOutEdgeValue(graph, nn, src, dst);
        if (!Directional || InOut) {
          map[dstID].push_back({src, e});
        }
      }
    }
  }

  /**
   * Constructs the MorphGraph in-edges given a FileGraph to construct it from
   * and already created nodes. Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                            const ReadGraphAuxData& aux) {
    // only do it if not directioal or an inout graph
    if (!Directional || InOut) {
      auto r = graph
                   .divideByNode(sizeof(gNode),
                                 sizeof(typename gNode::EdgeInfo), tid, total)
                   .first;

      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {
        const auto& map = aux.inNghs.get(i);
        auto ii         = map.lower_bound(*(r.first));  // inclusive begin
        auto ei         = map.lower_bound(*(r.second)); // exclusive end
        for (; ii != ei; ++ii) {
          auto dst = aux.nodes[ii->first];
          for (const auto& ie : ii->second) {
            constructInEdgeValue(graph, ie.second, ie.first, dst);
          }
        }
      }
    }
  }
#else
  /**
   * Allocate memory for nodes given a file graph with a particular number of
   * nodes.
   *
   * @param graph FileGraph with a number of nodes to allocate
   * @param aux Data structure in which to allocate space for nodes.
   */
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.allocateInterleaved(numNodes);

    if (!DirectedNotInOut) {
      galois::do_all(galois::iterate(size_t{0}, aux.size()),
                     [&](size_t index) { aux.constructAt(index); });
    }
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   * Version for DirectedNotInOut = false.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                          unsigned total,
                                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      auxNode.n     = createNode();
      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   * Version for DirectedNotInOut = true.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                         unsigned total,
                                         ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux[*ii] = createNode();
      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                             unsigned total,
                                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto src     = aux[*ii].get().n;
        auto& dstAux = aux[graph.getEdgeDst(nn)].get();
        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);
        dstAux.lock.lock();
        dstAux.inNghs.push_back({src, e});
        dstAux.lock.unlock();
      }
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   * DirectedNotInOut = true version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            const ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);
      }
    }
  }

  /**
   * Constructs the MorphGraph in-edges given a FileGraph to construct it from
   * and already created nodes. Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      for (auto ie : auxNode.inNghs) {
        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);
      }
    }
  }

  //! If a directed graph and no in-edges exist (i.e. DirectedNotInOut = true),
  //! then construct in edges should do nothing.
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,
                                           ReadGraphAuxData&) {}
#endif
};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/MorphHyperGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file MorphGraph.h
 *
 * Contains MorphGraph and associated helpers.
 */

#ifndef GALOIS_GRAPHS_MORPHHYPERGRAPH_H
#define GALOIS_GRAPHS_MORPHHYPERGRAPH_H

#include <algorithm>
#include <map>
#include <set>
#include <type_traits>
#include <vector>

#include <boost/container/small_vector.hpp>
#include <boost/functional.hpp>
#include <boost/iterator/transform_iterator.hpp>
#include <boost/iterator/filter_iterator.hpp>

#include "galois/Bag.h"
#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/Details.h"
#include "galois/gstl.h"

#ifdef AUX_MAP
#include "galois/PerThreadContainer.h"
#else
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/SimpleLock.h"
#endif

namespace galois {
namespace graphs {

namespace internal {
/**
 * Wrapper class to have a valid type on void edges
 */
template <typename NTy, typename ETy, bool DirectedButNotInOut>
struct UEdgeInfoBase;

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, true> {
  typedef ETy& reference;

  NTy* N;
  ETy Ea;

  inline NTy* first() {
    assert(N);
    return N;
  }
  inline NTy* first() const {
    assert(N);
    return N;
  }
  inline ETy* second() { return &Ea; }
  inline const ETy* second() const { return &Ea; }

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)
      : N(n), Ea(std::forward<Args>(args)...) {}

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {
    Ea = v;
  }

  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return false; }
};

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, false> {
  typedef ETy& reference;

  NTy* N;
  ETy* Ea;

  inline NTy* first() {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline NTy* first() const {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline ETy* second() { return Ea; }
  inline const ETy* second() const { return Ea; }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}
  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, true> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return N; }
  inline NTy* first() const { return N; }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return false; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, false> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }
  inline NTy* first() const { return (NTy*)((uintptr_t)N & ~1); }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

/*
 * Only graphs w/ in-out/symmetric edges and non-void edge data,
 * i.e. ETy != void and DirectedNotInOut = false,
 * need to allocate memory for edge data
 */
template <typename ETy, bool DirectedNotInOut>
struct EdgeFactory {
  galois::InsertBag<ETy> mem;
  template <typename... Args>
  ETy* mkEdge(Args&&... args) {
    return &mem.emplace(std::forward<Args>(args)...);
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <typename ETy>
struct EdgeFactory<ETy, true> {
  template <typename... Args>
  ETy* mkEdge(Args&&...) {
    return nullptr;
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <>
struct EdgeFactory<void, false> {
  template <typename... Args>
  void* mkEdge(Args&&...) {
    return static_cast<void*>(NULL);
  }
  void delEdge(void*) {}
  bool mustDel() const { return false; }
};

} // namespace internal

/**
 * A graph that can have new nodes and edges added to it.
 *
 * An example of use:
 *
 * \code
 * struct Node {
 *   ... // Definition of node data
 * };
 *
 * typedef galois::graphs::MorphGraph<Node,int,true> Graph;
 *
 * // Create graph
 * Graph g;
 * Node n1, n2;
 * Graph::GraphNode a, b;
 * a = g.createNode(n1);
 * g.addNode(a);
 * b = g.createNode(n2);
 * g.addNode(b);
 * g.getEdgeData(g.addEdge(a, b)) = 5;
 *
 * // Traverse graph
 * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
 *   Graph::GraphNode src = *ii;
 *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);
 *        jj != ej;
 *        ++jj) {
 *     Graph::GraphNode dst = graph.getEdgeDst(jj);
 *     int edgeData = g.getEdgeData(jj);
 *     assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * And in C++11:
 *
 * \code
 * // Traverse graph
 * for (Graph::GraphNode src : g) {
 *   for (Graph::edge_iterator edge : g.out_edges(src)) {
 *     Graph::GraphNode dst = g.getEdgeDst(edge);
 *     int edgeData = g.getEdgeData(edge);
 *     assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * @tparam NodeTy Type of node data
 * @tparam EdgeTy Type of edge data
 * @tparam Directional true if graph is directed
 * @tparam InOut true if directed graph tracks in-edges
 * @tparam HasNoLockable if true, use no abstract locks in the graph
 * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)
 * @tparam FileEdgeTy type of edges on file to be read from
 */
template <typename NodeTy, typename EdgeTy, bool Directional,
          bool InOut = false, bool HasNoLockable = false,
          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>
class MorphHyperGraph : private boost::noncopyable {
public:
  /**
   * Struct used to define the HasNoLockable template parameter as a type
   * in the struct.
   */
  template <bool _has_no_lockable>
  struct with_no_lockable {
    //! Type with Lockable parameter set according to struct template arg
    using type = MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut,
                                 _has_no_lockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of node data in the graph.
   */
  template <typename _node_data>
  struct with_node_data {
    //! Type with node data parameter set according to struct template arg
    using type = MorphHyperGraph<_node_data, EdgeTy, Directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of edge data in the graph.
   */
  template <typename _edge_data>
  struct with_edge_data {
    //! Type with edge data parameter set according to struct template arg
    using type = MorphHyperGraph<NodeTy, _edge_data, Directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define the type of file edge data in the graph.
   */
  template <typename _file_edge_data>
  struct with_file_edge_data {
    //! Type with file edge data parameter set according to struct template arg
    using type =
        MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,
                        SortedNeighbors, _file_edge_data>;
  };

  /**
   * Struct used to define directionality of the graph.
   */
  template <bool _directional>
  struct with_directional {
    //! Type with directional parameter set according to struct template arg
    using type = MorphHyperGraph<NodeTy, EdgeTy, _directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;
  };

  /**
   * Struct used to define if neighbors are sorted or not in the graph.
   */
  template <bool _sorted_neighbors>
  struct with_sorted_neighbors {
    //! Type with sort neighbor parameter set according to struct template arg
    using type = MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut,
                                 HasNoLockable, _sorted_neighbors, FileEdgeTy>;
  };

  //! Tag that defines to graph reader how to read a graph into this class
  using read_tag = read_with_aux_first_graph_tag;

private: ///////////////////////////////////////////////////////////////////////
  template <typename T>
  struct first_eq_and_valid {
    T N2;
    first_eq_and_valid(T& n) : N2(n) {}
    template <typename T2>
    bool operator()(const T2& ii) const {
      return ii.first() == N2 && ii.first() && ii.first()->active;
    }
  };

  struct first_not_valid {
    template <typename T2>
    bool operator()(const T2& ii) const {
      return !ii.first() || !ii.first()->active;
    }
  };

  template <typename T>
  struct first_lt {
    template <typename T2>
    bool operator()(const T& N2, const T2& ii) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return N2 < ii.first();
    }
    template <typename T2>
    bool operator()(const T2& ii, const T& N2) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return ii.first() < N2;
    }
  };

  // forward declaration for graph node type
  class gNode;
  struct gNodeTypes
      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {
    //! The storage type for an edge
    using EdgeInfo =
        internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>;

    //! The storage type for edges
    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;
    using EdgesTy = boost::container::small_vector<
        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>;

    using iterator = typename EdgesTy::iterator;
  };

  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,
                public gNodeTypes {
    //! friend of MorphHyperGraph since MorphHyperGraph contains gNodes
    friend class MorphHyperGraph;
    //! Storage type for node
    using NodeInfo = internal::NodeInfoBase<NodeTy, !HasNoLockable>;
    //! iterator over edges (taken from gNodeTypes)
    using iterator = typename gNode::iterator;
    //! Storage type of a single edge (taken from gNodeTypes)
    using EdgeInfo = typename gNode::EdgeInfo;

    //! edges on this node
    typename gNodeTypes::EdgesTy edges;

    //! Tracks if this node is considered as "in" the graph
    bool active;

    //! Return iterator to first edge
    iterator begin() { return edges.begin(); }
    //! Return iterator to end of edges
    iterator end() { return edges.end(); }

    //! Remove the provided edge from this node
    //! @param ii iterator to edge to remove
    void erase(iterator ii) {
      if (SortedNeighbors) {
        // For sorted case remove the element, moving following
        // elements back to fill the space.
        edges.erase(ii);
      } else {
        // We don't need to preserve the order, so move the last edge
        // into this place and then remove last edge.
        *ii = edges.back();
        edges.pop_back();
      }
    }

    /**
     * Erase an edge with a provided destination.
     */
    void erase(gNode* N, bool inEdge = false) {
      iterator ii = find(N, inEdge);
      if (ii != end())
        edges.erase(ii);
    }

    /**
     * Find an edge with a particular destination node.
     */
    iterator find(gNode* N, bool inEdge = false) {
      iterator ii, ei = edges.end();
      // find starting point to start search
      if (SortedNeighbors) {
        assert(std::is_sorted(edges.begin(), edges.end(),
                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                                return e1.first() < e2.first();
                              }));
        ii =
            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
      } else {
        ii = edges.begin();
      }

      first_eq_and_valid<gNode*> checker(N);
      ii = std::find_if(ii, ei, checker);
      while (ii != ei && ii->isInEdge() != inEdge) {
        ++ii;
        ii = std::find_if(ii, ei, checker);
      };
      return ii;
    }

    /**
     * Make space for more edges stored by this node
     */
    void resizeEdges(size_t size) {
      edges.resize(size, EdgeInfo(new gNode(), 0));
    }

    /**
     * Add a new edge to this node
     */
    template <typename... Args>
    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {
      iterator ii;
      if (SortedNeighbors) {
        // If neighbors are sorted, find appropriate insertion point.
        // Insert before first neighbor that is too far.
        ii =
            std::upper_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
      } else {
        ii = edges.end();
      }

      return edges.insert(ii,
                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    /**
     * Add an edge to this node; if space exists to add it in, then reuse that
     * space.
     */
    template <typename... Args>
    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,
                                 Args&&... args) {
      // First check for holes
      iterator ii, ei;
      if (SortedNeighbors) {
        // If neighbors are sorted, find acceptable range for insertion.
        ii =
            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());
        ei = std::upper_bound(ii, edges.end(), N, first_lt<gNode*>());
      } else {
        // If not sorted, we can insert anywhere in the list.
        ii = edges.begin();
        ei = edges.end();
      }
      ii = std::find_if(ii, ei, first_not_valid());
      if (ii != ei) {
        // FIXME: We could move elements around (short distances).
        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);
        return ii;
      }
      return edges.insert(ei,
                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {
      galois::runtime::acquire(this, mflag);
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}

  public:
    int gain;
    template <typename... Args>
    gNode(Args&&... args)
        : NodeInfo(std::forward<Args>(args)...), active(false) {}
  };

  // The graph manages the lifetimes of the data in the nodes and edges
  //! Container for nodes
  using NodeListTy = galois::InsertBag<gNode>;
  using Bnodes     = galois::InsertBag<gNode*>;
  //! nodes in this graph
  NodeListTy nodes;
  Bnodes cells;
  Bnodes nets;

  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;

  // Helpers for iterator classes
  struct is_node {
    bool operator()(const gNode& g) const { return g.active; }
  };
  struct is_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active;
    }
  };
  struct is_in_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && e.isInEdge();
    }
  };
  struct is_out_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && !e.isInEdge();
    }
  };
  struct makeGraphNode {
    gNode* operator()(gNode& data) const { return &data; }
  };

public: ////////////////////////////////////////////////////////////////////////
  //! Graph node handle
  using GraphNode = gNode*;
  //! Edge data type
  using edge_data_type = EdgeTy;
  //! Edge data type of file we are loading this graph from
  using file_edge_data_type = FileEdgeTy;
  //! Node data type
  using node_data_type = NodeTy;
  //! (Out or Undirected) Edge iterator
  using edge_iterator =
      typename boost::filter_iterator<is_out_edge,
                                      typename gNodeTypes::iterator>;
  //! In Edge iterator
  using in_edge_iterator =
      typename boost::filter_iterator<is_in_edge,
                                      typename gNodeTypes::iterator>;

  //! Reference to edge data
  using edge_data_reference = typename gNodeTypes::EdgeInfo::reference;
  //! Reference to node data
  using node_data_reference = typename gNodeTypes::reference;
  //! Node iterator
  using iterator = boost::transform_iterator<
      makeGraphNode,
      boost::filter_iterator<is_node, typename NodeListTy::iterator>>;

  gstl::Vector<GraphNode> locked_cells;
  int max_cell_area;
  int total_area;
  std::list<int> freeCells;

  gstl::Vector<int> maxgain;
  gstl::Vector<int> balance;
#ifdef AUX_MAP
  //! Auxiliary data for nodes that stores in neighbors in per thread storage
  //! accessed through a map
  struct ReadGraphAuxData {
    LargeArray<GraphNode> nodes;
    //! stores in neighbors
    galois::PerThreadMap<FileGraph::GraphNode,
                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>
        inNghs;
  };
#else
  //! Wrapper around a graph node that provides a lock for it as well as
  //! in-neighbor tracking
  struct AuxNode {
    //! lock for wrapped graph node
    galois::substrate::SimpleLock lock;
    //! single graph node wrapped by this struct
    GraphNode n;
    //! stores in neighbors
    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;
  };
  //! Padded version of AuxNode
  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;

  //! True if a node is both directional and not storing both in and out
  //! edges
  constexpr static const bool DirectedNotInOut = (Directional && !InOut);
  //! Large array that contains auxiliary data for each node (AuxNodes)
  using ReadGraphAuxData =
      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,
                                LargeArray<AuxNodePadded>>::type;
#endif

private: ///////////////////////////////////////////////////////////////////////
  template <typename... Args>
  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,
                                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->find(dst);
    // add edge only if it doesn't already exist
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdgeWithReuse(dst, 0, false,
                                      std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
                                      std::forward<Args>(args)...);
        ii        = src->createEdgeWithReuse(dst, e, false,
                                      std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  template <typename... Args>
  edge_iterator createEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    // add edge only if it doesn't already exist
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdge(src, e, Directional ? true : false,
                             std::forward<Args>(args)...);
        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  /**
   * Creates an outgoing edge at src for the edge from src to dst.
   * Only called by constructOutEdgeValue.
   */
  template <typename... Args>
  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,
                        Args&&... args) {
    assert(src);
    assert(dst);

    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    if (ii == src->end()) {
      dst->acquire(mflag);
      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      return e;
    }
    return nullptr;
  }

  /**
   * Creates an incoming edge at dst for the edge from src to dst.
   * Only called by constructInEdgeValue.
   * Reuse data from the corresponding outgoing edge.
   */
  template <typename... Args>
  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,
                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);

    dst->acquire(mflag);
    typename gNode::iterator ii = dst->end();
    if (ii == dst->end()) {
      src->acquire(mflag);
      ii = dst->createEdge(src, e, Directional ? true : false,
                           std::forward<Args>(args)...);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;
    typedef LargeArray<EdgeTy> ED;
    if (ED::has_value) {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,
                           graph.getEdgeData<FEDV>(nn));
    } else {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
  }

  // will reuse edge data from outgoing edges
  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,
                            GraphNode dst) {
    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);
  }

public
    : /////////////////////////////////////////////////////////////////////////
  /**
   * Creates a new node holding the indicated data. Usually you should call
   * {@link addNode()} afterwards.
   *
   * @param[in] args constructor arguments for node data
   * @returns newly created graph node
   */
  template <typename... Args>
  GraphNode createNode(Args&&... args) {
    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));
    N->active = false;
    return GraphNode(N);
  }

  /**
   * Adds a node to the graph.
   */
  void addNode(const GraphNode& n,
               galois::MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    n->active = true;
  }

  //! Gets the node data for a node.
  node_data_reference
  getData(const GraphNode& n,
          galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    // galois::runtime::checkWrite(mflag, false);
    n->acquire(mflag);
    return n->getData();
  }

  //! Checks if a node is in the graph
  //! @returns true if a node has is in the graph
  bool containsNode(const GraphNode& n,
                    galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    n->acquire(mflag);
    return n->active;
  }

  /**
   * Removes a node from the graph along with all its outgoing/incoming edges
   * for undirected graphs or outgoing edges for directed graphs.
   *
   * @todo handle edge memory
   */
  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(n);
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    gNode* N = n;
    if (N->active) {
      N->active = false;
      N->edges.clear();
    }
  }

  /**
   * Resize the edges of the node. For best performance, should be done
   * serially.
   */
  void resizeEdges(GraphNode src, size_t size,
                   galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, false);
    src->acquire(mflag);
    src->resizeEdges(size);
  }

  /**
   * Adds an edge to graph, replacing existing value if edge already exists.
   *
   * Ignore the edge data, let the caller use the returned iterator to set the
   * value if desired.  This frees us from dealing with the void edge data
   * problem in this API
   */
  edge_iterator addEdge(GraphNode src, GraphNode dst,
                        galois::MethodFlag mflag = MethodFlag::WRITE) {
    return createEdgeWithReuse(src, dst, mflag);
  }

  //! Adds and initializes an edge to graph but does not check for duplicate
  //! edges
  template <typename... Args>
  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,
                             galois::MethodFlag mflag, Args&&... args) {
    return createEdge(src, dst, mflag, std::forward<Args>(args)...);
  }

  //! Removes an edge from the graph
  void removeEdge(GraphNode src, edge_iterator dst,
                  galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    if (Directional && !InOut) {
      src->erase(dst.base());
    } else {
      dst->first()->acquire(mflag);
      // EdgeTy* e = dst->second();
      dst->first()->erase(
          src, Directional ? true : false); // erase incoming/symmetric edge
      src->erase(dst.base());
    }
  }

  //! Finds if an edge between src and dst exists
  edge_iterator findEdge(GraphNode src, GraphNode dst,
                         galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();
    is_out_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edge, lock dst and verify still active
      dst->acquire(mflag);
      if (!edge_predicate(*ii))
        // I think we need this too, else we'll return some random iterator.
        ii = ei;
    } else {
      ii = ei;
    }
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  //! Find/return edge between src/dst if it exists; assumes that edges
  //! are sorted by destination
  edge_iterator
  findEdgeSortedByDst(GraphNode src, GraphNode dst,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    assert(std::is_sorted(src->begin(), src->end(),
                          [=](const typename gNode::EdgeInfo& e1,
                              const typename gNode::EdgeInfo& e2) {
                            return e1.first() < e2.first();
                          }));

    auto ei = src->end();

    // jump directly to edges with destination we are looking for
    auto ii =
        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());

    first_eq_and_valid<gNode*> checker(dst);
    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound
    // ignore in edges
    while (ii != ei && ii->isInEdge()) {
      ++ii;
      ii = std::find_if(ii, ei, checker);
    };

    // make sure destination node is active else return end iterator
    is_out_edge edge_predicate;
    if (ii != ei) {
      dst->acquire(mflag);
      if (!edge_predicate(*ii)) {
        ii = ei;
      }
    }
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  //! Find a particular in-edge: note this function activates for the undirected
  //! graph case, so it just calls the regular out-edge finding function
  template <bool _Undirected = !Directional>
  edge_iterator findInEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag = MethodFlag::WRITE,
                           typename std::enable_if<_Undirected>::type* = 0) {
    // incoming neighbors are the same as outgoing neighbors in undirected
    // graphs
    return findEdge(src, dst, mflag);
  }

  //! Find if an incoming edge between src and dst exists for directed in-out
  //! graphs
  template <bool _DirectedInOut = (Directional && InOut)>
  in_edge_iterator
  findInEdge(GraphNode src, GraphNode dst,
             galois::MethodFlag mflag                       = MethodFlag::WRITE,
             typename std::enable_if<_DirectedInOut>::type* = 0) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    typename gNodeTypes::iterator ii = src->find(dst, true), ei = src->end();
    is_in_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edges, lock dst and verify still active
      dst->acquire(mflag);
      if (!edge_predicate(*ii))
        // need this to avoid returning a random iterator
        ii = ei;
    } else
      ii = ei;
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  /**
   * Returns the edge data associated with the edge. It is an error to
   * get the edge data for a non-existent edge.  It is an error to get
   * edge data for inactive edges. By default, the mflag is
   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call
   * and should perform the appropriate locking.
   */
  edge_data_reference
  getEdgeData(edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  /**
   * Get edge data for an in-edge
   */
  edge_data_reference
  getEdgeData(in_edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  //! Returns the destination of an edge
  GraphNode getEdgeDst(edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  //! Returns the destination of an in-edge
  GraphNode getEdgeDst(in_edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  //! Sorts edge of a node by destination.
  void sortEdgesByDst(GraphNode N,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    acquire(N, mflag);
    typedef typename gNode::EdgeInfo EdgeInfo;
    std::sort(N->begin(), N->end(),
              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                return e1.first() < e2.first();
              });
  }

  //! Sort all edges by destination
  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(*this),
        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());
  }

  // General Things
  void sortEdgesByDeg(GraphNode N,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    acquire(N, mflag);
    typedef typename gNode::EdgeInfo EdgeInfo;
    std::sort(N->begin(), N->end(),
              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                return getallneighbor(e1.first()).size() <
                       getallneighbor(e2.first()).size();
              });
  }

  // Sort cells in a net by their degree
  void sortCellDegree(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(galois::iterate(this->getNets()),
                   [=](GraphNode N) { this->sortEdgesByDeg(N, mflag); });
  }

  //! Returns an iterator to the neighbors of a node
  edge_iterator edge_begin(GraphNode N,
                           galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;
           ++ii) {
        if (ii->first()->active && !ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());
  }

  //! Returns an iterator to the in-neighbors of a node
  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
                typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;
           ++ii) {
        if (ii->first()->active && ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_in_edge(), N->begin(), N->end());
  }

  //! Returns an iterator to the in-neighbors of a node; undirected case
  //! in which it's the same as a regular neighbor
  template <bool _Undirected = !Directional>
  edge_iterator in_edge_begin(GraphNode N,
                              galois::MethodFlag mflag = MethodFlag::WRITE,
                              typename std::enable_if<_Undirected>::type* = 0) {
    return edge_begin(N, mflag);
  }

  //! Returns the end of the neighbor edge iterator
  edge_iterator
  edge_end(GraphNode N,
           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());
  }

  //! Returns the end of an in-neighbor edge iterator
  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_end(GraphNode N,
              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,
              typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_in_edge(), N->end(), N->end());
  }

  //! Returns the end of an in-neighbor edge iterator, undirected case
  template <bool _Undirected = !Directional>
  edge_iterator in_edge_end(GraphNode N,
                            galois::MethodFlag mflag = MethodFlag::WRITE,
                            typename std::enable_if<_Undirected>::type* = 0) {
    return edge_end(N, mflag);
  }

  //! Return a range of edges that can be iterated over by C++ for-each
  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  //! Return a range of in-edges that can be iterated over by C++ for-each
  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<in_edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<!_Undirected>::type* = 0) {
    return internal::make_no_deref_range(in_edge_begin(N, mflag),
                                         in_edge_end(N, mflag));
  }

  //! Return a range of in-edges that can be iterated over by C++ for-each
  //! Undirected case, equivalent to out-edge iteration
  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<_Undirected>::type* = 0) {
    return edges(N, mflag);
  }

  /**
   * An object with begin() and end() methods to iterate over the outgoing
   * edges of N.
   */
  internal::EdgesIterator<MorphHyperGraph>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::EdgesIterator<MorphHyperGraph>(*this, N, mflag);
  }

  /**
   * Returns an iterator to all the nodes in the graph. Not thread-safe.
   */
  iterator begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),
        makeGraphNode());
  }

  //! Returns the end of the node iterator. Not thread-safe.
  iterator end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),
        makeGraphNode());
  }

  //! local iterator over nodes
  using local_iterator = iterator;

  //! Return the beginning of local range of nodes
  local_iterator local_begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_begin(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  //! Return the end of local range of nodes
  local_iterator local_end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_end(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  /**
   * Returns the number of nodes in the graph. Not thread-safe.
   */
  unsigned int size() { return std::distance(begin(), end()); }

  //! Returns the size of edge data.
  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }

#ifdef AUX_MAP
  /**
   * Allocate memory for nodes given a file graph with a particular number of
   * nodes.
   *
   * @param graph FileGraph with a number of nodes to allocate
   * @param aux Data structure in which to allocate space for nodes.
   */
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.nodes.allocateInterleaved(numNodes);
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux.nodes[*ii] = createNode();
      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    auto& map = aux.inNghs.get();

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto dstID = graph.getEdgeDst(nn);
        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];
        auto e = constructOutEdgeValue(graph, nn, src, dst);
        if (!Directional || InOut) {
          map[dstID].push_back({src, e});
        }
      }
    }
  }

  /**
   * Constructs the MorphGraph in-edges given a FileGraph to construct it from
   * and already created nodes. Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                            const ReadGraphAuxData& aux) {
    // only do it if not directioal or an inout graph
    if (!Directional || InOut) {
      auto r = graph
                   .divideByNode(sizeof(gNode),
                                 sizeof(typename gNode::EdgeInfo), tid, total)
                   .first;

      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {
        const auto& map = aux.inNghs.get(i);
        auto ii         = map.lower_bound(*(r.first));  // inclusive begin
        auto ei         = map.lower_bound(*(r.second)); // exclusive end
        for (; ii != ei; ++ii) {
          auto dst = aux.nodes[ii->first];
          for (const auto& ie : ii->second) {
            constructInEdgeValue(graph, ie.second, ie.first, dst);
          }
        }
      }
    }
  }
#else
  /**
   * Allocate memory for nodes given a file graph with a particular number of
   * nodes.
   *
   * @param graph FileGraph with a number of nodes to allocate
   * @param aux Data structure in which to allocate space for nodes.
   */
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.allocateInterleaved(numNodes);

    if (!DirectedNotInOut) {
      galois::do_all(galois::iterate(0ul, aux.size()),
                     [&](size_t index) { aux.constructAt(index); });
    }
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   * Version for DirectedNotInOut = false.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                          unsigned total,
                                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      auxNode.n     = createNode();
      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph nodes given a FileGraph to construct it from.
   * Meant to be called by multiple threads.
   * Version for DirectedNotInOut = true.
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in,out] aux Allocated memory to store newly created nodes
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                         unsigned total,
                                         ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux[*ii] = createNode();
      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                             unsigned total,
                                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto src     = aux[*ii].get().n;
        auto& dstAux = aux[graph.getEdgeDst(nn)].get();
        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);
        dstAux.lock.lock();
        dstAux.inNghs.push_back({src, e});
        dstAux.lock.unlock();
      }
    }
  }

  /**
   * Constructs the MorphGraph edges given a FileGraph to construct it from and
   * already created nodes.
   * Meant to be called by multiple threads.
   * DirectedNotInOut = true version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            const ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);
      }
    }
  }

  /**
   * Constructs the MorphGraph in-edges given a FileGraph to construct it from
   * and already created nodes. Meant to be called by multiple threads.
   * DirectedNotInOut = false version
   *
   * @param[in] graph FileGraph to construct a morph graph from
   * @param[in] tid Thread id of thread calling this function
   * @param[in] total Total number of threads in current execution
   * @param[in] aux Contains created nodes to create edges for
   */
  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      for (auto ie : auxNode.inNghs) {
        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);
      }
    }
  }

  //! If a directed graph and no in-edges exist (i.e. DirectedNotInOut = true),
  //! then construct in edges should do nothing.
  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,
                                           ReadGraphAuxData&) {}
#endif
  gstl::Vector<GraphNode> getneighbor(GraphNode N, GraphNode H) {

    gstl::Vector<GraphNode> neighbors;
    //    if (findEdge(N, H)) {
    for (auto it : edges(H)) {
      auto n = getEdgeDst(it);
      if (n != N)
        neighbors.push_back(n);
    }
    //  }
    return neighbors;
  }
  // get all the neighbors
  gstl::Vector<GraphNode> getallneighbor(GraphNode N) {

    gstl::Vector<GraphNode> neighbors;
    for (auto it : edges(N)) {
      auto hedge = getEdgeDst(it);
      for (auto h : edges(hedge)) {
        auto hneighbor = getEdgeDst(h);
        if (hneighbor != N)
          neighbors.push_back(hneighbor);
      }
    }

    return neighbors;
  }
  // get all the nets on a cell
  gstl::Vector<GraphNode> getNets(GraphNode N) {
    gstl::Vector<GraphNode> n;
    for (auto net : edges(N)) {
      auto nnet = getEdgeDst(net);
      n.push_back(nnet);
    }
    return n;
  }

  // get all the cells on a net
  gstl::Vector<GraphNode> getCells(GraphNode N) {
    gstl::Vector<GraphNode> cells;
    for (auto net : edges(N)) {
      auto hedge = getEdgeDst(net);
      cells.push_back(hedge);
    }
    return cells;
  }

  void addHyperedge(GraphNode n) { nets.push_back(n); }
  void addCell(GraphNode n) { cells.push_back(n); }
  Bnodes& cellList() { return cells; }

  Bnodes& getNets() { return nets; }

  GraphNode getneighbornet(GraphNode N, GraphNode C) {
    for (auto n : edges(N)) {
      GraphNode n1 = getEdgeDst(n);
      for (auto c : edges(C)) {
        GraphNode n2 = getEdgeDst(c);
        if (n2 == n1)
          return n1;
      }
    }
  }
  //
  std::vector<GraphNode> getallneighbornets(GraphNode N) {
    std::vector<GraphNode> nets;
    for (auto n : edges(N)) {
      GraphNode n1 = getEdgeDst(n);
      nets.push_back(n1);
    }
    return nets;
  }
};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPH_MORPH_SEPINOUT_GRAPH_H
#define GALOIS_GRAPH_MORPH_SEPINOUT_GRAPH_H

#include <algorithm>
#include <map>
#include <set>
#include <type_traits>
#include <vector>

#include <boost/container/small_vector.hpp>
#include <boost/functional.hpp>
#include <boost/iterator/filter_iterator.hpp>
#include <boost/iterator/transform_iterator.hpp>

#include "galois/Bag.h"
#include "galois/Galois.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/FileGraph.h"
#include "galois/gstl.h"

#ifdef AUX_MAP
#include "galois/PerThreadContainer.h"
#else
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/SimpleLock.h"
#endif

namespace galois {
//! Parallel graph data structures.
namespace graphs {

namespace internal {
/**
 * Wrapper class to have a valid type on void edges
 */
template <typename NTy, typename ETy, bool DirectedButNotInOut>
struct UEdgeInfoBase;

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, true> {
  typedef ETy& reference;

  NTy* N;
  ETy Ea;

  inline NTy* first() {
    assert(N);
    return N;
  }
  inline NTy const* first() const {
    assert(N);
    return N;
  }
  inline ETy* second() { return &Ea; }
  inline const ETy* second() const { return &Ea; }

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)
      : N(n), Ea(std::forward<Args>(args)...) {}

  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {
    Ea = v;
  }

  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return false; }
};

template <typename NTy, typename ETy>
struct UEdgeInfoBase<NTy, ETy, false> {
  typedef ETy& reference;

  NTy* N;
  ETy* Ea;

  inline NTy* first() {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline NTy const* first() const {
    assert(N);
    return (NTy*)((uintptr_t)N & ~1);
  }
  inline ETy* second() { return Ea; }
  inline const ETy* second() const { return Ea; }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}
  static size_t sizeOfSecond() { return sizeof(ETy); }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, true> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return N; }
  inline NTy const* first() const { return N; }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return false; }
};

template <typename NTy>
struct UEdgeInfoBase<NTy, void, false> {
  typedef char& reference;

  NTy* N;
  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }
  inline NTy const* first() const { return (NTy*)((uintptr_t)N & ~1); }
  inline char* second() const { return static_cast<char*>(NULL); }
  inline char* addr() const { return second(); }
  template <typename... Args>
  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)
      : N((NTy*)((uintptr_t)n | f)) {}
  static size_t sizeOfSecond() { return 0; }
  bool isInEdge() const { return (uintptr_t)N & 1; }
};

/*
 * Only graphs w/ in-out/symmetric edges and non-void edge data,
 * i.e. ETy != void and DirectedNotInOut = false,
 * need to allocate memory for edge data
 */
template <typename ETy, bool DirectedNotInOut>
struct EdgeFactory {
  galois::InsertBag<ETy> mem;
  template <typename... Args>
  ETy* mkEdge(Args&&... args) {
    return &mem.emplace(std::forward<Args>(args)...);
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <typename ETy>
struct EdgeFactory<ETy, true> {
  template <typename... Args>
  ETy* mkEdge(Args&&...) {
    return nullptr;
  }
  void delEdge(ETy*) {}
  bool mustDel() const { return false; }
};

template <>
struct EdgeFactory<void, false> {
  template <typename... Args>
  void* mkEdge(Args&&...) {
    return static_cast<void*>(NULL);
  }
  void delEdge(void*) {}
  bool mustDel() const { return false; }
};

} // namespace internal

/**
 * A Graph.
 *
 * An example of use:
 *
 * \code
 * struct Node {
 *   ... // Definition of node data
 * };
 *
 * typedef galois::graphs::Morph_SepInOut_Graph<Node,int,true> Graph;
 *
 * // Create graph
 * Graph g;
 * Node n1, n2;
 * Graph::GraphNode a, b;
 * a = g.createNode(n1);
 * g.addNode(a);
 * b = g.createNode(n2);
 * g.addNode(b);
 * g.getEdgeData(g.addEdge(a, b)) = 5;
 *
 * // Traverse graph
 * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
 *   Graph::GraphNode src = *ii;
 *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);
 * ++jj) { Graph::GraphNode dst = graph.getEdgeDst(jj); int edgeData =
 * g.getEdgeData(jj); assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * And in C++11:
 *
 * \code
 * // Traverse graph
 * for (Graph::GraphNode src : g) {
 *   for (Graph::edge_iterator edge : g.out_edges(src)) {
 *     Graph::GraphNode dst = g.getEdgeDst(edge);
 *     int edgeData = g.getEdgeData(edge);
 *     assert(edgeData == 5);
 *   }
 * }
 * \endcode
 *
 * @tparam NodeTy Type of node data
 * @tparam EdgeTy Type of edge data
 * @tparam Directional true if graph is directed
 * @tparam InOut true if directed graph tracks in-edges
 * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)
 */
template <typename NodeTy, typename EdgeTy, bool Directional,
          bool InOut = false, bool HasNoLockable = false,
          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>
class Morph_SepInOut_Graph : private boost::noncopyable {
public:
  //! If true, do not use abstract locks in graph
  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,
                                 _has_no_lockable, SortedNeighbors, FileEdgeTy>
        type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef Morph_SepInOut_Graph<_node_data, EdgeTy, Directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef Morph_SepInOut_Graph<NodeTy, _edge_data, Directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>
        type;
  };

  template <typename _file_edge_data>
  struct with_file_edge_data {
    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,
                                 HasNoLockable, SortedNeighbors,
                                 _file_edge_data>
        type;
  };

  template <bool _directional>
  struct with_directional {
    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, _directional, InOut,
                                 HasNoLockable, SortedNeighbors, FileEdgeTy>
        type;
  };

  template <bool _sorted_neighbors>
  struct with_sorted_neighbors {
    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,
                                 HasNoLockable, _sorted_neighbors, FileEdgeTy>
        type;
  };

  typedef read_with_aux_first_graph_tag read_tag;

private:
  template <typename T>
  struct first_eq_and_valid {
    T N2;
    first_eq_and_valid(T& n) : N2(n) {}
    template <typename T2>
    bool operator()(const T2& ii) const {
      return ii.first() == N2 && ii.first() && ii.first()->active;
    }
  };

  struct first_not_valid {
    template <typename T2>
    bool operator()(const T2& ii) const {
      return !ii.first() || !ii.first()->active;
    }
  };

  template <typename T>
  struct first_lt {
    template <typename T2>
    bool operator()(const T& N2, const T2& ii) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return N2 < ii.first();
    }
    template <typename T2>
    bool operator()(const T2& ii, const T& N2) const {
      assert(ii.first() && "UNEXPECTED: invalid item in edgelist");
      return ii.first() < N2;
    }
  };

  class gNode;
  struct gNodeTypes
      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {
    //! The storage type for an edge
    typedef internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>
        EdgeInfo;

    //! The storage type for edges
    // typedef llvm::SmallVector<EdgeInfo, 3> EdgesTy;
    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;
    typedef boost::container::small_vector<
        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>
        EdgesTy;

    typedef typename EdgesTy::iterator iterator;
  };

  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,
                public gNodeTypes {
    friend class Morph_SepInOut_Graph;
    typedef internal::NodeInfoBase<NodeTy, !HasNoLockable> NodeInfo;
    typename gNodeTypes::EdgesTy edges;
    typename gNodeTypes::EdgesTy in_edges;
    typedef typename gNode::iterator iterator;
    typedef typename gNode::EdgeInfo EdgeInfo;

    bool active;

    iterator begin() { return edges.begin(); }
    iterator end() { return edges.end(); }

    iterator in_edge_begin() { return in_edges.begin(); }
    iterator in_edge_end() { return in_edges.end(); }

    void erase(iterator ii, bool inEdge = false) {
      auto& edgelist = (inEdge) ? in_edges : edges;
      if (SortedNeighbors) {
        // For sorted case remove the element, moving following
        // elements back to fill the space.
        edgelist.erase(ii);
      } else {
        // We don't need to preserve the order, so move the last edge
        // into this place and then remove last edge.
        *ii = edgelist.back();
        edgelist.pop_back();
      }
    }

    void erase(gNode* N, bool inEdge = false) {
      iterator ii = find(N, inEdge);
      erase(ii, inEdge);
    }

    iterator find(gNode* N, bool inEdge = false) {
      auto& edgelist = (inEdge) ? in_edges : edges;
      iterator ii, ei = edgelist.end();
      if (SortedNeighbors) {
        assert(std::is_sorted(edgelist.begin(), edgelist.end(),
                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {
                                return e1.first() < e2.first();
                              }));
        ii = std::lower_bound(edgelist.begin(), edgelist.end(), N,
                              first_lt<gNode*>());
      } else {
        ii = edgelist.begin();
      }

      first_eq_and_valid<gNode*> checker(N);
      ii = std::find_if(ii, ei, checker);
      while (ii != ei && ii->isInEdge() != inEdge) {
        ++ii;
        ii = std::find_if(ii, ei, checker);
      };
      return ii;
    }

    void resizeEdges(size_t size, bool inEdge = false) {
      auto& edgelist = (inEdge) ? in_edges : edges;
      edgelist.resize(size, EdgeInfo(new gNode(), 0));
    }

    template <typename... Args>
    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {
      iterator ii;
      auto& edgelist = (inEdge) ? in_edges : edges;
      if (SortedNeighbors) {
        // If neighbors are sorted, find appropriate insertion point.
        // Insert before first neighbor that is too far.
        ii = std::upper_bound(edgelist.begin(), edgelist.end(), N,
                              first_lt<gNode*>());
      } else
        ii = edgelist.end();
      return edgelist.insert(
          ii, EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    template <typename... Args>
    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,
                                 Args&&... args) {
      auto& edgelist = (inEdge) ? in_edges : edges;
      // Morph check for holes
      iterator ii, ei;
      if (SortedNeighbors) {
        // If neighbors are sorted, find acceptable range for insertion.
        ii = std::lower_bound(edgelist.begin(), edgelist.end(), N,
                              first_lt<gNode*>());
        ei = std::upper_bound(ii, edgelist.end(), N, first_lt<gNode*>());
      } else {
        // If not sorted, we can insert anywhere in the list.
        ii = edgelist.begin();
        ei = edgelist.end();
      }
      ii = std::find_if(ii, ei, first_not_valid());
      if (ii != ei) {
        // FIXME: We could move elements around (short distances).
        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);
        return ii;
      }
      return edgelist.insert(
          ei, EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {
      galois::runtime::acquire(this, mflag);
    }

    template <bool _A1 = HasNoLockable>
    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}

  public:
    template <typename... Args>
    gNode(Args&&... args)
        : NodeInfo(std::forward<Args>(args)...), active(false) {}
  };

  // The graph manages the lifetimes of the data in the nodes and edges
  typedef galois::InsertBag<gNode> NodeListTy;
  NodeListTy nodes;

  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;

  // Helpers for iterator classes
  struct is_node {
    bool operator()(const gNode& g) const { return g.active; }
  };
  struct is_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active;
    }
  };
  struct is_in_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && e.isInEdge();
    }
  };
  struct is_out_edge {
    bool operator()(typename gNodeTypes::EdgeInfo& e) const {
      return e.first()->active && !e.isInEdge();
    }
  };
  struct makeGraphNode {
    gNode* operator()(gNode& data) const { return &data; }
  };

public:
  //! Graph node handle
  typedef gNode* GraphNode;
  //! Edge data type
  typedef EdgeTy edge_data_type;
  //! Edge data type of file we are loading this graph from
  typedef FileEdgeTy file_edge_data_type;
  //! Node data type
  typedef NodeTy node_data_type;
  //! (Out or Undirected) Edge iterator
  typedef typename boost::filter_iterator<is_out_edge,
                                          typename gNodeTypes::iterator>
      edge_iterator;
  //! In Edge iterator
  typedef
      typename boost::filter_iterator<is_in_edge, typename gNodeTypes::iterator>
          in_edge_iterator;
  //! Reference to edge data
  typedef typename gNodeTypes::EdgeInfo::reference edge_data_reference;
  //! Reference to node data
  typedef typename gNodeTypes::reference node_data_reference;
  //! Node iterator
  typedef boost::transform_iterator<
      makeGraphNode,
      boost::filter_iterator<is_node, typename NodeListTy::iterator>>
      iterator;
#ifdef AUX_MAP
  struct ReadGraphAuxData {
    LargeArray<GraphNode> nodes;
    galois::PerThreadMap<FileGraph::GraphNode,
                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>
        inNghs;
  };
#else
  struct AuxNode {
    galois::substrate::SimpleLock lock;
    GraphNode n;
    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;
  };
  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;

  constexpr static const bool DirectedNotInOut = (Directional && !InOut);
  using ReadGraphAuxData =
      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,
                                LargeArray<AuxNodePadded>>::type;
#endif

private:
  template <typename... Args>
  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,
                                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->find(dst);
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdgeWithReuse(dst, 0, false,
                                      std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
                                      std::forward<Args>(args)...);
        ii        = src->createEdgeWithReuse(dst, e, false,
                                      std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  template <typename... Args>
  edge_iterator createEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    if (ii == src->end()) {
      if (Directional && !InOut) {
        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);
      } else {
        dst->acquire(mflag);
        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
        ii        = dst->createEdge(src, e, Directional ? true : false,
                             std::forward<Args>(args)...);
        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), ii, src->end());
  }

  /**
   * Creates an outgoing edge at src for the edge from src to dst.
   * Only called by constructOutEdgeValue.
   */
  template <typename... Args>
  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,
                        Args&&... args) {
    assert(src);
    assert(dst);

    src->acquire(mflag);
    typename gNode::iterator ii = src->end();
    if (ii == src->end()) {
      dst->acquire(mflag);
      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
      return e;
    }
    return nullptr;
  }

  /**
   * Creates an incoming edge at dst for the edge from src to dst.
   * Only called by constructInEdgeValue.
   * Reuse data from the corresponding outgoing edge.
   */
  template <typename... Args>
  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,
                    galois::MethodFlag mflag, Args&&... args) {
    assert(src);
    assert(dst);

    dst->acquire(mflag);
    typename gNode::iterator ii = dst->end();
    if (ii == dst->end()) {
      src->acquire(mflag);
      ii = dst->createEdge(src, e, Directional ? true : false,
                           std::forward<Args>(args)...);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<!_A1 || _A2>::type* = 0) {
    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;
    typedef LargeArray<EdgeTy> ED;
    if (ED::has_value) {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,
                           graph.getEdgeData<FEDV>(nn));
    } else {
      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
    }
  }

  template <bool _A1 = LargeArray<EdgeTy>::has_value,
            bool _A2 = LargeArray<FileEdgeTy>::has_value>
  EdgeTy*
  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                        GraphNode src, GraphNode dst,
                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
  }

  // will reuse edge data from outgoing edges
  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,
                            GraphNode dst) {
    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);
  }

public:
  /**
   * Creates a new node holding the indicated data. Usually you should call
   * {@link addNode()} afterwards.
   */
  template <typename... Args>
  GraphNode createNode(Args&&... args) {
    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));
    N->active = false;
    return GraphNode(N);
  }

  /**
   * Adds a node to the graph.
   */
  void addNode(const GraphNode& n,
               galois::MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    n->active = true;
  }

  //! Gets the node data for a node.
  node_data_reference
  getData(const GraphNode& n,
          galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    // galois::runtime::checkWrite(mflag, false);
    n->acquire(mflag);
    return n->getData();
  }

  //! Checks if a node is in the graph
  bool containsNode(const GraphNode& n,
                    galois::MethodFlag mflag = MethodFlag::WRITE) const {
    assert(n);
    n->acquire(mflag);
    return n->active;
  }

  /**
   * Removes a node from the graph along with all its outgoing/incoming edges
   * for undirected graphs or outgoing edges for directed graphs.
   */
  // FIXME: handle edge memory
  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(n);
    // galois::runtime::checkWrite(mflag, true);
    n->acquire(mflag);
    gNode* N = n;
    if (N->active) {
      N->active = false;
      N->edges.clear();
      N->in_edges.clear();
    }
  }

  /**
   * Resize the edges of the node. For best performance, should be done
   * serially.
   */
  void resizeEdges(GraphNode src, size_t size,
                   galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, false);
    src->acquire(mflag);
    src->resizeEdges(size);
    src->resizeEdges(size, true); // for incoming edges
  }

  /**
   * Adds an edge to graph, replacing existing value if edge already exists.
   *
   * Ignore the edge data, let the caller use the returned iterator to set the
   * value if desired.  This frees us from dealing with the void edge data
   * problem in this API
   */
  edge_iterator addEdge(GraphNode src, GraphNode dst,
                        galois::MethodFlag mflag = MethodFlag::WRITE) {
    return createEdgeWithReuse(src, dst, mflag);
  }

  //! Adds and initializes an edge to graph but does not check for duplicate
  //! edges
  template <typename... Args>
  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,
                             galois::MethodFlag mflag, Args&&... args) {
    return createEdge(src, dst, mflag, std::forward<Args>(args)...);
  }

  //! Removes an edge from the graph
  void removeEdge(GraphNode src, edge_iterator dst,
                  galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    // galois::runtime::checkWrite(mflag, true);
    src->acquire(mflag);
    if (Directional && !InOut) {
      src->erase(dst.base());
    } else {
      dst->first()->acquire(mflag);
      // EdgeTy* e = dst->second();
      dst->first()->erase(
          src, Directional ? true : false); // erase incoming/symmetric edge
      src->erase(dst.base());
    }
  }

  template <bool _DirectedInOut = (Directional && InOut)>
  void removeInEdge(GraphNode dst, in_edge_iterator src,
                    galois::MethodFlag mflag = MethodFlag::WRITE,
                    typename std::enable_if<_DirectedInOut>::type* = 0) {
    assert(dst);

    dst->acquire(mflag);
    src->first()->acquire(mflag);
    // EdgeTy* e = src->second();
    src->first()->erase(dst); // erase the outgoing edge
    dst->erase(src.base(), true);
  }

  //! Finds if an edge between src and dst exists
  edge_iterator findEdge(GraphNode src, GraphNode dst,
                         galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();
    is_out_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edge, lock dst and verify still active
      dst->acquire(mflag);
      if (!edge_predicate(*ii))
        // I think we need this too, else we'll return some random iterator.
        ii = ei;
    } else
      ii = ei;
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  edge_iterator
  findEdgeSortedByDst(GraphNode src, GraphNode dst,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(src);
    assert(dst);
    src->acquire(mflag);
    assert(std::is_sorted(src->begin(), src->end(),
                          [=](const typename gNode::EdgeInfo& e1,
                              const typename gNode::EdgeInfo& e2) {
                            return e1.first() < e2.first();
                          }));

    auto ei = src->end();
    auto ii =
        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());

    first_eq_and_valid<gNode*> checker(dst);
    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound
    while (ii != ei && ii->isInEdge()) {
      ++ii;
      ii = std::find_if(ii, ei, checker);
    };

    is_out_edge edge_predicate;
    if (ii != ei) {
      dst->acquire(mflag);
      if (!edge_predicate(*ii)) {
        ii = ei;
      }
    }
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  template <bool _Undirected = !Directional>
  edge_iterator findInEdge(GraphNode src, GraphNode dst,
                           galois::MethodFlag mflag = MethodFlag::WRITE,
                           typename std::enable_if<_Undirected>::type* = 0) {
    // incoming neighbors are the same as outgoing neighbors in undirected
    // graphs
    return findEdge(src, dst, mflag);
  }

  // Find if an incoming edge between src and dst exists for directed in-out
  // graphs
  template <bool _DirectedInOut = (Directional && InOut)>
  in_edge_iterator
  findInEdge(GraphNode src, GraphNode dst,
             galois::MethodFlag mflag                       = MethodFlag::WRITE,
             typename std::enable_if<_DirectedInOut>::type* = 0) {
    assert(src);
    assert(dst);
    dst->acquire(mflag);
    typename gNodeTypes::iterator ii = dst->find(src, true),
                                  ei = dst->in_edge_end();
    is_in_edge edge_predicate;
    if (ii != ei && edge_predicate(*ii)) {
      // After finding edges, lock dst and verify still active
      src->acquire(mflag);
      if (!edge_predicate(*ii))
        // need this to avoid returning a random iterator
        ii = ei;
    } else
      ii = ei;
    return boost::make_filter_iterator(edge_predicate, ii, ei);
  }

  /**
   * Returns the edge data associated with the edge. It is an error to
   * get the edge data for a non-existent edge.  It is an error to get
   * edge data for inactive edges. By default, the mflag is
   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call
   * and should perform the appropriate locking.
   */
  edge_data_reference
  getEdgeData(edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  edge_data_reference
  getEdgeData(in_edge_iterator ii,
              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {
    assert(ii->first()->active);
    // galois::runtime::checkWrite(mflag, false);
    ii->first()->acquire(mflag);
    return *ii->second();
  }

  //! Returns the destination of an edge
  GraphNode getEdgeDst(edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  GraphNode getEdgeDst(in_edge_iterator ii) {
    assert(ii->first()->active);
    return GraphNode(ii->first());
  }

  void sortEdgesByDst(GraphNode N,
                      galois::MethodFlag mflag = MethodFlag::WRITE) {
    acquire(N, mflag);
    typedef typename gNode::EdgeInfo EdgeInfo;
    auto eDstCompare = [=](const EdgeInfo& e1, const EdgeInfo& e2) {
      return e1.first() < e2.first();
    };
    std::sort(N->begin(), N->end(), eDstCompare);
    std::sort(N->in_edge_begin(), N->in_edge_end(), eDstCompare);
  }

  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
    galois::do_all(
        galois::iterate(*this),
        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());
  }

  //// General Things ////

  //! Returns an iterator to the neighbors of a node
  edge_iterator edge_begin(GraphNode N,
                           galois::MethodFlag mflag = MethodFlag::WRITE) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;
           ++ii) {
        if (ii->first()->active && !ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());
  }

  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
                typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    N->acquire(mflag);

    if (galois::runtime::shouldLock(mflag)) {
      for (typename gNode::iterator ii = N->in_edge_begin(),
                                    ee = N->in_edge_end();
           ii != ee; ++ii) {
        if (ii->first()->active && ii->isInEdge())
          ii->first()->acquire(mflag);
      }
    }
    return boost::make_filter_iterator(is_in_edge(), N->in_edge_begin(),
                                       N->in_edge_end());
  }

  template <bool _Undirected = !Directional>
  edge_iterator in_edge_begin(GraphNode N,
                              galois::MethodFlag mflag = MethodFlag::WRITE,
                              typename std::enable_if<_Undirected>::type* = 0) {
    return edge_begin(N, mflag);
  }

  //! Returns the end of the neighbor iterator
  edge_iterator
  edge_end(GraphNode N,
           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());
  }

  template <bool _Undirected = !Directional>
  in_edge_iterator
  in_edge_end(GraphNode N,
              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,
              typename std::enable_if<!_Undirected>::type* = 0) {
    assert(N);
    // Acquiring lock is not necessary: no valid use for an end pointer should
    // ever require it
    // N->acquire(mflag);
    return boost::make_filter_iterator(is_in_edge(), N->in_edge_end(),
                                       N->in_edge_end());
  }

  template <bool _Undirected = !Directional>
  edge_iterator in_edge_end(GraphNode N,
                            galois::MethodFlag mflag = MethodFlag::WRITE,
                            typename std::enable_if<_Undirected>::type* = 0) {
    return edge_end(N, mflag);
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<in_edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<!_Undirected>::type* = 0) {
    return internal::make_no_deref_range(in_edge_begin(N, mflag),
                                         in_edge_end(N, mflag));
  }

  template <bool _Undirected = !Directional>
  runtime::iterable<NoDerefIterator<edge_iterator>>
  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,
           typename std::enable_if<_Undirected>::type* = 0) {
    return edges(N, mflag);
  }

  /**
   * An object with begin() and end() methods to iterate over the outgoing
   * edges of N.
   */
  internal::EdgesIterator<Morph_SepInOut_Graph>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::EdgesIterator<Morph_SepInOut_Graph>(*this, N, mflag);
  }

  /**
   * Returns an iterator to all the nodes in the graph. Not thread-safe.
   */
  iterator begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),
        makeGraphNode());
  }

  //! Returns the end of the node iterator. Not thread-safe.
  iterator end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),
        makeGraphNode());
  }

  typedef iterator local_iterator;

  local_iterator local_begin() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_begin(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  local_iterator local_end() {
    return boost::make_transform_iterator(
        boost::make_filter_iterator(is_node(), nodes.local_end(),
                                    nodes.local_end()),
        makeGraphNode());
  }

  /**
   * Returns the number of nodes in the graph. Not thread-safe.
   */
  unsigned int size() { return std::distance(begin(), end()); }

  //! Returns the size of edge data.
  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }

#ifdef AUX_MAP
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.nodes.allocateInterleaved(numNodes);
  }

  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,
                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux.nodes[*ii] = createNode();
      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    auto& map = aux.inNghs.get();

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto dstID = graph.getEdgeDst(nn);
        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];
        auto e = constructOutEdgeValue(graph, nn, src, dst);
        if (!Directional || InOut) {
          map[dstID].push_back({src, e});
        }
      }
    }
  }

  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,
                            const ReadGraphAuxData& aux) {
    if (!Directional || InOut) {
      auto r = graph
                   .divideByNode(sizeof(gNode),
                                 sizeof(typename gNode::EdgeInfo), tid, total)
                   .first;

      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {
        const auto& map = aux.inNghs.get(i);
        auto ii         = map.lower_bound(*(r.first));  // inclusive begin
        auto ei         = map.lower_bound(*(r.second)); // exclusive end
        for (; ii != ei; ++ii) {
          auto dst = aux.nodes[ii->first];
          for (const auto& ie : ii->second) {
            constructInEdgeValue(graph, ie.second, ie.first, dst);
          }
        }
      }
    }
  }
#else
  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {
    size_t numNodes = graph.size();
    aux.allocateInterleaved(numNodes);

    if (!DirectedNotInOut) {
      galois::do_all(galois::iterate(0ul, aux.size()),
                     [&](size_t index) { aux.constructAt(index); });
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                          unsigned total,
                                          ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      auxNode.n     = createNode();
      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,
                                         unsigned total,
                                         ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;
    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      aux[*ii] = createNode();
      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                             unsigned total,
                                             ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        auto src     = aux[*ii].get().n;
        auto& dstAux = aux[graph.getEdgeDst(nn)].get();
        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);
        dstAux.lock.lock();
        dstAux.inNghs.push_back({src, e});
        dstAux.lock.unlock();
      }
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            const ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
                                    en = graph.edge_end(*ii);
           nn != en; ++nn) {
        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);
      }
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,
                                            unsigned total,
                                            ReadGraphAuxData& aux) {
    auto r = graph
                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),
                               tid, total)
                 .first;

    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
      auto& auxNode = aux[*ii].get();
      for (auto ie : auxNode.inNghs) {
        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);
      }
    }
  }

  template <bool V = DirectedNotInOut>
  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,
                                           ReadGraphAuxData&) {}
#endif
};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/graphs/OCGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_OCGRAPH_H
#define GALOIS_GRAPHS_OCGRAPH_H

#include <string>
#include <type_traits>

#include <boost/iterator/counting_iterator.hpp>
#include <boost/utility.hpp>

#include "galois/config.h"
#include "galois/graphs/Details.h"
#include "galois/substrate/PageAlloc.h"
#include "galois/LazyObject.h"
#include "galois/LargeArray.h"
#include "galois/optional.h"

namespace galois {
namespace graphs {

/**
 * Binds the segment parameter of an out-of-core graph so that it can be used in
 * place of a non out-of-core graph.
 */
template <typename Graph>
class BindSegmentGraph : private boost::noncopyable {
  typedef typename Graph::segment_type segment_type;

  Graph& graph;
  segment_type segment;

public:
  explicit BindSegmentGraph(Graph& g) : graph(g) {}
  BindSegmentGraph(Graph& g, segment_type s) : graph(g), segment(s) {}

  void setSegment(const segment_type& s) { segment = s; }

  typedef typename Graph::GraphNode GraphNode;
  typedef typename Graph::edge_data_type edge_data_type;
  typedef typename Graph::node_data_type node_data_type;
  typedef typename Graph::edge_data_reference edge_data_reference;
  typedef typename Graph::node_data_reference node_data_reference;
  typedef typename Graph::edge_iterator edge_iterator;
  typedef typename Graph::in_edge_iterator in_edge_iterator;
  typedef typename Graph::iterator iterator;
  typedef typename Graph::const_iterator const_iterator;
  typedef typename Graph::local_iterator local_iterator;
  typedef typename Graph::const_local_iterator const_local_iterator;

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    return graph.getData(N, mflag);
  }

  edge_data_reference getEdgeData(edge_iterator ni,
                                  MethodFlag mflag = MethodFlag::UNPROTECTED) {
    return graph.getEdgeData(segment, ni, mflag);
  }

  GraphNode getEdgeDst(edge_iterator ni) {
    return graph.getEdgeDst(segment, ni);
  }

  size_t size() const { return graph.size(); }
  size_t sizeEdges() const { return graph.sizeEdges(); }

  iterator begin() const { return graph.begin(); }
  iterator end() const { return graph.end(); }

  local_iterator local_begin() const { return graph.local_begin(); }
  local_iterator local_end() const { return graph.local_end(); }

  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return graph.edge_begin(segment, N, mflag);
  }

  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return graph.edge_end(segment, N, mflag);
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::make_no_deref_range(edge_begin(N, mflag),
                                         edge_end(N, mflag));
  }

  runtime::iterable<NoDerefIterator<edge_iterator>>
  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return edges(N, mflag);
  }

  edge_data_reference
  getInEdgeData(edge_iterator ni, MethodFlag mflag = MethodFlag::UNPROTECTED) {
    return graph.getInEdgeData(segment, ni, mflag);
  }

  GraphNode getInEdgeDst(in_edge_iterator ni) {
    return graph.getInEdgeDst(segment, ni);
  }

  in_edge_iterator in_edge_begin(GraphNode N,
                                 MethodFlag mflag = MethodFlag::WRITE) {
    return graph.in_edge_begin(segment, N, mflag);
  }

  in_edge_iterator in_edge_end(GraphNode N,
                               MethodFlag mflag = MethodFlag::WRITE) {
    return graph.in_edge_end(segment, N, mflag);
  }

  internal::InEdgesIterator<BindSegmentGraph>
  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
    return internal::InEdgesIterator<BindSegmentGraph>(*this, N, mflag);
  }

  size_t idFromNode(GraphNode N) { return graph.idFromNode(N); }

  GraphNode nodeFromId(size_t N) { return graph.nodeFromId(N); }
};

//! Like {@link FileGraph} but allows partial loading of the graph.
class OCFileGraph : private boost::noncopyable {
public:
  typedef uint32_t GraphNode;
  typedef boost::counting_iterator<uint32_t> iterator;
  typedef boost::counting_iterator<uint64_t> edge_iterator;
  typedef uint64_t* edge_offset_iterator;

  template <typename EdgeTy>
  struct EdgeReference {
    typedef typename LazyObject<EdgeTy>::reference type;
  };

private:
  struct PageSizeConf;

  class Block {
    friend class OCFileGraph;
    void* m_mapping;
    size_t m_length;
    char* m_data;
    size_t m_begin;
    size_t m_sizeof_data;

    void unload();
    void load(int fd, offset_t offset, size_t begin, size_t len,
              size_t sizeof_data);

  public:
    Block() : m_mapping(0) {}

    char* get(size_t index) const {
      char* p = m_data + (m_sizeof_data * (index - m_begin));
      assert(p < reinterpret_cast<char*>(m_mapping) + m_length);
      assert(m_mapping <= p);
      return p;
    }
  };

  struct Segment {
    Block outs;
    Block edgeData;
    bool loaded;

    Segment() : loaded(false) {}

    void unload() {
      outs.unload();
      edgeData.unload();
      loaded = false;
    }
  };

  void* masterMapping;
  int masterFD;
  size_t masterLength;
  uint64_t numEdges;
  uint64_t numNodes;
  uint64_t* outIdx;

public:
  typedef Segment segment_type;

  OCFileGraph()
      : masterMapping(0), masterFD(-1), numEdges(0), numNodes(0), outIdx(0) {}
  ~OCFileGraph();

  iterator begin() const { return iterator(0); }
  iterator end() const { return iterator(numNodes); }
  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }
  edge_iterator edge_begin(GraphNode n) const {
    return edge_iterator(n == 0 ? 0 : outIdx[n - 1]);
  }
  edge_iterator edge_end(GraphNode n) const { return edge_iterator(outIdx[n]); }
  edge_offset_iterator edge_offset_begin() const { return outIdx; }
  edge_offset_iterator edge_offset_end() const { return outIdx + numNodes; }

  template <typename EdgeTy>
  typename EdgeReference<EdgeTy>::type getEdgeData(
      const segment_type& s, edge_iterator it,
      typename std::enable_if<!std::is_same<void, EdgeTy>::value>::type* = 0) {
    EdgeTy* p = reinterpret_cast<EdgeTy*>(s.edgeData.get(*it));
    return *p;
  }

  template <typename EdgeTy>
  typename EdgeReference<EdgeTy>::type getEdgeData(
      const segment_type&, edge_iterator,
      typename std::enable_if<std::is_same<void, EdgeTy>::value>::type* = 0) {
    return 0;
  }

  GraphNode getEdgeDst(const segment_type& s, edge_iterator it) {
    uint32_t* p = reinterpret_cast<uint32_t*>(s.outs.get(*it));
    return *p;
  }

  void unload(segment_type& s) {
    if (!s.loaded)
      return;

    s.outs.unload();
    s.edgeData.unload();
    s.loaded = false;
  }

  void load(segment_type& s, edge_iterator begin, edge_iterator end,
            size_t sizeof_data);

  void fromFile(const std::string& fname);
};

struct read_oc_immutable_edge_graph_tag {};

template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
          // bool UseNumaAlloc=false, // XXX: implement this
          bool HasOutOfLineLockable = false>
class OCImmutableEdgeGraph
    : private internal::LocalIteratorFeature<false>,
      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
                                                 !HasNoLockable> {
public:
  template <bool _has_id>
  struct with_id {
    typedef OCImmutableEdgeGraph type;
  };

  template <typename _node_data>
  struct with_node_data {
    typedef OCImmutableEdgeGraph<_node_data, EdgeTy, HasNoLockable,
                                 HasOutOfLineLockable>
        type;
  };

  template <typename _edge_data>
  struct with_edge_data {
    typedef OCImmutableEdgeGraph<NodeTy, _edge_data, HasNoLockable,
                                 HasOutOfLineLockable>
        type;
  };

  template <bool _has_no_lockable>
  struct with_no_lockable {
    typedef OCImmutableEdgeGraph<NodeTy, EdgeTy, _has_no_lockable,
                                 HasOutOfLineLockable>
        type;
  };

  template <bool _use_numa_alloc>
  struct with_numa_alloc {
    typedef OCImmutableEdgeGraph type;
  };

  template <bool _has_out_of_line_lockable>
  struct with_out_of_line_lockable {
    typedef OCImmutableEdgeGraph<NodeTy, EdgeTy, HasNoLockable,
                                 _has_out_of_line_lockable>
        type;
  };

  typedef read_oc_immutable_edge_graph_tag read_tag;

private:
  typedef internal::NodeInfoBase<NodeTy,
                                 !HasNoLockable && !HasOutOfLineLockable>
      NodeInfo;
  typedef LargeArray<NodeInfo> NodeData;

  NodeData nodeData;
  OCFileGraph outGraph;
  OCFileGraph inGraphStorage;
  OCFileGraph* inGraph;

  uint64_t numNodes;
  uint64_t numEdges;

public:
  typedef int tt_is_segmented;

  typedef typename OCFileGraph::GraphNode GraphNode;
  typedef EdgeTy edge_data_type;
  typedef edge_data_type file_edge_data_type;
  typedef NodeTy node_data_type;
  typedef typename OCFileGraph::template EdgeReference<EdgeTy>::type
      edge_data_reference;
  typedef typename NodeInfo::reference node_data_reference;
  typedef typename OCFileGraph::edge_iterator edge_iterator;
  typedef edge_iterator in_edge_iterator;
  typedef typename OCFileGraph::iterator iterator;
  typedef iterator const_iterator;
  typedef boost::counting_iterator<GraphNode> local_iterator;
  typedef local_iterator const_local_iterator;

  class segment_type {
    template <typename, typename, bool, bool>
    friend class OCImmutableEdgeGraph;
    OCFileGraph::segment_type out;
    OCFileGraph::segment_type in;
    iterator nodeBegin;
    iterator nodeEnd;

  public:
    //! Returns true if segment has been loaded into memory
    bool loaded() const { return out.loaded; }
    //! Returns true if segment represents a non-empty range
    explicit operator bool() { return nodeBegin != nodeEnd; }
    size_t size() const { return std::distance(nodeBegin, nodeEnd); }
    bool containsNode(size_t n) const { // XXX: hack
      return *nodeBegin <= n && n < *nodeEnd;
    }
  };

private:
  galois::optional<segment_type> memorySegment;

  segment_type computeSegment(size_t startNode, size_t numEdges) {
    typedef typename OCFileGraph::edge_offset_iterator edge_offset_iterator;

    segment_type ret;

    edge_offset_iterator outStart = outGraph.edge_offset_begin();
    edge_offset_iterator outEnd   = outGraph.edge_offset_end();
    std::advance(outStart, startNode);
    if (outStart == outEnd) {
      ret.nodeBegin = ret.nodeEnd = iterator(0);
      return ret;
    }
    edge_offset_iterator outNext =
        std::lower_bound(outStart + 1, outEnd, *outStart + numEdges);
    ptrdiff_t outNodes = std::distance(outStart, outNext);

    edge_offset_iterator inStart = inGraph->edge_offset_begin();
    edge_offset_iterator inEnd   = inGraph->edge_offset_end();
    std::advance(inStart, startNode);
    edge_offset_iterator inNext =
        std::lower_bound(inStart + 1, inEnd, *inStart + numEdges);
    ptrdiff_t inNodes = std::distance(inStart, inNext);

    ptrdiff_t nodes = std::min(outNodes, inNodes);

    ret.nodeBegin = iterator(startNode);
    ret.nodeEnd   = iterator(startNode + nodes);
    return ret;
  }

  void load(segment_type& seg, size_t sizeof_data) {
    outGraph.load(seg.out, outGraph.edge_begin(*seg.nodeBegin),
                  outGraph.edge_end(seg.nodeEnd[-1]), sizeof_data);
    if (inGraph != &outGraph)
      inGraph->load(seg.in, inGraph->edge_begin(*seg.nodeBegin),
                    inGraph->edge_end(seg.nodeEnd[-1]), sizeof_data);
    else
      seg.in = seg.out;
  }

  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
    galois::runtime::acquire(&nodeData[N], mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode N, MethodFlag mflag,
                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
    this->outOfLineAcquire(idFromNode(N), mflag);
  }

  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
  void acquireNode(GraphNode, MethodFlag,
                   typename std::enable_if<_A2>::type* = 0) {}

public:
  ~OCImmutableEdgeGraph() {
    if (memorySegment) {
      outGraph.unload(memorySegment->out);
      if (inGraph != &outGraph)
        inGraph->unload(memorySegment->in);
    }
  }

  void keepInMemory() {
    memorySegment = galois::optional<segment_type>(computeSegment(0, numEdges));
    load(*memorySegment, LazyObject<EdgeTy>::size_of::value);
  }

  /**
   * Returns a segment starting from the beginning of the graph with either
   * (1) some number of nodes with all their edges but no more than numEdges
   * else (2) one node and all its edges.
   */
  segment_type nextSegment(size_t edges) {
    if (memorySegment)
      return *memorySegment;
    else
      return computeSegment(0, edges);
  }

  /**
   * Returns the next segment after cur.
   */
  segment_type nextSegment(const segment_type& cur, size_t edges) {
    return computeSegment(*cur.nodeEnd, edges);
  }

  void load(segment_type& seg) {
    if (memorySegment)
      return;

    load(seg, LazyObject<EdgeTy>::size_of::value);
  }

  void unload(segment_type& seg) {
    if (memorySegment)
      return;

    outGraph.unload(seg.out);
    if (inGraph != &outGraph)
      inGraph->unload(seg.in);
  }

  iterator begin(const segment_type& cur) { return cur.nodeBegin; }
  iterator end(const segment_type& cur) { return cur.nodeEnd; }

  node_data_reference getData(GraphNode N,
                              MethodFlag mflag = MethodFlag::WRITE) {
    // galois::runtime::checkWrite(mflag, false);
    NodeInfo& NI = nodeData[N];
    acquireNode(N, mflag);
    return NI.getData();
  }

  edge_data_reference
  getEdgeData(const segment_type& segment, edge_iterator ni,
              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    return outGraph.getEdgeData<EdgeTy>(segment.out, ni);
  }

  GraphNode getEdgeDst(const segment_type& segment, edge_iterator ni) {
    return outGraph.getEdgeDst(segment.out, ni);
  }

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }

  iterator begin() const { return outGraph.begin(); }
  iterator end() const { return outGraph.end(); }

  const_local_iterator local_begin() const {
    return const_local_iterator(this->localBegin(numNodes));
  }
  const_local_iterator local_end() const {
    return const_local_iterator(this->localEnd(numNodes));
  }
  local_iterator local_begin() {
    return local_iterator(this->localBegin(numNodes));
  }
  local_iterator local_end() {
    return local_iterator(this->localEnd(numNodes));
  }

  edge_iterator edge_begin(const segment_type& segment, GraphNode N,
                           MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (galois::runtime::shouldLock(mflag)) {
      for (edge_iterator ii = outGraph.edge_begin(N), ee = outGraph.edge_end(N);
           ii != ee; ++ii) {
        acquireNode(outGraph.getEdgeDst(segment.out, *ii), mflag);
      }
    }
    return outGraph.edge_begin(N);
  }

  edge_iterator edge_end(const segment_type&, GraphNode N,
                         MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return outGraph.edge_end(N);
  }

  edge_data_reference
  getInEdgeData(const segment_type& segment, edge_iterator ni,
                MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
    // galois::runtime::checkWrite(mflag, false);
    return inGraph->getEdgeData<EdgeTy>(segment.in, ni);
  }

  GraphNode getInEdgeDst(const segment_type& segment, in_edge_iterator ni) {
    return inGraph->getEdgeDst(segment.in, ni);
  }

  in_edge_iterator in_edge_begin(const segment_type& segment, GraphNode N,
                                 MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    if (galois::runtime::shouldLock(mflag)) {
      for (in_edge_iterator ii = inGraph->edge_begin(N),
                            ee = inGraph->edge_end(N);
           ii != ee; ++ii) {
        acquireNode(inGraph->getEdgeDst(segment.in, ii), mflag);
      }
    }
    return inGraph->edge_begin(N);
  }

  in_edge_iterator in_edge_end(const segment_type&, GraphNode N,
                               MethodFlag mflag = MethodFlag::WRITE) {
    acquireNode(N, mflag);
    return inGraph->edge_end(N);
  }

  size_t idFromNode(GraphNode N) { return N; }

  GraphNode nodeFromId(size_t N) { return N; }

  //! Assumes that the graph is symmetric
  void createFrom(const std::string& fname) {
    outGraph.fromFile(fname);
    numNodes = outGraph.size();
    numEdges = outGraph.sizeEdges();
    nodeData.create(numNodes);
    inGraph = &outGraph;
    this->outOfLineAllocateInterleaved(numNodes);
    for (size_t i = 0; i < numNodes; ++i)
      this->outOfLineConstructAt(i);
  }

  void createFrom(const std::string& fname, const std::string& transpose) {
    outGraph.fromFile(fname);
    inGraphStorage.fromFile(transpose);
    numNodes = outGraph.size();
    if (numNodes != inGraphStorage.size())
      GALOIS_DIE(
          "graph does not have the same number of nodes as its transpose");
    numEdges = outGraph.sizeEdges();
    nodeData.create(numNodes);
    inGraph = &inGraphStorage;
    this->outOfLineAllocateInterleaved(numNodes);
    for (size_t i = 0; i < numNodes; ++i)
      this->outOfLineConstructAt(i);
  }
};

template <typename GraphTy, typename... Args>
void readGraphDispatch(GraphTy& graph, read_oc_immutable_edge_graph_tag,
                       Args&&... args) {
  graph.createFrom(std::forward<Args>(args)...);
}

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/OfflineGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _GALOIS_DIST_OFFLINE_GRAPH_
#define _GALOIS_DIST_OFFLINE_GRAPH_

#include <cstdint>
#include <fstream>
#include <iostream>
#include <mutex>
#include <numeric>

#include <fcntl.h>
#include <sys/mman.h>

#include <boost/iterator/counting_iterator.hpp>

#include "galois/config.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/GraphHelpers.h"
#include "galois/substrate/SimpleLock.h"

namespace galois {
namespace graphs {

// File format V1:
// version (1) {uint64_t LE}
// EdgeType size {uint64_t LE}
// numNodes {uint64_t LE}
// numEdges {uint64_t LE}
// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge
// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.
// outedges[numEdges] {uint32_t LE}
// potential padding (32bit max) to Re-Align to 64bits
// EdgeType[numEdges] {EdgeType size}

// File format V2:
// version (2) {uint64_t LE}
// EdgeType size {uint64_t LE}
// numNodes {uint64_t LE}
// numEdges {uint64_t LE}
// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge
// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.
// outedges[numEdges] {uint64_t LE}
// EdgeType[numEdges] {EdgeType size}

class OfflineGraph {
  std::ifstream fileEdgeDst, fileIndex, fileEdgeData;
  std::streamoff locEdgeDst, locIndex, locEdgeData;

  uint64_t numNodes;
  uint64_t numEdges;
  uint64_t sizeEdgeData;
  size_t length;
  bool v2;
  uint64_t numSeeksEdgeDst, numSeeksIndex, numSeeksEdgeData;
  uint64_t numBytesReadEdgeDst, numBytesReadIndex, numBytesReadEdgeData;

  galois::substrate::SimpleLock lock;

  uint64_t outIndexs(uint64_t node) {
    std::lock_guard<decltype(lock)> lg(lock);
    std::streamoff pos = (4 + node) * sizeof(uint64_t);

    // move to correct position in file
    if (locEdgeDst != pos) {
      numSeeksEdgeDst++;
      fileEdgeDst.seekg(pos, fileEdgeDst.beg);
      locEdgeDst = pos;
    }

    // read the value
    uint64_t retval;
    try {
      fileEdgeDst.read(reinterpret_cast<char*>(&retval), sizeof(uint64_t));
    } catch (const std::ifstream::failure& e) {
      std::cerr << "Exception while reading edge destinations:" << e.what()
                << "\n";
      std::cerr << "IO error flags: EOF " << fileEdgeDst.eof() << " FAIL "
                << fileEdgeDst.fail() << " BAD " << fileEdgeDst.bad() << "\n";
    }

    // metadata update
    auto numBytesRead = fileEdgeDst.gcount();
    assert(numBytesRead == sizeof(uint64_t));
    locEdgeDst += numBytesRead;
    numBytesReadEdgeDst += numBytesRead;

    return retval;
  }

  uint64_t outEdges(uint64_t edge) {
    std::lock_guard<decltype(lock)> lg(lock);
    std::streamoff pos = (4 + numNodes) * sizeof(uint64_t) +
                         edge * (v2 ? sizeof(uint64_t) : sizeof(uint32_t));

    // move to correct position
    if (locIndex != pos) {
      numSeeksIndex++;
      fileIndex.seekg(pos, fileEdgeDst.beg);
      locIndex = pos;
    }

    // v2 reads 64 bits, v1 reads 32 bits
    if (v2) {
      uint64_t retval;
      try {
        fileIndex.read(reinterpret_cast<char*>(&retval), sizeof(uint64_t));
      } catch (const std::ifstream::failure& e) {
        std::cerr << "Exception while reading index:" << e.what() << "\n";
        std::cerr << "IO error flags: EOF " << fileIndex.eof() << " FAIL "
                  << fileIndex.fail() << " BAD " << fileIndex.bad() << "\n";
      }

      auto numBytesRead = fileIndex.gcount();
      assert(numBytesRead == sizeof(uint64_t));
      locIndex += numBytesRead;
      numBytesReadIndex += numBytesRead;
      return retval;
    } else {
      uint32_t retval;
      try {
        fileIndex.read(reinterpret_cast<char*>(&retval), sizeof(uint32_t));
      } catch (const std::ifstream::failure& e) {
        std::cerr << "Exception while reading index:" << e.what() << "\n";
        std::cerr << "IO error flags: EOF " << fileIndex.eof() << " FAIL "
                  << fileIndex.fail() << " BAD " << fileIndex.bad() << "\n";
      }

      auto numBytesRead = fileIndex.gcount();
      assert(numBytesRead == sizeof(uint32_t));
      locIndex += numBytesRead;
      numBytesReadIndex += numBytesRead;
      return retval;
    }
  }

  template <typename T>
  T edgeData(uint64_t edge) {
    assert(sizeof(T) <= sizeEdgeData);
    std::lock_guard<decltype(lock)> lg(lock);
    std::streamoff pos = (4 + numNodes) * sizeof(uint64_t) +
                         numEdges * (v2 ? sizeof(uint64_t) : sizeof(uint32_t));

    // align + move to correct position
    pos = (pos + 7) & ~7;
    pos += edge * sizeEdgeData;

    if (locEdgeData != pos) {
      numSeeksEdgeData++;
      fileEdgeData.seekg(pos, fileEdgeDst.beg);
      locEdgeData = pos;
    }

    T retval;
    try {
      fileEdgeData.read(reinterpret_cast<char*>(&retval), sizeof(T));
    } catch (const std::ifstream::failure& e) {
      std::cerr << "Exception while reading edge data:" << e.what() << "\n";
      std::cerr << "IO error flags: EOF " << fileEdgeData.eof() << " FAIL "
                << fileEdgeData.fail() << " BAD " << fileEdgeData.bad() << "\n";
    }

    auto numBytesRead = fileEdgeData.gcount();
    assert(numBytesRead == sizeof(T));
    locEdgeData += numBytesRead;
    numBytesReadEdgeData += numBytesRead;
    /*fprintf(stderr, "READ:: %ld[", edge);
    for(int i=0; i<sizeof(T); ++i){
       fprintf(stderr, "%c", reinterpret_cast<char*>(&retval)[i]);
    }
    fprintf(stderr, "]");*/
    return retval;
  }

public:
  typedef boost::counting_iterator<uint64_t> iterator;
  typedef boost::counting_iterator<uint64_t> edge_iterator;
  typedef uint64_t GraphNode;

  OfflineGraph(const std::string& name)
      : fileEdgeDst(name, std::ios_base::binary),
        fileIndex(name, std::ios_base::binary),
        fileEdgeData(name, std::ios_base::binary), locEdgeDst(0), locIndex(0),
        locEdgeData(0), numSeeksEdgeDst(0), numSeeksIndex(0),
        numSeeksEdgeData(0), numBytesReadEdgeDst(0), numBytesReadIndex(0),
        numBytesReadEdgeData(0) {
    if (!fileEdgeDst.is_open() || !fileEdgeDst.good())
      throw "Bad filename";
    if (!fileIndex.is_open() || !fileIndex.good())
      throw "Bad filename";
    if (!fileEdgeData.is_open() || !fileEdgeData.good())
      throw "Bad filename";

    fileEdgeDst.exceptions(std::ifstream::eofbit | std::ifstream::failbit |
                           std::ifstream::badbit);
    fileIndex.exceptions(std::ifstream::eofbit | std::ifstream::failbit |
                         std::ifstream::badbit);
    fileEdgeData.exceptions(std::ifstream::eofbit | std::ifstream::failbit |
                            std::ifstream::badbit);

    uint64_t ver = 0;

    try {
      fileEdgeDst.read(reinterpret_cast<char*>(&ver), sizeof(uint64_t));
      fileEdgeDst.read(reinterpret_cast<char*>(&sizeEdgeData),
                       sizeof(uint64_t));
      fileEdgeDst.read(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));
      fileEdgeDst.read(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));
    } catch (const std::ifstream::failure& e) {
      std::cerr << "Exception while reading graph header:" << e.what() << "\n";
      std::cerr << "IO error flags: EOF " << fileEdgeDst.eof() << " FAIL "
                << fileEdgeDst.fail() << " BAD " << fileEdgeDst.bad() << "\n";
    }

    if (ver == 0 || ver > 2)
      throw "Bad Version";

    v2 = ver == 2;

    if (!fileEdgeDst)
      throw "Out of data";

    // File length
    fileEdgeDst.seekg(0, fileEdgeDst.end);
    length = fileEdgeDst.tellg();
    if (length < sizeof(uint64_t) * (4 + numNodes) +
                     (v2 ? sizeof(uint64_t) : sizeof(uint32_t)) * numEdges)
      throw "File too small";

    fileEdgeDst.seekg(0, std::ios_base::beg);
    fileEdgeData.seekg(0, std::ios_base::beg);
    fileIndex.seekg(0, std::ios_base::beg);
  }

  uint64_t num_seeks() {
    // std::cout << "Seeks :: " << numSeeksEdgeDst << " , " << numSeeksEdgeData
    //          << " , " << numSeeksIndex << " \n";
    return numSeeksEdgeDst + numSeeksEdgeData + numSeeksIndex;
  }

  uint64_t num_bytes_read() {
    // std::cout << "Bytes read :: " << numBytesReadEdgeDst << " , " <<
    // numBytesReadEdgeData << " , " << numBytesReadIndex << " \n";
    return numBytesReadEdgeDst + numBytesReadEdgeData + numBytesReadIndex;
  }

  void reset_seek_counters() {
    numSeeksEdgeDst = numSeeksEdgeData = numSeeksIndex = 0;
    numBytesReadEdgeDst = numBytesReadEdgeData = numBytesReadIndex = 0;
  }

  OfflineGraph(OfflineGraph&&) = default;

  size_t size() const { return numNodes; }
  size_t sizeEdges() const { return numEdges; }
  size_t edgeSize() const { return sizeEdgeData; }

  iterator begin() { return iterator(0); }
  iterator end() { return iterator(numNodes); }

  edge_iterator edge_begin(GraphNode N) {
    if (N == 0)
      return edge_iterator(0);
    else
      return edge_iterator(outIndexs(N - 1));
  }

  edge_iterator edge_end(GraphNode N) { return edge_iterator(outIndexs(N)); }

  GraphNode getEdgeDst(edge_iterator ni) { return outEdges(*ni); }

  runtime::iterable<NoDerefIterator<edge_iterator>> edges(GraphNode N) {
    return internal::make_no_deref_range(edge_begin(N), edge_end(N));
  }

  template <typename T>
  T getEdgeData(edge_iterator ni) {
    return edgeData<T>(*ni);
  }

  /**
   * Accesses the prefix sum on disk.
   *
   * @param n Index into edge prefix sum
   * @returns The value located at index n in the edge prefix sum array
   */
  uint64_t operator[](uint64_t n) { return outIndexs(n); }

  // typedefs used by divide by node below
  typedef std::pair<iterator, iterator> NodeRange;
  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;
  typedef std::pair<NodeRange, EdgeRange> GraphRange;

  /**
   * Returns 2 ranges (one for nodes, one for edges) for a particular division.
   * The ranges specify the nodes/edges that a division is responsible for. The
   * function attempts to split them evenly among threads given some kind of
   * weighting
   *
   * @param nodeWeight weight to give to a node in division
   * @param edgeWeight weight to give to an edge in division
   * @param id Division number you want the ranges for
   * @param total Total number of divisions
   * @param scaleFactor Vector specifying if certain divisions should get more
   * than other divisions
   */
  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,
                    size_t total,
                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())
      -> GraphRange {
    return galois::graphs::divideNodesBinarySearch<OfflineGraph>(
        numNodes, numEdges, nodeWeight, edgeWeight, id, total, *this,
        scaleFactor);
  }
};

class OfflineGraphWriter {
  std::fstream file;
  uint64_t numNodes, numEdges;
  bool smallData;
  uint64_t ver;
  std::vector<uint64_t> bufferDst;

  std::deque<uint64_t> edgeOffsets;

  std::streamoff offsetOfDst(uint64_t edge) {
    return sizeof(uint64_t) * (4 + numNodes + edge);
  }
  std::streamoff offsetOfData(uint64_t edge) {
    return sizeof(uint64_t) * (4 + numNodes + numEdges) +
           (smallData ? sizeof(float) : sizeof(double)) * edge;
  }

  void setEdge32(uint64_t src, uint64_t offset, uint64_t dst, uint32_t val) {
    if (src)
      offset += edgeOffsets[src - 1];
    file.seekg(offsetOfDst(offset), std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));
    file.seekg(offsetOfData(offset), std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&val), sizeof(uint32_t));
  }

  void setEdge64(uint64_t src, uint64_t offset, uint64_t dst, uint64_t val) {
    if (src)
      offset += edgeOffsets[src - 1];
    file.seekg(offsetOfDst(offset), std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));
    file.seekg(offsetOfData(offset), std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&val), sizeof(uint64_t));
  }

  void setEdge_sorted(uint64_t dst) {
    if (ver == 1) {
      uint32_t dst32 = dst;
      file.write(reinterpret_cast<char*>(&dst32), sizeof(uint32_t));
    } else {
      file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));
    }
  }

  void setEdge_sortedBuffer() {
    if (ver == 1) {
      std::vector<uint32_t> tmp(bufferDst.begin(), bufferDst.end());
      file.write(reinterpret_cast<char*>(&tmp[0]),
                 (sizeof(uint32_t) * tmp.size()));
    }
    file.write(reinterpret_cast<char*>(&bufferDst[0]),
               (sizeof(uint64_t) * bufferDst.size()));
  }

  // void setEdge64_sorted(uint64_t dst) {
  // file.write(reinterpret_cast<char*>(&dst), sizeof(uint32_t));
  //}

public:
  OfflineGraphWriter(const std::string& name, bool use32 = false,
                     uint64_t _numNodes = 0, uint64_t _numEdges = 0)
      : file(name, std::ios_base::in | std::ios_base::out |
                       std::ios_base::binary | std::ios_base::trunc),
        numNodes(_numNodes), numEdges(_numEdges), smallData(use32), ver(1) {
    if (!file.is_open() || !file.good())
      throw "Bad filename";
    uint64_t etSize = smallData ? sizeof(float) : sizeof(double);
    file.write(reinterpret_cast<char*>(&ver), sizeof(uint64_t));
    file.write(reinterpret_cast<char*>(&etSize), sizeof(uint64_t));
    file.write(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));
    file.write(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));
    file.seekg(0, std::ios_base::beg);
  }

  ~OfflineGraphWriter() {}

  // sets the number of nodes and edges.  points to an container of edge counts
  void setCounts(std::deque<uint64_t> edgeCounts) {
    edgeOffsets = std::move(edgeCounts);
    numNodes    = edgeOffsets.size();
    numEdges    = std::accumulate(edgeOffsets.begin(), edgeOffsets.end(), 0);
    std::cout << " NUM EDGES  : " << numEdges << "\n";
    std::partial_sum(edgeOffsets.begin(), edgeOffsets.end(),
                     edgeOffsets.begin());
    // Nodes are greater than 2^32 so need ver = 2.
    if (numNodes >= 4294967296) {
      ver = 2;
    } else {
      ver = 1;
    }
    std::cout << " USING VERSION : " << ver << "\n";
    uint64_t etSize = 0; // smallData ? sizeof(float) : sizeof(double);
    file.seekg(0, std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&ver), sizeof(uint64_t));
    file.write(reinterpret_cast<char*>(&etSize), sizeof(uint64_t));
    // file.seekg(sizeof(uint64_t)*2, std::ios_base::beg);
    file.write(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));
    file.write(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));
    for (auto i : edgeOffsets)
      file.write(reinterpret_cast<char*>(&i), sizeof(uint64_t));
    file.seekg(0, std::ios_base::beg);
  }

  void setEdge(uint64_t src, uint64_t offset, uint64_t dst, uint64_t val) {
    if (smallData)
      setEdge32(src, offset, dst, val);
    else
      setEdge64(src, offset, dst, val);
  }

  void setEdgeSorted(uint64_t dst) { setEdge_sorted(dst); }

  void seekEdgesDstStart() { file.seekg(offsetOfDst(0), std::ios_base::beg); }
};

} // namespace graphs
} // namespace galois

#endif //_GALOIS_DIST_OFFLINE_GRAPH_


================================================
FILE: libgalois/include/galois/graphs/ReadGraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_READGRAPH_H
#define GALOIS_GRAPHS_READGRAPH_H

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/graphs/Details.h"
#include "galois/graphs/FileGraph.h"
#include "galois/Timer.h"

namespace galois {
namespace graphs {

/**
 * Allocates and constructs a graph from a file. Tries to balance
 * memory evenly across system. Cannot be called during parallel
 * execution.
 */
template <typename GraphTy, typename... Args>
void readGraph(GraphTy& graph, Args&&... args) {
  typename GraphTy::read_tag tag;
  readGraphDispatch(graph, tag, std::forward<Args>(args)...);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_default_graph_tag tag,
                       const std::string& filename,
                       const bool readUnweighted = false) {
  FileGraph f;
  if (readUnweighted) {
    //! If user specifies that the input graph is unweighted,
    //! the file graph also should be aware of this.
    //! Note that the application still could use the edge data array.
    f.fromFileInterleaved<void>(filename);
  } else {
    f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);
  }
  readGraphDispatch(graph, tag, f, readUnweighted);
}

template <typename GraphTy>
struct ReadGraphConstructFrom {
  GraphTy& graph;
  FileGraph& f;
  bool readUnweighted = false;
  ReadGraphConstructFrom(GraphTy& g, FileGraph& _f) : graph(g), f(_f) {}
  ReadGraphConstructFrom(GraphTy& g, FileGraph& _f, bool _readUnweighted)
      : graph(g), f(_f), readUnweighted(_readUnweighted) {}
  void operator()(unsigned tid, unsigned total) {
    graph.constructFrom(f, tid, total, readUnweighted);
  }
};

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_default_graph_tag, FileGraph& f,
                       const bool readUnweighted = false) {
  graph.allocateFrom(f);

  ReadGraphConstructFrom<GraphTy> reader(graph, f, readUnweighted);
  galois::on_each(reader);
}

template <typename GraphTy, typename Aux>
struct ReadGraphConstructNodesFrom {
  GraphTy& graph;
  FileGraph& f;
  Aux& aux;
  ReadGraphConstructNodesFrom(GraphTy& g, FileGraph& _f, Aux& a)
      : graph(g), f(_f), aux(a) {}
  void operator()(unsigned tid, unsigned total) {
    graph.constructNodesFrom(f, tid, total, aux);
  }
};

template <typename GraphTy, typename Aux>
struct ReadGraphConstructEdgesFrom {
  GraphTy& graph;
  FileGraph& f;
  Aux& aux;
  ReadGraphConstructEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)
      : graph(g), f(_f), aux(a) {}
  void operator()(unsigned tid, unsigned total) {
    graph.constructEdgesFrom(f, tid, total, aux);
  }
};

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_with_aux_graph_tag tag,
                       const std::string& filename) {
  FileGraph f;
  f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);
  readGraphDispatch(graph, tag, f);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_with_aux_graph_tag, FileGraph& f) {
  typedef typename GraphTy::ReadGraphAuxData Aux;

  Aux aux;
  graph.allocateFrom(f, aux);

  ReadGraphConstructNodesFrom<GraphTy, Aux> nodeReader(graph, f, aux);
  galois::on_each(nodeReader);

  ReadGraphConstructEdgesFrom<GraphTy, Aux> edgeReader(graph, f, aux);
  galois::on_each(edgeReader);
}

template <typename GraphTy, typename Aux>
struct ReadGraphConstructOutEdgesFrom {
  GraphTy& graph;
  FileGraph& f;
  Aux& aux;
  ReadGraphConstructOutEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)
      : graph(g), f(_f), aux(a) {}
  void operator()(unsigned tid, unsigned total) {
    graph.constructOutEdgesFrom(f, tid, total, aux);
  }
};

template <typename GraphTy, typename Aux>
struct ReadGraphConstructInEdgesFrom {
  GraphTy& graph;
  FileGraph& f;
  Aux& aux;
  ReadGraphConstructInEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)
      : graph(g), f(_f), aux(a) {}
  void operator()(unsigned tid, unsigned total) {
    graph.constructInEdgesFrom(f, tid, total, aux);
  }
};

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_with_aux_first_graph_tag,
                       FileGraph& f) {
  typedef typename GraphTy::ReadGraphAuxData Aux;
  constexpr static const bool profile = false;

  galois::CondStatTimer<profile> TAlloc("AllocateAux");
  TAlloc.start();
  Aux* auxPtr = new Aux;
  graph.allocateFrom(f, *auxPtr);
  TAlloc.stop();

  galois::CondStatTimer<profile> TNode("ConstructNode");
  TNode.start();
  ReadGraphConstructNodesFrom<GraphTy, Aux> nodeReader(graph, f, *auxPtr);
  galois::on_each(nodeReader);
  TNode.stop();

  galois::CondStatTimer<profile> TOutEdge("ConstructOutEdge");
  TOutEdge.start();
  ReadGraphConstructOutEdgesFrom<GraphTy, Aux> outEdgeReader(graph, f, *auxPtr);
  galois::on_each(outEdgeReader);
  TOutEdge.stop();

  galois::CondStatTimer<profile> TInEdge("ConstructInEdge");
  TInEdge.start();
  ReadGraphConstructInEdgesFrom<GraphTy, Aux> inEdgeReader(graph, f, *auxPtr);
  galois::on_each(inEdgeReader);
  TInEdge.stop();

  galois::CondStatTimer<profile> TDestruct("DestructAux");
  TDestruct.start();
  delete auxPtr;
  TDestruct.stop();
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_with_aux_first_graph_tag tag,
                       const std::string& filename) {
  FileGraph f;
  f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);
  readGraphDispatch(graph, tag, f);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag,
                       const std::string& f1, const std::string& f2) {
  graph.createAsymmetric();

  typename GraphTy::out_graph_type::read_tag tag1;
  readGraphDispatch(graph, tag1, f1);

  typename GraphTy::in_graph_type::read_tag tag2;
  readGraphDispatch(graph.inGraph, tag2, f2);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag, FileGraph& f1,
                       FileGraph& f2) {
  graph.createAsymmetric();

  typename GraphTy::out_graph_type::read_tag tag1;
  readGraphDispatch(graph, tag1, f1);

  typename GraphTy::in_graph_type::read_tag tag2;
  readGraphDispatch(graph.inGraph, tag2, f2);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag, FileGraph& f1) {
  typename GraphTy::out_graph_type::read_tag tag1;
  readGraphDispatch(graph, tag1, f1);
}

template <typename GraphTy>
void readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag,
                       const std::string& f1) {
  typename GraphTy::out_graph_type::read_tag tag1;
  readGraphDispatch(graph, tag1, f1);
}

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/SpatialTree.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_SPATIALTREE_H
#define GALOIS_GRAPHS_SPATIALTREE_H

#include "galois/config.h"

namespace galois {
namespace graphs {

//! Stores sets of objects at specific spatial coordinates in a quad tree.
//! Lookup returns an approximation of the closest item
template <typename T>
class SpatialTree2d {
  struct Box2d {
    double xmin;
    double ymin;
    double xmax;
    double ymax;

    double xmid() const { return (xmin + xmax) / 2.0; }
    double ymid() const { return (ymin + ymax) / 2.0; }

    void decimate(int quad, double midx, double midy) {
      if (quad & 1)
        xmin = midx;
      else
        xmax = midx;
      if (quad & 2)
        ymin = midy;
      else
        ymax = midy;
    }
  };
  struct Node {
    // existing item
    T val;
    double x, y;

    // center
    double midx, midy;

    Node* children[4];
    // needs c++11: Node(const T& v) :val(v), children({0,0,0,0}) {}
    Node(const T& v, double _x, double _y) : val(v), x(_x), y(_y) {
      children[0] = children[1] = children[2] = children[3] = 0;
    }

    void setCenter(double cx, double cy) {
      midx = cx;
      midy = cy;
    }

    int getQuad(double _x, double _y) {
      int retval = 0;
      if (_x > midx)
        retval += 1;
      if (_y > midy)
        retval += 2;
      return retval;
    }
  };

  galois::runtime::FixedSizeAllocator<Node> nodeAlloc;

  Node* root;
  Box2d bounds;

  // true if x,y is closer to testx, testy than oldx, oldy
  bool closer(double x, double y, double testx, double testy, double oldx,
              double oldy) const {
    double doldx  = x - oldx;
    double doldy  = y - oldy;
    double dtestx = x - testx;
    double dtesty = y - testy;
    doldx *= doldx;
    doldy *= doldy;
    dtestx *= dtestx;
    dtesty *= dtesty;
    return (dtestx + dtesty) < (doldx + doldy);
  }

  /*
  T* recfind(Node* n, T* best, double bestx, double besty, double x, double y,
  Box2d b) { if (!n) return best; if (!best) { // || closer(x, y, n->x, n->y,
  bestx, besty)) { best = &n->val; bestx = n->x; besty = n->y;
    }
    int quad = b.getQuad(x,y);
    b.decimate(quad);
    return recfind(n->children[quad], best, bestx, besty, x, y, b);
  }
  */

  T* recfind(Node* n, double x, double y) {
    Node* best = 0;
    while (n) {
      if (!best || closer(x, y, n->x, n->y, best->x, best->y))
        best = n;
      //      best = &n->val;
      int quad = n->getQuad(x, y);
      n        = n->children[quad];
    }
    return &best->val;
  }

  void recinsert(Node** pos, Box2d b, Node* node) {
    if (!*pos) {
      // only do an atomic if it looks empty
      node->setCenter(b.xmid(), b.ymid());
      if (__sync_bool_compare_and_swap(pos, 0, node))
        return; // worked!
    }
    // We should recurse
    int quad = (*pos)->getQuad(node->x, node->y);
    b.decimate(quad, (*pos)->midx, (*pos)->midy);
    recinsert(&(*pos)->children[quad], b, node);
  }

  Node* mkNode(const T& v, double x, double y) {
    Node* n = nodeAlloc.allocate(1);
    nodeAlloc.construct(n, Node(v, x, y));
    return n;
    // return new Node(v,x,y);
  }

  void delNode(Node* n) {
    nodeAlloc.destroy(n);
    nodeAlloc.deallocate(n, 1);
    // delete n;
  }

  void freeTree(Node* n) {
    if (!n)
      return;
    for (int x = 0; x < 4; ++x)
      freeTree(n->children[x]);
    delNode(n);
  }

public:
  SpatialTree2d(double xmin = 0.0, double ymin = 0.0, double xmax = 0.0,
                double ymax = 0.0)
      : root(0) {
    init(xmin, ymin, xmax, ymax);
  }

  ~SpatialTree2d() {
    freeTree(root);
    root = 0;
  }

  void init(double xmin, double ymin, double xmax, double ymax) {
    bounds.xmin = xmin;
    bounds.ymin = ymin;
    bounds.xmax = xmax;
    bounds.ymax = ymax;
  }

  //! Returns null if tree is empty
  T* find(double x, double y) {
    assert(root);
    return recfind(root, x, y);
  }

  //! Insert an element. Will always insert and never roll back and thus must
  //! be used after failsafe point.
  void insert(double x, double y, const T& v) {
    recinsert(&root, bounds, mkNode(v, x, y));
  }
};

} // namespace graphs
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/graphs/TypeTraits.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GRAPHS_TYPETRAITS_H
#define GALOIS_GRAPHS_TYPETRAITS_H

#include <boost/mpl/has_xxx.hpp>

#include "galois/config.h"

namespace galois {
namespace graphs {

BOOST_MPL_HAS_XXX_TRAIT_DEF(tt_is_segmented)
template <typename T>
struct is_segmented : public has_tt_is_segmented<T> {};

} // namespace graphs
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/gslist.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GSLIST_H
#define GALOIS_GSLIST_H

#include <type_traits>

#include <boost/iterator/iterator_facade.hpp>
#include <boost/mpl/if.hpp>

#include "galois/config.h"
#include "galois/FixedSizeRing.h"
#include "galois/TwoLevelIteratorA.h"

namespace galois {

template <typename T, int ChunkSize, bool Concurrent>
class gslist_base {
public:
  //! Tag for methods that depend on user to deallocate memory, although gslist
  //! will destroy elements
  struct promise_to_dealloc {};

private:
  typedef typename boost::mpl::if_c<Concurrent,
                                    ConcurrentFixedSizeBag<T, ChunkSize>,
                                    FixedSizeBag<T, ChunkSize>>::type Ring;

  struct Block : public Ring {
    Block* next;
    Block() : next() {}
  };

  template <typename U>
  class outer_iterator
      : public boost::iterator_facade<outer_iterator<U>, U,
                                      boost::forward_traversal_tag> {
    friend class boost::iterator_core_access;
    U* cur;

    void increment() { cur = cur->next; }

    template <typename OtherTy>
    bool equal(const outer_iterator<OtherTy>& o) const {
      return cur == o.cur;
    }

    U& dereference() const { return *cur; }

  public:
    outer_iterator(U* c = 0) : cur(c) {}

    template <typename OtherTy>
    outer_iterator(const outer_iterator<OtherTy>& o) : cur(o.cur) {}
  };

  typedef
      typename boost::mpl::if_c<Concurrent, std::atomic<Block*>, Block*>::type
          First;

  First first;

  template <typename HeapTy>
  Block* alloc_block(HeapTy& heap) {
    return new (heap.allocate(sizeof(Block))) Block();
  }

  template <typename HeapTy>
  void free_block(HeapTy& heap, Block* b) {
    b->~Block();
    heap.deallocate(b);
  }

  void free_block(promise_to_dealloc, Block* b) { b->~Block(); }

  template <typename HeapTy, bool C = Concurrent>
  auto extend_first(HeapTy& heap) -> typename std::enable_if<C>::type {
    Block* b = alloc_block(heap);
    while (true) {
      Block* f = first.load(std::memory_order_relaxed);
      b->next  = f;
      if (first.compare_exchange_weak(f, b))
        return;
    }
  }

  template <typename HeapTy, bool C = Concurrent>
  auto extend_first(HeapTy& heap) -> typename std::enable_if<!C>::type {
    Block* b = alloc_block(heap);
    b->next  = first;
    first    = b;
  }

  Block* get_first() {
    Block* b = first;
    return b;
  }

  const Block* get_first() const {
    Block* b = first;
    return b;
  }

  template <typename U, bool C = Concurrent>
  auto shrink_first(Block* old_first, U&& arg) ->
      typename std::enable_if<C>::type {
    if (first.compare_exchange_strong(old_first, old_first->next)) {
      // old_first->clear();
      free_block(std::forward<U>(arg), old_first);
    }
  }

  template <typename U, bool C = Concurrent>
  auto shrink_first(Block* old_first, U&& arg) ->
      typename std::enable_if<!C>::type {
    if (first != old_first)
      return;
    first = old_first->next;
    // old_first->clear();
    free_block(std::forward<U>(arg), old_first);
  }

  template <typename U>
  void _clear(U&& arg) {
    Block* b = get_first();
    while (b) {
      shrink_first(b, std::forward<U>(arg));
      b = get_first();
    }
  }

  template <typename U>
  bool _pop_front(U&& arg) {
    while (true) {
      Block* b = get_first();
      if (!b)
        return false;
      if (b->pop_front())
        return true;

      shrink_first(b, std::forward<U>(arg));
    }
  }

public:
  //! External allocator must be able to allocate this type
  typedef Block block_type;
  typedef T value_type;
  typedef galois::TwoLevelIteratorA<outer_iterator<Block>,
                                    typename Block::iterator,
                                    std::forward_iterator_tag, GetBegin, GetEnd>
      iterator;
  typedef galois::TwoLevelIteratorA<outer_iterator<const Block>,
                                    typename Block::const_iterator,
                                    std::forward_iterator_tag, GetBegin, GetEnd>
      const_iterator;

  gslist_base() : first(0) {}

  gslist_base(const gslist_base&) = delete;
  gslist_base& operator=(const gslist_base&) = delete;

  gslist_base(gslist_base&& other) : first(0) { *this = std::move(other); }

  gslist_base& operator=(gslist_base&& o) {
    Block* m_first = first;
    Block* o_first = o.first;
    first          = o_first;
    o.first        = m_first;
    return *this;
  }

  ~gslist_base() {
    _clear(promise_to_dealloc());
    // assert(empty() && "Memory leak if gslist is not empty before
    // destruction");
  }

  iterator begin() {
    return galois::make_two_level_iterator(outer_iterator<Block>(get_first()),
                                           outer_iterator<Block>(nullptr))
        .first;
  }

  iterator end() {
    return galois::make_two_level_iterator(outer_iterator<Block>(get_first()),
                                           outer_iterator<Block>(nullptr))
        .second;
  }

  const_iterator begin() const {
    return galois::make_two_level_iterator(
               outer_iterator<const Block>(get_first()),
               outer_iterator<const Block>(nullptr))
        .first;
  }

  const_iterator end() const {
    return galois::make_two_level_iterator(
               outer_iterator<const Block>(get_first()),
               outer_iterator<const Block>(nullptr))
        .second;
  }

  bool empty() const {
    return first == NULL || (get_first()->empty() && get_first()->next == NULL);
  }

  value_type& front() { return get_first()->front(); }

  const value_type& front() const { return get_first()->front(); }

  template <typename HeapTy, typename... Args, bool C = Concurrent>
  auto emplace_front(HeapTy& heap, Args&&... args) ->
      typename std::enable_if<!C>::type {
    if (!first || first->full())
      extend_first(heap);
    first->emplace_front(std::forward<Args>(args)...);
  }

  template <typename HeapTy, bool C = Concurrent>
  auto push_front(HeapTy& heap, const value_type& v) ->
      typename std::enable_if<C>::type {
    while (true) {
      Block* b = get_first();
      if (b && b->push_front(v))
        return;
      extend_first(heap);
    }
  }

  template <typename HeapTy, typename ValueTy, bool C = Concurrent>
  auto push_front(HeapTy& heap, ValueTy&& v) ->
      typename std::enable_if<!C>::type {
    emplace_front(heap, std::forward<ValueTy>(v));
  }

  //! Returns true if something was popped
  template <typename HeapTy>
  bool pop_front(HeapTy& heap) {
    return _pop_front(heap);
  }

  //! Returns true if something was popped
  bool pop_front(promise_to_dealloc) {
    return _pop_front(promise_to_dealloc());
  }

  template <typename HeapTy>
  void clear(HeapTy& heap) {
    _clear(heap);
  }

  void clear(promise_to_dealloc) { _clear(promise_to_dealloc()); }
};

/**
 * Singly linked list. To conserve space, allocator is maintained external to
 * the list.
 */
template <typename T, unsigned chunksize = 16>
using gslist = gslist_base<T, chunksize, false>;

/**
 * Concurrent linked list. To conserve space, allocator is maintained external
 * to the list. Iteration order is unspecified.
 */
template <typename T, unsigned chunksize = 16>
using concurrent_gslist = gslist_base<T, chunksize, true>;

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/gstl.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_GSTL_H
#define GALOIS_GSTL_H

#include <algorithm>
#include <iterator>
#include <utility>
#include <cassert>
#include <vector>
#include <set>
#include <deque>
#include <map>
#include <list>
#include <string>
#include <sstream>

#include "galois/config.h"
#include "galois/PriorityQueue.h"

namespace galois {

namespace gstl {

//! [define Pow_2_VarSizeAlloc]
template <typename T>
using Pow2Alloc = typename runtime::Pow_2_BlockAllocator<T>;
//! [define Pow_2_VarSizeAlloc]

template <typename T>
using FixedSizeAlloc = typename runtime::FixedSizeAllocator<T>;

//! [STL vector using Pow_2_VarSizeAlloc]
template <typename T>
using Vector = std::vector<T, Pow2Alloc<T>>;
//! [STL vector using Pow_2_VarSizeAlloc]

template <typename T>
using Deque = std::deque<T, Pow2Alloc<T>>;

template <typename T>
using List = std::list<T, FixedSizeAlloc<T>>;

template <typename T, typename C = std::less<T>>
using Set = std::set<T, C, FixedSizeAlloc<T>>;

template <typename K, typename V, typename C = std::less<K>>
using Map = std::map<K, V, C, FixedSizeAlloc<std::pair<const K, V>>>;

template <typename K, typename V, typename Hash = std::hash<K>,
          typename KeyEqual = std::equal_to<K>>
using UnorderedMap = std::unordered_map<K, V, Hash, KeyEqual,
                                        FixedSizeAlloc<std::pair<const K, V>>>;

template <typename T, typename C = std::less<T>>
using PQ = MinHeap<T, C, Vector<T>>;

using Str = std::basic_string<char, std::char_traits<char>, Pow2Alloc<char>>;

template <typename T>
struct StrMaker {
  Str operator()(const T& x) const {
    std::basic_ostringstream<char, std::char_traits<char>, Pow2Alloc<char>> os;
    os << x;
    return Str(os.str());
  }
};

template <>
struct StrMaker<std::string> {
  Str operator()(const std::string& x) const { return Str(x.begin(), x.end()); }
};

template <>
struct StrMaker<Str> {
  const Str& operator()(const Str& x) const { return x; }
};

template <>
struct StrMaker<const char*> {
  Str operator()(const char* x) const { return Str(x); }
};

template <typename T>
Str makeStr(const T& x) {
  return StrMaker<T>()(x);
}
} // end namespace gstl

template <typename I>
class IterRange {
  I m_beg;
  I m_end;

public:
  IterRange(const I& b, const I& e) : m_beg(b), m_end(e) {}
  const I& begin(void) const { return m_beg; }
  const I& end(void) const { return m_end; }
};

template <typename I>
auto makeIterRange(const I& beg, const I& end) {
  return IterRange<I>(beg, end);
}

template <typename C>
auto makeIterRange(C&& cont) {
  using I = decltype(std::forward<C>(cont).begin());
  return IterRange<I>(std::forward<C>(cont).begin(),
                      std::forward<C>(cont).end());
}

namespace internal {

template <typename T, typename C>
struct SerCont {
  C m_q;

  explicit SerCont(const C& q = C()) : m_q(q) {}

  void push(const T& x) { m_q.push_back(x); }

  template <typename I>
  void push(const I& beg, const I& end) {
    for (I i = beg; i != end; ++i) {
      push(*i);
    }
  }

  template <typename... Args>
  void emplace(Args&&... args) {
    m_q.emplace_back(std::forward<Args>(args)...);
  }

  bool empty(void) const { return m_q.empty(); }

  void clear(void) { m_q.clear(); }

  using value_type     = typename C::value_type;
  using iterator       = typename C::iterator;
  using const_iterator = typename C::const_iterator;

  iterator begin(void) { return m_q.begin(); }
  iterator end(void) { return m_q.end(); }

  const_iterator begin(void) const { return m_q.begin(); }
  const_iterator end(void) const { return m_q.end(); }

  const_iterator cbegin(void) const { return m_q.cbegin(); }
  const_iterator cend(void) const { return m_q.cend(); }
};
} // namespace internal

template <typename T, typename C = std::deque<T>>
class SerFIFO : public internal::SerCont<T, C> {

  using Base = internal::SerCont<T, C>;

public:
  explicit SerFIFO(const C& q = C()) : Base(q) {}

  T pop(void) {
    T ret = Base::m_q.front();
    Base::m_q.pop_front();
    return ret;
  }
};

template <typename T, typename C = std::vector<T>>
class SerStack : public internal::SerCont<T, C> {

  using Base = internal::SerCont<T, C>;

public:
  explicit SerStack(const C& q = C()) : Base(q) {}

  T pop(void) {
    T ret = Base::m_q.back();
    Base::m_q.pop_back();
    return ret;
  }
};

template <typename IterTy, class Distance>
IterTy safe_advance_dispatch(IterTy b, IterTy e, Distance n,
                             std::random_access_iterator_tag) {
  if (std::distance(b, e) >= n)
    return b + n;
  else
    return e;
}

template <typename IterTy, class Distance>
IterTy safe_advance_dispatch(IterTy b, IterTy e, Distance n,
                             std::input_iterator_tag) {
  while (b != e && n--)
    ++b;
  return b;
}

/**
 * Like std::advance but returns end if end is closer than the advance amount.
 */
template <typename IterTy, class Distance>
IterTy safe_advance(IterTy b, IterTy e, Distance n) {
  typename std::iterator_traits<IterTy>::iterator_category category;
  return safe_advance_dispatch(b, e, n, category);
}

/**
 * Finds the midpoint of a range.  The first half is always be bigger than
 * the second half if the range has an odd length.
 */
template <typename IterTy>
IterTy split_range(IterTy b, IterTy e) {
  std::advance(b, (std::distance(b, e) + 1) / 2);
  return b;
}

/**
 * Returns a continuous block from the range based on the number of
 * divisions and the id of the block requested
 */
template <
    typename IterTy,
    typename std::enable_if<!std::is_integral<IterTy>::value>::type* = nullptr>
std::pair<IterTy, IterTy> block_range(IterTy b, IterTy e, unsigned id,
                                      unsigned num) {
  size_t dist   = std::distance(b, e);
  size_t numper = std::max((dist + num - 1) / num, (size_t)1); // round up
  size_t A      = std::min(numper * id, dist);
  size_t B      = std::min(numper * (id + 1), dist);
  std::advance(b, A);

  if (dist != B) {
    e = b;
    std::advance(e, B - A);
  }

  return std::make_pair(b, e);
}

template <typename IntTy, typename std::enable_if<
                              std::is_integral<IntTy>::value>::type* = nullptr>
std::pair<IntTy, IntTy> block_range(IntTy b, IntTy e, unsigned id,
                                    unsigned num) {
  IntTy dist   = e - b;
  IntTy numper = std::max((dist + num - 1) / num, (IntTy)1); // round up
  IntTy A      = std::min(numper * id, dist);
  IntTy B      = std::min(numper * (id + 1), dist);
  b += A;
  if (dist != B) {
    e = b;
    e += (B - A);
  }
  return std::make_pair(b, e);
}

namespace internal {
template <typename I>
using Val_ty = typename std::iterator_traits<I>::value_type;
} // namespace internal

//! Destroy a range
template <typename I>
std::enable_if_t<!std::is_scalar<internal::Val_ty<I>>::value>
uninitialized_destroy(I first, I last) {

  using T = internal::Val_ty<I>;
  for (; first != last; ++first)
    (&*first)->~T();
}

template <class I>
std::enable_if_t<std::is_scalar<internal::Val_ty<I>>::value>
uninitialized_destroy(I, I) {}

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/optional.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_OPTIONAL_H
#define GALOIS_OPTIONAL_H

#include <cassert>

#include "galois/config.h"
#include "galois/LazyObject.h"

namespace galois {

/**
 * Galois version of <code>boost::optional</code>.
 */
template <typename T>
class optional {
  LazyObject<T> data_;
  bool initialized_;

  void construct(const T& val) {
    data_.construct(val);
    initialized_ = true;
  }

  void assign_impl(const T& val) { get_impl() = val; }

  void destroy() {
    if (initialized_) {
      data_.destroy();
      initialized_ = false;
    }
  }

  T& get_impl() { return data_.get(); }
  const T& get_impl() const { return data_.get(); }

public:
  typedef bool (optional::*unspecified_bool_type)() const;

  optional() : initialized_(false) {}

  optional(const T& val) : initialized_(false) { construct(val); }

  optional(const optional& rhs) : initialized_(false) {
    if (rhs.is_initialized())
      construct(rhs.get_impl());
  }

  template <typename U>
  explicit optional(const optional<U>& rhs) : initialized_(false) {
    assign(rhs);
  }

  ~optional() { destroy(); }

  void assign(const optional& rhs) {
    if (is_initialized()) {
      if (rhs.is_initialized())
        assign_impl(rhs.get_impl());
      else
        destroy();
    } else {
      if (rhs.is_initialized())
        construct(rhs.get_impl());
    }
  }

  template <typename U>
  void assign(const optional<U>& rhs) {
    if (is_initialized()) {
      if (rhs.is_initialized())
        assign_impl(rhs.get_impl());
      else
        destroy();
    } else {
      if (rhs.is_initialized())
        construct(rhs.get_impl());
    }
  }

  void assign(const T& val) {
    if (is_initialized())
      assign_impl(val);
    else
      construct(val);
  }

  bool is_initialized() const { return initialized_; }

  optional& operator=(const optional& rhs) {
    assign(rhs);
    return *this;
  }

  template <typename U>
  optional& operator=(const optional<U>& rhs) {
    assign(rhs);
    return *this;
  }

  optional& operator=(const T& val) {
    assign(val);
    return *this;
  }

  T& get() {
    assert(initialized_);
    return get_impl();
  }
  const T& get() const {
    assert(initialized_);
    return get_impl();
  }
  T& operator*() { return get(); }
  const T& operator*() const { return get(); }
  T* operator->() {
    assert(initialized_);
    return &get_impl();
  }
  const T* operator->() const {
    assert(initialized_);
    return &get_impl();
  }

  operator unspecified_bool_type() const {
    return initialized_ ? &optional::is_initialized : 0;
  }
};

} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Context.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_CONTEXT_H
#define GALOIS_RUNTIME_CONTEXT_H

#include <cassert>
#include <cstdlib>

#include <boost/utility.hpp>

#include "galois/config.h"

#ifdef GALOIS_USE_LONGJMP_ABORT
#include <csetjmp>
#endif

#include "galois/gIO.h"
#include "galois/MethodFlags.h"
#include "galois/substrate/PtrLock.h"

namespace galois {
namespace runtime {

enum ConflictFlag {
  CONFLICT         = -1,
  NO_CONFLICT      = 0,
  REACHED_FAILSAFE = 1,
  BREAK            = 2
};

extern thread_local std::jmp_buf execFrame;

class Lockable;

[[noreturn]] inline void signalConflict(Lockable* = nullptr) {
#if defined(GALOIS_USE_LONGJMP_ABORT)
  std::longjmp(execFrame, CONFLICT);
  std::abort(); // shouldn't reach here after longjmp
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  throw CONFLICT;
#endif
}

#ifdef GALOIS_USE_EXP
bool owns(Lockable* lockable, MethodFlag m);
#endif

[[noreturn]] inline void signalFailSafe(void) {
#if defined(GALOIS_USE_LONGJMP_ABORT)
  std::longjmp(galois::runtime::execFrame, galois::runtime::REACHED_FAILSAFE);
  std::abort(); // shouldn't reach here after longjmp
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  throw REACHED_FAILSAFE;
#endif
}

//! used to release lock over exception path
static inline void clearConflictLock() {}

class LockManagerBase;

/**
 * All objects that may be locked (nodes primarily) must inherit from
 * Lockable.
 */
class Lockable {
  substrate::PtrLock<LockManagerBase> owner;
  //! Use an intrusive list to track neighborhood of a context without
  //! allocation overhead. Works for cases where a Lockable needs to be only in
  //! one context's neighborhood list
  Lockable* next;
  friend class LockManagerBase;
  friend class SimpleRuntimeContext;

public:
  Lockable() : next(0) {}
};

class LockManagerBase : private boost::noncopyable {
protected:
  enum AcquireStatus { FAIL, NEW_OWNER, ALREADY_OWNER };

  AcquireStatus tryAcquire(Lockable* lockable);

  inline bool stealByCAS(Lockable* lockable, LockManagerBase* other) {
    assert(lockable != nullptr);
    return lockable->owner.stealing_CAS(other, this);
  }

  inline bool CASowner(Lockable* lockable, LockManagerBase* other) {
    assert(lockable != nullptr);
    return lockable->owner.CAS(other, this);
  }

  inline void setOwner(Lockable* lockable) {
    assert(lockable != nullptr);
    assert(!lockable->owner.getValue());
    lockable->owner.setValue(this);
  }

  inline void release(Lockable* lockable) {
    assert(lockable != nullptr);
    assert(getOwner(lockable) == this);
    lockable->owner.unlock_and_clear();
  }

  inline static bool tryLock(Lockable* lockable) {
    assert(lockable != nullptr);
    return lockable->owner.try_lock();
  }

  inline static LockManagerBase* getOwner(Lockable* lockable) {
    assert(lockable != nullptr);
    return lockable->owner.getValue();
  }
};

class SimpleRuntimeContext : public LockManagerBase {
  //! The locks we hold
  Lockable* locks;
  bool customAcquire;

protected:
  friend void doAcquire(Lockable*, galois::MethodFlag);

  static SimpleRuntimeContext* getOwner(Lockable* lockable) {
    LockManagerBase* owner = LockManagerBase::getOwner(lockable);
    return static_cast<SimpleRuntimeContext*>(owner);
  }

  virtual void subAcquire(Lockable* lockable, galois::MethodFlag m);

  void addToNhood(Lockable* lockable) {
    assert(!lockable->next);
    lockable->next = locks;
    locks          = lockable;
  }

  void acquire(Lockable* lockable, galois::MethodFlag m) {
    AcquireStatus i;
    if (customAcquire) {
      subAcquire(lockable, m);
    } else if ((i = tryAcquire(lockable)) != AcquireStatus::FAIL) {
      if (i == AcquireStatus::NEW_OWNER) {
        addToNhood(lockable);
      }
    } else {
      signalConflict(lockable);
    }
  }

  void release(Lockable* lockable);

public:
  SimpleRuntimeContext(bool child = false) : locks(0), customAcquire(child) {}
  virtual ~SimpleRuntimeContext() {}

  void startIteration() { assert(!locks); }

  unsigned cancelIteration();
  unsigned commitIteration();
};

//! get the current conflict detection class, may be null if not in parallel
//! region
SimpleRuntimeContext* getThreadContext();

//! used by the parallel code to set up conflict detection per thread
void setThreadContext(SimpleRuntimeContext* n);

//! Helper function to decide if the conflict detection lock should be taken
inline bool shouldLock(const galois::MethodFlag g) {
  // Mask out additional "optional" flags
  switch (g & galois::MethodFlag::INTERNAL_MASK) {
  case MethodFlag::UNPROTECTED:
  case MethodFlag::PREVIOUS:
    return false;

  case MethodFlag::READ:
  case MethodFlag::WRITE:
    return true;

  default:
    // XXX(ddn): Adding error checking code here either upsets the inlining
    // heuristics or icache behavior. Avoid complex code if possible.
    // GALOIS_DIE("shouldn't get here");
    assert(false);
  }
  return false;
}

//! actual locking function.  Will always lock.
inline void doAcquire(Lockable* lockable, galois::MethodFlag m) {
  SimpleRuntimeContext* ctx = getThreadContext();
  if (ctx)
    ctx->acquire(lockable, m);
}

//! Master function which handles conflict detection
//! used to acquire a lockable thing
inline void acquire(Lockable* lockable, galois::MethodFlag m) {
  if (shouldLock(m))
    doAcquire(lockable, m);
}

struct AlwaysLockObj {
  void operator()(Lockable* lockable) const {
    doAcquire(lockable, galois::MethodFlag::WRITE);
  }
};

struct CheckedLockObj {
  galois::MethodFlag m;
  CheckedLockObj(galois::MethodFlag _m) : m(_m) {}
  void operator()(Lockable* lockable) const { acquire(lockable, m); }
};

} // namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_Deterministic.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_DETERMINISTIC_H
#define GALOIS_RUNTIME_EXECUTOR_DETERMINISTIC_H

#include <deque>
#include <queue>
#include <type_traits>

#include <boost/iterator/counting_iterator.hpp>
#include <boost/iterator/iterator_facade.hpp>
#include <boost/iterator/transform_iterator.hpp>

#include "galois/Bag.h"
#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/gslist.h"
#include "galois/ParallelSTL.h"
#include "galois/runtime/Executor_ForEach.h"
#include "galois/runtime/LoopStatistics.h"
#include "galois/runtime/Mem.h"
#include "galois/runtime/Range.h"
#include "galois/runtime/Statistics.h"
#include "galois/runtime/Substrate.h"
#include "galois/runtime/UserContextAccess.h"
#include "galois/substrate/Termination.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/Threads.h"
#include "galois/TwoLevelIteratorA.h"
#include "galois/UnionFind.h"
#include "galois/worklists/WorkList.h"

// TODO deterministic hash
// TODO deterministic hash: only give ids to window
// TODO detect and fail if using releasable objects
// TODO fixed neighborhood: cyclic scheduling
// TODO fixed neighborhood: reduce list contention
// TODO fixed neighborhood: profile, reuse graph
// TODO fixed neighborhood: still ~2X slower than implicit version on bfs
namespace galois {
namespace runtime {
//! Implementation of deterministic execution
namespace internal {

extern thread_local SizedHeapFactory::SizedHeap* dagListHeap;

template <typename T, bool UseLocalState>
class DItemBase {
public:
  T val;
  unsigned long id;

  DItemBase(const T& _val, unsigned long _id) : val(_val), id(_id) {}
  void* getLocalState() const { return nullptr; }
  void setLocalState(void*) {}
};

template <typename T>
class DItemBase<T, true> {
public:
  T val;

private:
  void* localState;

public:
  unsigned long id;

  DItemBase(const T& _val, unsigned long _id)
      : val(_val), localState(nullptr), id(_id) {}
  void* getLocalState() const { return localState; }
  void setLocalState(void* ptr) { localState = ptr; }
};

template <typename OptionsTy>
using DItem =
    DItemBase<typename OptionsTy::value_type, OptionsTy::useLocalState>;

class FirstPassBase : public SimpleRuntimeContext {
protected:
  bool firstPassFlag;

public:
  explicit FirstPassBase(bool f = true)
      : SimpleRuntimeContext(true), firstPassFlag(f) {}

  bool isFirstPass(void) const { return firstPassFlag; }

  void setFirstPass(void) { firstPassFlag = true; }

  void resetFirstPass(void) { firstPassFlag = false; }

  virtual void alwaysAcquire(Lockable*, galois::MethodFlag) = 0;

  virtual void subAcquire(Lockable* lockable, galois::MethodFlag f) {
    if (isFirstPass()) {
      alwaysAcquire(lockable, f);
    }
  }
};

template <typename OptionsTy, bool HasFixedNeighborhood, bool HasIntentToRead>
class DeterministicContextBase : public FirstPassBase {
public:
  typedef DItem<OptionsTy> Item;
  Item item;

private:
  bool notReady;

public:
  DeterministicContextBase(const Item& _item)
      : FirstPassBase(true), item(_item), notReady(false) {}

  void clear() {}

  bool isReady() { return !notReady; }

  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag) {

    if (this->tryLock(lockable))
      this->addToNhood(lockable);

    DeterministicContextBase* other;
    do {
      other = static_cast<DeterministicContextBase*>(this->getOwner(lockable));
      if (other == this)
        return;
      if (other) {
        bool conflict = other->item.id < this->item.id;
        if (conflict) {
          // A lock that I want but can't get
          notReady = true;
          return;
        }
      }
    } while (!this->stealByCAS(lockable, other));

    // Disable loser
    if (other) {
      // Only need atomic write
      other->notReady = true;
    }
  }

  static void initialize() {}
};

class HasIntentToReadContext : public FirstPassBase {
public:
  unsigned long id;
  bool notReady;
  bool isWriter;

  HasIntentToReadContext(unsigned long id, bool w)
      : FirstPassBase(true), id(id), notReady(false), isWriter(w) {}

  bool isReady() { return !notReady; }
};

class ReaderContext : public galois::UnionFindNode<ReaderContext>,
                      public HasIntentToReadContext {
  template <typename, bool, bool>
  friend class DeterministicContextBase;

public:
  ReaderContext(unsigned long id)
      : galois::UnionFindNode<ReaderContext>(const_cast<ReaderContext*>(this)),
        HasIntentToReadContext(id, false) {}

  void build() {
    if (this->isReady())
      return;
    ReaderContext* r = this->find();
    if (r->isReady())
      r->notReady = true;
  }

  bool propagate() { return this->find()->isReady(); }

  virtual void alwaysAcquire(Lockable*, galois::MethodFlag) {
    GALOIS_DIE("unreachable");
  }
};

template <typename OptionsTy>
class DeterministicContextBase<OptionsTy, false, true>
    : public HasIntentToReadContext {
public:
  typedef DItem<OptionsTy> Item;
  Item item;

private:
  ReaderContext readerCtx;

  void acquireRead(Lockable* lockable) {
    HasIntentToReadContext* other;
    do {
      other = static_cast<HasIntentToReadContext*>(this->getOwner(lockable));
      if (other == this || other == &readerCtx)
        return;
      if (other) {
        bool conflict = other->id < this->id;
        if (conflict) {
          if (other->isWriter)
            readerCtx.notReady = true;
          else
            readerCtx.merge(static_cast<ReaderContext*>(other));
          return;
        }
      }
    } while (!readerCtx.stealByCAS(lockable, other));

    // Disable loser
    if (other) {
      if (other->isWriter) {
        // Only need atomic write
        other->notReady = true;
      } else {
        static_cast<ReaderContext*>(other)->merge(&readerCtx);
      }
    }
  }

  void acquireWrite(Lockable* lockable) {
    HasIntentToReadContext* other;
    do {
      other = static_cast<HasIntentToReadContext*>(this->getOwner(lockable));
      if (other == this || other == &readerCtx)
        return;
      if (other) {
        bool conflict = other->id < this->id;
        if (conflict) {
          // A lock that I want but can't get
          this->notReady = true;
          return;
        }
      }
    } while (!this->stealByCAS(lockable, other));

    // Disable loser
    if (other) {
      // Only need atomic write
      other->notReady = true;
    }
  }

public:
  DeterministicContextBase(const Item& i)
      : HasIntentToReadContext(i.id, true), item(i), readerCtx(i.id) {}

  void clear() {}

  void build() { readerCtx.build(); }

  void propagate() {
    if (this->isReady() && !readerCtx.propagate())
      this->notReady = true;
  }

  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag m) {
    assert(m == MethodFlag::READ || m == MethodFlag::WRITE);

    if (this->tryLock(lockable))
      this->addToNhood(lockable);

    if (m == MethodFlag::READ) {
      acquireRead(lockable);
    } else {
      assert(m == MethodFlag::WRITE);
      acquireWrite(lockable);
    }
  }

  static void initialize() {}
};

template <typename OptionsTy>
class DeterministicContextBase<OptionsTy, true, false> : public FirstPassBase {
public:
  typedef DItem<OptionsTy> Item;
  typedef galois::concurrent_gslist<DeterministicContextBase*, 8> ContextList;
  Item item;
  ContextList edges;
  ContextList succs;
  std::atomic<int> preds;

  struct ContextPtrLessThan {
    bool operator()(const DeterministicContextBase* a,
                    const DeterministicContextBase* b) const {
      // XXX non-deterministic behavior when we have multiple items with the
      // same id
      if (a->item.id == b->item.id)
        return a < b;
      return a->item.id < b->item.id;
    }
  };

public:
  DeterministicContextBase(const Item& _item)
      : FirstPassBase(true), item(_item), preds(0) {}

  void clear() {
    assert(preds == 0);
    this->commitIteration();
    // TODO replace with bulk heap
    edges.clear(*dagListHeap);
    succs.clear(*dagListHeap);
  }

  void addEdge(DeterministicContextBase* o) {
    succs.push_front(*dagListHeap, o);
    o->preds += 1;
  }

  bool isReady() { return false; }

  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag) {

    // First to lock becomes representative
    DeterministicContextBase* owner =
        static_cast<DeterministicContextBase*>(this->getOwner(lockable));
    while (!owner) {
      if (this->tryLock(lockable)) {
        this->setOwner(lockable);
        this->addToNhood(lockable);
      }

      owner = static_cast<DeterministicContextBase*>(this->getOwner(lockable));
    }

    if (std::find(edges.begin(), edges.end(), owner) != edges.end())
      return;
    edges.push_front(*dagListHeap, owner);
  }

  static void initialize() {
    if (!dagListHeap)
      dagListHeap = SizedHeapFactory::getHeapForSize(
          sizeof(typename ContextList::block_type));
  }
};

template <typename OptionsTy>
class DeterministicContextBase<OptionsTy, true, true> {
  // TODO implement me
};

template <typename OptionsTy>
using DeterministicContext =
    DeterministicContextBase<OptionsTy, OptionsTy::hasFixedNeighborhood,
                             OptionsTy::hasIntentToRead>;

template <typename T>
struct DNewItem {
  T val;
  unsigned long parent;
  unsigned count;

  DNewItem(const T& _val, unsigned long _parent, unsigned _count)
      : val(_val), parent(_parent), count(_count) {}

  bool operator<(const DNewItem<T>& o) const {
    if (parent < o.parent)
      return true;
    else if (parent == o.parent)
      return count < o.count;
    else
      return false;
  }

  bool operator==(const DNewItem<T>& o) const {
    return parent == o.parent && count == o.count;
  }

  bool operator!=(const DNewItem<T>& o) const { return !(*this == o); }

  struct GetValue {
    const T& operator()(const DNewItem<T>& x) const { return x.val; }
  };
};

template <typename InputIteratorTy>
void safe_advance(InputIteratorTy& it, size_t d, size_t& cur, size_t dist) {
  if (d + cur >= dist) {
    d = dist - cur;
  }
  std::advance(it, d);
  cur += d;
}

//! Wrapper around worklists::ChunkFIFO to allow peek() and empty() and still
//! have FIFO order
template <int ChunkSize, typename T>
struct FIFO {
  worklists::ChunkFIFO<ChunkSize, T, false> m_data;
  worklists::ChunkLIFO<16, T, false> m_buffer;
  size_t m_size;

  FIFO() : m_size(0) {}

  ~FIFO() {
    galois::optional<T> p;
    while ((p = m_buffer.pop()))
      ;
    while ((p = m_data.pop()))
      ;
  }

  galois::optional<T> pop() {
    galois::optional<T> p;
    if ((p = m_buffer.pop()) || (p = m_data.pop())) {
      --m_size;
    }
    return p;
  }

  galois::optional<T> peek() {
    galois::optional<T> p;
    if ((p = m_buffer.pop())) {
      m_buffer.push(*p);
    } else if ((p = m_data.pop())) {
      m_buffer.push(*p);
    }
    return p;
  }

  void push(const T& val) {
    m_data.push(val);
    ++m_size;
  }

  size_t size() const { return m_size; }

  bool empty() const { return m_size == 0; }
};

template <typename T, typename FunctionTy, typename ArgsTy>
struct OptionsCommon {
  typedef T value_type;
  typedef FunctionTy function2_type;
  typedef ArgsTy args_type;

  constexpr static bool needStats = galois::internal::NeedStats<ArgsTy>::value;
  constexpr static bool needsPush = !has_trait<no_pushes_tag, ArgsTy>();
  constexpr static bool needsAborts =
      !has_trait<disable_conflict_detection_tag, ArgsTy>();
  constexpr static bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();
  constexpr static bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();

  constexpr static bool hasBreak = has_trait<det_parallel_break_tag, ArgsTy>();
  constexpr static bool hasId    = has_trait<det_id_tag, ArgsTy>();

  constexpr static bool useLocalState = has_trait<local_state_tag, ArgsTy>();
  constexpr static bool hasFixedNeighborhood =
      has_trait<fixed_neighborhood_tag, ArgsTy>();
  constexpr static bool hasIntentToRead =
      has_trait<intent_to_read_tag, ArgsTy>();

  static const int ChunkSize             = 32;
  static const unsigned InitialNumRounds = 100;
  static const size_t MinDelta           = ChunkSize * 40;

  static_assert(
      !hasFixedNeighborhood || (hasFixedNeighborhood && hasId),
      "Please provide id function when operator has fixed neighborhood");

  function2_type fn2;
  args_type args;

  OptionsCommon(function2_type f, ArgsTy a) : fn2(f), args(a) {}
};

template <typename T, typename FunctionTy, typename ArgsTy, bool Enable>
struct OptionsBase : public OptionsCommon<T, FunctionTy, ArgsTy> {
  typedef OptionsCommon<T, FunctionTy, ArgsTy> SuperTy;
  typedef FunctionTy function1_type;

  function1_type fn1;

  OptionsBase(function1_type f, ArgsTy a) : SuperTy(f, a), fn1(f) {}
};

template <typename T, typename FunctionTy, typename ArgsTy>
struct OptionsBase<T, FunctionTy, ArgsTy, true>
    : public OptionsCommon<T, FunctionTy, ArgsTy> {
  typedef OptionsCommon<T, FunctionTy, ArgsTy> SuperTy;
  typedef typename get_trait_type<neighborhood_visitor_tag, ArgsTy>::type::type
      function1_type;

  function1_type fn1;

  OptionsBase(FunctionTy f, ArgsTy a)
      : SuperTy(f, a), fn1(get_trait_value<neighborhood_visitor_tag>(a).value) {
  }
};

template <typename T, typename FunctionTy, typename ArgsTy>
using Options = OptionsBase<T, FunctionTy, ArgsTy,
                            has_trait<neighborhood_visitor_tag, ArgsTy>()>;

template <typename OptionsTy, bool Enable>
class DAGManagerBase {
  typedef DeterministicContext<OptionsTy> Context;

public:
  void destroyDAGManager() {}
  void pushDAGTask(Context*) {}
  bool buildDAG() { return false; }
  template <typename Executor, typename ExecutorTLD>
  bool executeDAG(Executor&, ExecutorTLD&) {
    return false;
  }
};

template <typename OptionsTy>
class DAGManagerBase<OptionsTy, true> {
  typedef DeterministicContext<OptionsTy> Context;
  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize * 2, Context*> WL1;
  typedef worklists::PerThreadChunkLIFO<OptionsTy::ChunkSize * 2, Context*> WL2;
  typedef worklists::PerSocketChunkFIFO<32, Context*> WL3;

  struct ThreadLocalData : private boost::noncopyable {
    typedef std::vector<Context*,
                        typename PerIterAllocTy::rebind<Context*>::other>
        SortBuf;
    IterAllocBaseTy heap;
    PerIterAllocTy alloc;
    SortBuf sortBuf;
    ThreadLocalData() : alloc(&heap), sortBuf(alloc) {}
  };

  substrate::PerThreadStorage<ThreadLocalData> data;
  WL1 taskList;
  WL2 taskList2;
  WL3 sourceList;
  substrate::TerminationDetection& term;
  substrate::Barrier& barrier;

public:
  DAGManagerBase()
      : term(substrate::getSystemTermination(activeThreads)),
        barrier(getBarrier(activeThreads)) {}

  void destroyDAGManager() { data.getLocal()->heap.clear(); }

  void pushDAGTask(Context* ctx) { taskList.push(ctx); }

  bool buildDAG() {
    ThreadLocalData& tld = *data.getLocal();
    galois::optional<Context*> p;
    while ((p = taskList.pop())) {
      Context* ctx = *p;
      tld.sortBuf.clear();
      std::copy(ctx->edges.begin(), ctx->edges.end(),
                std::back_inserter(tld.sortBuf));
      std::sort(tld.sortBuf.begin(), tld.sortBuf.end(),
                typename Context::ContextPtrLessThan());

      if (!tld.sortBuf.empty()) {
        Context* last = tld.sortBuf.front();
        for (auto ii = tld.sortBuf.begin() + 1, ei = tld.sortBuf.end();
             ii != ei; ++ii) {
          Context* cur = *ii;
          if (last != cur && cur != ctx)
            last->addEdge(cur);
          last = cur;
        }
      }

      taskList2.push(ctx);
    }
    return true;
  }

  template <typename Executor, typename ExecutorTLD>
  bool executeDAG(Executor& e, ExecutorTLD& etld) {
    auto& local = e.getLocalWindowManager();
    galois::optional<Context*> p;
    Context* ctx;

    // Go through all tasks to find intial sources and
    while ((p = taskList2.pop())) {
      ctx = *p;
      if (ctx->preds.load(std::memory_order_relaxed) == 0)
        sourceList.push(ctx);
    }

    term.initializeThread();

    barrier.wait();

    size_t oldCommitted = 0;
    size_t committed    = 0;
    do {
      galois::optional<Context*> p;
      while ((p = sourceList.pop())) {
        ctx = *p;
        assert(ctx->preds == 0);
        bool commit;
        commit = e.executeTask(etld, ctx);
        local.incrementCommitted();
        assert(commit);
        committed += 1;
        e.deallocLocalState(etld.facing);

        if (OptionsTy::needsPia && !OptionsTy::useLocalState)
          etld.facing.resetAlloc();

        etld.facing.resetPushBuffer();

        // enqueue successors
        for (auto& succ : ctx->succs) {
          int v = --succ->preds;
          assert(v >= 0);
          if (v == 0)
            sourceList.push(succ);
        }
      }

      term.localTermination(oldCommitted != committed);
      oldCommitted = committed;
      substrate::asmPause();
    } while (!term.globalTermination());

    if (OptionsTy::needsPia && OptionsTy::useLocalState)
      etld.facing.resetAlloc();

    setThreadContext(0);

    return true;
  }
};

template <typename OptionsTy>
using DAGManager = DAGManagerBase<OptionsTy, OptionsTy::hasFixedNeighborhood>;

template <typename OptionsTy, bool Enable>
struct StateManagerBase {
  typedef typename OptionsTy::value_type value_type;
  typedef typename OptionsTy::function2_type function_type;
  void allocLocalState(UserContextAccess<value_type>&, function_type&) {}
  void deallocLocalState(UserContextAccess<value_type>&) {}
  void saveLocalState(UserContextAccess<value_type>&, DItem<OptionsTy>&) {}
  void restoreLocalState(UserContextAccess<value_type>&,
                         const DItem<OptionsTy>&) {}
  void reuseItem(DItem<OptionsTy>&) {}

  template <typename LWL, typename GWL>
  typename GWL::value_type* emplaceContext(LWL&, GWL& gwl,
                                           const DItem<OptionsTy>& item) const {
    return gwl.emplace(item);
  }

  template <typename LWL, typename GWL>
  typename GWL::value_type* peekContext(LWL&, GWL& gwl) const {
    return gwl.peek();
  }

  template <typename LWL, typename GWL>
  void popContext(LWL&, GWL& gwl) const {
    gwl.pop_peeked();
  }
};

template <typename OptionsTy>
struct StateManagerBase<OptionsTy, true> {
  typedef typename OptionsTy::value_type value_type;
  typedef typename OptionsTy::function2_type function_type;
  typedef typename get_trait_type<
      local_state_tag, typename OptionsTy::args_type>::type::type LocalState;

  void allocLocalState(UserContextAccess<value_type>& c, function_type&) {
    void* p = c.data().getPerIterAlloc().allocate(sizeof(LocalState));
    // new (p) LocalState(self, c.data().getPerIterAlloc());
    c.setLocalState(p);
  }

  void deallocLocalState(UserContextAccess<value_type>& c) {
    LocalState* p = c.data().template getLocalState<LocalState>();
    if (p)
      p->~LocalState();
  }

  void saveLocalState(UserContextAccess<value_type>& c,
                      DItem<OptionsTy>& item) {
    item.setLocalState(c.data().template getLocalState<LocalState>());
  }

  void restoreLocalState(UserContextAccess<value_type>& c,
                         const DItem<OptionsTy>& item) {
    c.setLocalState(item.getLocalState());
  }

  template <typename LWL, typename GWL>
  typename LWL::value_type* emplaceContext(LWL& lwl, GWL&,
                                           const DItem<OptionsTy>& item) const {
    return lwl.emplace(item);
  }

  template <typename LWL, typename GWL>
  typename LWL::value_type* peekContext(LWL& lwl, GWL&) const {
    return lwl.peek();
  }

  template <typename LWL, typename GWL>
  void popContext(LWL& lwl, GWL&) const {
    lwl.pop_peeked();
  }

  void reuseItem(DItem<OptionsTy>& item) { item.setLocalState(nullptr); }
};

template <typename OptionsTy>
using StateManager = StateManagerBase<OptionsTy, OptionsTy::useLocalState>;

template <typename OptionsTy, bool Enable>
class BreakManagerBase {
public:
  bool checkBreak() { return false; }
  BreakManagerBase(const OptionsTy&) {}
};

template <typename OptionsTy>
class BreakManagerBase<OptionsTy, true> {
  typedef typename get_trait_type<det_parallel_break_tag,
                                  typename OptionsTy::args_type>::type::type
      BreakFn;
  BreakFn breakFn;
  substrate::Barrier& barrier;
  substrate::CacheLineStorage<volatile long> done;

public:
  BreakManagerBase(const OptionsTy& o)
      : breakFn(get_trait_value<det_parallel_break_tag>(o.args).value),
        barrier(getBarrier(activeThreads)) {}

  bool checkBreak() {
    if (substrate::ThreadPool::getTID() == 0)
      done.get() = breakFn();
    barrier.wait();
    return done.get();
  }
};

template <typename OptionsTy>
using BreakManager = BreakManagerBase<OptionsTy, OptionsTy::hasBreak>;

template <typename OptionsTy, bool Enable>
class IntentToReadManagerBase {
  typedef DeterministicContext<OptionsTy> Context;

public:
  void pushIntentToReadTask(Context*) {}
  bool buildIntentToRead() { return false; }
};

template <typename OptionsTy>
class IntentToReadManagerBase<OptionsTy, true> {
  typedef DeterministicContext<OptionsTy> Context;
  typedef galois::gdeque<Context*> WL;
  substrate::PerThreadStorage<WL> pending;
  substrate::Barrier& barrier;

public:
  IntentToReadManagerBase() : barrier(getBarrier(activeThreads)) {}

  void pushIntentToReadTask(Context* ctx) {
    pending.getLocal()->push_back(ctx);
  }

  // NB(ddn): Need to gather information from dependees before commitLoop
  // otherwise some contexts will be deallocated before we have time to check
  bool buildIntentToRead() {
    for (Context* ctx : *pending.getLocal())
      ctx->build();
    barrier.wait();
    for (Context* ctx : *pending.getLocal())
      ctx->propagate();
    pending.getLocal()->clear();
    return true;
  }
};

template <typename OptionsTy>
using IntentToReadManager =
    IntentToReadManagerBase<OptionsTy, OptionsTy::hasIntentToRead>;

template <typename OptionsTy, bool Enable>
class WindowManagerBase {
public:
  class ThreadLocalData {
    template <typename, bool>
    friend class WindowManagerBase;
    size_t window;
    size_t delta;
    size_t committed;
    size_t iterations;

  public:
    size_t nextWindow(bool first = false) {
      if (first)
        window = delta;
      else
        window += delta;
      committed = iterations = 0;
      return window;
    }

    void incrementIterations() { ++iterations; }
    void incrementCommitted() { ++committed; }
  };

private:
  substrate::PerThreadStorage<ThreadLocalData> data;
  unsigned numActive;

public:
  WindowManagerBase() { numActive = getActiveThreads(); }

  ThreadLocalData& getLocalWindowManager() { return *data.getLocal(); }

  size_t nextWindow(size_t dist, size_t atleast, size_t base = 0) {
    if (false) {
      // This, which tries to continue delta with new work, seems to result in
      // more conflicts (although less total rounds) and more time
      ThreadLocalData& local = *data.getLocal();
      return local.nextWindow(true);
    } else {
      return initialWindow(dist, atleast, base);
    }
  }

  size_t initialWindow(size_t dist, size_t atleast, size_t base = 0) {
    ThreadLocalData& local = *data.getLocal();
    size_t w     = std::max(dist / OptionsTy::InitialNumRounds, atleast) + base;
    local.window = local.delta = w;
    return w;
  }

  void calculateWindow(bool inner) {
    ThreadLocalData& local = *data.getLocal();

    // Accumulate all threads' info
    size_t allcommitted  = 0;
    size_t alliterations = 0;
    for (unsigned i = 0; i < numActive; ++i) {
      ThreadLocalData& r = *data.getRemote(i);
      allcommitted += r.committed;
      alliterations += r.iterations;
    }

    float commitRatio =
        alliterations > 0 ? allcommitted / (float)alliterations : 0.0;
    const float target = 0.95;

    if (commitRatio >= target)
      local.delta += local.delta;
    else if (allcommitted == 0) {
      assert((alliterations == 0) && "someone should have committed");
      local.delta += local.delta;
    } else
      local.delta = commitRatio / target * local.delta;

    if (!inner) {
      if (local.delta < OptionsTy::MinDelta)
        local.delta = OptionsTy::MinDelta;
    } else if (local.delta < OptionsTy::MinDelta) {
      // Try to get some new work instead of increasing window
      local.delta = 0;
    }

    // Useful debugging info
    if (false) {
      if (substrate::ThreadPool::getTID() == 0) {
        char buf[1024];
        snprintf(buf, 1024, "%d %.3f (%zu/%zu) window: %zu delta: %zu\n", inner,
                 commitRatio, allcommitted, alliterations, local.window,
                 local.delta);
        gPrint(buf);
      }
    }
  }
};

template <typename OptionsTy>
class WindowManagerBase<OptionsTy, true> {
public:
  class ThreadLocalData {
  public:
    size_t nextWindow() { return std::numeric_limits<size_t>::max(); }

    void incrementIterations() {}
    void incrementCommitted() {}
  };

private:
  ThreadLocalData data;

public:
  ThreadLocalData& getLocalWindowManager() { return data; }

  size_t nextWindow(size_t, size_t, size_t = 0) { return data.nextWindow(); }

  size_t initialWindow(size_t, size_t, size_t = 0) {
    return std::numeric_limits<size_t>::max();
  }

  void calculateWindow(bool) {}
};

template <typename OptionsTy>
using WindowManager =
    WindowManagerBase<OptionsTy, OptionsTy::hasFixedNeighborhood>;

template <typename OptionsTy, bool Enable>
struct IdManagerBase {
  typedef typename OptionsTy::value_type value_type;
  IdManagerBase(const OptionsTy&) {}
  uintptr_t id(const value_type&) { return 0; }
};

template <typename OptionsTy>
class IdManagerBase<OptionsTy, true> {
  typedef typename OptionsTy::value_type value_type;
  typedef
      typename get_trait_type<det_id_tag,
                              typename OptionsTy::args_type>::type::type IdFn;
  IdFn idFn;

public:
  IdManagerBase(const OptionsTy& o)
      : idFn(get_trait_value<det_id_tag>(o.args).value) {}
  uintptr_t id(const value_type& x) { return idFn(x); }
};

template <typename OptionsTy>
using IdManager = IdManagerBase<OptionsTy, OptionsTy::hasId>;

template <typename OptionsTy>
class NewWorkManager : public IdManager<OptionsTy> {
  typedef typename OptionsTy::value_type value_type;
  typedef DItem<OptionsTy> Item;
  typedef DNewItem<value_type> NewItem;
  typedef std::vector<NewItem, typename PerIterAllocTy::rebind<NewItem>::other>
      NewItemsTy;
  typedef typename NewItemsTy::iterator NewItemsIterator;
  typedef FIFO<1024, Item> ReserveTy;
  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, NewItem> NewWork;

  struct GetNewItem {
    NewWorkManager* self;
    GetNewItem(NewWorkManager* s = 0) : self(s) {}
    NewItemsTy& operator()(int i) const {
      return self->data.getRemote(i)->newItems;
    }
  };

  typedef boost::transform_iterator<GetNewItem, boost::counting_iterator<int>>
      MergeOuterIt;
  typedef std::vector<NewItem, typename PerIterAllocTy::rebind<NewItem>::other>
      MergeBuf;
  typedef std::vector<value_type,
                      typename PerIterAllocTy::rebind<value_type>::other>
      DistributeBuf;

  struct ThreadLocalData {
    IterAllocBaseTy heap;
    PerIterAllocTy alloc;
    NewItemsTy newItems;
    ReserveTy reserve;
    size_t minId;
    size_t maxId;
    size_t size;

    ThreadLocalData() : alloc(&heap), newItems(alloc) {}
  };

  IterAllocBaseTy heap;
  PerIterAllocTy alloc;
  substrate::PerThreadStorage<ThreadLocalData> data;
  NewWork new_;
  MergeBuf mergeBuf;
  DistributeBuf distributeBuf;
  substrate::Barrier& barrier;
  unsigned numActive;

  bool merge(int begin, int end) {
    if (begin == end)
      return false;
    else if (begin + 1 == end)
      return !data.getRemote(begin)->newItems.empty();

    bool retval = false;
    int mid     = (end - begin) / 2 + begin;
    retval |= merge(begin, mid);
    retval |= merge(mid, end);

    GetNewItem fn(this);

    MergeOuterIt bbegin(boost::make_counting_iterator(begin), fn);
    MergeOuterIt mmid(boost::make_counting_iterator(mid), fn);
    MergeOuterIt eend(boost::make_counting_iterator(end), fn);
    auto aa = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,
                                      typename NewItemsTy::iterator, GetBegin,
                                      GetEnd>(bbegin, mmid);
    auto bb = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,
                                      typename NewItemsTy::iterator, GetBegin,
                                      GetEnd>(mmid, eend);
    auto cc = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,
                                      typename NewItemsTy::iterator, GetBegin,
                                      GetEnd>(bbegin, eend);

    while (aa.first != aa.second && bb.first != bb.second) {
      if (*aa.first < *bb.first)
        mergeBuf.push_back(*aa.first++);
      else
        mergeBuf.push_back(*bb.first++);
    }

    for (; aa.first != aa.second; ++aa.first)
      mergeBuf.push_back(*aa.first);

    for (; bb.first != bb.second; ++bb.first)
      mergeBuf.push_back(*bb.first);

    for (NewItemsIterator ii = mergeBuf.begin(), ei = mergeBuf.end(); ii != ei;
         ++ii)
      *cc.first++ = *ii;

    mergeBuf.clear();

    assert(cc.first == cc.second);

    return retval;
  }

  /**
   * Slightly complicated reindexing to separate out continuous elements in
   * InputIterator. <pre> Example:
   *
   * blocksize: 2
   * pos:  0 1 2 3 4 5
   * item: A B C D E F
   * new:  A D B E C F
   * </pre>
   */
  template <typename InputIteratorTy>
  void redistribute(InputIteratorTy ii, InputIteratorTy ei, size_t dist,
                    size_t window, unsigned tid) {
    // ThreadLocalData& local = *data.getLocal();
    size_t blockSize = window;
    size_t numBlocks = dist / blockSize;

    size_t cur = 0;
    safe_advance(ii, tid, cur, dist);
    while (ii != ei) {
      unsigned long id;
      if (cur < blockSize * numBlocks)
        id = (cur % numBlocks) * blockSize + (cur / numBlocks);
      else
        id = cur;
      distributeBuf[id] = *ii;
      safe_advance(ii, numActive, cur, dist);
    }
  }

  template <typename InputIteratorTy, typename WL>
  void copyMine(InputIteratorTy ii, InputIteratorTy ei, size_t dist, WL* wl,
                size_t window, unsigned tid) {
    ThreadLocalData& local = *data.getLocal();
    size_t cur             = 0;
    size_t k               = 0;
    safe_advance(ii, tid, cur, dist);
    while (ii != ei) {
      unsigned long id = k * numActive + tid;
      if (id < window)
        wl->push(Item(*ii, id));
      else
        break;
      ++k;
      safe_advance(ii, numActive, cur, dist);
    }

    while (ii != ei) {
      unsigned long id = k * numActive + tid;
      local.reserve.push(Item(*ii, id));
      ++k;
      safe_advance(ii, numActive, cur, dist);
    }
  }

  template <typename InputIteratorTy, typename WL>
  void copyAllWithIds(InputIteratorTy ii, InputIteratorTy ei, WL* wl,
                      size_t window) {
    ThreadLocalData& local = *data.getLocal();
    for (; ii != ei; ++ii) {
      unsigned long id = ii->parent;
      if (id < window)
        wl->push(Item(ii->val, id));
      else
        break;
    }

    for (; ii != ei; ++ii) {
      unsigned long id = ii->parent;
      local.reserve.push(Item(ii->val, id));
    }
  }

  template <typename InputIteratorTy, typename WL>
  void copyMineAfterRedistribute(InputIteratorTy ii, InputIteratorTy ei,
                                 size_t dist, WL* wl, size_t window,
                                 unsigned tid) {
    if (tid == 0) {
      distributeBuf.resize(dist);
    }
    barrier.wait();
    redistribute(ii, ei, dist, window, tid);
    barrier.wait();
    copyMine(distributeBuf.begin(), distributeBuf.end(), dist, wl, window, tid);
  }

  template <typename WL>
  void parallelSort(WindowManager<OptionsTy>& wm, WL* wl, unsigned tid) {
    ThreadLocalData& local = *data.getLocal();

    local.newItems.clear();
    galois::optional<NewItem> p;
    while ((p = this->new_.pop())) {
      local.newItems.push_back(*p);
    }

    NewItemsIterator ii = local.newItems.begin();
    NewItemsIterator ei = local.newItems.end();
    std::sort(ii, ei);
    initialLimits(ii, ei);
    local.size = local.newItems.size();

    barrier.wait();

    if (tid == 0) {
      receiveLimits(local);
      broadcastLimits(local);
      if (!OptionsTy::hasId) {
        mergeBuf.reserve(local.size);
        merge(0, numActive);
      }
    }

    barrier.wait();

    if (OptionsTy::hasId) {
      size_t window = wm.nextWindow(local.maxId - local.minId,
                                    OptionsTy::MinDelta, local.minId);
      copyAllWithIds(ii, ei, wl, window);
    } else {
      GetNewItem fn(this);
      MergeOuterIt bbegin(boost::make_counting_iterator(0), fn);
      MergeOuterIt eend(boost::make_counting_iterator((int)numActive), fn);
      auto ii = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,
                                        typename NewItemsTy::iterator, GetBegin,
                                        GetEnd>(bbegin, eend);

      size_t window = wm.nextWindow(local.size, OptionsTy::MinDelta);
      copyMineAfterRedistribute(boost::make_transform_iterator(
                                    ii.first, typename NewItem::GetValue()),
                                boost::make_transform_iterator(
                                    ii.second, typename NewItem::GetValue()),
                                local.size, wl, window, tid);
    }
  }

  void broadcastLimits(ThreadLocalData& local) {
    for (unsigned i = 1; i < numActive; ++i) {
      ThreadLocalData& other = *data.getRemote(i);
      other.minId            = local.minId;
      other.maxId            = local.maxId;
      other.size             = local.size;
    }
  }

  void receiveLimits(ThreadLocalData& local) {
    for (unsigned i = 1; i < numActive; ++i) {
      ThreadLocalData& other = *data.getRemote(i);
      local.minId            = std::min(other.minId, local.minId);
      local.maxId            = std::max(other.maxId, local.maxId);
      local.size += other.size;
    }
  }

  //! Update min and max from sorted iterator
  template <typename BiIteratorTy>
  void initialLimits(BiIteratorTy ii, BiIteratorTy ei) {
    ThreadLocalData& local = *data.getLocal();

    local.minId = std::numeric_limits<size_t>::max();
    local.maxId = std::numeric_limits<size_t>::min();
    local.size  = std::distance(ii, ei);

    if (ii != ei) {
      if (ii + 1 == ei) {
        local.minId = local.maxId = ii->parent;
      } else {
        local.minId = ii->parent;
        local.maxId = (ei - 1)->parent;
      }
    }
  }

  template <typename InputIteratorTy>
  void sortInitialWorkDispatch(InputIteratorTy, InputIteratorTy, ...) {}

  template <typename InputIteratorTy, bool HasId = OptionsTy::hasId,
            bool HasFixed = OptionsTy::hasFixedNeighborhood>
  auto sortInitialWorkDispatch(InputIteratorTy ii, InputIteratorTy ei, int) ->
      typename std::enable_if<HasId && !HasFixed, void>::type {
    ThreadLocalData& local = *data.getLocal();
    size_t dist            = std::distance(ii, ei);

    mergeBuf.reserve(dist);
    for (; ii != ei; ++ii)
      mergeBuf.emplace_back(*ii, this->id(*ii), 1);

    ParallelSTL::sort(mergeBuf.begin(), mergeBuf.end());

    initialLimits(mergeBuf.begin(), mergeBuf.end());
    broadcastLimits(local);
  }

public:
  NewWorkManager(const OptionsTy& o)
      : IdManager<OptionsTy>(o), alloc(&heap), mergeBuf(alloc),
        distributeBuf(alloc), barrier(getBarrier(activeThreads)) {
    numActive = getActiveThreads();
  }

  bool emptyReserve() { return data.getLocal()->reserve.empty(); }

  template <typename WL>
  void pushNextWindow(WL* wl, size_t window) {
    ThreadLocalData& local = *data.getLocal();
    galois::optional<Item> p;
    while ((p = local.reserve.peek())) {
      if (p->id >= window)
        break;
      wl->push(*p);
      local.reserve.pop();
    }
  }

  void clearNewWork() { data.getLocal()->heap.clear(); }

  template <typename InputIteratorTy>
  void sortInitialWork(InputIteratorTy ii, InputIteratorTy ei) {
    return sortInitialWorkDispatch(ii, ei, 0);
  }

  template <typename InputIteratorTy, typename WL>
  void addInitialWork(WindowManager<OptionsTy>& wm, InputIteratorTy b,
                      InputIteratorTy e, WL* wl) {
    size_t dist = std::distance(b, e);
    if (OptionsTy::hasId) {
      ThreadLocalData& local = *data.getLocal();
      size_t window = wm.initialWindow(dist, OptionsTy::MinDelta, local.minId);
      if (OptionsTy::hasFixedNeighborhood) {
        copyMine(b, e, dist, wl, window, substrate::ThreadPool::getTID());
      } else {
        copyMine(boost::make_transform_iterator(mergeBuf.begin(),
                                                typename NewItem::GetValue()),
                 boost::make_transform_iterator(mergeBuf.end(),
                                                typename NewItem::GetValue()),
                 mergeBuf.size(), wl, window, substrate::ThreadPool::getTID());
      }
    } else {
      size_t window = wm.initialWindow(dist, OptionsTy::MinDelta);
      copyMineAfterRedistribute(b, e, dist, wl, window,
                                substrate::ThreadPool::getTID());
    }
  }

  template <bool HasId = OptionsTy::hasId>
  auto pushNew(const value_type& val, unsigned long, unsigned) ->
      typename std::enable_if<HasId, void>::type {
    new_.push(NewItem(val, this->id(val), 1));
  }

  template <bool HasId = OptionsTy::hasId>
  auto pushNew(const value_type& val, unsigned long parent, unsigned count) ->
      typename std::enable_if<!HasId, void>::type {
    new_.push(NewItem(val, parent, count));
  }

  template <typename WL>
  void distributeNewWork(WindowManager<OptionsTy>& wm, WL* wl) {
    parallelSort(wm, wl, substrate::ThreadPool::getTID());
  }
};

template <typename OptionsTy>
class Executor : public BreakManager<OptionsTy>,
                 public StateManager<OptionsTy>,
                 public NewWorkManager<OptionsTy>,
                 public WindowManager<OptionsTy>,
                 public DAGManager<OptionsTy>,
                 public IntentToReadManager<OptionsTy> {
  typedef typename OptionsTy::value_type value_type;
  typedef DItem<OptionsTy> Item;
  typedef DeterministicContext<OptionsTy> Context;

  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, Item> WL;
  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, Context>
      PendingWork;
  typedef worklists::ChunkFIFO<OptionsTy::ChunkSize, Context, false>
      LocalPendingWork;

  // Truly thread-local
  using LoopStat = LoopStatistics<OptionsTy::needStats>;
  struct ThreadLocalData : public LoopStat {

    typename OptionsTy::function1_type fn1;
    typename OptionsTy::function2_type fn2;
    LocalPendingWork localPending;
    UserContextAccess<value_type> facing;

    WL* wlcur;
    WL* wlnext;
    size_t rounds;
    size_t outerRounds;
    bool hasNewWork;
    ThreadLocalData(const OptionsTy& o, const char* loopname)
        : LoopStat(loopname), fn1(o.fn1), fn2(o.fn2), rounds(0),
          outerRounds(0) {}
  };

  OptionsTy options;
  substrate::Barrier& barrier;
  WL worklists[2];
  PendingWork pending;
  const char* loopname;
  substrate::CacheLineStorage<volatile long> innerDone;
  substrate::CacheLineStorage<volatile long> outerDone;
  substrate::CacheLineStorage<volatile long> hasNewWork;

  int runFunction(ThreadLocalData& tld, Context* ctx);

  bool pendingLoop(ThreadLocalData& tld);
  bool commitLoop(ThreadLocalData& tld);
  void go();

  void drainPending(ThreadLocalData& tld) {
    Context* ctx;
    while ((ctx = this->peekContext(tld.localPending, pending))) {
      ctx->clear();
      this->popContext(tld.localPending, pending);
    }
  }

public:
  Executor(const OptionsTy& o)
      : BreakManager<OptionsTy>(o), NewWorkManager<OptionsTy>(o), options(o),
        barrier(getBarrier(activeThreads)),
        loopname(galois::internal::getLoopName(o.args)) {
    static_assert(!OptionsTy::needsBreak || OptionsTy::hasBreak,
                  "need to use break function to break loop");
  }

  bool executeTask(ThreadLocalData& tld, Context* ctx);

  template <typename RangeTy>
  void initThread(const RangeTy& range) {
    Context::initialize();
    this->addInitialWork(*this, range.begin(), range.end(), &worklists[1]);
  }

  template <typename RangeTy>
  void init(const RangeTy& range) {
    this->sortInitialWork(range.begin(), range.end());
  }

  void operator()() { go(); }
};

template <typename OptionsTy>
void Executor<OptionsTy>::go() {
  ThreadLocalData tld(options, loopname);
  auto& local = this->getLocalWindowManager();
  tld.wlcur   = &worklists[0];
  tld.wlnext  = &worklists[1];

  tld.hasNewWork = false;

  while (true) {
    ++tld.outerRounds;

    while (true) {
      ++tld.rounds;

      std::swap(tld.wlcur, tld.wlnext);
      bool nextPending = pendingLoop(tld);
      innerDone.get()  = true;

      barrier.wait();

      if (this->buildDAG())
        barrier.wait();

      if (this->buildIntentToRead())
        barrier.wait();

      bool nextCommit = false;
      outerDone.get() = true;

      if (this->executeDAG(*this, tld)) {
        if (OptionsTy::needsBreak)
          barrier.wait();
        drainPending(tld);
        break;
      }

      nextCommit = commitLoop(tld);

      if (nextPending || nextCommit)
        innerDone.get() = false;

      barrier.wait();

      if (innerDone.get())
        break;

      this->calculateWindow(true);

      barrier.wait();

      this->pushNextWindow(tld.wlnext, local.nextWindow());
    }

    if (!this->emptyReserve())
      outerDone.get() = false;

    if (tld.hasNewWork)
      hasNewWork.get() = true;

    if (this->checkBreak())
      break;

    barrier.wait();

    if (outerDone.get()) {
      if (!OptionsTy::needsPush)
        break;
      if (!hasNewWork.get()) // (1)
        break;
      this->distributeNewWork(*this, tld.wlnext);
      tld.hasNewWork = false;
      // NB: assumes that distributeNewWork has a barrier otherwise checking at
      // (1) is erroneous
      hasNewWork.get() = false;
    } else {
      this->calculateWindow(false);

      this->pushNextWindow(tld.wlnext, local.nextWindow());
    }
  }

  this->destroyDAGManager();
  this->clearNewWork();

  if (OptionsTy::needStats) {
    if (substrate::ThreadPool::getTID() == 0) {
      reportStat_Single(loopname, "RoundsExecuted", tld.rounds);
      reportStat_Single(loopname, "OuterRoundsExecuted", tld.outerRounds);
    }
  }
}

template <typename OptionsTy>
int Executor<OptionsTy>::runFunction(ThreadLocalData& tld, Context* ctx) {
  int result = 0;
#ifdef GALOIS_USE_LONGJMP_ABORT
  if ((result = setjmp(execFrame)) == 0) {
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  try {
#endif
    tld.fn1(ctx->item.val, tld.facing.data());
#ifdef GALOIS_USE_LONGJMP_ABORT
  } else {
    clearConflictLock();
  }
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  } catch (const ConflictFlag& flag) {
    clearConflictLock();
    result = flag;
  }
#endif
  return result;
}

template <typename OptionsTy>
bool Executor<OptionsTy>::pendingLoop(ThreadLocalData& tld) {
  auto& local = this->getLocalWindowManager();
  bool retval = false;
  galois::optional<Item> p;
  while ((p = tld.wlcur->pop())) {
    // Use a new context for each item because there is a race when reusing
    // between aborted iterations.
    Context* ctx = this->emplaceContext(tld.localPending, pending, *p);
    this->pushDAGTask(ctx);
    local.incrementIterations();
    bool commit = true;

    ctx->startIteration();
    ctx->setFirstPass();
    tld.inc_iterations();
    tld.facing.setFirstPass();
    setThreadContext(ctx);

    this->allocLocalState(tld.facing, tld.fn2);
    int result = runFunction(tld, ctx);
    // FIXME:    clearReleasable();
    tld.facing.resetFirstPass();
    ctx->resetFirstPass();
    switch (result) {
    case 0:
    case REACHED_FAILSAFE:
      break;
    case CONFLICT:
      commit = false;
      break;
    default:
      abort();
      break;
    }

    // TODO only needed if fn1 needs pia
    if (OptionsTy::needsPia && !OptionsTy::useLocalState)
      tld.facing.resetAlloc();

    if (commit || OptionsTy::hasFixedNeighborhood) {
      this->saveLocalState(tld.facing, ctx->item);
    } else {
      retval = true;
    }
  }

  return retval;
}

template <typename OptionsTy>
bool Executor<OptionsTy>::executeTask(ThreadLocalData& tld, Context* ctx) {
  setThreadContext(ctx);
  this->restoreLocalState(tld.facing, ctx->item);
  tld.facing.resetFirstPass();
  ctx->resetFirstPass();
  int result = 0;
#ifdef GALOIS_USE_LONGJMP_ABORT
  if ((result = setjmp(execFrame)) == 0) {
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  try {
#endif
    tld.fn2(ctx->item.val, tld.facing.data());
#ifdef GALOIS_USE_LONGJMP_ABORT
  } else {
    clearConflictLock();
  }
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
  } catch (const ConflictFlag& flag) {
    clearConflictLock();
    result = flag;
  }
#endif
  // FIXME: clearReleasable();
  switch (result) {
  case 0:
    break;
  case CONFLICT:
    return false;
    break;
  default:
    GALOIS_DIE("unknown conflict flag");
    break;
  }

  if (OptionsTy::needsPush) {
    unsigned long parent = ctx->item.id;
    //    typedef typename UserContextAccess<value_type>::PushBufferTy::iterator
    //    iterator;
    unsigned count = 0;
    for (auto& item : tld.facing.getPushBuffer()) {
      this->pushNew(item, parent, ++count);
      if (count == 0) {
        GALOIS_DIE("counter overflow");
      }
    }
    if (count)
      tld.hasNewWork = true;
  }
  assert(OptionsTy::needsPush || tld.facing.getPushBuffer().begin() ==
                                     tld.facing.getPushBuffer().end());

  return true;
}

template <typename OptionsTy>
bool Executor<OptionsTy>::commitLoop(ThreadLocalData& tld) {
  bool retval = false;
  auto& local = this->getLocalWindowManager();

  Context* ctx;
  while ((ctx = this->peekContext(tld.localPending, pending))) {
    bool commit = false;
    if (ctx->isReady())
      commit = executeTask(tld, ctx);

    if (commit) {
      ctx->commitIteration();
      local.incrementCommitted();
    } else {
      this->reuseItem(ctx->item);
      tld.wlnext->push(ctx->item);
      tld.inc_conflicts();
      retval = true;
      ctx->cancelIteration();
    }

    this->deallocLocalState(tld.facing);

    if (OptionsTy::needsPia && !OptionsTy::useLocalState)
      tld.facing.resetAlloc();

    tld.facing.resetPushBuffer();
    ctx->clear();
    this->popContext(tld.localPending, pending);
  }

  if (OptionsTy::needsPia && OptionsTy::useLocalState)
    tld.facing.resetAlloc();

  setThreadContext(0);

  return retval;
}

} // namespace internal
} // namespace runtime

namespace worklists {

/**
 * Deterministic execution. Operator should be cautious.
 */
template <typename T = int>
struct Deterministic {
  template <bool _concurrent>
  using rethread = Deterministic<T>;

  template <typename _T>
  using retype = Deterministic<_T>;

  typedef T value_type;
};

} // namespace worklists

namespace runtime {

template <class T, class FunctionTy, class ArgsTy>
struct ForEachExecutor<worklists::Deterministic<T>, FunctionTy, ArgsTy>
    : public internal::Executor<internal::Options<T, FunctionTy, ArgsTy>> {
  typedef internal::Options<T, FunctionTy, ArgsTy> OptionsTy;
  typedef internal::Executor<OptionsTy> SuperTy;
  ForEachExecutor(FunctionTy f, const ArgsTy& args)
      : SuperTy(OptionsTy(f, args)) {}
};

} // namespace runtime

} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_DoAll.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_DOALL_H
#define GALOIS_RUNTIME_EXECUTOR_DOALL_H

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/runtime/OperatorReferenceTypes.h"
#include "galois/runtime/Statistics.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/Termination.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/Timer.h"

namespace galois::runtime {

namespace internal {

template <typename R, typename F, typename ArgsTuple>
class DoAllStealingExec {

  typedef typename R::local_iterator Iter;
  typedef typename std::iterator_traits<Iter>::difference_type Diff_ty;

  enum StealAmt { HALF, FULL };

  constexpr static const bool NEED_STATS =
      galois::internal::NeedStats<ArgsTuple>::value;
  constexpr static const bool MORE_STATS =
      NEED_STATS && has_trait<more_stats_tag, ArgsTuple>();
  constexpr static const bool USE_TERM = false;

  struct ThreadContext {

    alignas(substrate::GALOIS_CACHE_LINE_SIZE) substrate::SimpleLock work_mutex;
    unsigned id;

    Iter shared_beg;
    Iter shared_end;
    Diff_ty m_size;
    size_t num_iter;

    // Stats

    ThreadContext()
        : work_mutex(), id(substrate::getThreadPool().getMaxThreads()),
          shared_beg(), shared_end(), m_size(0), num_iter(0) {
      // TODO: fix this initialization problem,
      // see initThread
    }

    ThreadContext(unsigned id, Iter beg, Iter end)
        : work_mutex(), id(id), shared_beg(beg), shared_end(end),
          m_size(std::distance(beg, end)), num_iter(0) {}

    bool doWork(F func, const unsigned chunk_size) {
      Iter beg(shared_beg);
      Iter end(shared_end);

      bool didwork = false;

      while (getWork(beg, end, chunk_size)) {

        didwork = true;

        for (; beg != end; ++beg) {
          if (NEED_STATS) {
            ++num_iter;
          }
          func(*beg);
        }
      }

      return didwork;
    }

    bool hasWorkWeak() const { return (m_size > 0); }

    bool hasWork() const {
      bool ret = false;

      work_mutex.lock();
      {
        ret = hasWorkWeak();

        if (m_size > 0) {
          assert(shared_beg != shared_end);
        }
      }
      work_mutex.unlock();

      return ret;
    }

  private:
    bool getWork(Iter& priv_beg, Iter& priv_end, const unsigned chunk_size) {
      bool succ = false;

      work_mutex.lock();
      {
        if (hasWorkWeak()) {
          succ = true;

          Iter nbeg = shared_beg;
          if (m_size <= chunk_size) {
            nbeg   = shared_end;
            m_size = 0;

          } else {
            std::advance(nbeg, chunk_size);
            m_size -= chunk_size;
            assert(m_size > 0);
          }

          priv_beg   = shared_beg;
          priv_end   = nbeg;
          shared_beg = nbeg;
        }
      }
      work_mutex.unlock();

      return succ;
    }

    void steal_from_end_impl(Iter& steal_beg, Iter& steal_end, const Diff_ty sz,
                             std::forward_iterator_tag) {

      // steal from front for forward_iterator_tag
      steal_beg = shared_beg;
      std::advance(shared_beg, sz);
      steal_end = shared_beg;
    }

    void steal_from_end_impl(Iter& steal_beg, Iter& steal_end, const Diff_ty sz,
                             std::bidirectional_iterator_tag) {

      steal_end = shared_end;
      std::advance(shared_end, -sz);
      steal_beg = shared_end;
    }

    void steal_from_end(Iter& steal_beg, Iter& steal_end, const Diff_ty sz) {
      assert(sz > 0);
      steal_from_end_impl(
          steal_beg, steal_end, sz,
          typename std::iterator_traits<Iter>::iterator_category());
    }

    void steal_from_beg(Iter& steal_beg, Iter& steal_end, const Diff_ty sz) {
      assert(sz > 0);
      steal_beg = shared_beg;
      std::advance(shared_beg, sz);
      steal_end = shared_beg;
    }

  public:
    bool stealWork(Iter& steal_beg, Iter& steal_end, Diff_ty& steal_size,
                   StealAmt amount, size_t chunk_size) {
      bool succ = false;

      if (work_mutex.try_lock()) {

        if (hasWorkWeak()) {
          succ = true;

          if (amount == HALF && m_size > (Diff_ty)chunk_size) {
            steal_size = m_size / 2;
          } else {
            steal_size = m_size;
          }

          if (m_size <= steal_size) {
            steal_beg = shared_beg;
            steal_end = shared_end;

            shared_beg = shared_end;

            steal_size = m_size;
            m_size     = 0;

          } else {

            // steal_from_end (steal_beg, steal_end, steal_size);
            steal_from_beg(steal_beg, steal_end, steal_size);
            m_size -= steal_size;
          }
        }

        work_mutex.unlock();
      }

      return succ;
    }

    void assignWork(const Iter& beg, const Iter& end, const Diff_ty sz) {
      work_mutex.lock();
      {
        assert(!hasWorkWeak());
        assert(beg != end);
        assert(std::distance(beg, end) == sz);

        shared_beg = beg;
        shared_end = end;
        m_size     = sz;
      }
      work_mutex.unlock();
    }
  };

private:
  GALOIS_ATTRIBUTE_NOINLINE bool
  transferWork(ThreadContext& rich, ThreadContext& poor, StealAmt amount) {

    assert(rich.id != poor.id);
    assert(rich.id < galois::getActiveThreads());
    assert(poor.id < galois::getActiveThreads());

    Iter steal_beg;
    Iter steal_end;

    // stealWork should initialize to a more appropriate value
    Diff_ty steal_size = 0;

    bool succ =
        rich.stealWork(steal_beg, steal_end, steal_size, amount, chunk_size);

    if (succ) {
      assert(steal_beg != steal_end);
      assert(std::distance(steal_beg, steal_end) == steal_size);

      poor.assignWork(steal_beg, steal_end, steal_size);
    }

    return succ;
  }

  GALOIS_ATTRIBUTE_NOINLINE bool stealWithinSocket(ThreadContext& poor) {

    bool sawWork   = false;
    bool stoleWork = false;

    auto& tp = substrate::getThreadPool();

    const unsigned maxT     = galois::getActiveThreads();
    const unsigned my_pack  = substrate::ThreadPool::getSocket();
    const unsigned per_pack = tp.getMaxThreads() / tp.getMaxSockets();

    const unsigned pack_beg = my_pack * per_pack;
    const unsigned pack_end = (my_pack + 1) * per_pack;

    for (unsigned i = 1; i < pack_end; ++i) {

      // go around the socket in circle starting from the next thread
      unsigned t = (poor.id + i) % per_pack + pack_beg;
      assert((t >= pack_beg) && (t < pack_end));

      if (t < maxT) {
        if (workers.getRemote(t)->hasWorkWeak()) {
          sawWork = true;

          stoleWork = transferWork(*workers.getRemote(t), poor, HALF);

          if (stoleWork) {
            break;
          }
        }
      }
    }

    return sawWork || stoleWork;
  }

  GALOIS_ATTRIBUTE_NOINLINE bool stealOutsideSocket(ThreadContext& poor,
                                                    const StealAmt& amt) {
    bool sawWork   = false;
    bool stoleWork = false;

    auto& tp       = substrate::getThreadPool();
    unsigned myPkg = substrate::ThreadPool::getSocket();
    // unsigned maxT = LL::getMaxThreads ();
    unsigned maxT = galois::getActiveThreads();

    for (unsigned i = 0; i < maxT; ++i) {
      ThreadContext& rich = *(workers.getRemote((poor.id + i) % maxT));

      if (tp.getSocket(rich.id) != myPkg) {
        if (rich.hasWorkWeak()) {
          sawWork = true;

          stoleWork = transferWork(rich, poor, amt);
          // stoleWork = transferWork (rich, poor, HALF);

          if (stoleWork) {
            break;
          }
        }
      }
    }

    return sawWork || stoleWork;
  }

  GALOIS_ATTRIBUTE_NOINLINE bool trySteal(ThreadContext& poor) {
    bool ret = false;

    ret = stealWithinSocket(poor);

    if (ret) {
      return true;
    }

    substrate::asmPause();

    if (substrate::getThreadPool().isLeader(poor.id)) {
      ret = stealOutsideSocket(poor, HALF);

      if (ret) {
        return true;
      }
      substrate::asmPause();
    }

    ret = stealOutsideSocket(poor, HALF);
    if (ret) {
      return true;
    }
    substrate::asmPause();

    return ret;
  }

private:
  R range;
  F func;
  const char* loopname;
  Diff_ty chunk_size;
  substrate::PerThreadStorage<ThreadContext> workers;

  substrate::TerminationDetection& term;

  // for stats
  PerThreadTimer<MORE_STATS> totalTime;
  PerThreadTimer<MORE_STATS> initTime;
  PerThreadTimer<MORE_STATS> execTime;
  PerThreadTimer<MORE_STATS> stealTime;
  PerThreadTimer<MORE_STATS> termTime;

public:
  DoAllStealingExec(const R& _range, F _func, const ArgsTuple& argsTuple)
      : range(_range), func(_func),
        loopname(galois::internal::getLoopName(argsTuple)),
        chunk_size(get_trait_value<chunk_size_tag>(argsTuple).value),
        term(substrate::getSystemTermination(activeThreads)),
        totalTime(loopname, "Total"), initTime(loopname, "Init"),
        execTime(loopname, "Execute"), stealTime(loopname, "Steal"),
        termTime(loopname, "Term") {
    assert(chunk_size > 0);
  }

  // parallel call
  void initThread(void) {
    initTime.start();

    term.initializeThread();

    unsigned id = substrate::ThreadPool::getTID();

    *workers.getLocal(id) =
        ThreadContext(id, range.local_begin(), range.local_end());

    initTime.stop();
  }

  ~DoAllStealingExec() {
// executed serially
#ifndef NDEBUG
    for (unsigned i = 0; i < workers.size(); ++i) {
      auto& ctx = *(workers.getRemote(i));
      assert(!ctx.hasWork() && "Unprocessed work left");
    }
#endif

    // printStats ();
  }

  void operator()(void) {

    ThreadContext& ctx = *workers.getLocal();
    totalTime.start();

    while (true) {
      bool workHappened = false;

      execTime.start();

      if (ctx.doWork(func, chunk_size)) {
        workHappened = true;
      }

      execTime.stop();

      assert(!ctx.hasWork());

      stealTime.start();
      bool stole = trySteal(ctx);
      stealTime.stop();

      if (stole) {
        continue;

      } else {

        assert(!ctx.hasWork());
        if (USE_TERM) {
          termTime.start();
          term.localTermination(workHappened);

          bool quit = term.globalTermination();
          termTime.stop();

          if (quit) {
            break;
          }
        } else {
          break;
        }
      }
    }

    totalTime.stop();
    assert(!ctx.hasWork());

    if (NEED_STATS) {
      galois::runtime::reportStat_Tsum(loopname, "Iterations", ctx.num_iter);
    }
  }
};

template <bool _STEAL>
struct ChooseDoAllImpl {

  template <typename R, typename F, typename ArgsT>
  static void call(const R& range, F&& func, const ArgsT& argsTuple) {

    internal::DoAllStealingExec<
        R, OperatorReferenceType<decltype(std::forward<F>(func))>, ArgsT>
        exec(range, std::forward<F>(func), argsTuple);

    substrate::Barrier& barrier = getBarrier(activeThreads);

    substrate::getThreadPool().run(
        activeThreads, [&exec](void) { exec.initThread(); }, std::ref(barrier),
        std::ref(exec));
  }
};

template <>
struct ChooseDoAllImpl<false> {

  template <typename R, typename F, typename ArgsT>
  static void call(const R& range, F func, const ArgsT& argsTuple) {

    runtime::on_each_gen(
        [&](const unsigned int, const unsigned int) {
          static constexpr bool NEED_STATS =
              galois::internal::NeedStats<ArgsT>::value;
          static constexpr bool MORE_STATS =
              NEED_STATS && has_trait<more_stats_tag, ArgsT>();

          const char* const loopname = galois::internal::getLoopName(argsTuple);

          PerThreadTimer<MORE_STATS> totalTime(loopname, "Total");
          PerThreadTimer<MORE_STATS> initTime(loopname, "Init");
          PerThreadTimer<MORE_STATS> execTime(loopname, "Work");

          totalTime.start();
          initTime.start();

          auto begin     = range.local_begin();
          const auto end = range.local_end();

          initTime.stop();

          execTime.start();

          size_t iter = 0;

          while (begin != end) {
            func(*begin++);
            if (NEED_STATS) {
              ++iter;
            }
          }
          execTime.stop();

          totalTime.stop();

          if (NEED_STATS) {
            galois::runtime::reportStat_Tsum(loopname, "Iterations", iter);
          }
        },
        std::make_tuple());
  }
};

} // end namespace internal

template <typename R, typename F, typename ArgsTuple>
void do_all_gen(const R& range, F&& func, const ArgsTuple& argsTuple) {

  static_assert(!has_trait<char*, ArgsTuple>(), "old loopname");
  static_assert(!has_trait<char const*, ArgsTuple>(), "old loopname");
  static_assert(!has_trait<bool, ArgsTuple>(), "old steal");

  auto argsT = std::tuple_cat(
      argsTuple,
      get_default_trait_values(argsTuple, std::make_tuple(chunk_size_tag{}),
                               std::make_tuple(chunk_size<>{})));

  using ArgsT = decltype(argsT);

  constexpr bool TIME_IT = has_trait<loopname_tag, ArgsT>();
  CondStatTimer<TIME_IT> timer(galois::internal::getLoopName(argsT));

  timer.start();

  constexpr bool STEAL = has_trait<steal_tag, ArgsT>();

  OperatorReferenceType<decltype(std::forward<F>(func))> func_ref = func;
  internal::ChooseDoAllImpl<STEAL>::call(range, func_ref, argsT);

  timer.stop();
}

} // namespace galois::runtime

#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_ForEach.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_FOREACH_H
#define GALOIS_RUNTIME_EXECUTOR_FOREACH_H

#include <algorithm>
#include <functional>
#include <memory>
#include <utility>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/Mem.h"
#include "galois/runtime/Context.h"
#include "galois/runtime/LoopStatistics.h"
#include "galois/runtime/OperatorReferenceTypes.h"
#include "galois/runtime/Range.h"
#include "galois/runtime/Statistics.h"
#include "galois/runtime/Substrate.h"
#include "galois/runtime/ThreadTimer.h"
#include "galois/runtime/UserContextAccess.h"
#include "galois/substrate/Termination.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/Threads.h"
#include "galois/Timer.h"
#include "galois/Traits.h"
#include "galois/worklists/Chunk.h"
#include "galois/worklists/Simple.h"

namespace galois {
//! Internal Galois functionality - Use at your own risk.
namespace runtime {

template <typename value_type>
class AbortHandler {
  struct Item {
    value_type val;
    int retries;
  };

  typedef worklists::GFIFO<Item> AbortedList;
  substrate::PerThreadStorage<AbortedList> queues;
  bool useBasicPolicy;

  /**
   * Policy: serialize via tree over sockets.
   */
  void basicPolicy(const Item& item) {
    auto& tp        = substrate::getThreadPool();
    unsigned socket = tp.getSocket();
    queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);
  }

  /**
   * Policy: retry work 2X locally, then serialize via tree on socket (trying
   * twice at each level), then serialize via tree over sockets.
   */
  void doublePolicy(const Item& item) {
    int retries = item.retries - 1;
    if ((retries & 1) == 1) {
      queues.getLocal()->push(item);
      return;
    }

    unsigned tid    = substrate::ThreadPool::getTID();
    auto& tp        = substrate::getThreadPool();
    unsigned socket = substrate::ThreadPool::getSocket();
    unsigned leader = substrate::ThreadPool::getLeader();
    if (tid != leader) {
      unsigned next = leader + (tid - leader) / 2;
      queues.getRemote(next)->push(item);
    } else {
      queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);
    }
  }

  /**
   * Policy: retry work 2X locally, then serialize via tree on socket but
   * try at most 3 levels, then serialize via tree over sockets.
   */
  void boundedPolicy(const Item& item) {
    int retries = item.retries - 1;
    if (retries < 2) {
      queues.getLocal()->push(item);
      return;
    }

    unsigned tid    = substrate::ThreadPool::getTID();
    auto& tp        = substrate::getThreadPool();
    unsigned socket = substrate::ThreadPool::getSocket();
    unsigned leader = tp.getLeaderForSocket(socket);
    if (retries < 5 && tid != leader) {
      unsigned next = leader + (tid - leader) / 2;
      queues.getRemote(next)->push(item);
    } else {
      queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);
    }
  }

  /**
   * Retry locally only.
   */
  void eagerPolicy(const Item& item) { queues.getLocal()->push(item); }

public:
  AbortHandler() {
    // XXX(ddn): Implement smarter adaptive policy
    useBasicPolicy = substrate::getThreadPool().getMaxSockets() > 2;
  }

  value_type& value(Item& item) const { return item.val; }
  value_type& value(value_type& val) const { return val; }

  void push(const value_type& val) {
    Item item = {val, 1};
    queues.getLocal()->push(item);
  }

  void push(const Item& item) {
    Item newitem = {item.val, item.retries + 1};
    if (useBasicPolicy)
      basicPolicy(newitem);
    else
      doublePolicy(newitem);
  }

  AbortedList* getQueue() { return queues.getLocal(); }
};

// TODO(ddn): Implement wrapper to allow calling without UserContext
// TODO(ddn): Check for operators that implement both with and without context
template <class WorkListTy, class FunctionTy, typename ArgsTy>
class ForEachExecutor {
public:
  static constexpr bool needStats = galois::internal::NeedStats<ArgsTy>::value;
  static constexpr bool needsPush = !has_trait<no_pushes_tag, ArgsTy>();
  static constexpr bool needsAborts =
      !has_trait<disable_conflict_detection_tag, ArgsTy>();
  static constexpr bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();
  static constexpr bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();
  static constexpr bool MORE_STATS =
      needStats && has_trait<more_stats_tag, ArgsTy>();

protected:
  typedef typename WorkListTy::value_type value_type;

  struct ThreadLocalBasics {
    UserContextAccess<value_type> facing;
    FunctionTy function;
    SimpleRuntimeContext ctx;

    explicit ThreadLocalBasics(FunctionTy fn) : facing(), function(fn), ctx() {}
  };

  using LoopStat = LoopStatistics<needStats>;

  struct ThreadLocalData : public ThreadLocalBasics, public LoopStat {

    ThreadLocalData(FunctionTy fn, const char* ln)
        : ThreadLocalBasics(fn), LoopStat(ln) {}
  };

  // RunQueueState factors out state within runQueue iterations to protect it
  // from being overwritten when using longjmp/setjmp.
  template <typename WL>
  struct RunQueueState {
    unsigned int num = 0;
    galois::optional<typename WL::value_type> item;
  };

  // NB: Place dynamically growing wl after fixed-size PerThreadStorage
  // members to give higher likelihood of reclaiming PerThreadStorage

  AbortHandler<value_type> aborted;
  substrate::TerminationDetection& term;
  substrate::Barrier& barrier;

  WorkListTy wl;
  FunctionTy origFunction;
  const char* loopname;
  bool broke;

  PerThreadTimer<MORE_STATS> initTime;
  PerThreadTimer<MORE_STATS> execTime;

  inline void commitIteration(ThreadLocalData& tld) {
    if (needsPush) {
      // auto ii = tld.facing.getPushBuffer().begin();
      // auto ee = tld.facing.getPushBuffer().end();
      auto& pb = tld.facing.getPushBuffer();
      auto n   = pb.size();
      if (n) {
        tld.inc_pushes(n);
        wl.push(pb.begin(), pb.end());
        pb.clear();
      }
    }
    if (needsPia)
      tld.facing.resetAlloc();
    if (needsAborts)
      tld.ctx.commitIteration();
    //++tld.stat_commits;
  }

  template <typename Item>
  GALOIS_ATTRIBUTE_NOINLINE void abortIteration(const Item& item,
                                                ThreadLocalData& tld) {
    assert(needsAborts);
    tld.ctx.cancelIteration();
    tld.inc_conflicts();
    aborted.push(item);
    // clear push buffer
    if (needsPush)
      tld.facing.resetPushBuffer();
    // reset allocator
    if (needsPia)
      tld.facing.resetAlloc();
  }

  inline void doProcess(value_type& val, ThreadLocalData& tld) {
    if (needsAborts)
      tld.ctx.startIteration();

    tld.inc_iterations();
    tld.function(val, tld.facing.data());
    commitIteration(tld);
  }

  bool runQueueSimple(ThreadLocalData& tld) {
    galois::optional<value_type> p;
    bool didWork = false;
    while ((p = wl.pop())) {
      didWork = true;
      doProcess(*p, tld);
    }
    return didWork;
  }

  template <unsigned int limit, typename WL>
  void runQueueDispatch(ThreadLocalData& tld, WL& lwl, RunQueueState<WL>& s) {
#ifdef GALOIS_USE_LONGJMP_ABORT
    if (setjmp(execFrame) == 0) {
      while ((!limit || s.num < limit) && (s.item = lwl.pop())) {
        ++s.num;
        doProcess(aborted.value(*s.item), tld);
      }
    } else {
      clearConflictLock();
      abortIteration(*s.item, tld);
    }
#elif defined(GALOIS_USE_EXCEPTION_ABORT)
    try {
      while ((!limit || s.num < limit) && (s.item = lwl.pop())) {
        ++s.num;
        doProcess(aborted.value(*s.item), tld);
      }
    } catch (ConflictFlag const& flag) {
      clearConflictLock();
      abortIteration(*s.item, tld);
    }
#endif
  }

  template <unsigned int limit, typename WL>
  bool runQueue(ThreadLocalData& tld, WL& lwl) {
    RunQueueState<WL> s;
    runQueueDispatch<limit>(tld, lwl, s);
    return s.num > 0;
  }

  GALOIS_ATTRIBUTE_NOINLINE
  bool handleAborts(ThreadLocalData& tld) {
    return runQueue<0>(tld, *aborted.getQueue());
  }

  void fastPushBack(typename UserContextAccess<value_type>::PushBufferTy& x) {
    wl.push(x.begin(), x.end());
    x.clear();
  }

  bool checkEmpty(WorkListTy&, ThreadLocalData&, ...) { return true; }

  template <typename WL>
  auto checkEmpty(WL& wl, ThreadLocalData&, int)
      -> decltype(wl.empty(), bool()) {
    return wl.empty();
  }

  template <bool couldAbort, bool isLeader>
  void go() {

    execTime.start();

    // Thread-local data goes on the local stack to be NUMA friendly
    ThreadLocalData tld(origFunction, loopname);
    if (needsBreak)
      tld.facing.setBreakFlag(&broke);
    if (couldAbort)
      setThreadContext(&tld.ctx);
    if (needsPush && !couldAbort)
      tld.facing.setFastPushBack(std::bind(&ForEachExecutor::fastPushBack, this,
                                           std::placeholders::_1));

    while (true) {
      do {
        bool didWork = false;

        // Run some iterations
        if (couldAbort || needsBreak) {
          constexpr int __NUM = (needsBreak || isLeader) ? 64 : 0;
          bool b              = runQueue<__NUM>(tld, wl);
          didWork             = b || didWork;
          // Check for abort
          if (couldAbort) {
            b       = handleAborts(tld);
            didWork = b || didWork;
          }
        } else { // No try/catch
          bool b  = runQueueSimple(tld);
          didWork = b || didWork;
        }

        // Update node color and prop token
        term.localTermination(didWork);
        substrate::asmPause(); // Let token propagate
      } while (!term.globalTermination() && (!needsBreak || !broke));

      if (checkEmpty(wl, tld, 0)) {
        execTime.stop();
        break;
      }

      if (needsBreak && broke) {
        execTime.stop();
        break;
      }

      term.initializeThread();
      barrier.wait();
    }

    if (couldAbort)
      setThreadContext(0);
  }

  struct T1 {};
  struct T2 {};

  template <typename... WArgsTy>
  ForEachExecutor(T2, FunctionTy f, const ArgsTy& args, WArgsTy... wargs)
      : term(substrate::getSystemTermination(activeThreads)),
        barrier(getBarrier(activeThreads)), wl(std::forward<WArgsTy>(wargs)...),
        origFunction(f), loopname(galois::internal::getLoopName(args)),
        broke(false), initTime(loopname, "Init"),
        execTime(loopname, "Execute") {}

  template <typename WArgsTy, size_t... Is>
  ForEachExecutor(T1, FunctionTy f, const ArgsTy& args, const WArgsTy& wlargs,
                  std::index_sequence<Is...>)
      : ForEachExecutor(T2{}, f, args, std::get<Is>(wlargs)...) {}

  template <typename WArgsTy>
  ForEachExecutor(T1, FunctionTy f, const ArgsTy& args, const WArgsTy&,
                  std::index_sequence<>)
      : ForEachExecutor(T2{}, f, args) {}

public:
  ForEachExecutor(FunctionTy f, const ArgsTy& args)
      : ForEachExecutor(T1{}, f, args, get_trait_value<wl_tag>(args).args,
                        std::make_index_sequence<std::tuple_size<decltype(
                            get_trait_value<wl_tag>(args).args)>::value>{}) {}

  template <typename RangeTy>
  void init(const RangeTy&) {}

  template <typename RangeTy>
  void initThread(const RangeTy& range) {

    initTime.start();

    wl.push_initial(range);
    term.initializeThread();

    initTime.stop();
  }

  void operator()() {
    bool isLeader   = substrate::ThreadPool::isLeader();
    bool couldAbort = needsAborts && activeThreads > 1;
    if (couldAbort && isLeader)
      go<true, true>();
    else if (couldAbort && !isLeader)
      go<true, false>();
    else if (!couldAbort && isLeader)
      go<false, true>();
    else
      go<false, false>();
  }
};

template <typename WLTy>
constexpr auto has_with_iterator(int) -> decltype(
    std::declval<typename WLTy::template with_iterator<int*>::type>(), bool()) {
  return true;
}

template <typename>
constexpr auto has_with_iterator(...) -> bool {
  return false;
}

template <typename WLTy, typename IterTy, typename Enable = void>
struct reiterator {
  typedef WLTy type;
};

template <typename WLTy, typename IterTy>
struct reiterator<WLTy, IterTy,
                  typename std::enable_if<has_with_iterator<WLTy>(0)>::type> {
  typedef typename WLTy::template with_iterator<IterTy>::type type;
};

// TODO(ddn): Think about folding in range into args too
template <typename RangeTy, typename FunctionTy, typename ArgsTy>
void for_each_impl(const RangeTy& range, FunctionTy&& fn, const ArgsTy& args) {
  typedef typename std::iterator_traits<typename RangeTy::iterator>::value_type
      value_type;
  typedef typename get_trait_type<wl_tag, ArgsTy>::type::type BaseWorkListTy;
  typedef typename reiterator<BaseWorkListTy, typename RangeTy::iterator>::
      type ::template retype<value_type>
          WorkListTy;
  using FuncRefType =
      OperatorReferenceType<decltype(std::forward<FunctionTy>(fn))>;
  typedef ForEachExecutor<WorkListTy, FuncRefType, ArgsTy> WorkTy;

  auto& barrier      = getBarrier(activeThreads);
  FuncRefType fn_ref = fn;
  WorkTy W(fn_ref, args);
  W.init(range);
  substrate::getThreadPool().run(
      activeThreads, [&W, &range]() { W.initThread(range); }, std::ref(barrier),
      std::ref(W));
}

// TODO: Need to decide whether user should provide num_run tag or
// num_run can be provided by loop instance which is guaranteed to be unique

//! Normalize arguments to for_each
template <typename RangeTy, typename FunctionTy, typename TupleTy>
void for_each_gen(const RangeTy& r, FunctionTy&& fn, const TupleTy& tpl) {
  static_assert(!has_trait<char*, TupleTy>(), "old loopname");
  static_assert(!has_trait<char const*, TupleTy>(), "old loopname");
  static_assert(!has_trait<bool, TupleTy>(), "old steal");

  auto ftpl = std::tuple_cat(tpl, typename function_traits<FunctionTy>::type{});

  auto xtpl = std::tuple_cat(
      ftpl, get_default_trait_values(tpl, std::make_tuple(wl_tag{}),
                                     std::make_tuple(wl<defaultWL>())));

  constexpr bool TIME_IT = has_trait<loopname_tag, decltype(xtpl)>();
  CondStatTimer<TIME_IT> timer(galois::internal::getLoopName(xtpl));

  timer.start();

  runtime::for_each_impl(r, std::forward<FunctionTy>(fn), xtpl);

  timer.stop();
}

} // end namespace runtime
} // end namespace galois
#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_OnEach.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_ONEACH_H
#define GALOIS_RUNTIME_EXECUTOR_ONEACH_H

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/runtime/OperatorReferenceTypes.h"
#include "galois/runtime/Statistics.h"
#include "galois/runtime/ThreadTimer.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/Threads.h"
#include "galois/Timer.h"
#include "galois/Traits.h"

namespace galois {
namespace runtime {

namespace internal {

template <typename FunctionTy, typename ArgsTy>
inline void on_each_impl(FunctionTy&& fn, const ArgsTy& argsTuple) {

  static_assert(!has_trait<char*, ArgsTy>(), "old loopname");
  static_assert(!has_trait<char const*, ArgsTy>(), "old loopname");

  static constexpr bool NEEDS_STATS = has_trait<loopname_tag, ArgsTy>();
  static constexpr bool MORE_STATS =
      NEEDS_STATS && has_trait<more_stats_tag, ArgsTy>();

  const char* const loopname = galois::internal::getLoopName(argsTuple);

  CondStatTimer<NEEDS_STATS> timer(loopname);

  PerThreadTimer<MORE_STATS> execTime(loopname, "Execute");

  const auto numT = getActiveThreads();

  OperatorReferenceType<decltype(std::forward<FunctionTy>(fn))> fn_ref = fn;

  auto runFun = [&] {
    execTime.start();

    fn_ref(substrate::ThreadPool::getTID(), numT);

    execTime.stop();
  };

  timer.start();
  substrate::getThreadPool().run(numT, runFun);
  timer.stop();
}

} // namespace internal

template <typename FunctionTy, typename TupleTy>
inline void on_each_gen(FunctionTy&& fn, const TupleTy& tpl) {
  internal::on_each_impl(std::forward<FunctionTy>(fn), tpl);
}

} // end namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_Ordered.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_ORDERED_H
#define GALOIS_RUNTIME_EXECUTOR_ORDERED_H

#include "galois/config.h"

namespace galois {
namespace runtime {

// TODO(ddn): Pull in and integrate in executors from exp

template <typename Iter, typename Cmp, typename NhFunc, typename OpFunc>
void for_each_ordered_impl(Iter GALOIS_UNUSED(beg), Iter GALOIS_UNUSED(end),
                           const Cmp& GALOIS_UNUSED(cmp),
                           const NhFunc& GALOIS_UNUSED(nhFunc),
                           const OpFunc& GALOIS_UNUSED(opFunc),
                           const char* GALOIS_UNUSED(loopname)) {
  GALOIS_DIE("not yet implemented");
}

template <typename Iter, typename Cmp, typename NhFunc, typename OpFunc,
          typename StableTest>
void for_each_ordered_impl(Iter GALOIS_UNUSED(beg), Iter GALOIS_UNUSED(end),
                           const Cmp& GALOIS_UNUSED(cmp),
                           const NhFunc& GALOIS_UNUSED(nhFunc),
                           const OpFunc& GALOIS_UNUSED(opFunc),
                           const StableTest& GALOIS_UNUSED(stabilityTest),
                           const char* GALOIS_UNUSED(loopname)) {
  GALOIS_DIE("not yet implemented");
}

} // end namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Executor_ParaMeter.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_EXECUTOR_PARAMETER_H
#define GALOIS_RUNTIME_EXECUTOR_PARAMETER_H

#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <deque>
#include <random>
#include <vector>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/Mem.h"
#include "galois/Reduction.h"
#include "galois/runtime/Context.h"
#include "galois/runtime/Executor_ForEach.h"
#include "galois/runtime/Executor_DoAll.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/PerThreadContainer.h"
#include "galois/Traits.h"
#include "galois/worklists/Simple.h"

namespace galois {
namespace runtime {

namespace ParaMeter {

struct StepStatsBase {
  static inline void printHeader(FILE* out) {
    fprintf(out,
            "LOOPNAME, STEP, PARALLELISM, WORKLIST_SIZE, NEIGHBORHOOD_SIZE\n");
  }

  static inline void dump(FILE* out, const char* loopname, size_t step,
                          size_t parallelism, size_t wlSize, size_t nhSize) {
    assert(out && "StepStatsBase::dump() file handle is null");
    fprintf(out, "%s, %zu, %zu, %zu, %zu\n", loopname, step, parallelism,
            wlSize, nhSize);
  }
};

struct OrderedStepStats : public StepStatsBase {
  using Base = StepStatsBase;

  const size_t step;
  GAccumulator<size_t> parallelism;
  const size_t wlSize;

  explicit OrderedStepStats(size_t _step, size_t _wlsz)
      : Base(), step(_step), parallelism(), wlSize(_wlsz) {}

  explicit OrderedStepStats(size_t _step, size_t par, size_t _wlsz)
      : Base(), step(_step), parallelism(), wlSize(_wlsz) {
    parallelism += par;
  }

  void dump(FILE* out, const char* loopname) {
    Base::dump(out, loopname, step, parallelism.reduce(), wlSize, 0ul);
  }
};

struct UnorderedStepStats : public StepStatsBase {
  using Base = StepStatsBase;

  size_t step;
  GAccumulator<size_t> parallelism;
  GAccumulator<size_t> wlSize;
  GAccumulator<size_t> nhSize;

  UnorderedStepStats(void) : Base(), step(0) {}

  void nextStep(void) {
    ++step;
    parallelism.reset();
    wlSize.reset();
    nhSize.reset();
  }

  void dump(FILE* out, const char* loopname) {
    Base::dump(out, loopname, step, parallelism.reduce(), wlSize.reduce(),
               nhSize.reduce());
  }
};

// Single ParaMeter stats file per run of an app
// which includes all instances of for_each loops
// run with ParaMeter Executor
FILE* getStatsFile(void);
void closeStatsFile(void);

template <typename T>
class FIFO_WL {

protected:
  using PTcont = galois::PerThreadVector<T>;

  PTcont* curr;
  PTcont* next;

public:
  FIFO_WL(void) : curr(new PTcont()), next(new PTcont()) {}

  ~FIFO_WL(void) {
    delete curr;
    curr = nullptr;
    delete next;
    next = nullptr;
  }

  auto iterateCurr(void) { return galois::runtime::makeLocalRange(*curr); }

  void pushNext(const T& item) { next->get().push_back(item); }

  void nextStep(void) {
    std::swap(curr, next);
    next->clear_all_parallel();
  }

  bool empty(void) const { return next->empty_all(); }
};

template <typename T>
class RAND_WL : public FIFO_WL<T> {
  using Base = FIFO_WL<T>;

public:
  auto iterateCurr(void) {
    galois::runtime::on_each_gen(
        [&](int, int) {
          auto& lwl = Base::curr->get();

          std::random_device r;
          std::mt19937 rng(r());
          std::shuffle(lwl.begin(), lwl.end(), rng);
        },
        std::make_tuple());

    return galois::runtime::makeLocalRange(*Base::curr);
  }
};

template <typename T>
class LIFO_WL : public FIFO_WL<T> {
  using Base = FIFO_WL<T>;

public:
  auto iterateCurr(void) {

    // TODO: use reverse iterator instead of std::reverse
    galois::runtime::on_each_gen(
        [&](int, int) {
          auto& lwl = Base::curr->get();
          std::reverse(lwl.begin(), lwl.end());
        },
        std::make_tuple());

    return galois::runtime::makeLocalRange(*Base::curr);
  }
};

enum class SchedType { FIFO, RAND, LIFO };

template <typename T, SchedType SCHED>
struct ChooseWL {};

template <typename T>
struct ChooseWL<T, SchedType::FIFO> {
  using type = FIFO_WL<T>;
};

template <typename T>
struct ChooseWL<T, SchedType::LIFO> {
  using type = LIFO_WL<T>;
};

template <typename T>
struct ChooseWL<T, SchedType::RAND> {
  using type = RAND_WL<T>;
};

template <class T, class FunctionTy, class ArgsTy>
class ParaMeterExecutor {

  using value_type = T;
  using GenericWL  = typename get_trait_type<wl_tag, ArgsTy>::type::type;
  using WorkListTy = typename GenericWL::template retype<T>;
  using dbg        = galois::debug<1>;

  constexpr static bool needsStats = !has_trait<no_stats_tag, ArgsTy>();
  constexpr static bool needsPush  = !has_trait<no_pushes_tag, ArgsTy>();
  constexpr static bool needsAborts =
      !has_trait<disable_conflict_detection_tag, ArgsTy>();
  constexpr static bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();
  constexpr static bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();

  struct IterationContext {
    T item;
    bool doabort;
    galois::runtime::UserContextAccess<value_type> facing;
    SimpleRuntimeContext ctx;

    explicit IterationContext(const T& v) : item(v), doabort(false) {}

    void reset() {
      doabort = false;
      if (needsPia)
        facing.resetAlloc();

      if (needsPush)
        facing.getPushBuffer().clear();
    }
  };

  using PWL = typename ChooseWL<IterationContext*, WorkListTy::SCHEDULE>::type;

private:
  PWL m_wl;
  FunctionTy m_func;
  const char* loopname;
  FILE* m_statsFile;
  FixedSizeAllocator<IterationContext> m_iterAlloc;
  galois::GReduceLogicalOr m_broken;

  IterationContext* newIteration(const T& item) {
    IterationContext* it = m_iterAlloc.allocate(1);
    assert(it && "IterationContext allocation failed");

    m_iterAlloc.construct(it, item);

    it->reset();
    return it;
  }

  unsigned abortIteration(IterationContext* it) {
    assert(it && "nullptr arg");
    assert(it->doabort &&
           "aborting an iteration without setting its doabort flag");

    unsigned numLocks = it->ctx.cancelIteration();
    it->reset();

    m_wl.pushNext(it);
    return numLocks;
  }

  unsigned commitIteration(IterationContext* it) {
    assert(it && "nullptr arg");

    if (needsPush) {
      for (const auto& item : it->facing.getPushBuffer()) {
        IterationContext* child = newIteration(item);
        m_wl.pushNext(child);
      }
    }

    unsigned numLocks = it->ctx.commitIteration();
    it->reset();

    m_iterAlloc.destroy(it);
    m_iterAlloc.deallocate(it, 1);

    return numLocks;
  }

private:
  void runSimpleStep(UnorderedStepStats& stats) {
    galois::runtime::do_all_gen(
        m_wl.iterateCurr(),
        [&, this](IterationContext* it) {
          stats.wlSize += 1;

          setThreadContext(&(it->ctx));

          m_func(it->item, it->facing.data());
          stats.parallelism += 1;
          unsigned nh = commitIteration(it);
          stats.nhSize += nh;

          setThreadContext(nullptr);
        },
        std::make_tuple(galois::steal(), galois::loopname("ParaM-Simple")));
  }

  void runCautiousStep(UnorderedStepStats& stats){galois::runtime::do_all_gen(
      m_wl.iterateCurr(),
      [&, this](IterationContext* it) {
        stats.wlSize += 1;

        setThreadContext(&(it->ctx));
        bool broke = false;

        if (needsBreak) {
          it->facing.setBreakFlag(&broke);
        }
#ifdef GALOIS_USE_LONGJMP_ABORT
        int flag = 0;
        if ((flag = setjmp(execFrame)) == 0) {
          m_func(it->item, it->facing.data());

        } else {
#elif GALOIS_USE_EXCEPTION_ABORT
        try {
          m_func(it->item, it->facing.data());

        } catch (const ConflictFlag& flag) {
#endif
          clearConflictLock();
          switch (flag) {
          case galois::runtime::CONFLICT:
            it->doabort = true;
            break;
          default:
            std::abort();
          }
        }

        if (needsBreak && broke) {
          m_broken.update(true);
        }

        setThreadContext(nullptr);
      },
      std::make_tuple(galois::steal(), galois::loopname("ParaM-Expand-NH")));

  galois::runtime::do_all_gen(
      m_wl.iterateCurr(),
      [&, this](IterationContext* it) {
        if (it->doabort) {
          abortIteration(it);

        } else {
          stats.parallelism += 1;
          unsigned nh = commitIteration(it);
          stats.nhSize += nh;
        }
      },
      std::make_tuple(galois::steal(), galois::loopname("ParaM-Commit")));
}

template <typename R>
void execute(const R& range) {

  galois::runtime::on_each_gen(
      [&, this](const unsigned, const unsigned) {
        auto p = range.local_pair();

        for (auto i = p.first; i != p.second; ++i) {
          IterationContext* it = newIteration(*i);
          m_wl.pushNext(it);
        }
      },
      std::make_tuple());

  UnorderedStepStats stats;

  while (!m_wl.empty()) {

    m_wl.nextStep();

    if (needsAborts) {
      runCautiousStep(stats);

    } else {
      runSimpleStep(stats);
    }

    // dbg::print("Step: ", stats.step, ", Parallelism: ",
    // stats.parallelism.reduce());
    assert(stats.parallelism.reduce() && "ERROR: No Progress");

    stats.dump(m_statsFile, loopname);
    stats.nextStep();

    if (needsBreak && m_broken.reduce()) {
      break;
    }

  } // end while

  closeStatsFile();
}

public:
ParaMeterExecutor(const FunctionTy& f, const ArgsTy& args)
    : m_func(f), loopname(galois::internal::getLoopName(args)),
      m_statsFile(getStatsFile()) {}

// called serially once
template <typename RangeTy>
void init(const RangeTy& range) {
  execute(range);
}

// called once on each thread followed by a barrier
template <typename RangeTy>
void initThread(const RangeTy&) const {}

void operator()(void) {}

}; // namespace ParaMeter

} // namespace runtime
} // namespace galois

namespace worklists {

template <class T = int, runtime::ParaMeter::SchedType SCHED =
                             runtime::ParaMeter::SchedType::FIFO>
class ParaMeter {
public:
  template <bool _concurrent>
  using rethread = ParaMeter<T, SCHED>;

  template <typename _T>
  using retype = ParaMeter<_T, SCHED>;

  using value_type = T;

  constexpr static const runtime::ParaMeter::SchedType SCHEDULE = SCHED;

  using fifo   = ParaMeter<T, runtime::ParaMeter::SchedType::FIFO>;
  using random = ParaMeter<T, runtime::ParaMeter::SchedType::RAND>;
  using lifo   = ParaMeter<T, runtime::ParaMeter::SchedType::LIFO>;
};

} // namespace worklists

namespace runtime {

// hookup into galois::for_each. Invoke galois::for_each with
// wl<galois::worklists::ParaMeter<> >
template <class T, class FunctionTy, class ArgsTy>
struct ForEachExecutor<galois::worklists::ParaMeter<T>, FunctionTy, ArgsTy>
    : public ParaMeter::ParaMeterExecutor<T, FunctionTy, ArgsTy> {
  using SuperTy = ParaMeter::ParaMeterExecutor<T, FunctionTy, ArgsTy>;
  ForEachExecutor(const FunctionTy& f, const ArgsTy& args) : SuperTy(f, args) {}
};

//! invoke ParaMeter tool to execute a for_each style loop
template <typename R, typename F, typename ArgsTuple>
void for_each_ParaMeter(const R& range, const F& func,
                        const ArgsTuple& argsTuple) {

  using T = typename R::values_type;

  auto tpl = galois::get_default_trait_values(
      argsTuple, std::make_tuple(wl_tag{}),
      std::make_tuple(wl<galois::worklists::ParaMeter<>>()));

  using Tpl_ty = decltype(tpl);

  using Exec = runtime::ParaMeter::ParaMeterExecutor<T, F, Tpl_ty>;
  Exec exec(func, tpl);

  exec.execute(range);
}

} // end namespace runtime
} // end namespace galois
#endif

/*
 * requirements:
 * - support random and fifo schedules, maybe lifo
 * - write stats to a single file.
 * - support multi-threaded execution
 *
 * interface:
 * - file set by environment variable
 * - ParaMeter invoked by choosing wl type, e.g. ParaMeter<>::with_rand, or
 * ParaMeter<>::fifo
 */


================================================
FILE: libgalois/include/galois/runtime/ExtraTraits.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file ExtraTraits.h
 *
 * Defines particular traits used by the distributed runtime.
 */

#ifndef GALOIS_RUNTIME_EXTRA_TRAITS_H
#define GALOIS_RUNTIME_EXTRA_TRAITS_H

#include <type_traits>
#include <boost/mpl/has_xxx.hpp>

// depending on compiler version, trivially copyable defintion changes
#if __GNUC__ < 5
//! Defines what it means to be trivially copyable
#define __is_trivially_copyable(type) __has_trivial_copy(type)
#else
//! Defines what it means to be trivially copyable
#define __is_trivially_copyable(type) std::is_trivially_copyable<type>::value
#endif

namespace galois {
namespace runtime {

BOOST_MPL_HAS_XXX_TRAIT_DEF(tt_has_serialize)
//! Indicates if T has the serialize trait
template <typename T>
struct has_serialize : public has_tt_has_serialize<T> {};

BOOST_MPL_HAS_XXX_TRAIT_DEF(tt_is_copyable)
//! Indicates if T is trivially copyable
template <typename T>
struct is_copyable : public has_tt_is_copyable<T> {};

//! Indicates if T is serializable
template <typename T>
struct is_serializable {
  //! true if T is serializable
  static const bool value = has_serialize<T>::value || is_copyable<T>::value ||
                            __is_trivially_copyable(T);
};

//! Indicates if T is memory copyable
template <typename T>
struct is_memory_copyable {
  //! true if T is memory copyable
  static const bool value = is_copyable<T>::value || __is_trivially_copyable(T);
};

} // namespace runtime
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Iterable.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_ITERABLE_H
#define GALOIS_RUNTIME_ITERABLE_H

#include "galois/config.h"

namespace galois {
namespace runtime {

// iterable and make_iterable specific
// From:
// https://github.com/CppCon/CppCon2014/tree/master/Presentations/C%2B%2B11%20in%20the%20Wild%20-%20Techniques%20from%20a%20Real%20Codebase
// Author: Arthur O'Dwyer
// License: The C++ code in this directory is placed in the public domain and
// may be reused or modified for any purpose, commercial or non-commercial.

template <class It>
class iterable {
  It m_first, m_last;

public:
  iterable() = default;
  iterable(It first, It last) : m_first(first), m_last(last) {}
  It begin() const { return m_first; }
  It end() const { return m_last; }
};

template <class It>
static inline iterable<It> make_iterable(It a, It b) {
  return iterable<It>(a, b);
}

} // end namespace runtime
} // end namespace galois

#endif // GALOIS_RUNTIME_ITERABLE_H


================================================
FILE: libgalois/include/galois/runtime/LoopStatistics.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_LOOPSTATISTICS_H
#define GALOIS_RUNTIME_LOOPSTATISTICS_H

#include "galois/config.h"
#include "galois/runtime/Statistics.h"

namespace galois {
namespace runtime {

// Usually instantiated per thread
template <bool Enabled>
class LoopStatistics {

protected:
  size_t m_iterations;
  size_t m_pushes;
  size_t m_conflicts;
  const char* loopname;

public:
  explicit LoopStatistics(const char* ln)
      : m_iterations(0), m_pushes(0), m_conflicts(0), loopname(ln) {}

  ~LoopStatistics() {
    reportStat_Tsum(loopname, "Iterations", m_iterations);
    reportStat_Tsum(loopname, "Commits", (m_iterations - m_conflicts));
    reportStat_Tsum(loopname, "Pushes", m_pushes);
    reportStat_Tsum(loopname, "Conflicts", m_conflicts);
  }

  size_t iterations(void) const { return m_iterations; }
  size_t pushes(void) const { return m_pushes; }
  size_t conflicts(void) const { return m_conflicts; }

  inline void inc_pushes(size_t v = 1) { m_pushes += v; }

  inline void inc_iterations() { ++m_iterations; }

  inline void inc_conflicts() { ++m_conflicts; }
};

template <>
class LoopStatistics<false> {
public:
  explicit LoopStatistics(const char*) {}

  size_t iterations(void) const { return 0; }
  size_t pushes(void) const { return 0; }
  size_t conflicts(void) const { return 0; }

  inline void inc_iterations() const {}
  inline void inc_pushes(size_t = 0) const {}
  inline void inc_conflicts() const {}
};

} // namespace runtime
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/runtime/Mem.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_MEM_H
#define GALOIS_RUNTIME_MEM_H

#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <list>
#include <map>
#include <memory>

#include <boost/utility.hpp>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/runtime/PagePool.h"
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/NumaMem.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/PtrLock.h"
#include "galois/substrate/SimpleLock.h"

namespace galois {
namespace runtime {

extern unsigned activeThreads;

//! Memory management functionality.

void preAlloc_impl(unsigned num);

// const size_t hugePageSize = 2*1024*1024;

//! Preallocate numpages large pages for each thread
void pagePreAlloc(int numpages);
//! Forces the given block to be paged into physical memory
void pageIn(void* buf, size_t len, size_t stride);
//! Forces the given readonly block to be paged into physical memory
void pageInReadOnly(void* buf, size_t len, size_t stride);

//! Returns total small pages allocated by OS on a NUMA node
int numNumaAllocForNode(unsigned nodeid);

//! Print lines from /proc/pid/numa_maps that contain at least n (non-huge)
//! pages
void printInterleavedStats(int minPages = 16 * 1024);

//! [Example Third Party Allocator]
class MallocHeap {
public:
  //! Supported allocation size in bytes. If 0, heap supports variable sized
  //! allocations
  enum { AllocSize = 0 };

  void* allocate(size_t size) { return malloc(size); }

  void deallocate(void* ptr) { free(ptr); }
};
//! [Example Third Party Allocator]

//! Per-thread heaps using Galois thread aware construct
template <class SourceHeap>
class ThreadPrivateHeap {
  substrate::PerThreadStorage<SourceHeap> heaps;

public:
  enum { AllocSize = SourceHeap::AllocSize };

  ThreadPrivateHeap() {}
  ~ThreadPrivateHeap() { clear(); }

  template <typename... Args>
  inline void* allocate(size_t size, Args&&... args) {
    return heaps.getLocal()->allocate(size, std::forward<Args>(args)...);
  }

  inline void deallocate(void* ptr) { heaps.getLocal()->deallocate(ptr); }

  void clear() {
    for (unsigned int i = 0; i < heaps.size(); i++)
      heaps.getRemote(i)->clear();
  }
};

//! Apply a lock to a heap
template <class SourceHeap>
class LockedHeap : public SourceHeap {
  substrate::SimpleLock lock;

public:
  enum { AllocSize = SourceHeap::AllocSize };

  inline void* allocate(size_t size) {
    lock.lock();
    void* retval = SourceHeap::allocate(size);
    lock.unlock();
    return retval;
  }

  inline void deallocate(void* ptr) {
    lock.lock();
    SourceHeap::deallocate(ptr);
    lock.unlock();
  }
};

template <typename SourceHeap>
class ZeroOut : public SourceHeap {
public:
  enum { AllocSize = SourceHeap::AllocSize };

  inline void* allocate(size_t size) {
    void* retval = SourceHeap::allocate(size);
    memset(retval, 0, size);
    return retval;
  }

  inline void deallocate(void* ptr) { SourceHeap::deallocate(ptr); }
};

//! Add a header to objects
template <typename Header, typename SourceHeap>
class AddHeader : public SourceHeap {
  enum {
    offset = (sizeof(Header) + (sizeof(double) - 1)) & ~(sizeof(double) - 1)
  };

public:
  inline void* allocate(size_t size) {
    // First increase the size of the header to be aligned to a double
    void* ptr = SourceHeap::allocate(size + offset);
    // Now return the offseted pointer
    return (char*)ptr + offset;
  }

  inline void deallocate(void* ptr) { SourceHeap::deallocate(getHeader(ptr)); }

  inline static Header* getHeader(void* ptr) {
    return (Header*)((char*)ptr - offset);
  }
};

//! Allow looking up parent heap pointers
template <class SourceHeap>
class OwnerTaggedHeap : public AddHeader<void*, SourceHeap> {
  typedef AddHeader<OwnerTaggedHeap*, SourceHeap> Src;

public:
  inline void* allocate(size_t size) {
    void* retval              = Src::allocate(size);
    *(Src::getHeader(retval)) = this;
    return retval;
  }

  inline void deallocate(void* ptr) {
    assert(*(Src::getHeader(ptr)) == this);
    Src::deallocate(ptr);
  }

  inline static OwnerTaggedHeap* owner(void* ptr) {
    return *(OwnerTaggedHeap**)Src::getHeader(ptr);
  }
};

//! Maintain a freelist
template <class SourceHeap>
class FreeListHeap : public SourceHeap {
  struct FreeNode {
    FreeNode* next;
  };
  FreeNode* head;

  using dbg = galois::debug<0>;

public:
  enum { AllocSize = SourceHeap::AllocSize };

  void clear() {
    while (head) {
      FreeNode* N = head;
      head        = N->next;
      SourceHeap::deallocate(N);
    }
  }

  FreeListHeap() : head(0) {}
  ~FreeListHeap() { clear(); }

  inline void* allocate(size_t size) {
    if (head) {
      void* ptr = head;
      head      = head->next;
      dbg::print(this, " picking from free list, ptr = ", ptr);
      return ptr;
    } else {
      void* ptr = SourceHeap::allocate(size);
      dbg::print(this, " allocating from SourceHeap, ptr = ", ptr);
      return ptr;
    }
  }

  inline void deallocate(void* ptr) {
    if (!ptr)
      return;
    assert((uintptr_t)ptr > 0x100);
    FreeNode* NH = (FreeNode*)ptr;
    NH->next     = head;
    head         = NH;
    dbg::print(this, " adding block to list, head = ", head);
  }
};

//! Maintain a freelist using a lock which doesn't cover SourceHeap
template <class SourceHeap>
class SelfLockFreeListHeap : public SourceHeap {
  struct FreeNode {
    FreeNode* next;
  };
  FreeNode* head;

public:
  enum { AllocSize = SourceHeap::AllocSize };

  void clear() {
    FreeNode* h = 0;
    do {
      h = head;
    } while (!__sync_bool_compare_and_swap(&head, h, 0));
    while (h) {
      FreeNode* N = h;
      h           = N->next;
      SourceHeap::deallocate(N);
    }
  }

  SelfLockFreeListHeap() : head(0) {}
  ~SelfLockFreeListHeap() { clear(); }

  inline void* allocate(size_t size) {
    static substrate::SimpleLock lock;

    lock.lock();
    FreeNode* OH = 0;
    FreeNode* NH = 0;
    do {
      OH = head;
      if (!OH) {
        lock.unlock();
        return SourceHeap::allocate(size);
      }
      NH = OH->next; // The lock protects this line
    } while (!__sync_bool_compare_and_swap(&head, OH, NH));
    lock.unlock();
    assert(OH);
    return (void*)OH;
  }

  inline void deallocate(void* ptr) {
    if (!ptr)
      return;
    FreeNode* OH;
    FreeNode* NH;
    do {
      OH       = head;
      NH       = (FreeNode*)ptr;
      NH->next = OH;
    } while (!__sync_bool_compare_and_swap(&head, OH, NH));
  }
};

template <unsigned ElemSize, typename SourceHeap>
class BlockHeap : public SourceHeap {
  struct TyEq {
    double data[((ElemSize + sizeof(double) - 1) & ~(sizeof(double) - 1)) /
                sizeof(double)];
  };

  struct Block_basic {
    union {
      Block_basic* next;
      double dummy;
    };
    TyEq data[1];
  };

  enum {
    BytesLeft  = (SourceHeap::AllocSize - sizeof(Block_basic)),
    BytesLeftR = BytesLeft & ~(sizeof(double) - 1),
    FitLeft    = BytesLeftR / sizeof(TyEq[1]),
    TotalFit   = FitLeft + 1
  };

  struct Block {
    union {
      Block* next;
      double dummy;
    };
    TyEq data[TotalFit];
  };

  Block* head;
  int headIndex;

  void refill() {
    void* P   = SourceHeap::allocate(SourceHeap::AllocSize);
    Block* BP = (Block*)P;
    BP->next  = head;
    head      = BP;
    headIndex = 0;
  }

public:
  enum { AllocSize = ElemSize };

  void clear() {
    while (head) {
      Block* B = head;
      head     = B->next;
      SourceHeap::deallocate(B);
    }
  }

  BlockHeap() : SourceHeap(), head(0), headIndex(0) {
    static_assert(sizeof(Block) <= SourceHeap::AllocSize, "");
  }

  ~BlockHeap() { clear(); }

  inline void* allocate(size_t GALOIS_USED_ONLY_IN_DEBUG(size)) {
    assert(size == ElemSize);
    if (!head || headIndex == TotalFit)
      refill();
    return &head->data[headIndex++];
  }

  inline void deallocate(void*) {}
};

//! This implements a bump pointer though chunks of memory
template <typename SourceHeap>
class BumpHeap : public SourceHeap {
  struct Block {
    union {
      Block* next;
      double dummy; // for alignment
    };
  };

  Block* head;
  int offset;

  void refill() {
    void* P   = SourceHeap::allocate(SourceHeap::AllocSize);
    Block* BP = (Block*)P;
    BP->next  = head;
    head      = BP;
    offset    = sizeof(Block);
  }

public:
  enum { AllocSize = 0 };

  BumpHeap() : SourceHeap(), head(0), offset(0) {}

  ~BumpHeap() { clear(); }

  void clear() {
    while (head) {
      Block* B = head;
      head     = B->next;
      SourceHeap::deallocate(B);
    }
  }

  inline void* allocate(size_t size) {
    // Increase to alignment
    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);
    // Check current block
    if (!head || offset + alignedSize > SourceHeap::AllocSize) {
      refill();
    }
    if (offset + alignedSize > SourceHeap::AllocSize) {
      std::abort(); // TODO: remove
      throw std::bad_alloc();
    }
    char* retval = (char*)head;
    retval += offset;
    offset += alignedSize;
    return retval;
  }

  /**
   * Allocates size bytes but may fail. If so, size < allocated and
   * allocated is the number of bytes allocated in the returned buffer.
   */
  inline void* allocate(size_t size, size_t& allocated) {
    // Increase to alignment
    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);
    if (alignedSize > SourceHeap::AllocSize) {
      alignedSize = SourceHeap::AllocSize;
    }
    // Check current block
    if (!head || offset + alignedSize > SourceHeap::AllocSize) {
      size_t remaining = SourceHeap::AllocSize - offset;
      assert((remaining & (sizeof(double) - 1)) ==
             0); // should still be aligned
      if (!remaining) {
        refill();
      } else {
        alignedSize = remaining;
      }
    }
    char* retval = (char*)head;
    retval += offset;
    offset += alignedSize;
    allocated = (alignedSize > size) ? size : alignedSize;
    return retval;
  }

  inline void deallocate(void*) {}
};

/**
 * This implements a bump pointer though chunks of memory that falls back
 * to malloc if the source heap cannot accommodate an allocation.
 */
template <typename SourceHeap>
class BumpWithMallocHeap : public SourceHeap {
  struct Block {
    union {
      Block* next;
      double dummy; // for alignment
    };
  };

  Block* head;
  Block* fallbackHead;
  int offset;

  //! Given block of memory P, update head pointer and offset metadata
  void refill(void* P, Block*& h, int* o) {
    Block* BP = (Block*)P;
    BP->next  = h;
    h         = BP;
    if (o)
      *o = sizeof(Block);
  }

public:
  enum { AllocSize = 0 };

  BumpWithMallocHeap() : SourceHeap(), head(0), fallbackHead(0), offset(0) {}

  ~BumpWithMallocHeap() { clear(); }

  void clear() {
    while (head) {
      Block* B = head;
      head     = B->next;
      SourceHeap::deallocate(B);
    }
    while (fallbackHead) {
      Block* B     = fallbackHead;
      fallbackHead = B->next;
      free(B);
    }
  }

  inline void* allocate(size_t size) {
    // Increase to alignment
    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);
    if (sizeof(Block) + alignedSize > SourceHeap::AllocSize) {
      void* p = malloc(alignedSize + sizeof(Block));
      refill(p, fallbackHead, NULL);
      return (char*)p + sizeof(Block);
    }
    // Check current block
    if (!head || offset + alignedSize > SourceHeap::AllocSize)
      refill(SourceHeap::allocate(SourceHeap::AllocSize), head, &offset);
    char* retval = (char*)head;
    retval += offset;
    offset += alignedSize;
    return retval;
  }

  inline void deallocate(void*) {}
};

//! This is the base source of memory for all allocators.
//! It maintains a freelist of chunks acquired from the system
class SystemHeap {
public:
  // FIXME: actually check!
  enum { AllocSize = 2 * 1024 * 1024 };

  SystemHeap();
  ~SystemHeap();

  inline void* allocate(size_t) { return pagePoolAlloc(); }

  inline void deallocate(void* ptr) { pagePoolFree(ptr); }
};

template <typename Derived>
class StaticSingleInstance : private boost::noncopyable {

  // static std::unique_ptr<Derived> instance;
  static substrate::PtrLock<Derived> ptr;

public:
  static Derived* getInstance(void) {
    Derived* f = ptr.getValue();
    if (f) {
      // assert (f == instance.get ());
      return f;
    }

    ptr.lock();
    f = ptr.getValue();
    if (f) {
      ptr.unlock();
      // assert (f == instance.get ());
    } else {
      // instance = std::unique_ptr<Derived> (new Derived());
      // f = instance.get ();
      f = new Derived;
      ptr.unlock_and_set(f);
    }
    return f;
  }
};

// template <typename Derived>
// std::unique_ptr<Derived> StaticSingleInstance<Derived>::instance =
// std::unique_ptr<Derived>();

template <typename Derived>
substrate::PtrLock<Derived>
    StaticSingleInstance<Derived>::ptr = substrate::PtrLock<Derived>();

class PageHeap : public StaticSingleInstance<PageHeap> {

  using Base = StaticSingleInstance<PageHeap>;

  /* template <typename _U> */ friend class StaticSingleInstance<PageHeap>;

  using InnerHeap = ThreadPrivateHeap<FreeListHeap<SystemHeap>>;
  // using InnerHeap = SystemHeap;

  InnerHeap innerHeap;

  using dbg = galois::debug<0>;

  PageHeap() : innerHeap() { dbg::print("New instance of PageHeap: ", this); }

public:
  enum { AllocSize = InnerHeap::AllocSize };

  inline void* allocate(size_t size) {
    assert(size <= AllocSize);
    void* ptr = innerHeap.allocate(size);
    dbg::print(this, " PageHeap allocate, ptr = ", ptr);
    return ptr;
  }

  inline void deallocate(void* ptr) {
    assert(ptr);
    dbg::print(this, " PageHeap  deallocate ptr = ", ptr);
    innerHeap.deallocate(ptr);
  }
};

#ifdef GALOIS_FORCE_STANDALONE
class SizedHeapFactory : private boost::noncopyable {
public:
  typedef MallocHeap SizedHeap;

  static SizedHeap* getHeapForSize(const size_t) { return &alloc; }

private:
  static SizedHeap alloc;
};
#else
class SizedHeapFactory : public StaticSingleInstance<SizedHeapFactory> {
  using Base = StaticSingleInstance<SizedHeapFactory>;
  /* template <typename> */ friend class StaticSingleInstance<SizedHeapFactory>;

public:
  //! [FixedSizeAllocator example]
  typedef ThreadPrivateHeap<FreeListHeap<BumpHeap<SystemHeap>>> SizedHeap;
  //! [FixedSizeAllocator example]

  static SizedHeap* getHeapForSize(const size_t);

private:
  typedef std::map<size_t, SizedHeap*> HeapMap;
  static thread_local HeapMap* localHeaps;
  HeapMap heaps;
  std::list<HeapMap*> allLocalHeaps;
  substrate::SimpleLock lock;

  SizedHeapFactory();

  SizedHeap* getHeap(size_t);

public:
  ~SizedHeapFactory();
};
#endif

/**
 * Scalable variable-size allocations.
 *
 * Slight misnomer as this doesn't support allocations greater than a page.
 * Users should call allocate multiple times to split
 * large allocations over multiple pages.
 */
struct VariableSizeHeap : public ThreadPrivateHeap<BumpHeap<SystemHeap>> {
  enum { AllocSize = 0 };
};

//! Main scalable allocator in Galois
class FixedSizeHeap {
  SizedHeapFactory::SizedHeap* heap;

public:
  FixedSizeHeap(size_t size) {
    heap = SizedHeapFactory::getHeapForSize(size);
    if (!heap && size != 0) {
      fprintf(stderr, "ERROR: Cannot init a fixed sized heap from "
                      "SizedHeapFactory\n");
      throw std::bad_alloc();
    }
  }

  inline void* allocate(size_t size) {
    void* alloc = heap->allocate(size);
    if (alloc == nullptr && size != 0) {
      fprintf(stderr, "ERROR: Fixed sized heap allocate called failed\n");
      throw std::bad_alloc();
    }
    return alloc;
  }

  inline void deallocate(void* ptr) { heap->deallocate(ptr); }

  inline bool operator!=(const FixedSizeHeap& rhs) const {
    return heap != rhs.heap;
  }

  inline bool operator==(const FixedSizeHeap& rhs) const {
    return heap == rhs.heap;
  }
};

class SerialNumaHeap {
  enum {
    offset = (sizeof(substrate::LAptr) + (sizeof(double) - 1)) &
             ~(sizeof(double) - 1)
  };

public:
  enum { AllocSize = 0 };

  void* allocate(size_t size) {
    auto ptr = substrate::largeMallocInterleaved(size + offset, activeThreads);
    substrate::LAptr* header =
        new ((char*)ptr.get()) substrate::LAptr{std::move(ptr)};
    return (char*)(header->get()) + offset;
  }

  void deallocate(void* ptr) {
    char* realPtr = ((char*)ptr - offset);
    substrate::LAptr dptr{std::move(*(substrate::LAptr*)realPtr)};
  }
};

////////////////////////////////////////////////////////////////////////////////
// Now adapt to standard std allocators
////////////////////////////////////////////////////////////////////////////////

//! A fixed size block allocator
template <typename Ty>
class FixedSizeAllocator;

template <>
class FixedSizeAllocator<void> {
public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef void* pointer;
  typedef const void* const_pointer;
  typedef void value_type;

  template <typename Other>
  struct rebind {
    typedef FixedSizeAllocator<Other> other;
  };
};

template <typename Ty>
class FixedSizeAllocator {
  inline void destruct(char*) const {}
  inline void destruct(wchar_t*) const {}
  template <typename T>
  inline void destruct(T* t) const {
    t->~T();
  }

  FixedSizeHeap heap;

public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef Ty* pointer;
  typedef const Ty* const_pointer;
  typedef Ty& reference;
  typedef const Ty& const_reference;
  typedef Ty value_type;

  template <class Other>
  struct rebind {
    typedef FixedSizeAllocator<Other> other;
  };

  FixedSizeAllocator() noexcept : heap(sizeof(Ty)) {}

  template <class U>
  FixedSizeAllocator(const FixedSizeAllocator<U>&) noexcept
      : heap(sizeof(Ty)) {}

  inline pointer address(reference val) const { return &val; }
  inline const_pointer address(const_reference val) const { return &val; }

  pointer allocate(size_type size) {
    if (size > max_size())
      throw std::bad_alloc();
    return static_cast<pointer>(heap.allocate(sizeof(Ty)));
  }

  void deallocate(pointer ptr, size_type GALOIS_USED_ONLY_IN_DEBUG(len)) {
    assert(len == 1);
    heap.deallocate(ptr);
  }

  template <class U, class... Args>
  inline void construct(U* p, Args&&... args) const {
    ::new ((void*)p) U(std::forward<Args>(args)...);
  }

  inline void destroy(pointer ptr) const { destruct(ptr); }

  size_type max_size() const noexcept { return 1; }

  template <typename T1>
  inline bool operator!=(const FixedSizeAllocator<T1>& rhs) const {
    return heap != rhs.heap;
  }

  template <typename T1>
  inline bool operator==(const FixedSizeAllocator<T1>& rhs) const {
    return heap == rhs.heap;
  }
};

class Pow_2_BlockHeap : public StaticSingleInstance<Pow_2_BlockHeap> {

private:
  using Base = StaticSingleInstance<Pow_2_BlockHeap>;
  /* template <typename> */ friend class StaticSingleInstance<Pow_2_BlockHeap>;

  static const bool USE_MALLOC_AS_BACKUP = true;

  static const size_t LOG2_MIN_SIZE = 3;  // 2^3 == 8 bytes
  static const size_t LOG2_MAX_SIZE = 16; // 64k

  typedef FixedSizeHeap Heap_ty;

  std::vector<Heap_ty> heapTable;

  static inline size_t pow2(unsigned i) { return (1U << i); }

  static unsigned nextLog2(const size_t allocSize) {

    unsigned i = LOG2_MIN_SIZE;

    while (pow2(i) < allocSize) {
      ++i;
    }

    // if (pow2 (i) > pow2 (LOG2_MAX_SIZE)) {
    // std::fprintf (stderr, "ERROR: block bigger than huge page size
    // requested\n"); throw std::bad_alloc();
    // }

    return i;
  }

  void populateTable(void) {
    assert(heapTable.empty());

    heapTable.clear();
    for (unsigned i = 0; i <= LOG2_MAX_SIZE; ++i) {
      heapTable.push_back(Heap_ty(pow2(i)));
    }
  }

  Pow_2_BlockHeap() noexcept; // NOLINT(modernize-use-equals-delete)

public:
  void* allocateBlock(const size_t allocSize) {

    if (allocSize > pow2(LOG2_MAX_SIZE)) {
      if (USE_MALLOC_AS_BACKUP) {
        return malloc(allocSize);
      } else {
        fprintf(stderr, "ERROR: block bigger than huge page size requested\n");
        throw std::bad_alloc();
      }
    } else {

      unsigned i = nextLog2(allocSize);
      assert(i < heapTable.size());
      return heapTable[i].allocate(pow2(i));
    }
  }

  void deallocateBlock(void* ptr, const size_t allocSize) {
    if (allocSize > pow2(LOG2_MAX_SIZE)) {
      if (USE_MALLOC_AS_BACKUP) {
        free(ptr);
      } else {
        fprintf(stderr, "ERROR: block bigger than huge page size requested\n");
        throw std::bad_alloc();
      }
    } else {
      unsigned i = nextLog2(allocSize);
      assert(i < heapTable.size());
      heapTable[i].deallocate(ptr);
    }
  }
};

template <typename Ty>
class Pow_2_BlockAllocator {

  template <typename T>
  static inline void destruct(T* t) {
    if (!std::is_scalar<T>::value) {
      t->~T();
    }
  }

public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef Ty* pointer;
  typedef const Ty* const_pointer;
  typedef Ty& reference;
  typedef const Ty& const_reference;
  typedef Ty value_type;

  template <class Other>
  struct rebind {
    typedef Pow_2_BlockAllocator<Other> other;
  };

  Pow_2_BlockHeap* heap;

  Pow_2_BlockAllocator() noexcept : heap(Pow_2_BlockHeap::getInstance()) {}

  // template <typename U>
  // friend class Pow_2_BlockAllocator<U>;

  template <typename U>
  Pow_2_BlockAllocator(const Pow_2_BlockAllocator<U>& that) noexcept
      : heap(that.heap) {}

  inline pointer address(reference val) const { return &val; }

  inline const_pointer address(const_reference val) const { return &val; }

  pointer allocate(size_type size) {
    return static_cast<pointer>(heap->allocateBlock(size * sizeof(Ty)));
  }

  void deallocate(pointer ptr, size_type len) {
    heap->deallocateBlock(ptr, len * sizeof(Ty));
  }

  template <class U, class... Args>
  inline void construct(U* p, Args&&... args) const {
    ::new ((void*)p) U(std::forward<Args>(args)...);
  }

  inline void destroy(pointer ptr) const { destruct(ptr); }

  size_type max_size() const noexcept { return size_type(-1); }

  template <typename T1>
  bool operator!=(const Pow_2_BlockAllocator<T1>& rhs) const {
    return heap != rhs.heap;
  }

  template <typename T1>
  bool operator==(const Pow_2_BlockAllocator<T1>& rhs) const {
    return heap == rhs.heap;
  }
};

template <>
class Pow_2_BlockAllocator<void> {
public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef void* pointer;
  typedef const void* const_pointer;
  typedef void value_type;

  template <typename Other>
  struct rebind {
    typedef Pow_2_BlockAllocator<Other> other;
  };
};

//! Keep a reference to an external allocator
template <typename Ty, typename HeapTy>
class ExternalHeapAllocator;

template <typename HeapTy>
class ExternalHeapAllocator<void, HeapTy> {
public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef void* pointer;
  typedef const void* const_pointer;
  typedef void value_type;

  template <typename Other>
  struct rebind {
    typedef ExternalHeapAllocator<Other, HeapTy> other;
  };
};

template <typename Ty, typename HeapTy>
class ExternalHeapAllocator {
  inline void destruct(char*) const {}
  inline void destruct(wchar_t*) const {}
  template <typename T>
  inline void destruct(T* t) const {
    t->~T();
  }

public:
  HeapTy* heap; // Should be private except that makes copy hard

  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef Ty* pointer;
  typedef const Ty* const_pointer;
  typedef Ty& reference;
  typedef const Ty& const_reference;
  typedef Ty value_type;

  template <class Other>
  struct rebind {
    typedef ExternalHeapAllocator<Other, HeapTy> other;
  };

  explicit ExternalHeapAllocator(HeapTy* a) noexcept : heap(a) {}

  template <class T1>
  ExternalHeapAllocator(const ExternalHeapAllocator<T1, HeapTy>& rhs) noexcept {
    heap = rhs.heap;
  }

  inline pointer address(reference val) const { return &val; }

  inline const_pointer address(const_reference val) const { return &val; }

  pointer allocate(size_type size) {
    if (size > max_size())
      throw std::bad_alloc();
    return static_cast<pointer>(heap->allocate(size * sizeof(Ty)));
  }

  void deallocate(pointer ptr, size_type) { heap->deallocate(ptr); }

  inline void construct(pointer ptr, const_reference val) const {
    new (ptr) Ty(val);
  }

  template <class U, class... Args>
  inline void construct(U* p, Args&&... args) const {
    ::new ((void*)p) U(std::forward<Args>(args)...);
  }

  void destroy(pointer ptr) const { destruct(ptr); }

  size_type max_size() const noexcept {
    return (HeapTy::AllocSize == 0) ? size_t(-1) / sizeof(Ty)
                                    : HeapTy::AllocSize / sizeof(Ty);
  }

  template <typename T1, typename A1>
  bool operator!=(const ExternalHeapAllocator<T1, A1>& rhs) const {
    return heap != rhs.heap;
  }

  template <typename T1, typename A1>
  bool operator==(const ExternalHeapAllocator<T1, A1>& rhs) const {
    return heap == rhs.heap;
  }
};

template <typename T>
class SerialNumaAllocator : public ExternalHeapAllocator<T, SerialNumaHeap> {
  using Super = ExternalHeapAllocator<T, SerialNumaHeap>;
  SerialNumaHeap heap;

public:
  template <class Other>
  struct rebind {
    typedef SerialNumaAllocator<Other> other;
  };

  SerialNumaAllocator() : Super(&heap) {}
};

} // end namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/OperatorReferenceTypes.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H
#define GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H

#include "galois/config.h"

namespace galois {
namespace runtime {

namespace internal {

// Helper template for getting the appropriate type of
// reference to hold within each executor based off of the
// type of reference that was passed to it.

// Don't accept operators by value.
template <typename FuncTy>
struct OperatorReferenceType_impl;

// Const references are propagated.
// If a user supplies a const reference the operator() on the
// given object must be callable with *this passed as const as well.
template <typename FuncNoRef>
struct OperatorReferenceType_impl<FuncNoRef const&> {
  using type = FuncNoRef const&;
};

// Non-const references continue to be non-const.
template <typename FuncNoRef>
struct OperatorReferenceType_impl<FuncNoRef&> {
  using type = FuncNoRef&;
};

// Inside each executor store a reference to a received rvalue reference
// and then use that to pass to the various threads. This must be done in
// a way that keeps the rvalue reference alive throughout the duration of
// the parallel loop (as long as the resulting lvalue reference is used
// anywhere).
template <typename FuncNoRef>
struct OperatorReferenceType_impl<FuncNoRef&&> {
  using type = FuncNoRef&;
};

} // namespace internal

template <typename T>
using OperatorReferenceType =
    typename internal::OperatorReferenceType_impl<T>::type;

} // namespace runtime
} // namespace galois

#endif // ifndef(GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H)


================================================
FILE: libgalois/include/galois/runtime/PagePool.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_PAGEPOOL_H
#define GALOIS_RUNTIME_PAGEPOOL_H

#include <cstddef>
#include <deque>
#include <mutex>
#include <numeric>
#include <unordered_map>
#include <vector>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/PageAlloc.h"
#include "galois/substrate/PtrLock.h"
#include "galois/substrate/ThreadPool.h"

namespace galois {
namespace runtime {

//! Low level page pool (individual pages, use largeMalloc for large blocks)

void* pagePoolAlloc();
void pagePoolFree(void*);
void pagePoolPreAlloc(unsigned);

// Size of returned pages
size_t pagePoolSize();

//! Returns total large pages allocated by Galois memory management subsystem
int numPagePoolAllocTotal();
//! Returns total large pages allocated for thread by Galois memory management
//! subsystem
int numPagePoolAllocForThread(unsigned tid);

namespace internal {

struct FreeNode {
  FreeNode* next;
};

typedef galois::substrate::PtrLock<FreeNode> HeadPtr;
typedef galois::substrate::CacheLineStorage<HeadPtr> HeadPtrStorage;

// Tracks pages allocated
template <typename _UNUSED = void>
class PageAllocState {
  std::deque<std::atomic<int>> counts;
  std::vector<HeadPtrStorage> pool;
  std::unordered_map<void*, int> ownerMap;
  galois::substrate::SimpleLock mapLock;

  void* allocFromOS() {
    void* ptr = galois::substrate::allocPages(1, true);
    assert(ptr);
    auto tid = galois::substrate::ThreadPool::getTID();
    counts[tid] += 1;
    std::lock_guard<galois::substrate::SimpleLock> lg(mapLock);
    ownerMap[ptr] = tid;
    return ptr;
  }

public:
  PageAllocState() {
    auto num = galois::substrate::getThreadPool().getMaxThreads();
    counts.resize(num);
    pool.resize(num);
  }

  int count(int tid) const { return counts[tid]; }

  int countAll() const {
    return std::accumulate(counts.begin(), counts.end(), 0);
  }

  void* pageAlloc() {
    auto tid    = galois::substrate::ThreadPool::getTID();
    HeadPtr& hp = pool[tid].data;
    if (hp.getValue()) {
      hp.lock();
      FreeNode* h = hp.getValue();
      if (h) {
        hp.unlock_and_set(h->next);
        return h;
      }
      hp.unlock();
    }
    return allocFromOS();
  }

  void pageFree(void* ptr) {
    assert(ptr);
    mapLock.lock();
    assert(ownerMap.count(ptr));
    int i = ownerMap[ptr];
    mapLock.unlock();
    HeadPtr& hp = pool[i].data;
    hp.lock();
    FreeNode* nh = reinterpret_cast<FreeNode*>(ptr);
    nh->next     = hp.getValue();
    hp.unlock_and_set(nh);
  }

  void pagePreAlloc() { pageFree(allocFromOS()); }
};

//! Initialize PagePool, used by runtime::init();
void setPagePoolState(PageAllocState<>* pa);

} // end namespace internal

} // end namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/Profile.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_PROFILE_H
#define GALOIS_RUNTIME_PROFILE_H

#include <cstdlib>

#ifdef GALOIS_ENABLE_VTUNE
#include "ittnotify.h"
#endif

#ifdef GALOIS_ENABLE_PAPI
extern "C" {
#include <papi.h>
#include <papiStdEventDefs.h>
}
#endif

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/gIO.h"
#include "galois/Timer.h"

namespace galois::runtime {

#ifdef GALOIS_ENABLE_VTUNE

template <typename F>
void profileVtune(const F& func, const char* region) {

  region = region ? region : "(NULL)";

  GALOIS_ASSERT(
      galois::substrate::ThreadPool::getTID() == 0,
      "profileVtune can only be called from master thread (thread 0)");

  __itt_resume();

  timeThis(func, region);

  __itt_pause();
}

#else

template <typename F>
void profileVtune(const F& func, const char* region) {

  region = region ? region : "(NULL)";
  galois::gWarn("Vtune not enabled or found");

  timeThis(func, region);
}

#endif

#ifdef GALOIS_ENABLE_PAPI

namespace internal {

unsigned long papiGetTID(void);

template <typename __T = void>
void papiInit() {

  /* Initialize the PAPI library */
  int retval = PAPI_library_init(PAPI_VER_CURRENT);

  if (retval != PAPI_VER_CURRENT && retval > 0) {
    GALOIS_DIE("PAPI library version mismatch: ", retval,
               " != ", PAPI_VER_CURRENT);
  }

  if (retval < 0) {
    GALOIS_DIE("initialization error!");
  }

  if ((retval = PAPI_thread_init(&papiGetTID)) != PAPI_OK) {
    GALOIS_DIE("PAPI thread init failed");
  }
}

template <typename V1, typename V2>
void decodePapiEvents(const V1& eventNames, V2& papiEvents) {
  for (size_t i = 0; i < eventNames.size(); ++i) {
    char buf[256];
    std::strcpy(buf, eventNames[i].c_str());
    if (PAPI_event_name_to_code(buf, &papiEvents[i]) != PAPI_OK) {
      GALOIS_DIE("failed to recognize eventName = ", eventNames[i],
                 ", event code: ", papiEvents[i]);
    }
  }
}

template <typename V1, typename V2, typename V3>
void papiStart(V1& eventSets, V2& papiResults, V3& papiEvents) {
  galois::on_each([&](const unsigned tid, const unsigned numT) {
    if (PAPI_register_thread() != PAPI_OK) {
      GALOIS_DIE("failed to register thread with PAPI");
    }

    int& eventSet = *eventSets.getLocal();

    eventSet = PAPI_NULL;
    papiResults.getLocal()->resize(papiEvents.size());

    if (PAPI_create_eventset(&eventSet) != PAPI_OK) {
      GALOIS_DIE("failed to init event set");
    }
    if (PAPI_add_events(eventSet, papiEvents.data(), papiEvents.size()) !=
        PAPI_OK) {
      GALOIS_DIE("failed to add events");
    }

    if (PAPI_start(eventSet) != PAPI_OK) {
      GALOIS_DIE("failed to start PAPI");
    }
  });
}

template <typename V1, typename V2, typename V3>
void papiStop(V1& eventSets, V2& papiResults, V3& eventNames,
              const char* region) {
  galois::on_each([&](const unsigned tid, const unsigned numT) {
    int& eventSet = *eventSets.getLocal();

    if (PAPI_stop(eventSet, papiResults.getLocal()->data()) != PAPI_OK) {
      GALOIS_DIE("PAPI_stop failed");
    }

    if (PAPI_cleanup_eventset(eventSet) != PAPI_OK) {
      GALOIS_DIE("PAPI_cleanup_eventset failed");
    }

    if (PAPI_destroy_eventset(&eventSet) != PAPI_OK) {
      GALOIS_DIE("PAPI_destroy_eventset failed");
    }

    assert(eventNames.size() == papiResults.getLocal()->size() &&
           "Both vectors should be of equal length");
    for (size_t i = 0; i < eventNames.size(); ++i) {
      galois::runtime::reportStat_Tsum(region, eventNames[i],
                                       (*papiResults.getLocal())[i]);
    }

    if (PAPI_unregister_thread() != PAPI_OK) {
      GALOIS_DIE("failed to un-register thread with PAPI");
    }
  });
}

template <typename C>
void splitCSVstr(const std::string& inputStr, C& output,
                 const char delim = ',') {
  std::stringstream ss(inputStr);

  for (std::string item; std::getline(ss, item, delim);) {
    output.push_back(item);
  }
}

} // end namespace internal

template <typename F>
void profilePapi(const F& func, const char* region) {

  const char* const PAPI_VAR_NAME = "GALOIS_PAPI_EVENTS";
  region                          = region ? region : "(NULL)";

  std::string eventNamesCSV;

  if (!galois::substrate::EnvCheck(PAPI_VAR_NAME, eventNamesCSV) ||
      eventNamesCSV.empty()) {
    galois::gWarn(
        "No Events specified. Set environment variable GALOIS_PAPI_EVENTS");
    galois::timeThis(func, region);
    return;
  }

  internal::papiInit();

  std::vector<std::string> eventNames;

  internal::splitCSVstr(eventNamesCSV, eventNames);

  std::vector<int> papiEvents(eventNames.size());

  internal::decodePapiEvents(eventNames, papiEvents);

  galois::substrate::PerThreadStorage<int> eventSets;
  galois::substrate::PerThreadStorage<std::vector<long_long>> papiResults;

  internal::papiStart(eventSets, papiResults, papiEvents);

  galois::timeThis(func, region);

  internal::papiStop(eventSets, papiResults, eventNames, region);
}

#else

template <typename F>
void profilePapi(const F& func, const char* region) {

  region = region ? region : "(NULL)";
  galois::gWarn("PAPI not enabled or found");

  timeThis(func, region);
}

#endif

} // namespace galois::runtime

#endif


================================================
FILE: libgalois/include/galois/runtime/Range.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_RANGE_H
#define GALOIS_RUNTIME_RANGE_H

#include <iterator>

#include <boost/iterator/counting_iterator.hpp>

#include "galois/config.h"
#include "galois/gstl.h"
#include "galois/substrate/ThreadPool.h"

namespace galois {
namespace runtime {

extern unsigned int activeThreads;

// TODO(ddn): update to have better forward iterator behavor for blocked/local
// iteration

template <typename T>
class LocalRange {
  T* container;

public:
  typedef T container_type;
  typedef typename T::iterator iterator;
  typedef typename T::local_iterator local_iterator;
  typedef iterator block_iterator;
  typedef typename std::iterator_traits<iterator>::value_type value_type;

  LocalRange(T& c) : container(&c) {}

  iterator begin() const { return container->begin(); }
  iterator end() const { return container->end(); }

  // TODO fix constness of local containers
  /* const */ T& get_container() const { return *container; }

  std::pair<block_iterator, block_iterator> block_pair() const {
    return galois::block_range(begin(), end(), substrate::ThreadPool::getTID(),
                               activeThreads);
  }

  std::pair<local_iterator, local_iterator> local_pair() const {
    return std::make_pair(container->local_begin(), container->local_end());
  }

  local_iterator local_begin() const { return container->local_begin(); }
  local_iterator local_end() const { return container->local_end(); }

  block_iterator block_begin() const { return block_pair().first; }
  block_iterator block_end() const { return block_pair().second; }
};

template <typename T>
inline LocalRange<T> makeLocalRange(T& obj) {
  return LocalRange<T>(obj);
}

template <typename IterTy>
class StandardRange {
  IterTy ii, ei;

public:
  typedef IterTy iterator;
  typedef iterator local_iterator;
  typedef iterator block_iterator;

  typedef typename std::iterator_traits<IterTy>::value_type value_type;

  StandardRange(IterTy b, IterTy e) : ii(b), ei(e) {}

  iterator begin() const { return ii; }
  iterator end() const { return ei; }

  std::pair<block_iterator, block_iterator> block_pair() const {
    return galois::block_range(ii, ei, substrate::ThreadPool::getTID(),
                               activeThreads);
  }

  std::pair<local_iterator, local_iterator> local_pair() const {
    return block_pair();
  }

  local_iterator local_begin() const { return block_begin(); }
  local_iterator local_end() const { return block_end(); }

  block_iterator block_begin() const { return block_pair().first; }
  block_iterator block_end() const { return block_pair().second; }
};

template <typename IterTy>
inline StandardRange<IterTy> makeStandardRange(IterTy begin, IterTy end) {
  return StandardRange<IterTy>(begin, end);
}

/**
 * SpecificRange is a range type where a threads range is specified by
 * an an int array that tells you where each thread should begin its
 * iteration
 */
template <typename IterTy>
class SpecificRange {
  IterTy global_begin, global_end;
  const uint32_t* thread_beginnings;

public:
  typedef IterTy iterator;
  typedef iterator local_iterator;
  typedef iterator block_iterator;

  typedef typename std::iterator_traits<IterTy>::value_type value_type;

  SpecificRange(IterTy b, IterTy e, const uint32_t* thread_ranges)
      : global_begin(b), global_end(e), thread_beginnings(thread_ranges) {}

  iterator begin() const { return global_begin; }
  iterator end() const { return global_end; }

  /* Using the thread_beginnings array which tells you which node each thread
   * should begin at, we can get the local block range for a particular
   * thread. If the local range falls outside of global range, do nothing.
   *
   * @returns A pair of iterators that specifies the beginning and end
   * of the range for this particular thread.
   */
  std::pair<block_iterator, block_iterator> block_pair() const {
    uint32_t my_thread_id  = substrate::ThreadPool::getTID();
    uint32_t total_threads = runtime::activeThreads;

    iterator local_begin = thread_beginnings[my_thread_id];
    iterator local_end   = thread_beginnings[my_thread_id + 1];

    assert(local_begin <= local_end);

    if (thread_beginnings[total_threads] == *global_end && *global_begin == 0) {
      return std::make_pair(local_begin, local_end);
    } else {
      // This path assumes that we were passed in thread_beginnings for the
      // range 0 to last node, but the passed in range to execute is NOT the
      // entire 0 to thread end range; therefore, work under the assumption that
      // only some threads will execute things only if they "own" nodes in the
      // range
      iterator left  = local_begin;
      iterator right = local_end;

      // local = what this thread CAN do
      // global = what this thread NEEDS to do

      // cutoff left and right if global begin/end require less than what we
      // need
      if (local_begin < global_begin) {
        left = global_begin;
      }
      if (local_end > global_end) {
        right = global_end;
      }
      // make sure range is sensible after changing left and right
      if (left >= right || right <= left) {
        left = right = global_end;
      }

      // Explanations/reasoning of possible cases
      // [ ] = local ranges
      // o = need to be included; global ranges = leftmost and rightmost circle
      // x = not included
      // ooooo[ooooooooxxxx]xxxxxx handled (left the same, right moved)
      // xxxxx[xxxxxooooooo]oooooo handled (left moved, right the same)
      // xxxxx[xxoooooooxxx]xxxxxx handled (both left/right moved)
      // xxxxx[xxxxxxxxxxxx]oooooo handled (left will be >= right, set l = r)
      // oooox[xxxxxxxxxxxx]xxxxxx handled (right will be <= left, set l = r)
      // xxxxx[oooooooooooo]xxxxxx handled (left, right the same = local range)

      return std::make_pair(left, right);
    }
  }

  std::pair<local_iterator, local_iterator> local_pair() const {
    return block_pair();
  }

  local_iterator local_begin() const { return block_begin(); }
  local_iterator local_end() const { return block_end(); }

  block_iterator block_begin() const { return block_pair().first; }
  block_iterator block_end() const { return block_pair().second; }
};

/**
 * Creates a SpecificRange object.
 *
 * @tparam IterTy The iterator type used by the range object
 * @param begin The global beginning of the range
 * @param end The global end of the range
 * @param thread_ranges An array of iterators that specifies where each
 * thread's range begins
 * @returns A SpecificRange object
 */
template <typename IterTy>
inline SpecificRange<IterTy> makeSpecificRange(IterTy begin, IterTy end,
                                               const uint32_t* thread_ranges) {
  return SpecificRange<IterTy>(begin, end, thread_ranges);
}

} // end namespace runtime

namespace internal {

// supported variants
// range(beg, end)
// range(C& cont)
// range(const T& x); // single item or drop this in favor of initializer list
// range(std::initializer_list<T>)
template <typename I, bool IS_INTEGER = false>
class IteratorRangeMaker {
  I m_beg;
  I m_end;

public:
  IteratorRangeMaker(const I& beg, const I& end) : m_beg(beg), m_end(end) {}

  template <typename Arg>
  auto operator()(const Arg&) const {
    return runtime::makeStandardRange(m_beg, m_end);
  }
};

template <typename I>
class IteratorRangeMaker<I, true> {
  I m_beg;
  I m_end;

public:
  IteratorRangeMaker(const I& beg, const I& end) : m_beg(beg), m_end(end) {}

  template <typename Arg>
  auto operator()(const Arg&) const {
    return runtime::makeStandardRange(boost::counting_iterator<I>(m_beg),
                                      boost::counting_iterator<I>(m_end));
  }
};

template <typename T>
class InitListRangeMaker {
  std::initializer_list<T> m_list;

public:
  explicit InitListRangeMaker(const std::initializer_list<T>& l) : m_list(l) {}

  template <typename Arg>
  auto operator()(const Arg&) const {
    return runtime::makeStandardRange(m_list.begin(), m_list.end());
  }
};

template <typename C, bool HAS_LOCAL_RANGE = true>
class ContainerRangeMaker {
  C& m_cont;

public:
  explicit ContainerRangeMaker(C& cont) : m_cont(cont) {}

  template <typename Arg>
  auto operator()(const Arg&) const {
    return runtime::makeLocalRange(m_cont);
  }
};

template <typename C>
class ContainerRangeMaker<C, false> {

  C& m_cont;

public:
  explicit ContainerRangeMaker(C& cont) : m_cont(cont) {}

  template <typename Arg>
  auto operator()(const Arg&) const {
    return runtime::makeStandardRange(m_cont.begin(), m_cont.end());
  }
};

template <typename C>
class HasLocalIter {

  template <typename T>
  using CallExprType = typename std::remove_reference<decltype(
      std::declval<T>().local_begin())>::type;

  template <typename T>
  static std::true_type go(typename std::add_pointer<CallExprType<T>>::type);

  template <typename T>
  static std::false_type go(...);

public:
  constexpr static const bool value =
      std::is_same<decltype(go<C>(nullptr)), std::true_type>::value;
};

} // end namespace internal

template <typename C>
auto iterate(C& cont) {
  return internal::ContainerRangeMaker<C, internal::HasLocalIter<C>::value>(
      cont);
}

template <typename T>
auto iterate(std::initializer_list<T> initList) {
  return internal::InitListRangeMaker<T>(initList);
}

template <typename I>
auto iterate(const I& beg, const I& end) {
  return internal::IteratorRangeMaker<I, std::is_integral<I>::value>(beg, end);
}

} // end namespace galois
#endif


================================================
FILE: libgalois/include/galois/runtime/SharedMem.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_SHAREDMEM_H
#define GALOIS_RUNTIME_SHAREDMEM_H

#include <string>

#include "galois/config.h"
#include "galois/runtime/PagePool.h"
#include "galois/runtime/Statistics.h"
#include "galois/substrate/SharedMem.h"

namespace galois::runtime {

template <typename SM>
class SharedMem : public galois::substrate::SharedMem {
  internal::PageAllocState<> m_pa;
  SM m_sm;

public:
  explicit SharedMem() : m_pa(), m_sm() {
    internal::setPagePoolState(&m_pa);
    internal::setSysStatManager(&m_sm);
  }

  ~SharedMem() {
    m_sm.print();
    internal::setSysStatManager(nullptr);
    internal::setPagePoolState(nullptr);
  }

  SharedMem(const SharedMem&) = delete;
  SharedMem& operator=(const SharedMem&) = delete;

  SharedMem(SharedMem&&) = delete;
  SharedMem& operator=(SharedMem&&) = delete;
};

} // namespace galois::runtime

#endif


================================================
FILE: libgalois/include/galois/runtime/Statistics.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_STAT_MANAGER_H
#define GALOIS_STAT_MANAGER_H

#include <limits>
#include <map>
#include <string>
#include <type_traits>

#include <sys/resource.h>
#include <sys/time.h>

#include <boost/uuid/uuid.hpp>            // uuid class
#include <boost/uuid/uuid_generators.hpp> // generators
#include <boost/uuid/uuid_io.hpp>         // streaming operators etc.

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/gstl.h"
#include "galois/Threads.h"
#include "galois/substrate/EnvCheck.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/ThreadRWlock.h"
#include "galois/Threads.h"

/**
 * TODO:
 * Print intra host stats with per-thread details and inter-host stats with
 per-host details
 * print to 2 files if supporting R format
 * dist implements an addToStat with host ID and manages inter-host stats and
 their combining

 */

namespace galois {
namespace runtime {

boost::uuids::uuid getRandUUID();

template <typename T>
class RunningMin {
  T m_min;

public:
  RunningMin(void) : m_min(std::numeric_limits<T>::max()) {}

  void add(const T& val) { m_min = std::min(m_min, val); }

  const T& min(void) const { return m_min; }
};

template <typename T>
class RunningMax {
  T m_max;

public:
  RunningMax(void) : m_max(std::numeric_limits<T>::min()) {}

  void add(const T& val) { m_max = std::max(m_max, val); }

  const T& max(void) const { return m_max; }
};

template <typename T>
class RunningSum {
  T m_sum;
  size_t m_count;

public:
  RunningSum(void) : m_sum(), m_count(0) {}

  void add(const T& val) {
    m_sum += val;
    ++m_count;
  }

  const T& sum(void) const { return m_sum; }

  const size_t& count(void) const { return m_count; }

  T avg() const { return m_sum / m_count; }
};

template <typename T>
class RunningVec {

  using Vec = gstl::Vector<T>;

  Vec m_vec;

public:
  void add(const T& val) { m_vec.push_back(val); }

  const Vec& values(void) const { return m_vec; }
};

template <typename T>
class NamedStat {

  using Str = galois::gstl::Str;

  Str m_name;

public:
  void setName(const Str& name) { m_name = name; }

  void setName(Str&& name) { m_name = std::move(name); }

  const Str& name(void) const { return m_name; }

  void add(const T&) const {}
};

template <typename T, typename... Bases>
class AggregStat : public Bases... {

public:
  using with_min = AggregStat<T, RunningMin<T>, Bases...>;

  using with_max = AggregStat<T, RunningMax<T>, Bases...>;

  using with_sum = AggregStat<T, RunningSum<T>, Bases...>;

  using with_mem = AggregStat<T, RunningVec<T>, Bases...>;

  using with_name = AggregStat<T, NamedStat<T>, Bases...>;

  void add(const T& val) { (..., Bases::add(val)); }
};

namespace {
static constexpr const char* StatTotalNames[] = {"SINGLE", "TMIN", "TMAX",
                                                 "TSUM", "TAVG"};
}

struct StatTotal {

  enum Type { SINGLE = 0, TMIN, TMAX, TSUM, TAVG };

  static const char* str(const Type& t) { return StatTotalNames[t]; }
};

namespace internal {

template <typename Stat_tp>
struct BasicStatMap {

  using Stat    = Stat_tp;
  using Str     = galois::gstl::Str;
  using StrSet  = galois::gstl::Set<Str>;
  using StatMap = galois::gstl::Map<std::tuple<const Str*, const Str*>, Stat>;
  using const_iterator = typename StatMap::const_iterator;

protected:
  StrSet symbols;
  StatMap statMap;

  const Str* getOrInsertSymbol(const Str& s) {
    auto p = symbols.insert(s);
    return &*(p.first);
  }

  const Str* getSymbol(const Str& s) const {
    auto i = symbols.find(s);

    if (i == symbols.cend()) {
      return nullptr;
    } else {
      return &(*i);
    }
  }

public:
  template <typename... Args>
  Stat& getOrInsertStat(const Str& region, const Str& category,
                        Args&&... args) {

    const Str* ln  = getOrInsertSymbol(region);
    const Str* cat = getOrInsertSymbol(category);

    auto tpl = std::make_tuple(ln, cat);

    auto p = statMap.emplace(tpl, Stat(std::forward<Args>(args)...));

    return p.first->second;
  }

  const_iterator findStat(const Str& region, const Str& category) const {

    const Str* ln  = getSymbol(region);
    const Str* cat = getSymbol(category);
    auto tpl       = std::make_tuple(ln, cat);

    auto i = statMap.find(tpl);

    return i;
  }

  const Stat& getStat(const Str& region, const Str& category) const {

    auto i = findStat(region, category);
    assert(i != statMap.end());
    return i->second;
  }

  template <typename T, typename... Args>
  void addToStat(const Str& region, const Str& category, const T& val,
                 Args&&... statArgs) {
    Stat& s =
        getOrInsertStat(region, category, std::forward<Args>(statArgs)...);
    s.add(val);
  }

  const_iterator cbegin(void) const { return statMap.cbegin(); }
  const_iterator cend(void) const { return statMap.cend(); }

  const Str& region(const const_iterator& i) const {
    return *(std::get<0>(i->first));
  }

  const Str& category(const const_iterator& i) const {
    return *(std::get<1>(i->first));
  }

  const Stat& stat(const const_iterator& i) const { return i->second; }
};

template <typename T>
using VecStat_with_MinMaxSum =
    typename AggregStat<T>::with_mem::with_min::with_max::with_sum;

template <typename T>
struct VecStat : public VecStat_with_MinMaxSum<T> {

  using Base = VecStat_with_MinMaxSum<T>;

  StatTotal::Type m_totalTy;

  explicit VecStat(const StatTotal::Type& type) : Base(), m_totalTy(type) {}

  const StatTotal::Type& totalTy(void) const { return m_totalTy; }

  T total(void) const {

    switch (m_totalTy) {

    case StatTotal::SINGLE:
      assert(Base::values().size() > 0);
      return Base::values()[0];

    case StatTotal::TMIN:
      return Base::min();

    case StatTotal::TMAX:
      return Base::max();

    case StatTotal::TSUM:
      return Base::sum();

    case StatTotal::TAVG:
      return Base::avg();

    default:
      GALOIS_DIE("unreachable");
    }
  }
};

template <>
struct VecStat<gstl::Str> : public AggregStat<gstl::Str>::with_mem {

  using Base = AggregStat<gstl::Str>::with_mem;

  StatTotal::Type m_totalTy;

  explicit VecStat(const StatTotal::Type& type) : Base(), m_totalTy(type) {}

  const StatTotal::Type& totalTy(void) const { return m_totalTy; }

  const gstl::Str& total(void) const {

    switch (m_totalTy) {

    case StatTotal::SINGLE:
      assert(Base::values().size() > 0);
      return Base::values()[0];

    default:
      GALOIS_DIE("unreachable");
    }
  }
};

template <typename T>
using VecStatManager = BasicStatMap<VecStat<T>>;

template <typename T>
struct ScalarStat {
  T m_val;
  StatTotal::Type m_totalTy;

  explicit ScalarStat(const StatTotal::Type& type) : m_val(), m_totalTy(type) {}

  void add(const T& v) { m_val += v; }

  operator const T&(void) const { return m_val; }

  const StatTotal::Type& totalTy(void) const { return m_totalTy; }
};

template <typename T>
using ScalarStatManager = BasicStatMap<ScalarStat<T>>;

} // end namespace internal

class StatManager {

public:
  using Str = galois::gstl::Str;

  static constexpr const char* const SEP           = ", ";
  static constexpr const char* const TSTAT_SEP     = "; ";
  static constexpr const char* const TSTAT_NAME    = "ThreadValues";
  static constexpr const char* const TSTAT_ENV_VAR = "PRINT_PER_THREAD_STATS";

  static bool printingThreadVals(void);

  template <typename T>
  static constexpr const char* statKind(void) {
    return std::is_same<T, Str>::value ? "PARAM" : "STAT";
  }

private:
  template <typename T>
  struct StatManagerImpl {

    using MergedStats    = internal::VecStatManager<T>;
    using const_iterator = typename MergedStats::const_iterator;
    using Stat           = typename MergedStats::Stat;

    substrate::PerThreadStorage<internal::ScalarStatManager<T>>
        perThreadManagers;
    MergedStats result;
    bool merged = false;

    void addToStat(const Str& region, const Str& category, const T& val,
                   const StatTotal::Type& type) {
      perThreadManagers.getLocal()->addToStat(region, category, val, type);
    }

    void mergeStats(void) {

      if (merged) {
        return;
      }

      for (unsigned t = 0; t < perThreadManagers.size(); ++t) {

        const auto* manager = perThreadManagers.getRemote(t);

        for (auto i = manager->cbegin(), end_i = manager->cend(); i != end_i;
             ++i) {
          result.addToStat(manager->region(i), manager->category(i),
                           T(manager->stat(i)), manager->stat(i).totalTy());
        }
      }

      merged = true;
    }

    const_iterator cbegin(void) const { return result.cbegin(); }
    const_iterator cend(void) const { return result.cend(); }

    const Str& region(const const_iterator& i) const {
      return result.region(i);
    }

    const Str& category(const const_iterator& i) const {
      return result.category(i);
    }

    const Stat& stat(const const_iterator& i) const { return result.stat(i); }

    template <typename S, typename V>
    void readStat(const const_iterator& i, S& region, S& category, T& total,
                  StatTotal::Type& type, V& thrdVals) const {
      region   = this->region(i);
      category = this->category(i);

      total = this->stat(i).total();
      type  = this->stat(i).totalTy();

      thrdVals.clear();
      thrdVals = this->stat(i).values();
    }

    void print(std::ostream& out) const {

      for (auto i = cbegin(), end_i = cend(); i != end_i; ++i) {
        out << statKind<T>() << SEP << this->region(i) << SEP
            << this->category(i) << SEP;

        const auto& s = this->stat(i);
        out << StatTotal::str(s.totalTy()) << SEP << s.total();

        out << "\n";

        if (StatManager::printingThreadVals()) {

          out << statKind<T>() << SEP << this->region(i) << SEP
              << this->category(i) << SEP;
          out << TSTAT_NAME << SEP;

          const char* sep = "";
          for (const auto& v : s.values()) {
            out << sep << v;
            sep = TSTAT_SEP;
          }

          out << "\n";
        }
      }
    }
  };

  using IntStats     = StatManagerImpl<int64_t>;
  using FPstats      = StatManagerImpl<double>;
  using StrStats     = StatManagerImpl<Str>;
  using int_iterator = typename IntStats::const_iterator;
  using fp_iterator  = typename FPstats::const_iterator;
  using str_iterator = typename StrStats::const_iterator;

  std::string m_outfile;
  IntStats intStats;
  FPstats fpStats;
  StrStats strStats;

protected:
  void mergeStats(void) {
    intStats.mergeStats();
    fpStats.mergeStats();
    strStats.mergeStats();
  }

  int_iterator intBegin(void) const;
  int_iterator intEnd(void) const;

  fp_iterator fpBegin(void) const;
  fp_iterator fpEnd(void) const;

  str_iterator paramBegin(void) const;
  str_iterator paramEnd(void) const;

  template <typename S, typename V>
  void readIntStat(const int_iterator& i, S& region, S& category,
                   int64_t& total, StatTotal::Type& type, V& vec) const {

    intStats.readStat(i, region, category, total, type, vec);
  }

  template <typename S, typename V>
  void readFPstat(const fp_iterator& i, S& region, S& category, double& total,
                  StatTotal::Type& type, V& vec) const {

    fpStats.readStat(i, region, category, total, type, vec);
  }

  template <typename S, typename V>
  void readParam(const str_iterator& i, S& region, S& category, Str& total,
                 StatTotal::Type& type, V& vec) const {

    strStats.readStat(i, region, category, total, type, vec);
  }

  virtual void printStats(std::ostream& out);

  void printHeader(std::ostream& out) const;

public:
  explicit StatManager(const std::string& outfile = "");

  virtual ~StatManager();

  void setStatFile(const std::string& outfile);

  template <typename S1, typename S2, typename T,
            typename = std::enable_if_t<std::is_integral<T>::value ||
                                        std::is_floating_point<T>::value>>
  void addToStat(const S1& region, const S2& category, const T& val,
                 const StatTotal::Type& type) {

    if (std::is_floating_point<T>::value) {
      fpStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),
                        double(val), type);

    } else {
      intStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),
                         int64_t(val), type);
    }
  }

  template <typename S1, typename S2, typename V>
  void addToParam(const S1& region, const S2& category, const V& val) {
    strStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),
                       gstl::makeStr(val), StatTotal::SINGLE);
  }

  void print(void);
};

namespace internal {

void setSysStatManager(StatManager* sm);
StatManager* sysStatManager(void);

} // namespace internal

template <typename S1, typename S2, typename T>
inline void reportStat(const S1& region, const S2& category, const T& value,
                       const StatTotal::Type& type) {
  internal::sysStatManager()->addToStat(region, category, value, type);
}

template <typename S1, typename S2, typename T>
inline void reportStat_Single(const S1& region, const S2& category,
                              const T& value) {
  reportStat(region, category, value, StatTotal::SINGLE);
}

template <typename S1, typename S2, typename T>
inline void reportStat_Tmin(const S1& region, const S2& category,
                            const T& value) {
  reportStat(region, category, value, StatTotal::TMIN);
}

template <typename S1, typename S2, typename T>
inline void reportStat_Tmax(const S1& region, const S2& category,
                            const T& value) {
  reportStat(region, category, value, StatTotal::TMAX);
}

template <typename S1, typename S2, typename T>
inline void reportStat_Tsum(const S1& region, const S2& category,
                            const T& value) {
  reportStat(region, category, value, StatTotal::TSUM);
}

template <typename S1, typename S2, typename T>
inline void reportStat_Tavg(const S1& region, const S2& category,
                            const T& value) {
  reportStat(region, category, value, StatTotal::TAVG);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond(const S1& region, const S2& category, const T& value,
                           const StatTotal::Type& type) {
  if (Report)
    internal::sysStatManager()->addToStat(region, category, value, type);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond_Single(const S1& region, const S2& category,
                                  const T& value) {
  if (Report)
    reportStat(region, category, value, StatTotal::SINGLE);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond_Tmin(const S1& region, const S2& category,
                                const T& value) {
  if (Report)
    reportStat(region, category, value, StatTotal::TMIN);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond_Tmax(const S1& region, const S2& category,
                                const T& value) {
  if (Report)
    reportStat(region, category, value, StatTotal::TMAX);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond_Tsum(const S1& region, const S2& category,
                                const T& value) {
  if (Report)
    reportStat(region, category, value, StatTotal::TSUM);
}

template <bool Report = false, typename S1, typename S2, typename T>
inline void reportStatCond_Tavg(const S1& region, const S2& category,
                                const T& value) {
  if (Report)
    reportStat(region, category, value, StatTotal::TAVG);
}

template <typename S1, typename S2, typename V>
void reportParam(const S1& region, const S2& category, const V& value) {
  internal::sysStatManager()->addToParam(region, category, value);
}

void setStatFile(const std::string& f);

//! Reports maximum resident set size and page faults stats using
//! rusage
//! @param id Identifier to prefix stat with in statistics output
void reportRUsage(const std::string& id);

// TODO: switch to gstl::Str in here
//! Reports Galois system memory stats for all threads
void reportPageAlloc(const char* category);
//! Reports NUMA memory stats for all NUMA nodes
void reportNumaAlloc(const char* category);

} // end namespace runtime
} // end namespace galois

#endif // GALOIS_STAT_MANAGER_H


================================================
FILE: libgalois/include/galois/runtime/Substrate.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_SUBSTRATE_H
#define GALOIS_RUNTIME_SUBSTRATE_H

#include "galois/substrate/Barrier.h"

namespace galois {
namespace runtime {

/**
 * Have a pre-instantiated barrier available for use.
 * This is initialized to the current activeThreads. This barrier
 * is designed to be fast and should be used in the common
 * case.
 *
 * However, there is a race if the number of active threads
 * is modified after using this barrier: some threads may still
 * be in the barrier while the main thread reinitializes this
 * barrier to the new number of active threads. If that may
 * happen, use {@link createSimpleBarrier()} instead.
 */
substrate::Barrier& getBarrier(unsigned activeThreads);

} // end namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/ThreadTimer.h
================================================
#ifndef GALOIS_RUNTIME_THREADTIMER_H
#define GALOIS_RUNTIME_THREADTIMER_H

#include <ctime>

#include "galois/config.h"
#include "galois/substrate/PerThreadStorage.h"

namespace galois::runtime {

class ThreadTimer {
  timespec start_;
  timespec stop_;
  uint64_t nsec_{0};

public:
  ThreadTimer() = default;

  void start() { clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start_); }

  void stop() {
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop_);
    nsec_ += (stop_.tv_nsec - start_.tv_nsec);
    nsec_ += ((stop_.tv_sec - start_.tv_sec) * 1000000000);
  }

  uint64_t get_nsec() const { return nsec_; }

  uint64_t get_sec() const { return (nsec_ / 1000000000); }

  uint64_t get_msec() const { return (nsec_ / 1000000); }
};

class ThreadTimers {
protected:
  substrate::PerThreadStorage<ThreadTimer> timers_;

  void reportTimes(const char* category, const char* region);
};

template <bool enabled>
class PerThreadTimer : private ThreadTimers {
  const char* const region_;
  const char* const category_;

  void reportTimes() { reportTimes(category_, region_); }

public:
  PerThreadTimer(const char* const region, const char* const category)
      : region_(region), category_(category) {}

  PerThreadTimer(const PerThreadTimer&) = delete;
  PerThreadTimer(PerThreadTimer&&)      = delete;
  PerThreadTimer& operator=(const PerThreadTimer&) = delete;
  PerThreadTimer& operator=(PerThreadTimer&&) = delete;

  ~PerThreadTimer() { reportTimes(); }

  void start() { timers_.getLocal()->start(); }

  void stop() { timers_.getLocal()->stop(); }
};

template <>
class PerThreadTimer<false> {

public:
  PerThreadTimer(const char* const, const char* const) {}

  PerThreadTimer(const PerThreadTimer&) = delete;
  PerThreadTimer(PerThreadTimer&&)      = delete;
  PerThreadTimer& operator=(const PerThreadTimer&) = delete;
  PerThreadTimer& operator=(PerThreadTimer&&) = delete;

  ~PerThreadTimer() = default;

  void start() const {}

  void stop() const {}
};

} // end namespace galois::runtime

#endif


================================================
FILE: libgalois/include/galois/runtime/TiledExecutor.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _GALOIS_RUNTIME_TILEDEXECUTOR_H_
#define _GALOIS_RUNTIME_TILEDEXECUTOR_H_

#include "galois/config.h"
#include "galois/Galois.h"
#include "galois/LargeArray.h"
#include "galois/NoDerefIterator.h"

namespace galois {
namespace runtime {

template <typename Graph, bool UseExp = false>
class Fixed2DGraphTiledExecutor {
  static constexpr int numDims = 2; // code is specialized to 2

  using SpinLock      = galois::substrate::PaddedLock<true>;
  using GNode         = typename Graph::GraphNode;
  using iterator      = typename Graph::iterator;
  using edge_iterator = typename Graph::edge_iterator;
  using Point         = std::array<size_t, numDims>;

  template <typename T>
  struct SimpleAtomic {
    std::atomic<T> value;
    SimpleAtomic() : value(0) {}
    SimpleAtomic(const SimpleAtomic& o) : value(o.value.load()) {}
    T relaxedLoad() { return value.load(std::memory_order_relaxed); }
    void relaxedAdd(T delta) {
      value.store(relaxedLoad() + delta, std::memory_order_relaxed);
    }
  };

  /**
   * Tasks are 2D ranges [startX, endX) x [startY, endYInclusive]
   */
  struct Task {
    iterator startX;
    iterator endX;
    GNode startY;
    GNode endYInclusive;
    Point coord;
    SimpleAtomic<unsigned> updates;
  };

  /**
   * Functor: given a graph on initialization, passing it an edge iterator
   * will return the destination of that edge in the graph.
   */
  struct GetDst {
    Graph* g;
    GetDst() {}
    GetDst(Graph* _g) : g(_g) {}

    GNode operator()(edge_iterator ii) const { return g->getEdgeDst(ii); }
  };

  using no_deref_iterator = galois::NoDerefIterator<edge_iterator>;
  using edge_dst_iterator =
      boost::transform_iterator<GetDst, no_deref_iterator>;

  Graph& g;
  int cutoff;                          // XXX: UseExp
  galois::substrate::Barrier& barrier; // XXX: UseExp
  // std::array<galois::LargeArray<SpinLock>, numDims> locks;
  // galois::LargeArray<Task> tasks;
  std::array<std::vector<SpinLock>, numDims> locks;
  std::vector<Task> tasks;
  size_t numTasks;
  unsigned maxUpdates;
  bool useLocks;
  galois::GAccumulator<unsigned> failedProbes;

  /**
   * Advance point p in the specified dimension by delta and account for
   * overflow as well.
   *
   * @param p Point to advance
   * @param dim Dimension to advance
   * @param delta Amount to advance by
   */
  void nextPoint(Point& p, int dim, int delta) {
    assert(dim < numDims);
    p[dim] += delta;
    // account for overflow
    while (p[dim] >= locks[dim].size()) {
      p[dim] -= locks[dim].size();
    }
  }

  /**
   * Get task associated with a point in the 2D block grid.
   *
   * For point x, y, get the task indexed into the X direction x times
   * and indexed into the Y direction y times.
   *
   * @param point with coordinates to a task
   * @returns pointer to task associated with the passed in point
   */
  Task* getTask(const Point& p) {
    Task* t = &tasks[p[0] + p[1] * locks[0].size()];

    assert(t < &tasks[numTasks]);
    assert(t >= &tasks[0]);

    return t;
  }

  /**
   * Finds a block starting from the provided point that hasn't reached
   * the maximum number of updates and returns a pointer to it.
   * Uses a lock on each block it probes, and a returned block is **returned
   * with a lock**.
   *
   * @param start Point specifying block to start probe from
   * @param dim Specifies whether or not to continue probe in x (0) or y (1)
   * direction
   * @param n Number of blocks to probe before failing
   * @returns pointer to block that hasn't reached a maximum number of updates
   * or nullptr on probe failure. The block is **returned with the lock**.
   */
  Task* probeBlockWithLock(Point& start, int dim, size_t n) {
    Point p = start;

    for (size_t i = 0; i < n; ++i) {
      Task* t = getTask(p);

      assert(p[0] == t->coord[0]);
      assert(p[1] == t->coord[1]);
      assert(t->coord[0] < locks[0].size());
      assert(t->coord[1] < locks[1].size());

      if (t->updates.relaxedLoad() < maxUpdates) {
        if (std::try_lock(locks[0][t->coord[0]], locks[1][t->coord[1]]) < 0) {
          if (t->updates.relaxedLoad() < maxUpdates) {
            t->updates.relaxedAdd(1);
            start = p;
            return t;
          }

          // TODO add to worklist
          for (int i = 0; i < numDims; ++i) {
            locks[i][t->coord[i]].unlock();
          }
        }
      }

      nextPoint(p, dim, 1);
    }

    failedProbes += 1;
    return nullptr;
  }

  /**
   * Finds a block starting from the provided point that hasn't reached
   * the maximum number of updates and returns a pointer to it.
   *
   * @param start Point specifying block to start probe from
   * @param dim Specifies whether or not to continue probe in x (0) or y (1)
   * direction
   * @param n Number of blocks to probe before failing
   * @returns pointer to block that hasn't reached a maximum number of updates
   * or nullptr on probe failure
   */
  Task* probeBlockWithoutLock(Point& start, int dim, size_t n) {
    Point p = start;

    for (size_t i = 0; i < n; ++i) {
      Task* t = getTask(p);

      assert(p[0] == t->coord[0]);
      assert(p[1] == t->coord[1]);
      assert(t->coord[0] < locks[0].size());
      assert(t->coord[1] < locks[1].size());

      if (t->updates.relaxedLoad() < maxUpdates) {
        if (t->updates.value.fetch_add(1) < maxUpdates) {
          // hasn't reached maxed updates at point of fetch
          start = p;
          return t;
        }
      }
      nextPoint(p, dim, 1);
    }

    failedProbes += 1;
    return nullptr;
  }

  /**
   * Finds a block starting from the provided point that hasn't reached
   * the maximum number of updates (and isn't locked if using locks) and returns
   * a pointer to it. If a task is found, start is updated to the
   * corresponding coordinate.
   *
   * Wrapper for locked and not locked versions.
   * Note that if locks are used, the block return will HAVE THE LOCK for that
   * block.
   *
   * @param start Point specifying block to start probe from
   * @param dim Specifies whether or not to continue probe in x (0) or y (1)
   * direction
   * @param n Number of blocks to probe before failing
   * @returns pointer to block that hasn't reached a maximum number of updates
   * or nullptr on probe failure. If locks are used, the caller will
   * have the lock for the block as well.
   */
  Task* probeBlock(Point& start, int dim, size_t n) {
    assert(dim < 2);

    if (useLocks) {
      return probeBlockWithLock(start, dim, n);
    } else {
      return probeBlockWithoutLock(start, dim, n);
    }
  }

  // TODO (Loc) this function needs an overhaul; right now it's too hacky and
  // imprecise
  /**
   * From the provided start point, find a block that is updateable and return
   * it. Search starts by going up-down left-right from start, but if that
   * fails, begin advancing along the diagonal and searching up-down left-right
   * until the entire grid is traversed without a found block.
   *
   * Updateable = hasn't reached max updates on inspection + isn't locked (if
   * using locks)
   *
   * @param start block to start search from
   * @param inclusive If true, the initial search will include the provided
   * start point as a potential block to look at; otherwise it is COMPLETELY
   * omitted from search (unless you have a non-square grid in which
   * case it might become "extra work"; see TODO below)
   **/
  Task* nextBlock(Point& start, bool inclusive) {
    Task* t;

    // repeats twice just to make sure there are actually no unused blocks
    // TODO this method of termination detection is hacky and imprecise,
    // find a better way
    for (int times = 0; times < 2; ++times) {
      Point limit{{locks[0].size(), locks[1].size()}};

      int inclusiveDelta = (inclusive && times == 0) ? 0 : 1;

      // First iteration (i.e. inclusive = true) is INCLUSIVE of start
      // Otherwise, check the next blocks in the x and y direction for the
      // next block
      for (int i = 0; i < numDims; ++i) {
        Point p = start;
        nextPoint(p, i, inclusiveDelta);

        if ((t = probeBlock(p, i, limit[i] - inclusiveDelta))) {
          start = p;
          return t;
        }
      }

      // if the above for loop failed, it means all blocks in both directions
      // (left->right, up->down) from current block from point are locked
      // and/or all blocks have reached max updates
      Point p = start;
      // solution to above issue in comment = advance using diagonal and check
      // from there
      for (int i = 0; i < numDims; ++i) {
        nextPoint(p, i, 1);
      }

      // below will end up looping through entire grid looking for a block
      // to work on; in some cases a block will be looped over more than once
      // (see below TODO)
      // TODO probably unoptimal: if any limit has hit 0, is it the case that
      // the entire grid has been looked at already? This comment writer thinks
      // the answer is yes in which case the below is doing extra work
      while (std::any_of(limit.begin(), limit.end(),
                         [](size_t x) { return x > 0; })) {
        for (int i = 0; i < numDims; ++i) {
          if (limit[i] > 1 && (t = probeBlock(p, i, limit[i] - 1))) {
            start = p;
            return t;
          }
        }

        for (int i = 0; i < numDims; ++i) {
          if (limit[i] > 0) {
            limit[i] -= 1;
            nextPoint(p, i, 1);
          }
        }
      }
    }

    return nullptr;
  }

  /**
   * Apply the provided function to the task/block.
   *
   * Dense update, i.e. update everything in the block even if no edge exists.
   *
   * @tparam UseDense must be true
   * @tparam Function function type
   *
   * @param fn Function to apply to 2 nodes
   * @param task Task that contains block information
   */
  template <bool UseDense, typename Function>
  void executeBlock(Function& fn, Task& task,
                    typename std::enable_if<UseDense>::type* = 0) {
    GetDst getDst{&g};

    for (auto ii = task.startX; ii != task.endX; ++ii) {
      for (auto jj = g.begin() + task.startY,
                ej = g.begin() + task.endYInclusive + 1;
           jj != ej; ++jj) {
        fn(*ii, *jj);
      }
    }
  }

  /**
   * Apply the provided function to the task/block.
   *
   * Sparse update, i.e. update nodes only if edge exists.
   *
   * @tparam UseDense must be false
   * @tparam Function function type
   *
   * @param fn Function to apply to 2 nodes + an edge
   * @param task Task that contains block information
   */
  template <bool UseDense, typename Function>
  void executeBlock(Function& fn, Task& task,
                    typename std::enable_if<!UseDense>::type* = 0) {
    GetDst getDst{&g};

    for (auto ii = task.startX; ii != task.endX; ++ii) {
      no_deref_iterator nbegin(
          g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED));
      no_deref_iterator nend(g.edge_end(*ii, galois::MethodFlag::UNPROTECTED));

      // iterates over the edges, but edge_dst_iterator xforms it to the dest
      // node itself
      edge_dst_iterator dbegin(nbegin, getDst);
      edge_dst_iterator dend(nend, getDst);

      // TODO check if we want to use experimental
      // if (UseExp &&
      //    cutoff < 0 &&
      //    std::distance(g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED),
      //      g.edge_end(*ii, galois::MethodFlag::UNPROTECTED)) >= -cutoff) {
      //  continue;
      //} else if (UseExp &&
      //           cutoff > 0 &&
      //           std::distance(g.edge_begin(*ii,
      //                           galois::MethodFlag::UNPROTECTED),
      //                         g.edge_end(*ii,
      //                           galois::MethodFlag::UNPROTECTED)) < cutoff) {
      //  continue;
      //}

      for (auto jj = std::lower_bound(dbegin, dend, task.startY); jj != dend;) {
        // if (UseExp) {
        //  constexpr int numTimes = 1;
        //  constexpr int width = 1;
        //  bool done = false;
        //  for (int times = 0; times < numTimes; ++times) {
        //    for (int i = 0; i < width; ++i) {
        //      edge_iterator edge = *(jj+i).base();
        //      if (*(jj + i) > task.endYInclusive) {
        //        done = true;
        //        break;
        //      }

        //      fn(*ii, *(jj+i), edge);
        //    }
        //  }
        //  if (done)
        //    break;
        //  for (int i = 0; jj != dend && i < width; ++jj, ++i)
        //    ;
        //  if (jj == dend)
        //    break;
        //} else {
        edge_iterator edge = *jj.base();
        if (*jj > task.endYInclusive)
          break;

        fn(*ii, *jj, edge);
        ++jj;
        //}
      }
    }
  }

  /**
   * Bulk Synchronous Diagonals: Static work assignment
   *
   * From the start point assigned to each thread, loop across the grid
   * diagonally, moving a step in the direction of the longer of the X or Y
   * direction every round and working along the diagonal there.
   *
   * @tparam UseDense dense update (all nodes in block update with all other
   * nodes) or sparse update (update only if edge exists)
   * @tparam Type of function specifying how to do update between nodes
   *
   * @param fn Function used to update nodes
   * @param tid Thread id
   * @param total Total number of threads
   */
  template <bool UseDense, typename Function>
  void executeLoopExp(Function fn, unsigned tid, unsigned total) {
    Point numBlocks{locks[0].size(), locks[1].size()};
    Point block;
    Point start;

    // TODO this assigns each thread a block along the diagonal, which is
    // probably NOT what you want in this executor since each block will go
    // along the diagonal; fix this
    for (int i = 0; i < numDims; ++i) {
      block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread
      start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start
    }

    // Move diagonal along dim each round
    // if more y than x, then dim is 1 (i.e. y), else 0
    int dim  = numBlocks[0] < numBlocks[1] ? 1 : 0;
    int odim = (dim + 1) % 2;
    // num blocks in dim dimension
    size_t maxRounds = numBlocks[dim];

    for (size_t rounds = 0; rounds < maxRounds; ++rounds) {
      Point p{start[0], start[1]};
      nextPoint(p, dim, rounds);

      size_t ntries =
          std::min(block[odim] * (tid + 1), numBlocks[odim]) - start[odim];
      for (size_t tries = 0; tries < ntries; ++tries) {
        Task* t = probeBlock(p, 0, 1); // probe block I am currently on
        if (t) {
          executeBlock<UseDense>(fn, *t);

          if (useLocks) {
            for (int i = 0; i < numDims; ++i)
              locks[i][t->coord[i]].unlock();
          }
        }

        for (int i = 0; i < numDims; ++i)
          nextPoint(p, i, 1);
      }

      barrier.wait();
    }
  }

  // TODO examine this
  // bulk synchronous diagonals: dynamic assignment within diagonals
  template <bool UseDense, typename Function>
  void executeLoopExp2(Function fn, unsigned tid, unsigned total) {
    Point numBlocks{{locks[0].size(), locks[1].size()}};
    Point block;
    Point start;
    for (int i = 0; i < numDims; ++i) {
      block[i] = (numBlocks[i] + total - 1) / total;
      start[i] = std::min(block[i] * tid, numBlocks[i] - 1);
    }

    // Move diagonal along dim each round
    int dim          = numBlocks[0] < numBlocks[1] ? 1 : 0;
    int odim         = (dim + 1) % 2;
    size_t maxRounds = numBlocks[dim];

    for (size_t round = 0; round < maxRounds; ++round) {
      Point base{{start[0], start[1]}};
      nextPoint(base, dim, round);
      for (size_t tries = 0; tries < numBlocks[odim]; ++tries) {
        size_t index = tries + base[odim];
        if (index >= numBlocks[odim])
          index -= numBlocks[odim];
        Point p{};
        nextPoint(p, dim, round);
        nextPoint(p, odim, index);
        nextPoint(p, dim, index);

        Task* t = probeBlock(p, 0, 1);
        if (!t)
          continue;
        executeBlock<UseDense>(fn, *t);

        if (useLocks) {
          for (int i = 0; i < numDims; ++i)
            locks[i][t->coord[i]].unlock();
        }
      }

      barrier.wait();
    }
  }

  // TODO this function is imprecise by virtue of nextBlock being a bad
  // function
  /**
   * Execute a function over the grid. Dynamic work: a thread can potentially
   * get any block.
   *
   * @tparam UseDense dense update (all nodes in block update with all other
   * nodes) or sparse update (update only if edge exists)
   * @tparam Type of function specifying how to do update between nodes
   *
   * @param fn Function used to update 2 nodes
   * @param tid Thread id
   * @param total Total number of threads
   */
  template <bool UseDense, typename Function>
  void executeLoopOrig(Function fn, unsigned tid, unsigned total) {
    Point numBlocks{{locks[0].size(), locks[1].size()}};
    Point block;
    Point start;

    // find out each thread's starting point; essentially what it is doing
    // is assinging each thread to a block on the diagonal to begin with
    for (int i = 0; i < numDims; ++i) {
      block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread
      start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start
    }

    unsigned coresPerSocket =
        galois::substrate::getThreadPool().getMaxCores() /
        galois::substrate::getThreadPool().getMaxSockets();

    // if using locks, readjust start Y location of this thread to location of
    // the thread's socket
    if (useLocks) {
      start = {{start[0],
                std::min(block[1] *
                             galois::substrate::getThreadPool().getSocket(tid) *
                             coresPerSocket,
                         numBlocks[1] - 1)}};
    }

    Point p = start;

    for (int i = 0;; ++i) {
      Task* t = nextBlock(p, i == 0);
      // TODO: Replace with sparse worklist, etc.
      if (!t)
        break;

      executeBlock<UseDense>(fn, *t);

      // unlock the task block if using locks (next block returns the task with
      // the block locked)
      if (useLocks) {
        for (int i = 0; i < numDims; ++i) {
          locks[i][t->coord[i]].unlock();
        }
      }
    }
  }

  /**
   * Wrapper for calling a loop executor function.
   * @tparam UseDense dense update (all nodes in block update with all other
   * nodes) or sparse update (update only if edge exists)
   * @tparam Type of function specifying how to do update between nodes
   *
   * @param fn Function used to update 2 nodes
   * @param tid Thread id
   * @param total Total number of threads
   */
  template <bool UseDense, typename Function>
  void executeLoop(Function fn, unsigned tid, unsigned total) {
    // if (false && UseExp)
    //  executeLoopExp2<UseDense>(fn, tid, total);
    // else
    executeLoopOrig<UseDense>(fn, tid, total);
  }

  /**
   * Given the range of elements in the X dimension and the range of elements
   * in the Y dimension with their respective sizes, divide the grid of
   * work into blocks and save the blocks to this structure.
   *
   * @param firstX first element in X dimension
   * @param lastX last element (non inclusive) in X dimension
   * @param firstY first element in Y dimension
   * @param lastY last element (non inclusive) in Y dimension
   * @param sizeX desired size of blocks in X dimension
   * @param sizeY desired size of blocks in Y dimension
   */
  void initializeTasks(iterator firstX, iterator lastX, iterator firstY,
                       iterator lastY, size_t sizeX, size_t sizeY) {
    const size_t numXBlocks =
        (std::distance(firstX, lastX) + sizeX - 1) / sizeX;
    const size_t numYBlocks =
        (std::distance(firstY, lastY) + sizeY - 1) / sizeY;
    const size_t numBlocks = numXBlocks * numYBlocks;

    // locks[0].create(numXBlocks);
    // locks[1].create(numYBlocks);
    // tasks.create(numBlocks);
    locks[0].resize(numXBlocks);
    locks[1].resize(numYBlocks);
    tasks.resize(numBlocks);

    // TODO parallelize this?
    // assign each block the X and Y that it is responsible for
    for (size_t i = 0; i < numBlocks; ++i) {
      Task& task = tasks[i];
      task.coord = {{i % numXBlocks, i / numXBlocks}};
      std::tie(task.startX, task.endX) =
          galois::block_range(firstX, lastX, task.coord[0], numXBlocks);
      iterator s;
      iterator e;
      std::tie(s, e) =
          galois::block_range(firstY, lastY, task.coord[1], numYBlocks);
      // XXX: Works for CSR graphs
      task.startY        = *s;
      task.endYInclusive = *e - 1;
    }
  }

  /**
   * Process assigned to each thread. Each thread calls execute loop which will
   * run the provided function over the grid.
   *
   * @tparam UseDense dense update (all nodes in block update with all other
   * nodes) or sparse update (update only if edge exists)
   * @tparam Function function type
   */
  template <bool UseDense, typename Function>
  struct Process {
    Fixed2DGraphTiledExecutor* self;
    Function fn;

    void operator()(unsigned tid, unsigned total) {
      self->executeLoop<UseDense>(fn, tid, total);
    }
  };

public:
  Fixed2DGraphTiledExecutor(Graph& g, int cutoff = 0)
      : g(g), cutoff(cutoff),
        barrier(galois::runtime::getBarrier(galois::getActiveThreads())) {}

  /**
   * Report the number of probe block failures to statistics.
   */
  ~Fixed2DGraphTiledExecutor() {
    galois::runtime::reportStat_Single("TiledExecutor", "ProbeFailures",
                                       failedProbes.reduce());
  }

  /**
   * Execute a function on a provided X set of nodes and Y set of nodes
   * for a certain number of iterations. Only update nodes x and y if
   * an edge exists between them (sparse).
   *
   * @tparam Function function type
   *
   * @param firstX first element in X dimension
   * @param lastX last element (non inclusive) in X dimension
   * @param firstY first element in Y dimension
   * @param lastY last element (non inclusive) in Y dimension
   * @param sizeX desired size of blocks in X dimension
   * @param sizeY desired size of blocks in Y dimension
   * @param fn Function used to update nodes
   * @param _useLocks true if locks are desired when updating blocks
   * @param numIterations Max number of iterations to run each block in the
   * tiled executor for
   */
  template <typename Function>
  void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY,
               size_t sizeX, size_t sizeY, Function fn, bool _useLocks,
               unsigned numIterations = 1) {
    initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);
    numTasks   = tasks.size();
    maxUpdates = numIterations;
    useLocks   = _useLocks;

    Process<false, Function> p{this, fn};

    galois::on_each(p);

    // TODO remove after worklist fix
    if (std::any_of(tasks.begin(), tasks.end(),
                    [this](Task& t) { return t.updates.value < maxUpdates; })) {
      galois::gWarn("Missing tasks");
    }
  }

  /**
   * Execute a function on a provided X set of nodes and Y set of nodes
   * for a certain number of iterations. Updates nodes x and y regardless
   * of whether or not an edge exists between them (dense).
   *
   * @tparam Function function type
   *
   * @param firstX first element in X dimension
   * @param lastX last element (non inclusive) in X dimension
   * @param firstY first element in Y dimension
   * @param lastY last element (non inclusive) in Y dimension
   * @param sizeX desired size of blocks in X dimension
   * @param sizeY desired size of blocks in Y dimension
   * @param fn Function used to update nodes
   * @param _useLocks true if locks are desired when updating blocks
   * @param numIterations Max number of iterations to run each block in the
   * tiled executor for
   */
  template <typename Function>
  void executeDense(iterator firstX, iterator lastX, iterator firstY,
                    iterator lastY, size_t sizeX, size_t sizeY, Function fn,
                    bool _useLocks, int numIterations = 1) {
    initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);
    numTasks   = tasks.size();
    maxUpdates = numIterations;
    useLocks   = _useLocks;
    Process<true, Function> p{this, fn};
    galois::on_each(p);

    // TODO remove after worklist fix
    if (std::any_of(tasks.begin(), tasks.end(),
                    [this](Task& t) { return t.updates.value < maxUpdates; })) {
      galois::gWarn("Missing tasks");
    }
  }
};

} // namespace runtime
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/runtime/Tracer.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Tracer.h
 *
 * Includes functions for tracing output and printing data.
 */
#ifndef GALOIS_RUNTIME_TRACER_H
#define GALOIS_RUNTIME_TRACER_H

#include <functional>
#include <sstream>

#include "galois/config.h"
#include "galois/substrate/EnvCheck.h"
#include "galois/PODResizeableArray.h"

namespace galois {
namespace runtime {

namespace internal {

/**
 * Base case for traceImpl; ends the line with a new line.
 */
static inline void traceImpl(std::ostringstream& os) { os << "\n"; }

/**
 * Prints out a value to the output stream.
 */
template <typename T, typename... Args>
static inline void traceImpl(std::ostringstream& os, T&&, Args&&... args) {
  // os << value << " ";
  traceImpl(os, std::forward<Args>(args)...);
}

/**
 * Format string to os.
 */
static inline void traceFormatImpl(std::ostringstream& os, const char* format) {
  os << format;
}

/**
 * Format string to os as well as something else to print.
 */
template <typename T, typename... Args>
static inline void traceFormatImpl(std::ostringstream& os, const char* format,
                                   T&& value, Args&&... args) {
  for (; *format != '\0'; format++) {
    if (*format == '%') {
      os << value;
      traceFormatImpl(os, format + 1, std::forward<Args>(args)...);
      return;
    }
    os << *format;
  }
}

/**
 * Class to print a vector.
 */
template <typename T>
class vecPrinter {
  const galois::PODResizeableArray<T>& v;

public:
  vecPrinter(const galois::PODResizeableArray<T>& _v) : v(_v) {}
  void print(std::ostream& os) const {
    os << "< " << v.size() << " : ";
    for (auto& i : v)
      os << " " << (int)i;
    os << ">";
  }
};

/**
 * Operator to print a vector given a vecPrinter object
 */
template <typename T>
std::ostream& operator<<(std::ostream& os, const vecPrinter<T>& vp) {
  vp.print(os);
  return os;
}

/**
 * Prints trace data (which has time data included).
 */
void printTrace(std::ostringstream&);

/**
 * Prints out string stream.
 */
void print_output_impl(std::ostringstream&);

extern bool doTrace;
extern bool initTrace;

} // namespace internal

/**
 * Given a vector, returns a vector printer object that is able
 * to print the vector out onto an output stream.
 */
template <typename T>
internal::vecPrinter<T> printVec(const galois::PODResizeableArray<T>& v) {
  return internal::vecPrinter<T>(v);
};

/**
 * Prints a trace log of the provided arguments if debug mode is on.
 */
#ifdef NDEBUG
template <typename... Args>
static inline void trace(Args&&...) {}
#else
template <typename... Args>
static inline void trace(Args&&... args) {
  if (!internal::initTrace) {
    internal::doTrace   = substrate::EnvCheck("GALOIS_DEBUG_TRACE");
    internal::initTrace = true;
  }
  if (internal::doTrace) {
    std::ostringstream os;
    internal::traceImpl(os, std::forward<Args>(args)...);
    internal::printTrace(os);
  }
}
#endif

/**
 * Prints data to an output stream.
 *
 * @param format Format string
 * @param args data to print
 */
template <typename... Args>
static inline void printOutput(const char* format, Args&&... args) {
  std::ostringstream os;
  internal::traceFormatImpl(os, format, std::forward<Args>(args)...);
  internal::print_output_impl(os);
}
} // namespace runtime
} // namespace galois

#endif


================================================
FILE: libgalois/include/galois/runtime/UserContextAccess.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_RUNTIME_USERCONTEXTACCESS_H
#define GALOIS_RUNTIME_USERCONTEXTACCESS_H

#include "galois/config.h"
#include "galois/UserContext.h"

namespace galois {
namespace runtime {

//! Backdoor to allow runtime methods to access private data in UserContext
template <typename T>
class UserContextAccess : public galois::UserContext<T> {
public:
  typedef galois::UserContext<T> SuperTy;
  typedef typename SuperTy::PushBufferTy PushBufferTy;
  typedef typename SuperTy::FastPushBack FastPushBack;

  void resetAlloc() { SuperTy::__resetAlloc(); }
  PushBufferTy& getPushBuffer() { return SuperTy::__getPushBuffer(); }
  void resetPushBuffer() { SuperTy::__resetPushBuffer(); }
  SuperTy& data() { return *static_cast<SuperTy*>(this); }
  void setLocalState(void* p) { SuperTy::__setLocalState(p); }
  void setFastPushBack(FastPushBack f) { SuperTy::__setFastPushBack(f); }
  void setBreakFlag(bool* b) {
    SuperTy::didBreak = b;
  } // NOLINT(readability-non-const-parameter)

  void setFirstPass(void) { SuperTy::__setFirstPass(); }
  void resetFirstPass(void) { SuperTy::__resetFirstPass(); }
};

} // namespace runtime
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/Barrier.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_BARRIER_H
#define GALOIS_SUBSTRATE_BARRIER_H

#include <functional>
#include <memory>

#include "galois/config.h"
#include "galois/gIO.h"
#include "galois/substrate/ThreadPool.h"

namespace galois {
namespace substrate {

class Barrier {
public:
  virtual ~Barrier();

  // not safe if any thread is in wait
  virtual void reinit(unsigned val) = 0;

  // Wait at this barrier
  virtual void wait() = 0;

  // wait at this barrier
  void operator()(void) { wait(); }

  // barrier type.
  virtual const char* name() const = 0;
};

/**
 * Return a reference to system barrier
 */
Barrier& getBarrier(unsigned activeThreads);

/**
 * Create specific types of barriers.  For benchmarking only.  Use
 * getBarrier() for all production code
 */
std::unique_ptr<Barrier> createPthreadBarrier(unsigned);
std::unique_ptr<Barrier> createMCSBarrier(unsigned);
std::unique_ptr<Barrier> createTopoBarrier(unsigned);
std::unique_ptr<Barrier> createCountingBarrier(unsigned);
std::unique_ptr<Barrier> createDisseminationBarrier(unsigned);

/**
 * Creates a new simple barrier. This barrier is not designed to be fast but
 * does gaurantee that all threads have left the barrier before returning
 * control. Useful when the number of active threads is modified to avoid a
 * race in {@link getBarrier()}.  Client is reponsible for deallocating
 * returned barrier.
 */
std::unique_ptr<Barrier> createSimpleBarrier(unsigned int);

namespace internal {

template <typename _UNUSED = void>
struct BarrierInstance {
  unsigned m_num_threads;
  std::unique_ptr<Barrier> m_barrier;

  BarrierInstance(void) {
    m_num_threads = getThreadPool().getMaxThreads();
    m_barrier     = createTopoBarrier(m_num_threads);
  }

  Barrier& get(unsigned numT) {
    GALOIS_ASSERT(numT > 0,
                  "substrate::getBarrier() number of threads must be > 0");

    numT = std::min(numT, getThreadPool().getMaxUsableThreads());
    numT = std::max(numT, 1u);

    if (numT != m_num_threads) {
      m_num_threads = numT;
      m_barrier->reinit(numT);
    }

    return *m_barrier;
  }
};

void setBarrierInstance(BarrierInstance<>* bi);

} // end namespace internal

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/CacheLineStorage.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_CACHELINESTORAGE_H
#define GALOIS_SUBSTRATE_CACHELINESTORAGE_H

#include <utility>

#include "galois/config.h"
#include "galois/substrate/CompilerSpecific.h"

namespace galois::substrate {

// Store an item with padding
template <typename T>
struct CacheLineStorage {
  alignas(GALOIS_CACHE_LINE_SIZE) T data;

  char buffer[GALOIS_CACHE_LINE_SIZE - (sizeof(T) % GALOIS_CACHE_LINE_SIZE)];
  // static_assert(sizeof(T) < GALOIS_CACHE_LINE_SIZE, "Too large a type");

  CacheLineStorage() : data() {}
  CacheLineStorage(const T& v) : data(v) {}

  template <typename A>
  explicit CacheLineStorage(A&& v) : data(std::forward<A>(v)) {}

  explicit operator T() { return data; }

  T& get() { return data; }
  template <typename V>
  CacheLineStorage& operator=(const V& v) {
    data = v;
    return *this;
  }
};

} // namespace galois::substrate

#endif


================================================
FILE: libgalois/include/galois/substrate/CompilerSpecific.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_COMPILERSPECIFIC_H
#define GALOIS_SUBSTRATE_COMPILERSPECIFIC_H

#include "galois/config.h"

namespace galois::substrate {

inline static void asmPause() {
#if defined(__i386__) || defined(__amd64__)
  //  __builtin_ia32_pause();
  asm volatile("pause");
#endif
}

inline static void compilerBarrier() { asm volatile("" ::: "memory"); }

// xeons have 64 byte cache lines, but will prefetch 2 at a time
constexpr int GALOIS_CACHE_LINE_SIZE = 128;

#if defined(__INTEL_COMPILER)
#define GALOIS_ATTRIBUTE_NOINLINE __attribute__((noinline))

#elif defined(__GNUC__)
#define GALOIS_ATTRIBUTE_NOINLINE __attribute__((noinline))

#else
#define GALOIS_ATTRIBUTE_NOINLINE
#endif

} // namespace galois::substrate

#endif


================================================
FILE: libgalois/include/galois/substrate/EnvCheck.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_ENVCHECK_H
#define GALOIS_SUBSTRATE_ENVCHECK_H

#include <cassert>
#include <string>

#include "galois/config.h"

namespace galois {
namespace substrate {

namespace internal {

template <typename T>
struct ConvByType {};

template <>
struct ConvByType<int> {
  static void go(const char* varVal, int& ret) {
    assert(varVal);
    ret = std::atoi(varVal);
  }
};

template <>
struct ConvByType<double> {
  static void go(const char* varVal, double& ret) {
    assert(varVal);
    ret = std::atof(varVal);
  }
};

template <>
struct ConvByType<std::string> {
  static void go(const char* varVal, std::string& ret) {
    assert(varVal);
    ret = varVal;
  }
};

template <typename T>
bool genericGetEnv(const char* varName, T& ret) {

  char* varVal = getenv(varName);
  if (varVal) {
    ConvByType<T>::go(varVal, ret);
    return true;
  } else {
    return false;
  }
}

} // end namespace internal

//! Return true if the Enviroment variable is set
bool EnvCheck(const char* varName);
bool EnvCheck(const std::string& varName);

/**
 * Return true if Enviroment variable is set, and extract its value into
 * 'retVal' parameter
 * @param varName: name of the variable
 * @param retVal: lvalue to store the value of environment variable
 * @return true if environment variable set, false otherwise
 */
template <typename T>
bool EnvCheck(const char* varName, T& retVal) {
  return internal::genericGetEnv(varName, retVal);
}

template <typename T>
bool EnvCheck(const std::string& varName, T& retVal) {
  return EnvCheck(varName.c_str(), retVal);
}

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/HWTopo.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_HWTOPO_H
#define GALOIS_SUBSTRATE_HWTOPO_H

#include <string>
#include <vector>

#include "galois/config.h"

namespace galois::substrate {

struct ThreadTopoInfo {
  unsigned tid;                 // this thread (galois id)
  unsigned socketLeader;        // first thread id in tid's socket
  unsigned socket;              // socket (L3 normally) of thread
  unsigned numaNode;            // memory bank.  may be different than socket.
  unsigned cumulativeMaxSocket; // max socket id seen from [0, tid]
  unsigned osContext;           // OS ID to use for thread binding
  unsigned osNumaNode;          // OS ID for numa node
};

struct MachineTopoInfo {
  unsigned maxThreads;
  unsigned maxCores;
  unsigned maxSockets;
  unsigned maxNumaNodes;
};

struct HWTopoInfo {
  MachineTopoInfo machineTopoInfo;
  std::vector<ThreadTopoInfo> threadTopoInfo;
};

/**
 * getHWTopo determines the machine topology from the process information
 * exposed in /proc and /dev filesystems.
 */
HWTopoInfo getHWTopo();

/**
 * parseCPUList parses cpuset information in "List format" as described in
 * cpuset(7) and available under /proc/self/status
 */
std::vector<int> parseCPUList(const std::string& in);

/**
 * bindThreadSelf binds a thread to an osContext as returned by getHWTopo.
 */
bool bindThreadSelf(unsigned osContext);

} // namespace galois::substrate

#endif


================================================
FILE: libgalois/include/galois/substrate/NumaMem.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_NUMAMEM
#define GALOIS_SUBSTRATE_NUMAMEM

#include <cstddef>
#include <memory>
#include <vector>

#include "galois/config.h"

namespace galois {
namespace substrate {

namespace internal {
struct largeFreer {
  size_t bytes;
  void operator()(void* ptr) const;
};
} // namespace internal

typedef std::unique_ptr<void, internal::largeFreer> LAptr;

LAptr largeMallocLocal(size_t bytes);    // fault in locally
LAptr largeMallocFloating(size_t bytes); // leave numa mapping undefined
// fault in interleaved mapping
LAptr largeMallocInterleaved(size_t bytes, unsigned numThreads);
// fault in block interleaved mapping
LAptr largeMallocBlocked(size_t bytes, unsigned numThreads);

// fault in specified regions for each thread (threadRanges)
template <typename RangeArrayTy>
LAptr largeMallocSpecified(size_t bytes, uint32_t numThreads,
                           RangeArrayTy& threadRanges, size_t elementSize);

} // namespace substrate
} // namespace galois

#endif // GALOIS_SUBSTRATE_NUMAMEM


================================================
FILE: libgalois/include/galois/substrate/PaddedLock.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_PADDEDLOCK_H
#define GALOIS_SUBSTRATE_PADDEDLOCK_H

#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/CacheLineStorage.h"

namespace galois {
namespace substrate {

/// PaddedLock is a spinlock.  If the second template parameter is
/// false, the lock is a noop.
template <bool concurrent>
class PaddedLock;

template <>
class PaddedLock<true> {
  mutable CacheLineStorage<SimpleLock> Lock;

public:
  void lock() const { Lock.get().lock(); }
  bool try_lock() const { return Lock.get().try_lock(); }
  void unlock() const { Lock.get().unlock(); }
};

template <>
class PaddedLock<false> {
public:
  void lock() const {}
  bool try_lock() const { return true; }
  void unlock() const {}
};

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/PageAlloc.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_PAGEALLOC_H
#define GALOIS_SUBSTRATE_PAGEALLOC_H

#include <cstddef>

#include "galois/config.h"

#ifdef __linux__
#include <linux/mman.h>
#endif
#include <sys/mman.h>

#include <utility>
#ifdef HAVE_MMAP64
namespace galois {
template <typename... Args>
void* mmap(void* addr, Args... args) { // 0 -> nullptr
  return ::mmap64(addr, std::forward<Args>(args)...);
}
} // namespace galois
//! offset type for mmap
typedef off64_t offset_t;
#else
namespace galois {
template <typename... Args>
void* mmap(void* addr, Args... args) { // 0 -> nullptr
  return ::mmap(addr, std::forward<Args>(args)...);
}
} // namespace galois
//! offset type for mmap
typedef off_t offset_t;
#endif

// mmap flags
#if defined(MAP_ANONYMOUS)
static const int _MAP_ANON = MAP_ANONYMOUS;
#elif defined(MAP_ANON)
static const int _MAP_ANON = MAP_ANON;
#else
static_assert(0, "No Anonymous mapping");
#endif

namespace galois {
namespace substrate {

// size of pages
size_t allocSize();

// allocate contiguous pages, optionally faulting them in
void* allocPages(unsigned num, bool preFault);

// free page range
void freePages(void* ptr, unsigned num);

} // namespace substrate
} // namespace galois

#endif // GALOIS_SUBSTRATE_PAGEALLOC_H


================================================
FILE: libgalois/include/galois/substrate/PerThreadStorage.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_PERTHREADSTORAGE_H
#define GALOIS_SUBSTRATE_PERTHREADSTORAGE_H

#include <cassert>
#include <cstddef>
#include <utility>
#include <vector>

#include "galois/config.h"
#include "galois/substrate/HWTopo.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/substrate/ThreadPool.h"

namespace galois {
namespace substrate {

class PerBackend {
  typedef substrate::SimpleLock Lock;

  std::atomic<unsigned int> nextLoc{0};
  std::atomic<char*>* heads{nullptr};
  Lock freeOffsetsLock;
  std::vector<std::vector<unsigned>> freeOffsets;
  /**
   * Guards access to non-POD objects that can be accessed after PerBackend
   * is destroyed. Access can occur through destroying PerThread/PerSocket
   * objects with static storage duration, which have a reference to a
   * PerBackend object, which may have be destroyed before the PerThread
   * object itself.
   */
  bool invalid{false};

  void initCommon(unsigned maxT);
  static unsigned nextLog2(unsigned size);

public:
  PerBackend();

  PerBackend(const PerBackend&) = delete;
  PerBackend& operator=(const PerBackend&) = delete;

  ~PerBackend() {
    // Intentionally leak heads so that other PerThread operations are
    // still valid after we are gone
    invalid = true;
  }

  char* initPerThread(unsigned maxT);
  char* initPerSocket(unsigned maxT);

  unsigned allocOffset(const unsigned size);
  void deallocOffset(const unsigned offset, const unsigned size);
  void* getRemote(unsigned thread, unsigned offset);
  void* getLocal(unsigned offset, char* base) { return &base[offset]; }
  // faster when (1) you already know the id and (2) shared access to heads is
  // not to expensive; otherwise use getLocal(unsigned,char*)
  void* getLocal(unsigned offset, unsigned id) { return &heads[id][offset]; }
};

extern thread_local char* ptsBase;
PerBackend& getPTSBackend();

extern thread_local char* pssBase;
PerBackend& getPPSBackend();

void initPTS(unsigned maxT);

template <typename T>
class PerThreadStorage {
protected:
  PerBackend* b;
  unsigned offset;

  void destruct() {
    if (offset == ~0U)
      return;

    for (unsigned n = 0; n < getThreadPool().getMaxThreads(); ++n)
      reinterpret_cast<T*>(b->getRemote(n, offset))->~T();
    b->deallocOffset(offset, sizeof(T));
    offset = ~0U;
  }

public:
  // construct on each thread
  template <typename... Args>
  PerThreadStorage(Args&&... args) : b(&getPTSBackend()) {
    // in case we make one of these before initializing the thread pool
    // This will call initPTS for each thread if it hasn't already
    auto& tp = getThreadPool();

    offset = b->allocOffset(sizeof(T));
    for (unsigned n = 0; n < tp.getMaxThreads(); ++n)
      new (b->getRemote(n, offset)) T(std::forward<Args>(args)...);
  }

  PerThreadStorage(PerThreadStorage&& rhs) : b(rhs.b), offset(rhs.offset) {
    rhs.offset = ~0;
  }

  ~PerThreadStorage() { destruct(); }

  PerThreadStorage& operator=(PerThreadStorage&& rhs) {
    std::swap(offset, rhs.offset);
    std::swap(b, rhs.b);
    return *this;
  }

  T* getLocal() {
    void* ditem = b->getLocal(offset, ptsBase);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getLocal() const {
    void* ditem = b->getLocal(offset, ptsBase);
    return reinterpret_cast<T*>(ditem);
  }

  //! Like getLocal() but optimized for when you already know the thread id
  T* getLocal(unsigned int thread) {
    void* ditem = b->getLocal(offset, thread);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getLocal(unsigned int thread) const {
    void* ditem = b->getLocal(offset, thread);
    return reinterpret_cast<T*>(ditem);
  }

  T* getRemote(unsigned int thread) {
    void* ditem = b->getRemote(thread, offset);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getRemote(unsigned int thread) const {
    void* ditem = b->getRemote(thread, offset);
    return reinterpret_cast<T*>(ditem);
  }

  unsigned size() const { return getThreadPool().getMaxThreads(); }
};

template <typename T>
class PerSocketStorage {
protected:
  unsigned offset;
  PerBackend& b;

  void destruct() {
    auto& tp = getThreadPool();
    for (unsigned n = 0; n < tp.getMaxSockets(); ++n)
      reinterpret_cast<T*>(b.getRemote(tp.getLeaderForSocket(n), offset))->~T();
    b.deallocOffset(offset, sizeof(T));
  }

public:
  template <typename... Args>
  PerSocketStorage(Args&&... args) : b(getPPSBackend()) {
    // in case we make one of these before initializing the thread pool
    // This will call initPTS for each thread if it hasn't already
    getThreadPool();

    offset   = b.allocOffset(sizeof(T));
    auto& tp = getThreadPool();
    for (unsigned n = 0; n < tp.getMaxSockets(); ++n)
      new (b.getRemote(tp.getLeaderForSocket(n), offset))
          T(std::forward<Args>(args)...);
  }

  PerSocketStorage(PerSocketStorage&& o)
      : offset(std::move(o.offset)), b(getPPSBackend()) {}
  PerSocketStorage& operator=(PerSocketStorage&& o) {
    destruct();
    offset = std::move(o.offset);
    return *this;
  }

  PerSocketStorage(const PerSocketStorage&) = delete;
  PerSocketStorage& operator=(const PerSocketStorage&) = delete;

  ~PerSocketStorage() { destruct(); }

  T* getLocal() {
    void* ditem = b.getLocal(offset, pssBase);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getLocal() const {
    void* ditem = b.getLocal(offset, pssBase);
    return reinterpret_cast<T*>(ditem);
  }

  //! Like getLocal() but optimized for when you already know the thread id
  T* getLocal(unsigned int thread) {
    void* ditem = b.getLocal(offset, thread);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getLocal(unsigned int thread) const {
    void* ditem = b.getLocal(offset, thread);
    return reinterpret_cast<T*>(ditem);
  }

  T* getRemote(unsigned int thread) {
    void* ditem = b.getRemote(thread, offset);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getRemote(unsigned int thread) const {
    void* ditem = b.getRemote(thread, offset);
    return reinterpret_cast<T*>(ditem);
  }

  T* getRemoteByPkg(unsigned int pkg) {
    void* ditem = b.getRemote(getThreadPool().getLeaderForSocket(pkg), offset);
    return reinterpret_cast<T*>(ditem);
  }

  const T* getRemoteByPkg(unsigned int pkg) const {
    void* ditem = b.getRemote(getThreadPool().getLeaderForSocket(pkg), offset);
    return reinterpret_cast<T*>(ditem);
  }

  unsigned size() const { return getThreadPool().getMaxThreads(); }
};

} // namespace substrate
} // end namespace galois
#endif


================================================
FILE: libgalois/include/galois/substrate/PtrLock.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_PTRLOCK_H
#define GALOIS_SUBSTRATE_PTRLOCK_H

#include <cstdint>
#include <cassert>
#include <atomic>

#include "galois/config.h"
#include "galois/substrate/CompilerSpecific.h"

namespace galois {
namespace substrate {

namespace internal {
void ptr_slow_lock(std::atomic<uintptr_t>& l);
}

/// PtrLock is a spinlock and a pointer.  This wraps a pointer and
/// uses the low order bit for the lock flag Copying a lock is
/// unsynchronized (relaxed ordering)

template <typename T>
class PtrLock {
  std::atomic<uintptr_t> _lock;

  //  static_assert(alignof(T) > 1, "Bad data type alignment for PtrLock");

public:
  constexpr PtrLock() : _lock(0) {}
  // relaxed order for copy
  PtrLock(const PtrLock& p) : _lock(p._lock.load(std::memory_order_relaxed)) {}

  PtrLock& operator=(const PtrLock& p) {
    if (&p == this)
      return *this;
    // relaxed order for initialization
    _lock.store(p._lock.load(std::memory_order_relaxed),
                std::memory_order_relaxed);
    return *this;
  }

  inline void lock() {
    uintptr_t oldval = _lock.load(std::memory_order_relaxed);
    if (oldval & 1)
      goto slow_path;
    if (!_lock.compare_exchange_weak(oldval, oldval | 1,
                                     std::memory_order_acq_rel,
                                     std::memory_order_relaxed))
      goto slow_path;
    assert(is_locked());
    return;

  slow_path:
    internal::ptr_slow_lock(_lock);
  }

  inline void unlock() {
    assert(is_locked());
    _lock.store(_lock.load(std::memory_order_relaxed) & ~(uintptr_t)1,
                std::memory_order_release);
  }

  inline void unlock_and_clear() {
    assert(is_locked());
    _lock.store(0, std::memory_order_release);
  }

  inline void unlock_and_set(T* val) {
    assert(is_locked());
    assert(!((uintptr_t)val & 1));
    _lock.store((uintptr_t)val, std::memory_order_release);
  }

  inline T* getValue() const {
    return (T*)(_lock.load(std::memory_order_relaxed) & ~(uintptr_t)1);
  }

  inline void setValue(T* val) {
    uintptr_t nval = (uintptr_t)val;
    nval |= (_lock & 1);
    // relaxed OK since this doesn't clear lock
    _lock.store(nval, std::memory_order_relaxed);
  }

  inline bool try_lock() {
    uintptr_t oldval = _lock.load(std::memory_order_relaxed);
    if ((oldval & 1) != 0)
      return false;
    oldval = _lock.fetch_or(1, std::memory_order_acq_rel);
    return !(oldval & 1);
  }

  inline bool is_locked() const {
    return _lock.load(std::memory_order_acquire) & 1;
  }

  //! CAS only works on unlocked values
  //! the lock bit will prevent a successful cas
  inline bool CAS(T* oldval, T* newval) {
    assert(!((uintptr_t)oldval & 1) && !((uintptr_t)newval & 1));
    uintptr_t old = (uintptr_t)oldval;
    return _lock.compare_exchange_strong(old, (uintptr_t)newval);
  }

  //! CAS that works on locked values; this can be very dangerous
  //! when used incorrectly
  inline bool stealing_CAS(T* oldval, T* newval) {
    uintptr_t old = 1 | (uintptr_t)oldval;
    return _lock.compare_exchange_strong(old, 1 | (uintptr_t)newval);
  }
};

template <typename T>
class DummyPtrLock {
  T* _lock;

public:
  DummyPtrLock() : _lock() {}

  inline void lock() {}
  inline void unlock() {}
  inline void unlock_and_clear() { _lock = 0; }
  inline void unlock_and_set(T* val) { _lock = val; }
  inline T* getValue() const { return _lock; }
  inline void setValue(T* val) { _lock = val; }
  inline bool try_lock() const { return true; }
  inline bool is_locked() const { return false; }
  inline bool CAS(T* oldval, T* newval) {
    if (_lock == oldval) {
      _lock = newval;
      return true;
    }
    return false;
  }
  inline bool stealing_CAS(T* oldval, T* newval) { return CAS(oldval, newval); }
};

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/SharedMem.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_SHAREDMEM_H
#define GALOIS_SUBSTRATE_SHAREDMEM_H

#include <memory>

#include "galois/config.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/Termination.h"

namespace galois::substrate {

class SharedMem {

  // Order is critical here
  ThreadPool m_tpool;

  std::unique_ptr<internal::LocalTerminationDetection<>> m_termPtr;
  std::unique_ptr<internal::BarrierInstance<>> m_biPtr;

public:
  /**
   * Initializes the Substrate library components
   */
  SharedMem();

  /**
   * Destroys the Substrate library components
   */
  ~SharedMem();

  SharedMem(const SharedMem&) = delete;
  SharedMem& operator=(const SharedMem&) = delete;

  SharedMem(SharedMem&&) = delete;
  SharedMem& operator=(SharedMem&&) = delete;
};

} // namespace galois::substrate

#endif


================================================
FILE: libgalois/include/galois/substrate/SimpleLock.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_SIMPLELOCK_H
#define GALOIS_SUBSTRATE_SIMPLELOCK_H

#include <atomic>
#include <cassert>
#include <mutex>

#include "galois/config.h"
#include "galois/substrate/CompilerSpecific.h"

namespace galois {
namespace substrate {

/// SimpleLock is a spinlock.
/// Copying a lock is unsynchronized (relaxed ordering)

class SimpleLock {
  mutable std::atomic<int> _lock;
  void slow_lock() const;

public:
  constexpr SimpleLock() : _lock(0) {}
  // relaxed order for copy
  SimpleLock(const SimpleLock& p)
      : _lock(p._lock.load(std::memory_order_relaxed)) {}

  SimpleLock& operator=(const SimpleLock& p) {
    if (&p == this)
      return *this;
    // relaxed order for initialization
    _lock.store(p._lock.load(std::memory_order_relaxed),
                std::memory_order_relaxed);
    return *this;
  }

  inline void lock() const {
    int oldval = 0;
    if (_lock.load(std::memory_order_relaxed))
      goto slow_path;
    if (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel,
                                     std::memory_order_relaxed))
      goto slow_path;
    assert(is_locked());
    return;
  slow_path:
    slow_lock();
  }

  inline void unlock() const {
    assert(is_locked());
    // HMMMM
    _lock.store(0, std::memory_order_release);
    //_lock = 0;
  }

  inline bool try_lock() const {
    int oldval = 0;
    if (_lock.load(std::memory_order_relaxed))
      return false;
    if (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel))
      return false;
    assert(is_locked());
    return true;
  }

  inline bool is_locked() const {
    return _lock.load(std::memory_order_acquire) & 1;
  }
};

//! Dummy Lock implements the lock interface without a lock for serial code

class DummyLock {
public:
  inline void lock() const {}
  inline void unlock() const {}
  inline bool try_lock() const { return true; }
  inline bool is_locked() const { return false; }
};

template <bool Enabled>
using CondLock =
    typename std::conditional<Enabled, SimpleLock, DummyLock>::type;

using lock_guard_galois = std::lock_guard<SimpleLock>;

#define MAKE_LOCK_GUARD(__x)                                                   \
  galois::substrate::lock_guard_galois locker##___COUNTER__(__x)

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/StaticInstance.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_STATICINSTANCE_H
#define GALOIS_SUBSTRATE_STATICINSTANCE_H

#include "galois/config.h"
#include "galois/substrate/CompilerSpecific.h"

namespace galois {
namespace substrate {

// This should be much simpler in c++03 mode, but be general for now
// This exists because ptrlock is not a pod, but this is.
template <typename T>
struct StaticInstance {
  volatile T* V;
  volatile int _lock;

  inline void lock() {
    int oldval;
    do {
      while (_lock != 0) {
        substrate::asmPause();
      }
      oldval = __sync_fetch_and_or(&_lock, 1);
    } while (oldval & 1);
  }

  inline void unlock() {
    compilerBarrier();
    _lock = 0;
  }

  T* get() {
    volatile T* val = V;
    if (val)
      return (T*)val;
    lock();
    val = V;
    if (!val)
      V = val = new T();
    unlock();
    return (T*)val;
  }
};

} // end namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/Termination.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_TERMINATION_H
#define GALOIS_SUBSTRATE_TERMINATION_H

#include <atomic>

#include "galois/config.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/CacheLineStorage.h"

namespace galois {
namespace substrate {

class TerminationDetection;
/*
 * returns an object.  The object will be reused, but reinitialized to
 * activeThreads
 */
TerminationDetection& getSystemTermination(unsigned activeThreads);

class TerminationDetection {

  friend TerminationDetection& getSystemTermination(unsigned);

protected:
  CacheLineStorage<std::atomic<int>> globalTerm;

  /**
   * for internal use by child classes
   */
  virtual void init(unsigned activeThreads) = 0;

public:
  virtual ~TerminationDetection(void);
  /**
   * Initializes the per-thread state.  All threads must call this
   * before any call localTermination.
   */
  virtual void initializeThread() = 0;

  /**
   * Process termination locally.  May be called as often as needed.  The
   * argument workHappened signals that since last time it was called, some
   * progress was made that should prevent termination. All threads must call
   * initializeThread() before any thread calls this function.  This function
   * should not be on the fast path (this is why it takes a flag, to allow the
   * caller to buffer up work status changes).
   */
  virtual void localTermination(bool workHappened) = 0;

  /**
   * Returns whether global termination is detected.
   */
  bool globalTermination() const { return globalTerm.data; }
};

namespace internal {
// Dijkstra style 2-pass ring termination detection
template <typename _UNUSED = void>
class LocalTerminationDetection : public TerminationDetection {

  struct TokenHolder {
    friend class TerminationDetection;
    std::atomic<long> tokenIsBlack;
    std::atomic<long> hasToken;
    long processIsBlack;
    bool lastWasWhite; // only used by the master
  };

  galois::substrate::PerThreadStorage<TokenHolder> data;

  unsigned activeThreads;

  // send token onwards
  void propToken(bool isBlack) {
    unsigned id     = ThreadPool::getTID();
    TokenHolder& th = *data.getRemote((id + 1) % activeThreads);
    th.tokenIsBlack = isBlack;
    th.hasToken     = true;
  }

  void propGlobalTerm() { globalTerm = true; }

  bool isSysMaster() const { return ThreadPool::getTID() == 0; }

protected:
  virtual void init(unsigned aThreads) { activeThreads = aThreads; }

public:
  LocalTerminationDetection() {}

  virtual void initializeThread() {
    TokenHolder& th   = *data.getLocal();
    th.tokenIsBlack   = false;
    th.processIsBlack = true;
    th.lastWasWhite   = true;
    globalTerm        = false;
    if (isSysMaster())
      th.hasToken = true;
    else
      th.hasToken = false;
  }

  virtual void localTermination(bool workHappened) {
    assert(!(workHappened && globalTerm.get()));
    TokenHolder& th = *data.getLocal();
    th.processIsBlack |= workHappened;
    if (th.hasToken) {
      if (isSysMaster()) {
        bool failed     = th.tokenIsBlack || th.processIsBlack;
        th.tokenIsBlack = th.processIsBlack = false;
        if (th.lastWasWhite && !failed) {
          // This was the second success
          propGlobalTerm();
          return;
        }
        th.lastWasWhite = !failed;
      }
      // Normal thread or recirc by master
      assert(!globalTerm.get() &&
             "no token should be in progress after globalTerm");
      bool taint        = th.processIsBlack || th.tokenIsBlack;
      th.processIsBlack = th.tokenIsBlack = false;
      th.hasToken                         = false;
      propToken(taint);
    }
  }
};

// Dijkstra style 2-pass tree termination detection
template <typename _UNUSED = void>
class TreeTerminationDetection : public TerminationDetection {
  static const int num = 2;

  struct TokenHolder {
    friend class TerminationDetection;
    // incoming from above
    volatile long down_token;
    // incoming from below
    volatile long up_token[num];
    // my state
    long processIsBlack;
    bool hasToken;
    bool lastWasWhite; // only used by the master
    int parent;
    int parent_offset;
    TokenHolder* child[num];
  };

  PerThreadStorage<TokenHolder> data;

  unsigned activeThreads;

  void processToken() {
    TokenHolder& th = *data.getLocal();
    // int myid = LL::getTID();
    // have all up tokens?
    bool haveAll = th.hasToken;
    bool black   = th.processIsBlack;
    for (int i = 0; i < num; ++i) {
      if (th.child[i]) {
        if (th.up_token[i] == -1)
          haveAll = false;
        else
          black |= th.up_token[i];
      }
    }
    // Have the tokens, propagate
    if (haveAll) {
      th.processIsBlack = false;
      th.hasToken       = false;
      if (isSysMaster()) {
        if (th.lastWasWhite && !black) {
          // This was the second success
          propGlobalTerm();
          return;
        }
        th.lastWasWhite = !black;
        th.down_token   = true;
      } else {
        data.getRemote(th.parent)->up_token[th.parent_offset] = black;
      }
    }

    // recieved a down token, propagate
    if (th.down_token) {
      th.down_token = false;
      th.hasToken   = true;
      for (int i = 0; i < num; ++i) {
        th.up_token[i] = -1;
        if (th.child[i])
          th.child[i]->down_token = true;
      }
    }
  }

  void propGlobalTerm() { globalTerm = true; }

  bool isSysMaster() const { return ThreadPool::getTID() == 0; }

protected:
  virtual void init(unsigned aThreads) { activeThreads = aThreads; }

public:
  TreeTerminationDetection() {}

  virtual void initializeThread() {
    TokenHolder& th = *data.getLocal();
    th.down_token   = false;
    for (int i = 0; i < num; ++i)
      th.up_token[i] = false;
    th.processIsBlack = true;
    th.hasToken       = false;
    th.lastWasWhite   = false;
    globalTerm        = false;
    auto tid          = ThreadPool::getTID();
    th.parent         = (tid - 1) / num;
    th.parent_offset  = (tid - 1) % num;
    for (unsigned i = 0; i < num; ++i) {
      unsigned cn = tid * num + i + 1;
      if (cn < activeThreads)
        th.child[i] = data.getRemote(cn);
      else
        th.child[i] = 0;
    }
    if (isSysMaster()) {
      th.down_token = true;
    }
  }

  virtual void localTermination(bool workHappened) {
    assert(!(workHappened && globalTerm.get()));
    TokenHolder& th = *data.getLocal();
    th.processIsBlack |= workHappened;
    processToken();
  }
};

void setTermDetect(TerminationDetection* term);
} // end namespace internal

} // namespace substrate
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/substrate/ThreadPool.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_THREADPOOL_H
#define GALOIS_SUBSTRATE_THREADPOOL_H

#include <atomic>
#include <cassert>
#include <condition_variable>
#include <cstdlib>
#include <functional>
#include <thread>
#include <vector>

#include "galois/substrate/CacheLineStorage.h"
#include "galois/substrate/HWTopo.h"

namespace galois::substrate::internal {

template <typename tpl, int s, int r>
struct ExecuteTupleImpl {
  static inline void execute(tpl& cmds) {
    std::get<s>(cmds)();
    ExecuteTupleImpl<tpl, s + 1, r - 1>::execute(cmds);
  }
};

template <typename tpl, int s>
struct ExecuteTupleImpl<tpl, s, 0> {
  static inline void execute(tpl&) {}
};

} // namespace galois::substrate::internal

namespace galois::substrate {

class ThreadPool {
  friend class SharedMem;

protected:
  struct shutdown_ty {}; //! type for shutting down thread
  struct fastmode_ty {
    bool mode;
  }; //! type for setting fastmode
  struct dedicated_ty {
    std::function<void(void)> fn;
  }; //! type to switch to dedicated mode

  //! Per-thread mailboxes for notification
  struct per_signal {
    std::condition_variable cv;
    std::mutex m;
    unsigned wbegin, wend;
    std::atomic<int> done;
    std::atomic<int> fastRelease;
    ThreadTopoInfo topo;

    void wakeup(bool fastmode) {
      if (fastmode) {
        done        = 0;
        fastRelease = 1;
      } else {
        std::lock_guard<std::mutex> lg(m);
        done = 0;
        cv.notify_one();
        // start.release();
      }
    }

    void wait(bool fastmode) {
      if (fastmode) {
        while (!fastRelease.load(std::memory_order_relaxed)) {
          asmPause();
        }
        fastRelease = 0;
      } else {
        std::unique_lock<std::mutex> lg(m);
        cv.wait(lg, [=] { return !done; });
        // start.acquire();
      }
    }
  };

  thread_local static per_signal my_box;

  MachineTopoInfo mi;
  std::vector<per_signal*> signals;
  std::vector<std::thread> threads;
  unsigned reserved;
  unsigned masterFastmode;
  bool running;
  std::function<void(void)> work;

  //! destroy all threads
  void destroyCommon();

  //! Initialize a thread
  void initThread(unsigned tid);

  //! main thread loop
  void threadLoop(unsigned tid);

  //! spin up for run
  void cascade(bool fastmode);

  //! spin down after run
  void decascade();

  //! execute work on num threads
  void runInternal(unsigned num);

  ThreadPool();

public:
  ~ThreadPool();

  ThreadPool(const ThreadPool&) = delete;
  ThreadPool& operator=(const ThreadPool&) = delete;

  ThreadPool(ThreadPool&&) = delete;
  ThreadPool& operator=(ThreadPool&&) = delete;

  //! execute work on all threads
  //! a simple wrapper for run
  template <typename... Args>
  void run(unsigned num, Args&&... args) {
    struct ExecuteTuple {
      //      using Ty = std::tuple<Args...>;
      std::tuple<Args...> cmds;

      void operator()() {
        internal::ExecuteTupleImpl<
            std::tuple<Args...>, 0,
            std::tuple_size<std::tuple<Args...>>::value>::execute(this->cmds);
      }
      ExecuteTuple(Args&&... args) : cmds(std::forward<Args>(args)...) {}
    };
    // paying for an indirection in work allows small-object optimization in
    // std::function to kick in and avoid a heap allocation
    ExecuteTuple lwork(std::forward<Args>(args)...);
    work = std::ref(lwork);
    // work =
    // std::function<void(void)>(ExecuteTuple(std::forward<Args>(args)...));
    assert(num <= getMaxThreads());
    runInternal(num);
  }

  //! run function in a dedicated thread until the threadpool exits
  void runDedicated(std::function<void(void)>& f);

  // experimental: busy wait for work
  void burnPower(unsigned num);
  // experimental: leave busy wait
  void beKind();

  bool isRunning() const { return running; }

  //! return the number of non-reserved threads in the pool
  unsigned getMaxUsableThreads() const { return mi.maxThreads - reserved; }
  //! return the number of threads supported by the thread pool on the current
  //! machine
  unsigned getMaxThreads() const { return mi.maxThreads; }
  unsigned getMaxCores() const { return mi.maxCores; }
  unsigned getMaxSockets() const { return mi.maxSockets; }
  unsigned getMaxNumaNodes() const { return mi.maxNumaNodes; }

  unsigned getLeaderForSocket(unsigned pid) const {
    for (unsigned i = 0; i < getMaxThreads(); ++i)
      if (getSocket(i) == pid && isLeader(i))
        return i;
    abort();
  }

  bool isLeader(unsigned tid) const {
    return signals[tid]->topo.socketLeader == tid;
  }
  unsigned getSocket(unsigned tid) const { return signals[tid]->topo.socket; }
  unsigned getLeader(unsigned tid) const {
    return signals[tid]->topo.socketLeader;
  }
  unsigned getCumulativeMaxSocket(unsigned tid) const {
    return signals[tid]->topo.cumulativeMaxSocket;
  }
  unsigned getNumaNode(unsigned tid) const {
    return signals[tid]->topo.numaNode;
  }

  static unsigned getTID() { return my_box.topo.tid; }
  static bool isLeader() { return my_box.topo.tid == my_box.topo.socketLeader; }
  static unsigned getLeader() { return my_box.topo.socketLeader; }
  static unsigned getSocket() { return my_box.topo.socket; }
  static unsigned getCumulativeMaxSocket() {
    return my_box.topo.cumulativeMaxSocket;
  }
  static unsigned getNumaNode() { return my_box.topo.numaNode; }
};

/**
 * return a reference to system thread pool
 */
ThreadPool& getThreadPool(void);

} // namespace galois::substrate

namespace galois::substrate::internal {

void setThreadPool(ThreadPool* tp);

} // namespace galois::substrate::internal

#endif


================================================
FILE: libgalois/include/galois/substrate/ThreadRWlock.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_SUBSTRATE_THREAD_RW_LOCK_H
#define GALOIS_SUBSTRATE_THREAD_RW_LOCK_H

#include "galois/config.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/substrate/PerThreadStorage.h"

namespace galois {
namespace substrate {

class ThreadRWlock {

  typedef substrate::PaddedLock<true> Lock_ty;
  // typedef galois::runtime::LL::SimpleLock<true> Lock_ty;
  typedef substrate::PerThreadStorage<Lock_ty> PerThreadLock;

  PerThreadLock locks;

public:
  void readLock() { locks.getLocal()->lock(); }

  void readUnlock() { locks.getLocal()->unlock(); }

  void writeLock() {
    for (unsigned i = 0; i < locks.size(); ++i) {
      locks.getRemote(i)->lock();
    }
  }

  void writeUnlock() {
    for (unsigned i = 0; i < locks.size(); ++i) {
      locks.getRemote(i)->unlock();
    }
  }
};

//! readOrUpdate is a generic function to perform reads or writes using a
//! rwmutex \param rwmutex is a read/write lock that implements
//! readLock/readUnlock, writeLoack/writeUnlock \param readAndCheck is function
//! object to execute when reading. It returns true only if read was successful.
//! Should update state to store read result. Shouldn't use rwmutex internally
//! \param write is function object to perform the write. It should update state
//! to store result after writing. Shouldn't use rwmutex internally
template <typename L, typename R, typename W>
void readUpdateProtected(L& rwmutex, R& readAndCheck, W& write) {

  rwmutex.readLock();

  if (readAndCheck()) {

    rwmutex.readUnlock();
    return;

  } else {

    rwmutex.readUnlock();

    rwmutex.writeLock();
    {
      // check again in case another thread made the write
      if (!readAndCheck()) {
        write();
      }
    }
    rwmutex.writeUnlock();
  }
}

} // end namespace substrate
} // end namespace galois

#endif // GALOIS_SUBSTRATE_THREAD_RW_LOCK_H


================================================
FILE: libgalois/include/galois/worklists/AdaptiveObim.h
================================================
/** Scalable priority worklist
 *
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_ADAPTIVEOBIM_H
#define GALOIS_WORKLIST_ADAPTIVEOBIM_H

#include <atomic>
#include <cmath>
#include <iostream>
#include <limits>
#include <type_traits>

#include "galois/config.h"

#include "galois/FlatMap.h"
#include "galois/Timer.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/worklists/Chunk.h"
#include "galois/worklists/WorkListHelpers.h"

namespace galois {
namespace worklists {

namespace internal {

template <typename Index, bool UseDescending>
struct AdaptiveOrderedByIntegerMetricComparator {
  typedef std::less<Index> compare_t;
  Index identity;
  Index earliest;

  AdaptiveOrderedByIntegerMetricComparator()
      : identity(std::numeric_limits<Index>::max()),
        earliest(std::numeric_limits<Index>::min()) {}
};

template <typename Index>
struct AdaptiveOrderedByIntegerMetricComparator<Index, true> {
  typedef std::greater<Index> compare_t;
  Index identity;
  Index earliest;

  AdaptiveOrderedByIntegerMetricComparator()
      : identity(std::numeric_limits<Index>::min()),
        earliest(std::numeric_limits<Index>::max()) {}
};

} // namespace internal

/**
 * Approximate priority scheduling. Indexer is a default-constructable class
 * whose instances conform to <code>R r = indexer(item)</code> where R is some
 * type with a total order defined by <code>operator&lt;</code> and
 * <code>operator==</code> and item is an element from the Galois set
 * iterator.
 *
 * An example:
 * \code
 * struct Item { int index; };
 *
 * struct Indexer {
 *   int operator()(Item i) const { return i.index; }
 * };
 *
 * typedef galois::worklists::AdaptiveOrderedByIntegerMetric<Indexer> WL;
 * galois::for_each<WL>(items.begin(), items.end(), Fn);
 * \endcode
 *
 * @tparam Indexer        Indexer class
 * @tparam Container      Scheduler for each bucket
 * @tparam BlockPeriod    Check for higher priority work every 2^BlockPeriod
 *                        iterations
 * @tparam BSP            Use back-scan prevention
 * @tparam uniformBSP     Use uniform back-scan prevention
 * @tparam T              Work item type
 * @tparam Index          Indexer return type
 * @tparam UseMonotonic   Assume that an activity at priority p will not
 * schedule work at priority p or any priority p1 where p1 < p.
 * @tparam UseDescending  Use descending order instead
 * @tparam Concurrent     Whether or not to allow concurrent execution
 */
template <class Indexer      = DummyIndexer<int>,
          typename Container = PerSocketChunkFIFO<>, int BlockPeriod = 0,
          bool BSP = true, bool uniformBSP = false, int chunk_size = 64,
          typename T = int, typename Index = int, bool EnableUmerge = false,
          bool UseMonotonic = false, bool UseDescending = false,
          bool Concurrent = true>
struct AdaptiveOrderedByIntegerMetric
    : private boost::noncopyable,
      public internal::AdaptiveOrderedByIntegerMetricComparator<Index,
                                                                UseDescending> {
  template <typename _T>
  using retype = AdaptiveOrderedByIntegerMetric<
      Indexer, typename Container::template retype<_T>, BlockPeriod, BSP,
      uniformBSP, chunk_size, _T, typename std::result_of<Indexer(_T)>::type,
      EnableUmerge, UseMonotonic, UseDescending, Concurrent>;

  template <bool _b>
  using rethread = AdaptiveOrderedByIntegerMetric<
      Indexer, typename Container::template rethread<_b>, BlockPeriod, BSP,
      uniformBSP, chunk_size, T, Index, EnableUmerge, UseMonotonic,
      UseDescending, _b>;

  template <unsigned _period>
  struct with_block_period {
    typedef AdaptiveOrderedByIntegerMetric<
        Indexer, Container, _period, BSP, uniformBSP, chunk_size, T, Index,
        EnableUmerge, UseMonotonic, UseDescending, Concurrent>
        type;
  };
  template <typename _container>
  struct with_container {
    typedef AdaptiveOrderedByIntegerMetric<
        Indexer, _container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,
        EnableUmerge, UseMonotonic, UseDescending, Concurrent>
        type;
  };

  template <typename _indexer>
  struct with_indexer {
    AdaptiveOrderedByIntegerMetric<
        _indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,
        EnableUmerge, UseMonotonic, UseDescending, Concurrent>
        type;
  };

  template <bool _bsp>
  struct with_back_scan_prevention {
    typedef AdaptiveOrderedByIntegerMetric<
        Indexer, Container, BlockPeriod, _bsp, uniformBSP, chunk_size, T, Index,
        EnableUmerge, UseMonotonic, UseDescending, Concurrent>
        type;
  };

  template <bool _enable_unmerge>
  struct with_unmerge {
    AdaptiveOrderedByIntegerMetric<
        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,
        _enable_unmerge, UseMonotonic, UseDescending, Concurrent>
        type;
  };

  template <bool _use_monotonic>
  struct with_monotonic {
    AdaptiveOrderedByIntegerMetric<
        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,
        EnableUmerge, _use_monotonic, UseDescending, Concurrent>
        type;
  };

  template <bool _use_descending>
  struct with_descending {
    AdaptiveOrderedByIntegerMetric<
        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,
        EnableUmerge, UseMonotonic, _use_descending, Concurrent>
        type;
  };

  typedef T value_type;
  typedef Index index_type;
  typedef uint32_t delta_type;

private:
  typedef typename Container::template rethread<Concurrent> CTy;
  typedef internal::AdaptiveOrderedByIntegerMetricComparator<Index,
                                                             UseDescending>
      Comparator;
  static inline typename Comparator::compare_t compare;
  delta_type delta;
  unsigned int counter;
  unsigned int maxIndex;
  unsigned int lastSizeMasterLog;

  // indexing mechanism
  // smaller delta insertions are prioritirized
  struct deltaIndex {
    Index k; // note: original index is stored here
    delta_type d;
    // taking the max of deltas and doing right shift eq. shifting priority with
    // d-max(d1, d2)

    deltaIndex() : k(0), d(0) {}
    deltaIndex(Index k1, delta_type d1) : k(k1), d(d1) {}
    bool operator==(const deltaIndex& a) const {
      unsigned delt = std::max(d, a.d);
      Index a1      = k >> delt;
      Index a2      = a.k >> delt;
      return (a1 == a2 && d == a.d);
    }
    bool operator<(const deltaIndex& a) const {
      unsigned delt = std::max(d, a.d);
      Index a1      = k >> delt;
      Index a2      = a.k >> delt;
      if (compare(a1, a2))
        return true;
      if (compare(a2, a1))
        return false;
      if (d < a.d)
        return true;
      return false;
    }
    bool operator>(const deltaIndex& a) const {
      unsigned delt = std::max(d, a.d);
      Index a1      = k >> delt;
      Index a2      = a.k >> delt;
      if (compare(a2, a1))
        return true;
      if (compare(a1, a2))
        return false;
      if (d > a.d)
        return true;
      return false;
    }
    bool operator<=(const deltaIndex& a) const {
      unsigned delt = std::max(d, a.d);
      Index a1      = k >> delt;
      Index a2      = a.k >> delt;
      if (compare(a1, a2))
        return true;
      if (compare(a2, a1))
        return false;
      if (d < a.d)
        return true;
      if (d == a.d)
        return true;
      return false;
    }
    bool operator>=(const deltaIndex& a) const {
      unsigned delt = std::max(d, a.d);
      Index a1      = k >> delt;
      Index a2      = a.k >> delt;
      if (compare(a2, a1))
        return true;
      if (compare(a1, a2))
        return false;
      if (d > a.d)
        return true;
      if (d == a.d)
        return true;
      return false;
    }
  };

  typedef galois::flat_map<deltaIndex, CTy*> LMapTy;

  struct ThreadData {
    LMapTy local;
    deltaIndex curIndex;
    deltaIndex scanStart;
    CTy* current;
    unsigned int lastMasterVersion;
    unsigned int numPops;

    unsigned int popsLastFix;
    unsigned int slowPopsLastPeriod;
    unsigned int pushesLastPeriod;
    unsigned int popsFromSameQ;
    struct {
      size_t pmodAllDeq;
      unsigned int priosLastPeriod;
      unsigned int numUmerges;
      Index maxPrioDiffLastPeriod;
    } stats;
    Index minPrio;
    Index maxPrio;
    substrate::PaddedLock<Concurrent> lock;

    void cleanup() {
      popsLastFix        = 0;
      slowPopsLastPeriod = 0;
      pushesLastPeriod   = 0;

      stats.priosLastPeriod       = 0;
      stats.maxPrioDiffLastPeriod = 0;

      minPrio = std::numeric_limits<Index>::max();
      maxPrio = std::numeric_limits<Index>::min();
    }

    inline bool isSlowPopFreq(double threshold) {
      // return ((double)slowPopsLastPeriod / (double)popsLastFix) > threshold;
      return (double)slowPopsLastPeriod > ((double)popsLastFix * threshold);
    }

    ThreadData(Index initial)
        : curIndex(initial, 0), scanStart(initial, 0), current(0),
          lastMasterVersion(0), numPops(0), popsLastFix(0),
          slowPopsLastPeriod(0), pushesLastPeriod(0),
          popsFromSameQ(0), stats{0, 0, 0, 0},
          minPrio(std::numeric_limits<Index>::max()),
          maxPrio(std::numeric_limits<Index>::min()) {}
  };

  typedef std::deque<std::pair<deltaIndex, CTy*>> MasterLog;

  // NB: Place dynamically growing masterLog after fixed-size PerThreadStorage
  // members to give higher likelihood of reclaiming PerThreadStorage
  substrate::PerThreadStorage<ThreadData> data;
  substrate::PaddedLock<Concurrent> masterLock;
  MasterLog masterLog;

  galois::runtime::FixedSizeHeap heap;
  std::atomic<unsigned int> masterVersion;
  Indexer indexer;

  bool updateLocal(ThreadData& p) {
    if (p.lastMasterVersion != masterVersion.load(std::memory_order_relaxed)) {
      for (;
           p.lastMasterVersion < masterVersion.load(std::memory_order_relaxed);
           ++p.lastMasterVersion) {
        // XXX(ddn): Somehow the second block is better than
        // the first for bipartite matching (GCC 4.7.2)
#if 0
        p.local.insert(masterLog[p.lastMasterVersion]);
#else
        std::pair<deltaIndex, CTy*> logEntry = masterLog[p.lastMasterVersion];
        p.local[logEntry.first]              = logEntry.second;
        assert(logEntry.second);
#endif
      }
      return true;
    }
    return false;
  }

  GALOIS_ATTRIBUTE_NOINLINE
  galois::optional<T> slowPop(ThreadData& p) {
    // Failed, find minimum bin
    p.slowPopsLastPeriod++;
    unsigned myID = galois::substrate::ThreadPool::getTID();

    // first give it some time
    // then check the fdeq frequency
    if (myID == 0 && p.popsLastFix > counter &&
        p.isSlowPopFreq(1.0 / (double)(chunk_size))) {
      unsigned long numPushesThisStep      = p.pushesLastPeriod;
      unsigned long priosCreatedThisPeriod = p.stats.priosLastPeriod;
      unsigned long allPmodDeqCounts       = p.stats.pmodAllDeq;
      Index minOfMin                       = p.minPrio;
      Index maxOfMax                       = p.maxPrio;
      p.cleanup();
      for (unsigned i = 1; i < runtime::activeThreads; ++i) {
        while (!data.getRemote(i)->lock.try_lock())
          ;

        Index& otherMinPrio = data.getRemote(i)->minPrio;
        minOfMin            = std::min(minOfMin, otherMinPrio, compare);
        Index& otherMaxPrio = data.getRemote(i)->maxPrio;
        maxOfMax            = std::max(otherMaxPrio, maxOfMax, compare);
        numPushesThisStep += data.getRemote(i)->pushesLastPeriod;
        priosCreatedThisPeriod += data.getRemote(i)->stats.priosLastPeriod;
        allPmodDeqCounts += data.getRemote(i)->stats.pmodAllDeq;

        data.getRemote(i)->cleanup();
        data.getRemote(i)->lock.unlock();
      }

      if ((double)numPushesThisStep) {
        Index prioRange = (maxOfMax >> delta) - (minOfMin >> delta);
        // Division is expensive
        // double fillRatio = ((double)numPushesThisStep / (double)prioRange);
        if (numPushesThisStep < (chunk_size >> 1) * prioRange) {
          // Ditto
          // double xx = ((double)(chunk_size) / fillRatio);
          double xx = std::log2(chunk_size) - std::log2(numPushesThisStep) +
                      std::log2(prioRange);
          assert(xx);
          delta += std::floor(xx);
          counter <<= 1;
        }
      }
    }
    // serif added here
    // make sure delta is bigger than 0 so that we can actually unmerge things
    // give it some time and check the same queue pops
    else if (EnableUmerge && delta > 0 && myID == 0 &&
             p.popsLastFix > counter && p.popsFromSameQ > (chunk_size << 2)) {
      if (((p.maxPrio >> delta) - (p.minPrio >> delta)) < 16 &&
          ((double)p.pushesLastPeriod /
           ((double)((p.maxPrio >> delta) - (p.minPrio >> delta)))) >
              4 * chunk_size) { // this is a check to make sure we are also
                                // pushing with the same frequency end of
                                // execution
        double diff = ((p.maxPrio >> delta) - (p.minPrio >> delta)) >= 1
                          ? ((p.maxPrio >> delta) - (p.minPrio >> delta))
                          : 1;
        double xx = 16 / diff;
        if (delta > (unsigned int)(std::floor(std::log2(xx))))
          delta -= (unsigned int)(std::floor(std::log2(xx)));
        else
          delta = 0;

        p.cleanup();
        for (unsigned i = 1; i < runtime::activeThreads; ++i) {
          while (!data.getRemote(i)->lock.try_lock())
            ;
          data.getRemote(i)->cleanup();
          data.getRemote(i)->lock.unlock();
        }
        p.stats.numUmerges++;
      }
      p.popsFromSameQ = 0;
    }
    // p.popsFromSameQ = 0;

    bool localLeader = substrate::ThreadPool::isLeader();
    deltaIndex msS(this->earliest, 0);

    updateLocal(p);

    if (BSP && !UseMonotonic) {
      msS = p.scanStart;
      if (localLeader || uniformBSP) {
        for (unsigned i = 0; i < runtime::activeThreads; ++i) {
          msS = std::min(msS, data.getRemote(i)->scanStart);
        }
      } else {
        msS = std::min(
            msS, data.getRemote(substrate::ThreadPool::getLeader())->scanStart);
      }
    }

    for (auto ii = p.local.lower_bound(msS), ei = p.local.end(); ii != ei;
         ++ii) {
      galois::optional<T> item;
      if ((item = ii->second->pop())) {
        p.current   = ii->second;
        p.curIndex  = ii->first;
        p.scanStart = ii->first;
        p.lock.unlock();
        return item;
      }
    }

    p.lock.unlock();
    return galois::optional<value_type>();
  }

  GALOIS_ATTRIBUTE_NOINLINE
  CTy* slowUpdateLocalOrCreate(ThreadData& p, deltaIndex i) {
    // update local until we find it or we get the write lock
    do {
      updateLocal(p);
      CTy* lC;
      if ((lC = p.local[i]))
        return lC;
    } while (!masterLock.try_lock());
    // we have the write lock, update again then create
    updateLocal(p);
    CTy*& C2 = p.local[i];
    if (!C2) {
      C2                  = new (heap.allocate(sizeof(CTy))) CTy();
      p.lastMasterVersion = masterVersion.load(std::memory_order_relaxed) + 1;
      masterLog.push_back(std::make_pair(i, C2));
      masterVersion.fetch_add(1);
      p.stats.priosLastPeriod++;
    }
    masterLock.unlock();
    return C2;
  }

  inline CTy* updateLocalOrCreate(ThreadData& p, deltaIndex i) {
    // Try local then try update then find again or else create and update the
    // master log
    CTy* lC;
    if ((lC = p.local[i]))
      return lC;
    // slowpath
    return slowUpdateLocalOrCreate(p, i);
  }

public:
  AdaptiveOrderedByIntegerMetric(const Indexer& x = Indexer())
      : data(this->earliest), heap(sizeof(CTy)), masterVersion(0), indexer(x) {
    delta   = 0;
    counter = chunk_size;
  }

  ~AdaptiveOrderedByIntegerMetric() {
    ThreadData& p = *data.getLocal();
    updateLocal(p);
    // Deallocate in LIFO order to give opportunity for simple garbage
    // collection
    // Print stats for priroity counts here
    for (auto ii = masterLog.rbegin(), ei = masterLog.rend(); ii != ei; ++ii) {
      CTy* lC = ii->second;
      lC->~CTy();
      heap.deallocate(lC);
    }
  }

  void push(const value_type& val) {
    deltaIndex index;
    ThreadData& p = *data.getLocal();
    while (!p.lock.try_lock())
      ;

    p.pushesLastPeriod++;
    index.k = indexer(val);
    index.d = delta;
    if (index.k > p.maxPrio) {
      p.maxPrio = index.k;
    }
    if (index.k < p.minPrio) {
      p.minPrio = index.k;
    }

    // Fast path
    if (index == p.curIndex && p.current) {
      p.current->push(val);
      p.lock.unlock();
      return;
    }

    // Slow path
    CTy* C = updateLocalOrCreate(p, index);
    if (BSP && index < p.scanStart)
      p.scanStart = index;
    // Opportunistically move to higher priority work
    if (index < p.curIndex) {
      // we moved to a higher prio
      p.popsFromSameQ = 0;

      p.curIndex = index;
      p.current  = C;
    }
    C->push(val);

    p.lock.unlock();
  }

  template <typename Iter>
  size_t push(Iter b, Iter e) {
    size_t npush;
    for (npush = 0; b != e; npush++)
      push(*b++);
    return npush;
  }

  template <typename RangeTy>
  size_t push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    return push(rp.first, rp.second);
  }

  galois::optional<value_type> pop() {
    ThreadData& p = *data.getLocal();
    while (!p.lock.try_lock())
      ;
    CTy* C = p.current;

    p.popsLastFix++;
    p.stats.pmodAllDeq++;

    if (BlockPeriod && ((p.numPops++ & ((1ull << BlockPeriod) - 1)) == 0))
      return slowPop(p);

    galois::optional<value_type> item;
    if (C && (item = C->pop())) {
      p.popsFromSameQ++;

      p.lock.unlock();
      return item;
    }

    // Slow path
    return slowPop(p);
  }
};
GALOIS_WLCOMPILECHECK(AdaptiveOrderedByIntegerMetric)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/BulkSynchronous.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_BULKSYNCHRONOUS_H
#define GALOIS_WORKLIST_BULKSYNCHRONOUS_H

#include <atomic>

#include "galois/config.h"
#include "galois/runtime/Substrate.h"
#include "galois/worklists/Chunk.h"
#include "galois/worklists/WLCompileCheck.h"

namespace galois {
namespace worklists {

/**
 * Bulk-synchronous scheduling. Work is processed in rounds, and all newly
 * created work is processed after all the current work in a round is
 * completed.
 */
template <class Container = PerSocketChunkFIFO<>, class T = int,
          bool Concurrent = true>
class BulkSynchronous : private boost::noncopyable {
public:
  template <bool _concurrent>
  using rethread = BulkSynchronous<Container, T, _concurrent>;

  template <typename _T>
  using retype =
      BulkSynchronous<typename Container::template retype<_T>, _T, Concurrent>;

  template <typename _container>
  using with_container = BulkSynchronous<_container, T, Concurrent>;

private:
  typedef typename Container::template rethread<Concurrent> CTy;

  struct TLD {
    unsigned round;
    TLD() : round(0) {}
  };

  CTy wls[2];
  substrate::PerThreadStorage<TLD> tlds;
  substrate::Barrier& barrier;
  substrate::CacheLineStorage<std::atomic<bool>> some;
  std::atomic<bool> isEmpty;

public:
  typedef T value_type;

  BulkSynchronous()
      : barrier(runtime::getBarrier(runtime::activeThreads)), some(false),
        isEmpty(false) {}

  void push(const value_type& val) {
    wls[(tlds.getLocal()->round + 1) & 1].push(val);
  }

  template <typename ItTy>
  void push(ItTy b, ItTy e) {
    while (b != e)
      push(*b++);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    push(rp.first, rp.second);
    tlds.getLocal()->round = 1;
    some.get()             = true;
  }

  galois::optional<value_type> pop() {
    TLD& tld = *tlds.getLocal();
    galois::optional<value_type> r;

    while (true) {
      if (isEmpty)
        return r; // empty

      r = wls[tld.round].pop();
      if (r)
        return r;

      barrier.wait();
      if (substrate::ThreadPool::getTID() == 0) {
        if (!some.get())
          isEmpty = true;
        some.get() = false;
      }
      tld.round = (tld.round + 1) & 1;
      barrier.wait();

      r = wls[tld.round].pop();
      if (r) {
        some.get() = true;
        return r;
      }
    }
  }
};
GALOIS_WLCOMPILECHECK(BulkSynchronous)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/Chunk.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_CHUNK_H
#define GALOIS_WORKLIST_CHUNK_H

#include "galois/config.h"
#include "galois/FixedSizeRing.h"
#include "galois/runtime/Mem.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/worklists/WLCompileCheck.h"
#include "galois/worklists/WorkListHelpers.h"

namespace galois {
namespace runtime {
extern unsigned activeThreads;
}
namespace worklists {

namespace internal {
// This overly complex specialization avoids a pointer indirection for
// non-distributed WL when accessing PerLevel
template <bool, template <typename> class PS, typename TQ>
struct squeue {
  PS<TQ> queues;
  TQ& get(int i) { return *queues.getRemote(i); }
  TQ& get() { return *queues.getLocal(); }
  int myEffectiveID() { return substrate::ThreadPool::getTID(); }
  int size() { return runtime::activeThreads; }
};

template <template <typename> class PS, typename TQ>
struct squeue<false, PS, TQ> {
  TQ queue;
  TQ& get(int) { return queue; }
  TQ& get() { return queue; }
  int myEffectiveID() { return 0; }
  int size() { return 0; }
};

//! Common functionality to all chunked worklists
template <typename T, template <typename, bool> class QT, bool Distributed,
          bool IsStack, int ChunkSize, bool Concurrent>
struct ChunkMaster {
  template <typename _T>
  using retype =
      ChunkMaster<_T, QT, Distributed, IsStack, ChunkSize, Concurrent>;

  template <int _chunk_size>
  using with_chunk_size =
      ChunkMaster<T, QT, Distributed, IsStack, _chunk_size, Concurrent>;

  template <bool _Concurrent>
  using rethread =
      ChunkMaster<T, QT, Distributed, IsStack, ChunkSize, _Concurrent>;

private:
  class Chunk : public FixedSizeRing<T, ChunkSize>,
                public QT<Chunk, Concurrent>::ListNode {};

  runtime::FixedSizeAllocator<Chunk> alloc;

  struct p {
    Chunk* cur;
    Chunk* next;
    p() : cur(0), next(0) {}
  };

  typedef QT<Chunk, Concurrent> LevelItem;

  squeue<Concurrent, substrate::PerThreadStorage, p> data;
  squeue<Distributed, substrate::PerSocketStorage, LevelItem> Q;

  Chunk* mkChunk() {
    Chunk* ptr = alloc.allocate(1);
    alloc.construct(ptr);
    return ptr;
  }

  void delChunk(Chunk* ptr) {
    alloc.destroy(ptr);
    alloc.deallocate(ptr, 1);
  }

  void pushChunk(Chunk* C) {
    LevelItem& I = Q.get();
    I.push(C);
  }

  Chunk* popChunkByID(unsigned int i) {
    LevelItem& I = Q.get(i);
    return I.pop();
  }

  Chunk* popChunk() {
    int id   = Q.myEffectiveID();
    Chunk* r = popChunkByID(id);
    if (r)
      return r;

    for (int i = id + 1; i < (int)Q.size(); ++i) {
      r = popChunkByID(i);
      if (r)
        return r;
    }

    for (int i = 0; i < id; ++i) {
      r = popChunkByID(i);
      if (r)
        return r;
    }

    return 0;
  }

  template <typename... Args>
  T* emplacei(p& n, Args&&... args) {
    T* retval = 0;
    if (n.next && (retval = n.next->emplace_back(std::forward<Args>(args)...)))
      return retval;
    if (n.next)
      pushChunk(n.next);
    n.next = mkChunk();
    retval = n.next->emplace_back(std::forward<Args>(args)...);
    assert(retval);
    return retval;
  }

public:
  typedef T value_type;

  ChunkMaster()                   = default;
  ChunkMaster(const ChunkMaster&) = delete;
  ChunkMaster& operator=(const ChunkMaster&) = delete;

  void flush() {
    p& n = data.get();
    if (n.next)
      pushChunk(n.next);
    n.next = 0;
  }

  /**
   * Construct an item on the worklist and return a pointer to its value.
   *
   * This pointer facilitates some internal runtime uses and is not designed
   * to be used by general clients. The address is generally not safe to use
   * in the presence of concurrent pops.
   */
  template <typename... Args>
  value_type* emplace(Args&&... args) {
    p& n = data.get();
    return emplacei(n, std::forward<Args>(args)...);
  }

  /**
   * Return pointer to next value to be returned by pop.
   *
   * For internal runtime use.
   */
  value_type* peek() {
    p& n = data.get();
    if (IsStack) {
      if (n.next && !n.next->empty())
        return &n.next->back();
      if (n.next)
        delChunk(n.next);
      n.next = popChunk();
      if (n.next && !n.next->empty())
        return &n.next->back();
      return NULL;
    } else {
      if (n.cur && !n.cur->empty())
        return &n.cur->front();
      if (n.cur)
        delChunk(n.cur);
      n.cur = popChunk();
      if (!n.cur) {
        n.cur  = n.next;
        n.next = 0;
      }
      if (n.cur && !n.cur->empty())
        return &n.cur->front();
      return NULL;
    }
  }

  /**
   * Remove the value returned from peek() from the worklist.
   *
   * For internal runtime use.
   */
  void pop_peeked() {
    p& n = data.get();
    if (IsStack) {
      n.next->pop_back();
      return;
    } else {
      n.cur->pop_front();
      return;
    }
  }

  void push(const value_type& val) {
    p& n = data.get();
    emplacei(n, val);
  }

  template <typename Iter>
  void push(Iter b, Iter e) {
    p& n = data.get();
    while (b != e)
      emplacei(n, *b++);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    push(rp.first, rp.second);
  }

  galois::optional<value_type> pop() {
    p& n = data.get();
    galois::optional<value_type> retval;
    if (IsStack) {
      if (n.next && (retval = n.next->extract_back()))
        return retval;
      if (n.next)
        delChunk(n.next);
      n.next = popChunk();
      if (n.next)
        return n.next->extract_back();
      return galois::optional<value_type>();
    } else {
      if (n.cur && (retval = n.cur->extract_front()))
        return retval;
      if (n.cur)
        delChunk(n.cur);
      n.cur = popChunk();
      if (!n.cur) {
        n.cur  = n.next;
        n.next = 0;
      }
      if (n.cur)
        return n.cur->extract_front();
      return galois::optional<value_type>();
    }
  }
};

} // namespace internal

/**
 * Chunk FIFO. A global FIFO of chunks of some fixed size.
 *
 * @tparam ChunkSize chunk size
 */
template <int ChunkSize = 64, typename T = int, bool Concurrent = true>
using ChunkFIFO = internal::ChunkMaster<T, ConExtLinkedQueue, false, false,
                                        ChunkSize, Concurrent>;
GALOIS_WLCOMPILECHECK(ChunkFIFO)

/**
 * Chunk LIFO. A global LIFO of chunks of some fixed size.
 *
 * @tparam ChunkSize chunk size
 */
template <int ChunkSize = 64, typename T = int, bool Concurrent = true>
using ChunkLIFO = internal::ChunkMaster<T, ConExtLinkedStack, false, true,
                                        ChunkSize, Concurrent>;
GALOIS_WLCOMPILECHECK(ChunkLIFO)

/**
 * Distributed chunked FIFO. A more scalable version of {@link ChunkFIFO}.
 *
 * @tparam ChunkSize chunk size
 */
template <int ChunkSize = 64, typename T = int, bool Concurrent = true>
using PerSocketChunkFIFO = internal::ChunkMaster<T, ConExtLinkedQueue, true,
                                                 false, ChunkSize, Concurrent>;
GALOIS_WLCOMPILECHECK(PerSocketChunkFIFO)

/**
 * Distributed chunked LIFO. A more scalable version of {@link ChunkLIFO}.
 *
 * @tparam chunksize chunk size
 */
template <int ChunkSize = 64, typename T = int, bool Concurrent = true>
using PerSocketChunkLIFO = internal::ChunkMaster<T, ConExtLinkedStack, true,
                                                 true, ChunkSize, Concurrent>;
GALOIS_WLCOMPILECHECK(PerSocketChunkLIFO)

/**
 * Distributed chunked bag. A scalable and resource-efficient policy when you
 * are agnostic to the particular scheduling order.
 *
 * @tparam chunksize chunk size
 */
template <int ChunkSize = 64, typename T = int, bool Concurrent = true>
using PerSocketChunkBag = internal::ChunkMaster<T, ConExtLinkedQueue, true,
                                                true, ChunkSize, Concurrent>;
GALOIS_WLCOMPILECHECK(PerSocketChunkBag)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/ExternalReference.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_EXTERNALREFERENCE_H
#define GALOIS_WORKLIST_EXTERNALREFERENCE_H

#include "galois/config.h"

namespace galois {
namespace worklists {

template <typename Container, bool IgnorePushInitial = false>
class ExternalReference {
  Container& wl;

public:
  //! change the type the worklist holds
  template <typename _T>
  using retype = ExternalReference<typename Container::template retype<_T>>;

  //! T is the value type of the WL
  typedef typename Container::value_type value_type;

  ExternalReference(Container& _wl) : wl(_wl) {}

  //! push a value onto the queue
  void push(const value_type& val) { wl.push(val); }

  //! push a range onto the queue
  template <typename Iter>
  void push(Iter b, Iter e) {
    wl.push(b, e);
  }

  //! push initial range onto the queue
  //! called with the same b and e on each thread
  template <typename RangeTy>
  void push_initial(const RangeTy& r) {
    if (!IgnorePushInitial)
      wl.push_initial(r);
  }

  //! pop a value from the queue.
  galois::optional<value_type> pop() { return wl.pop(); }
};

} // namespace worklists
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/worklists/LocalQueue.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_LOCALQUEUE_H
#define GALOIS_WORKLIST_LOCALQUEUE_H

#include <type_traits>

#include <boost/mpl/if.hpp>

#include "galois/config.h"
#include "galois/worklists/Simple.h"

namespace galois {
namespace worklists {

template <typename T = int>
struct NoGlobalQueue {
  template <bool _concurrent>
  using rethread = NoGlobalQueue<T>;

  template <typename _T>
  using retype = NoGlobalQueue<_T>;
};

template <typename Global = NoGlobalQueue<>, typename Local = GFIFO<int>,
          typename T = int>
struct LocalQueue : private boost::noncopyable {
  template <bool _concurrent>
  using rethread = LocalQueue<Global, Local, T>;

  template <typename _T>
  using retype = LocalQueue<typename Global::template retype<_T>,
                            typename Local::template retype<_T>, _T>;

  template <typename _global>
  using with_global = LocalQueue<_global, Local, T>;

  template <typename _local>
  using with_local = LocalQueue<Global, _local, T>;

private:
  typedef typename Local::template rethread<false> lWLTy;
  substrate::PerThreadStorage<lWLTy> local;
  Global global;

  template <typename RangeTy,
            bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>
  void pushGlobal(const RangeTy& range,
                  typename std::enable_if<Enable>::type* = 0) {
    auto rp = range.local_pair();
    local.getLocal()->push(rp.first, rp.second);
  }

  template <typename RangeTy,
            bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>
  void pushGlobal(const RangeTy& range,
                  typename std::enable_if<!Enable>::type* = 0) {
    global.push_initial(range);
  }

  template <bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>
  galois::optional<T> popGlobal(typename std::enable_if<Enable>::type* = 0) {
    return galois::optional<value_type>();
  }

  template <bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>
  galois::optional<T> popGlobal(typename std::enable_if<!Enable>::type* = 0) {
    return global.pop();
  }

public:
  typedef T value_type;

  void push(const value_type& val) { local.getLocal()->push(val); }

  template <typename Iter>
  void push(Iter b, Iter e) {
    local.getLocal()->push(b, e);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    pushGlobal(range);
  }

  galois::optional<value_type> pop() {
    galois::optional<value_type> ret = local.getLocal()->pop();
    if (ret)
      return ret;
    return popGlobal();
  }
};
GALOIS_WLCOMPILECHECK(LocalQueue)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/Obim.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_OBIM_H
#define GALOIS_WORKLIST_OBIM_H

#include <deque>
#include <limits>
#include <type_traits>

#include "galois/FlatMap.h"
#include "galois/runtime/Substrate.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/Termination.h"
#include "galois/worklists/Chunk.h"
#include "galois/worklists/WorkListHelpers.h"

namespace galois {
namespace worklists {

namespace internal {

template <typename T, typename Index, bool UseBarrier>
class OrderedByIntegerMetricData {
protected:
  struct ThreadData {};
  bool hasStored(ThreadData&, Index) { return false; }
  galois::optional<T> popStored(ThreadData&, Index) { return {}; }
};

template <typename T, typename Index>
class OrderedByIntegerMetricData<T, Index, true> {
protected:
  struct ThreadData {
    bool hasWork;
    std::deque<std::pair<Index, T>> stored;
  };

  substrate::Barrier& barrier;

  OrderedByIntegerMetricData()
      : barrier(runtime::getBarrier(runtime::activeThreads)) {}

  bool hasStored(ThreadData& p, Index idx) {
    for (auto& e : p.stored) {
      if (e.first == idx) {
        std::swap(e, p.stored.front());
        return true;
      }
    }
    return false;
  }

  galois::optional<T> popStored(ThreadData& p, Index idx) {
    galois::optional<T> item;
    for (auto ii = p.stored.begin(), ei = p.stored.end(); ii != ei; ++ii) {
      if (ii->first == idx) {
        item = ii->second;
        p.stored.erase(ii);
        break;
      }
    }
    return item;
  }
};

template <typename Index, bool UseDescending>
struct OrderedByIntegerMetricComparator {
  std::less<Index> compare;
  Index identity;
  Index earliest;

  template <typename C>
  struct with_local_map {
    typedef galois::flat_map<Index, C, std::less<Index>> type;
  };
  OrderedByIntegerMetricComparator()
      : identity(std::numeric_limits<Index>::max()),
        earliest(std::numeric_limits<Index>::min()) {}
};

template <typename Index>
struct OrderedByIntegerMetricComparator<Index, true> {
  std::greater<Index> compare;
  Index identity;
  Index earliest;

  template <typename C>
  struct with_local_map {
    typedef galois::flat_map<Index, C, std::greater<Index>> type;
  };
  OrderedByIntegerMetricComparator()
      : identity(std::numeric_limits<Index>::min()),
        earliest(std::numeric_limits<Index>::max()) {}
};

} // namespace internal

/**
 * Approximate priority scheduling. Indexer is a default-constructable class
 * whose instances conform to <code>R r = indexer(item)</code> where R is some
 * type with a total order defined by <code>operator&lt;</code> and
 * <code>operator==</code> and item is an element from the Galois set
 * iterator.
 *
 * An example:
 * \code
 * struct Item { int index; };
 *
 * struct Indexer {
 *   int operator()(Item i) const { return i.index; }
 * };
 *
 * typedef galois::worklists::OrderedByIntegerMetric<Indexer> WL;
 * galois::for_each<WL>(galois::iterate(items), Fn);
 * \endcode
 *
 * @tparam Indexer        Indexer class
 * @tparam Container      Scheduler for each bucket
 * @tparam BlockPeriod    Check for higher priority work every 2^BlockPeriod
 *                        iterations
 * @tparam BSP            Use back-scan prevention
 * @tparam UseBarrier     Eliminate priority inversions by placing a barrier
 * between priority levels
 * @tparam UseMonotonic   Assume that an activity at priority p will not
 * schedule work at priority p or any priority p1 where p1 < p.
 * @tparam UseDescending  Use descending order instead
 */
// TODO could move to general comparator but there are issues with atomic reads
// and initial values for arbitrary types
template <class Indexer      = DummyIndexer<int>,
          typename Container = PerSocketChunkFIFO<>, unsigned BlockPeriod = 0,
          bool BSP = true, typename T = int, typename Index = int,
          bool UseBarrier = false, bool UseMonotonic = false,
          bool UseDescending = false, bool Concurrent = true>
struct OrderedByIntegerMetric
    : private boost::noncopyable,
      public internal::OrderedByIntegerMetricData<T, Index, UseBarrier>,
      public internal::OrderedByIntegerMetricComparator<Index, UseDescending> {
  // static_assert(std::is_integral<Index>::value, "only integral index types
  // supported");

  template <typename _T>
  using retype = OrderedByIntegerMetric<
      Indexer, typename Container::template retype<_T>, BlockPeriod, BSP, _T,
      typename std::result_of<Indexer(_T)>::type, UseBarrier, UseMonotonic,
      UseDescending, Concurrent>;

  template <bool _b>
  using rethread =
      OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T, Index,
                             UseBarrier, UseMonotonic, UseDescending, _b>;

  template <unsigned _period>
  struct with_block_period {
    typedef OrderedByIntegerMetric<Indexer, Container, _period, BSP, T, Index,
                                   UseBarrier, UseMonotonic, UseDescending,
                                   Concurrent>
        type;
  };

  template <typename _container>
  struct with_container {
    typedef OrderedByIntegerMetric<Indexer, _container, BlockPeriod, BSP, T,
                                   Index, UseBarrier, UseMonotonic,
                                   UseDescending, Concurrent>
        type;
  };

  template <typename _indexer>
  struct with_indexer {
    typedef OrderedByIntegerMetric<_indexer, Container, BlockPeriod, BSP, T,
                                   Index, UseBarrier, UseMonotonic,
                                   UseDescending, Concurrent>
        type;
  };

  template <bool _bsp>
  struct with_back_scan_prevention {
    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, _bsp, T,
                                   Index, UseBarrier, UseMonotonic,
                                   UseDescending, Concurrent>
        type;
  };

  template <bool _use_barrier>
  struct with_barrier {
    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,
                                   Index, _use_barrier, UseMonotonic,
                                   UseDescending, Concurrent>
        type;
  };

  template <bool _use_monotonic>
  struct with_monotonic {
    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,
                                   Index, UseBarrier, _use_monotonic,
                                   UseDescending, Concurrent>
        type;
  };

  template <bool _use_descending>
  struct with_descending {
    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,
                                   Index, UseBarrier, UseMonotonic,
                                   _use_descending, Concurrent>
        type;
  };

  typedef T value_type;
  typedef Index index_type;

private:
  typedef typename Container::template rethread<Concurrent> CTy;
  typedef internal::OrderedByIntegerMetricComparator<Index, UseDescending>
      Comparator;
  typedef typename Comparator::template with_local_map<CTy*>::type LMapTy;

  struct ThreadData
      : public internal::OrderedByIntegerMetricData<T, Index,
                                                    UseBarrier>::ThreadData {
    LMapTy local;
    Index curIndex;
    Index scanStart;
    CTy* current;
    unsigned int lastMasterVersion;
    unsigned int numPops;

    ThreadData(Index initial)
        : curIndex(initial), scanStart(initial), current(0),
          lastMasterVersion(0), numPops(0) {}
  };

  typedef std::deque<std::pair<Index, CTy*>> MasterLog;

  // NB: Place dynamically growing masterLog after fixed-size PerThreadStorage
  // members to give higher likelihood of reclaiming PerThreadStorage
  substrate::PerThreadStorage<ThreadData> data;
  substrate::PaddedLock<Concurrent> masterLock;
  MasterLog masterLog;

  std::atomic<unsigned int> masterVersion;
  Indexer indexer;

  bool updateLocal(ThreadData& p) {
    if (p.lastMasterVersion != masterVersion.load(std::memory_order_relaxed)) {
      for (;
           p.lastMasterVersion < masterVersion.load(std::memory_order_relaxed);
           ++p.lastMasterVersion) {
        // XXX(ddn): Somehow the second block is better than
        // the first for bipartite matching (GCC 4.7.2)
#if 0
        p.local.insert(masterLog[p.lastMasterVersion]);
#else
        std::pair<Index, CTy*> logEntry = masterLog[p.lastMasterVersion];
        p.local[logEntry.first]         = logEntry.second;
        assert(logEntry.second);
#endif
      }
      return true;
    }
    return false;
  }

  GALOIS_ATTRIBUTE_NOINLINE
  galois::optional<T> slowPop(ThreadData& p) {
    bool localLeader = substrate::ThreadPool::isLeader();
    Index msS        = this->earliest;

    updateLocal(p);

    if (BSP && !UseMonotonic) {
      msS = p.scanStart;
      if (localLeader) {
        for (unsigned i = 0; i < runtime::activeThreads; ++i) {
          Index o = data.getRemote(i)->scanStart;
          if (this->compare(o, msS))
            msS = o;
        }
      } else {
        Index o = data.getRemote(substrate::ThreadPool::getLeader())->scanStart;
        if (this->compare(o, msS))
          msS = o;
      }
    }

    for (auto ii = p.local.lower_bound(msS), ei = p.local.end(); ii != ei;
         ++ii) {
      galois::optional<T> item;
      if ((item = ii->second->pop())) {
        p.current   = ii->second;
        p.curIndex  = ii->first;
        p.scanStart = ii->first;
        return item;
      }
    }

    return galois::optional<value_type>();
  }

  GALOIS_ATTRIBUTE_NOINLINE
  CTy* slowUpdateLocalOrCreate(ThreadData& p, Index i) {
    // update local until we find it or we get the write lock
    do {
      updateLocal(p);
      auto it = p.local.find(i);
      if (it != p.local.end())
        return it->second;
    } while (!masterLock.try_lock());
    // we have the write lock, update again then create
    updateLocal(p);
    auto it = p.local.find(i);
    CTy* C2 = (it != p.local.end()) ? it->second : nullptr;
    if (!C2) {
      C2                  = new CTy();
      p.local[i]          = C2;
      p.lastMasterVersion = masterVersion.load(std::memory_order_relaxed) + 1;
      masterLog.push_back(std::make_pair(i, C2));
      masterVersion.fetch_add(1);
    }
    masterLock.unlock();
    return C2;
  }

  inline CTy* updateLocalOrCreate(ThreadData& p, Index i) {
    // Try local then try update then find again or else create and update the
    // master log
    auto it = p.local.find(i);
    if (it != p.local.end())
      return it->second;
    // slowpath
    return slowUpdateLocalOrCreate(p, i);
  }

public:
  OrderedByIntegerMetric(const Indexer& x = Indexer())
      : data(this->earliest), masterVersion(0), indexer(x) {}

  ~OrderedByIntegerMetric() {
    // Deallocate in LIFO order to give opportunity for simple garbage
    // collection
    for (auto ii = masterLog.rbegin(), ei = masterLog.rend(); ii != ei; ++ii) {
      delete ii->second;
    }
  }

  void push(const value_type& val) {
    Index index   = indexer(val);
    ThreadData& p = *data.getLocal();

    assert(!UseMonotonic || this->compare(p.curIndex, index));

    // Fast path
    if (index == p.curIndex && p.current) {
      p.current->push(val);
      return;
    }

    // Slow path
    CTy* C = updateLocalOrCreate(p, index);
    if (BSP && this->compare(index, p.scanStart))
      p.scanStart = index;
    // Opportunistically move to higher priority work
    if (!UseBarrier && this->compare(index, p.curIndex)) {
      p.curIndex = index;
      p.current  = C;
    }
    C->push(val);
  }

  template <typename Iter>
  void push(Iter b, Iter e) {
    while (b != e)
      push(*b++);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    push(rp.first, rp.second);
  }

  galois::optional<value_type> pop() {
    // Find a successful pop
    ThreadData& p = *data.getLocal();
    CTy* C        = p.current;

    if (this->hasStored(p, p.curIndex))
      return this->popStored(p, p.curIndex);

    if (!UseBarrier && BlockPeriod &&
        ((p.numPops++ & ((1 << BlockPeriod) - 1)) == 0))
      return slowPop(p);

    galois::optional<value_type> item;
    if (C && (item = C->pop()))
      return item;

    if (UseBarrier)
      return item;

    // Slow path
    return slowPop(p);
  }

  template <bool Barrier = UseBarrier>
  auto empty() -> typename std::enable_if<Barrier, bool>::type {
    galois::optional<value_type> item;
    ThreadData& p = *data.getLocal();

    // try to pop from global worklist
    item = slowPop(p);
    if (item)
      p.stored.push_back(std::make_pair(p.curIndex, *item));

    // check if there are thread-local work items
    if (!p.stored.empty()) {
      Index storedIndex = this->identity;
      for (auto& e : p.stored) {
        if (this->compare(e.first, storedIndex)) {
          storedIndex = e.first;
        }
      }
      p.curIndex = storedIndex;
      p.current  = p.local[storedIndex];
    }
    p.hasWork = !p.stored.empty();

    this->barrier.wait();

    // align with the earliest level from threads that have works
    bool hasWork   = p.hasWork;
    Index curIndex = (hasWork) ? p.curIndex : this->identity;
    CTy* C         = (hasWork) ? p.current : nullptr;

    for (unsigned i = 0; i < runtime::activeThreads; ++i) {
      ThreadData& o = *data.getRemote(i);
      if (o.hasWork && this->compare(o.curIndex, curIndex)) {
        curIndex = o.curIndex;
        C        = o.current;
      }
      hasWork |= o.hasWork;
    }

    this->barrier.wait();

    p.current  = C;
    p.curIndex = curIndex;

    if (UseMonotonic) {
      for (auto ii = p.local.begin(); ii != p.local.end();) {
        bool toBreak = ii->second == C;
        if (toBreak)
          break;
        ii = p.local.erase(ii);
      }
    }

    return !hasWork;
  }
};
GALOIS_WLCOMPILECHECK(OrderedByIntegerMetric)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/OrderedList.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_ORDEREDLIST_H
#define GALOIS_WORKLIST_ORDEREDLIST_H

#include "galois/config.h"
#include "galois/FlatMap.h"

namespace galois {
namespace worklists {

template <class Compare = std::less<int>, typename T = int,
          bool concurrent = true>
class OrderedList : private boost::noncopyable,
                    private substrate::PaddedLock<concurrent> {
  typedef galois::flat_map<T, std::deque<T>, Compare> Map;

  Map map;

  using substrate::PaddedLock<concurrent>::lock;
  using substrate::PaddedLock<concurrent>::try_lock;
  using substrate::PaddedLock<concurrent>::unlock;

public:
  template <typename Tnew>
  using retype = OrderedList<Compare, Tnew, concurrent>;

  template <bool b>
  using rethread = OrderedList<Compare, T, b>;

  typedef T value_type;

  void push(value_type val) {
    lock();
    std::deque<T>& list = map[val];
    list.push_back(val);
    unlock();
  }

  template <typename Iter>
  void push(Iter b, Iter e) {
    lock();
    while (b != e) {
      std::deque<T>& list = map[*b];
      list.push_back(*b);
      ++b;
    }
    unlock();
  }

  template <typename RangeTy>
  void push_initial(RangeTy range) {
    if (substrate::ThreadPool::getTID() == 0)
      push(range.begin(), range.end());
  }

  galois::optional<value_type> pop() {
    lock();
    if (map.empty()) {
      unlock();
      return galois::optional<value_type>();
    }
    auto ii             = map.begin();
    std::deque<T>& list = ii->second;
    galois::optional<value_type> v(list.front());
    list.pop_front();
    if (list.empty())
      map.erase(ii);
    unlock();
    return v;
  }
};
GALOIS_WLCOMPILECHECK(OrderedList)
} // namespace worklists
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/worklists/OwnerComputes.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_OWNERCOMPUTES_H
#define GALOIS_WORKLIST_OWNERCOMPUTES_H

#include "galois/config.h"
#include "galois/worklists/WLCompileCheck.h"

namespace galois {
namespace worklists {

template <typename OwnerFn   = DummyIndexer<int>,
          typename Container = ChunkLIFO<>, typename T = int>
struct OwnerComputes : private boost::noncopyable {
  template <typename _T>
  using retype =
      OwnerComputes<OwnerFn, typename Container::template retype<_T>, _T>;

  template <bool b>
  using rethread =
      OwnerComputes<OwnerFn, typename Container::template rethread<b>, T>;

  template <typename _container>
  struct with_container {
    typedef OwnerComputes<OwnerFn, _container, T> type;
  };

  template <typename _indexer>
  struct with_indexer {
    typedef OwnerComputes<_indexer, Container, T> type;
  };

private:
  typedef typename Container::template retype<T> lWLTy;

  typedef lWLTy cWL;
  typedef lWLTy pWL;

  OwnerFn Fn;
  substrate::PerSocketStorage<cWL> items;
  substrate::PerSocketStorage<pWL> pushBuffer;

public:
  typedef T value_type;

  void push(const value_type& val) {
    unsigned int index  = Fn(val);
    auto& tp            = substrate::getThreadPool();
    unsigned int mindex = tp.getSocket(index);
    // std::cerr << "[" << index << "," << index % active << "]\n";
    if (mindex == substrate::ThreadPool::getSocket())
      items.getLocal()->push(val);
    else
      pushBuffer.getRemote(mindex)->push(val);
  }

  template <typename ItTy>
  void push(ItTy b, ItTy e) {
    while (b != e)
      push(*b++);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    push(rp.first, rp.second);
    for (unsigned int x = 0; x < pushBuffer.size(); ++x)
      pushBuffer.getRemote(x)->flush();
  }

  galois::optional<value_type> pop() {
    cWL& wl                             = *items.getLocal();
    galois::optional<value_type> retval = wl.pop();
    if (retval)
      return retval;
    pWL& p = *pushBuffer.getLocal();
    while ((retval = p.pop()))
      wl.push(*retval);
    return wl.pop();
  }
};
GALOIS_WLCOMPILECHECK(OwnerComputes)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/PerThreadChunk.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_PERTHREADCHUNK_H
#define GALOIS_WORKLIST_PERTHREADCHUNK_H

#include "galois/FixedSizeRing.h"
#include "galois/runtime/Mem.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/CompilerSpecific.h"
#include "galois/substrate/PtrLock.h"
#include "galois/Threads.h"
#include "galois/worklists/WLCompileCheck.h"

namespace galois {
namespace worklists {

struct ChunkHeader {
  ChunkHeader* next;
  ChunkHeader* prev;
};

class PerThreadChunkQueue {
  substrate::PtrLock<ChunkHeader> head;
  ChunkHeader* tail;

  void prepend(ChunkHeader* C) {
    // Find tail of stolen stuff
    ChunkHeader* t = C;
    while (t->next) {
      t = t->next;
    }
    head.lock();
    t->next = head.getValue();
    if (!t->next)
      tail = t;
    head.unlock_and_set(C);
  }

public:
  PerThreadChunkQueue() : tail(0) {}

  bool empty() const { return !tail; }

  void push(ChunkHeader* obj) {
    head.lock();
    obj->next = 0;
    if (tail) {
      tail->next = obj;
      tail       = obj;
      head.unlock();
    } else {
      assert(!head.getValue());
      tail = obj;
      head.unlock_and_set(obj);
    }
  }

  ChunkHeader* pop() {
    // lock free Fast path empty case
    if (empty())
      return 0;

    head.lock();
    ChunkHeader* h = head.getValue();
    if (!h) {
      head.unlock();
      return 0;
    }
    if (tail == h) {
      tail = 0;
      assert(!h->next);
      head.unlock_and_clear();
    } else {
      head.unlock_and_set(h->next);
      h->next = 0;
    }
    return h;
  }

  ChunkHeader* stealAllAndPop(PerThreadChunkQueue& victim) {
    // Don't do work on empty victims (lockfree check)
    if (victim.empty())
      return 0;
    // Steal everything
    victim.head.lock();
    ChunkHeader* C = victim.head.getValue();
    if (C)
      victim.tail = 0;
    victim.head.unlock_and_clear();
    if (!C)
      return 0; // Didn't get anything
    ChunkHeader* retval = C;
    C                   = C->next;
    retval->next        = 0;
    if (!C)
      return retval; // Only got one thing
    prepend(C);
    return retval;
  }

  ChunkHeader* stealHalfAndPop(PerThreadChunkQueue& victim) {
    // Don't do work on empty victims (lockfree check)
    if (victim.empty())
      return 0;
    // Steal half
    victim.head.lock();
    ChunkHeader* C     = victim.head.getValue();
    ChunkHeader* ntail = C;
    bool count         = false;
    while (C) {
      C = C->next;
      if (count)
        ntail = ntail->next;
      count = !count;
    }
    if (ntail) {
      C           = ntail->next;
      ntail->next = 0;
      victim.tail = ntail;
    }
    victim.head.unlock();
    if (!C)
      return 0; // Didn't get anything
    ChunkHeader* retval = C;
    C                   = C->next;
    retval->next        = 0;
    if (!C)
      return retval; // Only got one thing
    prepend(C);
    return retval;
  }
};

class PerThreadChunkStack {
  substrate::PtrLock<ChunkHeader> head;

  void prepend(ChunkHeader* C) {
    // Find tail of stolen stuff
    ChunkHeader* tail = C;
    while (tail->next) {
      tail = tail->next;
    }
    head.lock();
    tail->next = head.getValue();
    head.unlock_and_set(C);
  }

public:
  bool empty() const { return !head.getValue(); }

  void push(ChunkHeader* obj) {
    ChunkHeader* oldhead = 0;
    do {
      oldhead   = head.getValue();
      obj->next = oldhead;
    } while (!head.CAS(oldhead, obj));
  }

  ChunkHeader* pop() {
    // lock free Fast empty path
    if (empty())
      return 0;

    // Disable CAS
    head.lock();
    ChunkHeader* retval = head.getValue();
    ChunkHeader* setval = 0;
    if (retval) {
      setval       = retval->next;
      retval->next = 0;
    }
    head.unlock_and_set(setval);
    return retval;
  }

  ChunkHeader* stealAllAndPop(PerThreadChunkStack& victim) {
    // Don't do work on empty victims (lockfree check)
    if (victim.empty())
      return 0;
    // Steal everything
    victim.head.lock();
    ChunkHeader* C = victim.head.getValue();
    victim.head.unlock_and_clear();
    if (!C)
      return 0; // Didn't get anything
    ChunkHeader* retval = C;
    C                   = C->next;
    retval->next        = 0;
    if (!C)
      return retval; // Only got one thing
    prepend(C);
    return retval;
  }

  ChunkHeader* stealHalfAndPop(PerThreadChunkStack& victim) {
    // Don't do work on empty victims (lockfree check)
    if (victim.empty())
      return 0;
    // Steal half
    victim.head.lock();
    ChunkHeader* C     = victim.head.getValue();
    ChunkHeader* ntail = C;
    bool count         = false;
    while (C) {
      C = C->next;
      if (count)
        ntail = ntail->next;
      count = !count;
    }
    if (ntail) {
      C           = ntail->next;
      ntail->next = 0;
    }
    victim.head.unlock();
    if (!C)
      return 0; // Didn't get anything
    ChunkHeader* retval = C;
    C                   = C->next;
    retval->next        = 0;
    if (!C)
      return retval; // Only got one thing
    prepend(C);
    return retval;
  }
};

template <typename InnerWL>
class StealingQueue : private boost::noncopyable {
  substrate::PerThreadStorage<std::pair<InnerWL, unsigned>> local;

  GALOIS_ATTRIBUTE_NOINLINE
  ChunkHeader* doSteal() {
    std::pair<InnerWL, unsigned>& me = *local.getLocal();
    auto& tp                         = substrate::getThreadPool();
    unsigned id                      = tp.getTID();
    unsigned pkg                     = substrate::ThreadPool::getSocket();
    unsigned num                     = galois::getActiveThreads();

    // First steal from this socket
    for (unsigned eid = id + 1; eid < num; ++eid) {
      if (tp.getSocket(eid) == pkg) {
        ChunkHeader* c = me.first.stealHalfAndPop(local.getRemote(eid)->first);
        if (c)
          return c;
      }
    }
    for (unsigned eid = 0; eid < id; ++eid) {
      if (tp.getSocket(eid) == pkg) {
        ChunkHeader* c = me.first.stealHalfAndPop(local.getRemote(eid)->first);
        if (c)
          return c;
      }
    }

    // Leaders can cross socket
    if (substrate::ThreadPool::isLeader()) {
      unsigned eid = (id + me.second) % num;
      ++me.second;
      if (id != eid && tp.isLeader(eid)) {
        ChunkHeader* c = me.first.stealAllAndPop(local.getRemote(eid)->first);
        if (c)
          return c;
      }
    }
    return 0;
  }

public:
  void push(ChunkHeader* c) { local.getLocal()->first.push(c); }

  ChunkHeader* pop() {
    if (ChunkHeader* c = local.getLocal()->first.pop())
      return c;
    return doSteal();
  }
};

template <bool IsLocallyLIFO, int ChunkSize, typename Container, typename T>
struct PerThreadChunkMaster : private boost::noncopyable {
  template <typename _T>
  using retype = PerThreadChunkMaster<IsLocallyLIFO, ChunkSize, Container, _T>;

  template <bool _concurrent>
  using rethread = PerThreadChunkMaster<IsLocallyLIFO, ChunkSize, Container, T>;

  template <int _chunk_size>
  using with_chunk_size =
      PerThreadChunkMaster<IsLocallyLIFO, _chunk_size, Container, T>;

private:
  class Chunk : public ChunkHeader,
                public galois::FixedSizeRing<T, ChunkSize> {};

  runtime::FixedSizeAllocator<Chunk> alloc;
  substrate::PerThreadStorage<std::pair<Chunk*, Chunk*>> data;
  Container worklist;

  Chunk* mkChunk() {
    Chunk* ptr = alloc.allocate(1);
    alloc.construct(ptr);
    return ptr;
  }

  void delChunk(Chunk* ptr) {
    alloc.destroy(ptr);
    alloc.deallocate(ptr, 1);
  }

  void swapInPush(std::pair<Chunk*, Chunk*>& d) {
    if (!IsLocallyLIFO)
      std::swap(d.first, d.second);
  }

  Chunk*& getPushChunk(std::pair<Chunk*, Chunk*>& d) {
    if (!IsLocallyLIFO)
      return d.second;
    else
      return d.first;
  }

  Chunk*& getPopChunk(std::pair<Chunk*, Chunk*>& d) { return d.first; }

  bool doPush(Chunk* c, const T& val) { return c->push_back(val); }

  galois::optional<T> doPop(Chunk* c) {
    if (!IsLocallyLIFO)
      return c->extract_front();
    else
      return c->extract_back();
  }

  void push_internal(std::pair<Chunk*, Chunk*>&, Chunk*& n, const T& val) {
    // Simple case, space in current chunk
    if (n && doPush(n, val))
      return;
    // full chunk, push
    if (n)
      worklist.push(static_cast<ChunkHeader*>(n));
    // get empty chunk;
    n = mkChunk();
    // There better be some room in the new chunk
    doPush(n, val);
  }

public:
  typedef T value_type;

  PerThreadChunkMaster() {}

  void push(value_type val) {
    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();
    Chunk*& n                      = getPushChunk(tld);
    push_internal(tld, n, val);
  }

  template <typename Iter>
  void push(Iter b, Iter e) {
    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();
    Chunk*& n                      = getPushChunk(tld);
    while (b != e)
      push_internal(tld, n, *b++);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    auto rp = range.local_pair();
    push(rp.first, rp.second);
  }

  galois::optional<value_type> pop() {
    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();
    Chunk*& n                      = getPopChunk(tld);
    galois::optional<value_type> retval;
    // simple case, things in current chunk
    if (n && (retval = doPop(n)))
      return retval;
    // empty chunk, trash it
    if (n)
      delChunk(n);
    // get a new chunk
    n = static_cast<Chunk*>(worklist.pop());
    if (n && (retval = doPop(n)))
      return retval;
    // try stealing the push buffer if we can
    swapInPush(tld);
    if (n)
      retval = doPop(n);
    return retval;
  }
};

template <int ChunkSize = 64, typename T = int>
using PerThreadChunkLIFO =
    PerThreadChunkMaster<true, ChunkSize, StealingQueue<PerThreadChunkStack>,
                         T>;
GALOIS_WLCOMPILECHECK(PerThreadChunkLIFO)

template <int ChunkSize = 64, typename T = int>
using PerThreadChunkFIFO =
    PerThreadChunkMaster<false, ChunkSize, StealingQueue<PerThreadChunkQueue>,
                         T>;
GALOIS_WLCOMPILECHECK(PerThreadChunkFIFO)

} // namespace worklists
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/worklists/Simple.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_FIFO_H
#define GALOIS_WORKLIST_FIFO_H

#include <deque>
#include <mutex>

#include "galois/config.h"
#include "galois/gdeque.h"
#include "galois/substrate/PaddedLock.h"
#include "galois/worklists/WLCompileCheck.h"

namespace galois {
namespace worklists {

//! Simple Container Wrapper worklist (not scalable).
template <typename T, typename container = std::deque<T>, bool popBack = true>
class Wrapper : private boost::noncopyable {
  substrate::PaddedLock<true> lock;
  container wl;

public:
  template <typename _T>
  using retype = Wrapper<_T>;

  template <bool b>
  using rethread = Wrapper;

  typedef T value_type;

  void push(const value_type& val) {
    std::lock_guard<substrate::PaddedLock<true>> lg(lock);
    wl.push_back(val);
  }

  template <typename Iter>
  void push(Iter b, Iter e) {
    std::lock_guard<substrate::PaddedLock<true>> lg(lock);
    wl.insert(wl.end(), b, e);
  }

  template <typename RangeTy>
  void push_initial(const RangeTy& range) {
    if (substrate::ThreadPool::getTID() == 0)
      push(range.begin(), range.end());
  }

  galois::optional<value_type> pop() {
    galois::optional<value_type> retval;
    std::lock_guard<substrate::PaddedLock<true>> lg(lock);
    if (!wl.empty()) {
      if (popBack) {
        retval = wl.back();
        wl.pop_back();
      } else {
        retval = wl.front();
        wl.pop_front();
      }
    }
    return retval;
  }
};

template <typename T = int>
using FIFO = Wrapper<T, std::deque<T>, false>;

template <typename T = int>
using GFIFO = Wrapper<T, galois::gdeque<T>, false>;

template <typename T = int>
using LIFO = Wrapper<T, std::deque<T>, true>;

template <typename T = int>
using GLIFO = Wrapper<T, galois::gdeque<T>, true>;

GALOIS_WLCOMPILECHECK(FIFO)
GALOIS_WLCOMPILECHECK(GFIFO)
GALOIS_WLCOMPILECHECK(LIFO)
GALOIS_WLCOMPILECHECK(GLIFO)

} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/StableIterator.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_STABLEITERATOR_H
#define GALOIS_WORKLIST_STABLEITERATOR_H

#include "galois/config.h"
#include "galois/gstl.h"
#include "galois/worklists/Chunk.h"

namespace galois {
namespace worklists {

/**
 * Low-overhead worklist when initial range is not invalidated by the
 * operator.
 *
 * @tparam Steal     Try workstealing on initial ranges
 * @tparam Container Worklist to manage work enqueued by the operator
 * @tparam Iterator  (inferred by library)
 */
template <bool Steal = false, typename Container = PerSocketChunkFIFO<>,
          typename Iterator = int*>
struct StableIterator {
  typedef typename std::iterator_traits<Iterator>::value_type value_type;
  typedef Iterator iterator;

  //! change the type the worklist holds
  template <typename _T>
  using retype =
      StableIterator<Steal, typename Container::template retype<_T>, Iterator>;

  template <bool b>
  using rethread =
      StableIterator<Steal, typename Container::template rethread<b>, Iterator>;

  template <typename _iterator>
  struct with_iterator {
    typedef StableIterator<Steal, Container, _iterator> type;
  };

  template <bool _steal>
  struct with_steal {
    typedef StableIterator<_steal, Container, Iterator> type;
  };

  template <typename _container>
  struct with_container {
    typedef StableIterator<Steal, _container, Iterator> type;
  };

private:
  struct shared_state {
    Iterator stealBegin;
    Iterator stealEnd;
    substrate::SimpleLock stealLock;
    bool stealAvail;
  };

  struct state {
    substrate::CacheLineStorage<shared_state> stealState;
    Iterator localBegin;
    Iterator localEnd;
    unsigned int nextVictim;
    unsigned int numStealFailures;

    void populateSteal() {
      if (Steal && localBegin != localEnd) {
        shared_state& s = stealState.data;
        s.stealLock.lock();
        s.stealEnd   = localEnd;
        s.stealBegin = localEnd = galois::split_range(localBegin, localEnd);
        if (s.stealBegin != s.stealEnd)
          s.stealAvail = true;
        s.stealLock.unlock();
      }
    }
  };

  substrate::PerThreadStorage<state> TLDS;
  Container inner;

  bool doSteal(state& dst, state& src, bool wait) {
    shared_state& s = src.stealState.data;
    if (s.stealAvail) {
      if (wait) {
        s.stealLock.lock();
      } else if (!s.stealLock.try_lock()) {
        return false;
      }
      if (s.stealBegin != s.stealEnd) {
        dst.localBegin = s.stealBegin;
        s.stealBegin   = dst.localEnd =
            galois::split_range(s.stealBegin, s.stealEnd);
        s.stealAvail = s.stealBegin != s.stealEnd;
      }
      s.stealLock.unlock();
    }
    return dst.localBegin != dst.localEnd;
  }

  // pop already failed, try again with stealing
  galois::optional<value_type> pop_steal(state& data) {
    // always try stealing self
    if (doSteal(data, data, true))
      return *data.localBegin++;
    // only try stealing one other
    if (doSteal(data, *TLDS.getRemote(data.nextVictim), false)) {
      // share the wealth
      if (data.nextVictim != substrate::ThreadPool::getTID())
        data.populateSteal();
      return *data.localBegin++;
    }
    ++data.nextVictim;
    ++data.numStealFailures;
    data.nextVictim %= runtime::activeThreads;
    return galois::optional<value_type>();
  }

public:
  //! push initial range onto the queue
  //! called with the same b and e on each thread
  template <typename RangeTy>
  void push_initial(const RangeTy& r) {
    state& data           = *TLDS.getLocal();
    auto lp               = r.local_pair();
    data.localBegin       = lp.first;
    data.localEnd         = lp.second;
    data.nextVictim       = substrate::ThreadPool::getTID();
    data.numStealFailures = 0;
    data.populateSteal();
  }

  //! pop a value from the queue.
  galois::optional<value_type> pop() {
    state& data = *TLDS.getLocal();
    if (data.localBegin != data.localEnd)
      return *data.localBegin++;

    galois::optional<value_type> item;
    if (Steal && 2 * data.numStealFailures > runtime::activeThreads)
      if ((item = pop_steal(data)))
        return item;
    if ((item = inner.pop()))
      return item;
    if (Steal)
      return pop_steal(data);
    return item;
  }

  void push(const value_type& val) { inner.push(val); }

  template <typename Iter>
  void push(Iter b, Iter e) {
    while (b != e)
      push(*b++);
  }
};
GALOIS_WLCOMPILECHECK(StableIterator)

} // namespace worklists
} // namespace galois
#endif


================================================
FILE: libgalois/include/galois/worklists/WLCompileCheck.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_WLCOMPILECHECK_H
#define GALOIS_WORKLIST_WLCOMPILECHECK_H

#include "galois/config.h"

#ifndef GALOIS_WLCOMPILECHECK
#define GALOIS_WLCOMPILECHECK(name) //
#endif

#endif


================================================
FILE: libgalois/include/galois/worklists/WorkList.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_WORKLIST_H
#define GALOIS_WORKLIST_WORKLIST_H

#include "galois/config.h"
#include "galois/optional.h"
#include "galois/worklists/AdaptiveObim.h"
#include "galois/worklists/PerThreadChunk.h"
#include "galois/worklists/BulkSynchronous.h"
#include "galois/worklists/Chunk.h"
#include "galois/worklists/Simple.h"
#include "galois/worklists/LocalQueue.h"
#include "galois/worklists/Obim.h"
#include "galois/worklists/OrderedList.h"
#include "galois/worklists/OwnerComputes.h"
#include "galois/worklists/StableIterator.h"

namespace galois {
/**
 * Scheduling policies for Galois iterators. Unless you have very specific
 * scheduling requirement, {@link PerSocketChunkLIFO} or {@link
 * PerSocketChunkFIFO} is a reasonable scheduling policy. If you need
 * approximate priority scheduling, use {@link OrderedByIntegerMetric}. For
 * debugging, you may be interested in {@link FIFO} or {@link LIFO}, which try
 * to follow serial order exactly.
 *
 * The way to use a worklist is to pass it as a template parameter to
 * {@link for_each()}. For example,
 *
 * \code
 * galois::for_each(galois::iterate(beg,end), fn,
 * galois::wl<galois::worklists::PerSocketChunkFIFO<32>>()); \endcode
 */
namespace worklists {
namespace { // don't pollute the symbol table with the example

// Worklists may not be copied.
// All classes (should) conform to:
template <typename T>
class AbstractWorkList {
  AbstractWorkList(const AbstractWorkList&) = delete;
  const AbstractWorkList& operator=(const AbstractWorkList&) = delete;

public:
  AbstractWorkList();

  //! Optional paramaterized Constructor
  //! parameters can be whatever
  AbstractWorkList(int, double, char*);

  //! T is the value type of the WL
  typedef T value_type;

  //! Changes the type the worklist holds
  template <typename _T>
  using retype = AbstractWorkList<_T>;

  //! Pushes a value onto the queue
  void push(const value_type& val);

  //! Pushes a range onto the queue
  template <typename Iter>
  void push(Iter b, Iter e);

  /**
   * Pushes initial range onto the queue. Called with the same b and e on each
   * thread
   */
  template <typename RangeTy>
  void push_initial(const RangeTy&);

  //! Pops a value from the queue.
  galois::optional<value_type> pop();

  /**
   * (optional) Returns true if the worklist is empty. Called infrequently
   * by scheduler after pop has failed. Good way to split retrieving work
   * into pop (fast path) and empty (slow path).
   */
  bool empty();
};

} // namespace
} // end namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/include/galois/worklists/WorkListHelpers.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_WORKLIST_WORKLISTHELPERS_H
#define GALOIS_WORKLIST_WORKLISTHELPERS_H

#include <boost/iterator/iterator_facade.hpp>

#include "galois/config.h"
#include "galois/substrate/PtrLock.h"
#include "galois/worklists/WLCompileCheck.h"

namespace galois {
namespace worklists {

template <typename T>
class ConExtListNode {
  T* next;

public:
  ConExtListNode() : next(0) {}
  T*& getNext() { return next; }
  T* const& getNext() const { return next; }
};

template <typename T>
class ConExtIterator
    : public boost::iterator_facade<ConExtIterator<T>, T,
                                    boost::forward_traversal_tag> {
  friend class boost::iterator_core_access;
  T* at;

  template <typename OtherTy>
  bool equal(const ConExtIterator<OtherTy>& o) const {
    return at == o.at;
  }

  T& dereference() const { return *at; }
  void increment() { at = at->getNext(); }

public:
  ConExtIterator() : at(0) {}

  template <typename OtherTy>
  ConExtIterator(const ConExtIterator<OtherTy>& o) : at(o.at) {}

  explicit ConExtIterator(T* x) : at(x) {}
};

template <typename T, bool concurrent>
class ConExtLinkedStack {
  // fixme: deal with concurrent
  substrate::PtrLock<T> head;

public:
  typedef ConExtListNode<T> ListNode;

  bool empty() const { return !head.getValue(); }

  void push(T* C) {
    T* oldhead(0);
    do {
      oldhead      = head.getValue();
      C->getNext() = oldhead;
    } while (!head.CAS(oldhead, C));
  }

  T* pop() {
    // lock free Fast path (empty)
    if (empty())
      return 0;

    // Disable CAS
    head.lock();
    T* C = head.getValue();
    if (!C) {
      head.unlock();
      return 0;
    }
    head.unlock_and_set(C->getNext());
    C->getNext() = 0;
    return C;
  }

  //! iterators not safe with concurrent modifications
  typedef T value_type;
  typedef T& reference;
  typedef ConExtIterator<T> iterator;
  typedef ConExtIterator<const T> const_iterator;

  iterator begin() { return iterator(head.getValue()); }
  iterator end() { return iterator(); }

  const_iterator begin() const { return const_iterator(head.getValue()); }
  const_iterator end() const { return const_iterator(); }
};

template <typename T, bool concurrent>
class ConExtLinkedQueue {
  // Fixme: deal with concurrent
  substrate::PtrLock<T> head;
  T* tail;

public:
  typedef ConExtListNode<T> ListNode;

  ConExtLinkedQueue() : tail(0) {}

  bool empty() const { return !tail; }

  void push(T* C) {
    head.lock();
    // std::cerr << "in(" << C << ") ";
    C->getNext() = 0;
    if (tail) {
      tail->getNext() = C;
      tail            = C;
      head.unlock();
    } else {
      assert(!head.getValue());
      tail = C;
      head.unlock_and_set(C);
    }
  }

  T* pop() {
    // lock free Fast path empty case
    if (empty())
      return 0;

    head.lock();
    T* C = head.getValue();
    if (!C) {
      head.unlock();
      return 0;
    }
    if (tail == C) {
      tail = 0;
      assert(!C->getNext());
      head.unlock_and_clear();
    } else {
      head.unlock_and_set(C->getNext());
      C->getNext() = 0;
    }
    return C;
  }

  //! iterators not safe with concurrent modifications
  typedef T value_type;
  typedef T& reference;
  typedef ConExtIterator<T> iterator;
  typedef ConExtIterator<const T> const_iterator;

  iterator begin() { return iterator(head.getValue()); }
  iterator end() { return iterator(); }

  const_iterator begin() const { return const_iterator(head.getValue()); }
  const_iterator end() const { return const_iterator(); }
};

template <typename T>
struct DummyIndexer {
  unsigned operator()(const T&) { return 0; }
};

} // namespace worklists
} // end namespace galois

#endif


================================================
FILE: libgalois/src/Barrier.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/Barrier.h"

// anchor vtable
galois::substrate::Barrier::~Barrier() {}

// galois::substrate::Barrier& galois::substrate::getSystemBarrier(unsigned
// activeThreads) {
//  return benchmarking::getTopoBarrier(activeThreads);
//}

static galois::substrate::internal::BarrierInstance<>* BI = nullptr;

void galois::substrate::internal::setBarrierInstance(
    internal::BarrierInstance<>* bi) {
  GALOIS_ASSERT(!(bi && BI), "Double initialization of BarrierInstance");
  BI = bi;
}

galois::substrate::Barrier& galois::substrate::getBarrier(unsigned numT) {
  GALOIS_ASSERT(BI, "BarrierInstance not initialized");
  return BI->get(numT);
}


================================================
FILE: libgalois/src/Barrier_Counting.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"

namespace {

class CountingBarrier : public galois::substrate::Barrier {
  std::atomic<unsigned> count;
  std::atomic<bool> sense;
  unsigned num;
  std::vector<galois::substrate::CacheLineStorage<bool>> local_sense;

  void _reinit(unsigned val) {
    count = num = val;
    sense       = false;
    local_sense.resize(val);
    for (unsigned i = 0; i < val; ++i)
      local_sense.at(i).get() = false;
  }

public:
  CountingBarrier(unsigned int activeT) { _reinit(activeT); }

  virtual ~CountingBarrier() {}

  virtual void reinit(unsigned val) { _reinit(val); }

  virtual void wait() {
    bool& lsense =
        local_sense.at(galois::substrate::ThreadPool::getTID()).get();
    lsense = !lsense;
    if (--count == 0) {
      count = num;
      sense = lsense;
    } else {
      while (sense != lsense) {
        galois::substrate::asmPause();
      }
    }
  }

  virtual const char* name() const { return "CountingBarrier"; }
};

} // namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createCountingBarrier(unsigned activeThreads) {
  return std::unique_ptr<Barrier>(new CountingBarrier(activeThreads));
}


================================================
FILE: libgalois/src/Barrier_Dissemination.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"

#include <atomic>

namespace {

#define FAST_LOG2(x)                                                           \
  (sizeof(unsigned long) * 8 - 1 - __builtin_clzl((unsigned long)(x)))
#define FAST_LOG2_UP(x)                                                        \
  (((x) - (1 << FAST_LOG2(x))) ? FAST_LOG2(x) + 1 : FAST_LOG2(x))

class DisseminationBarrier : public galois::substrate::Barrier {

  struct node {
    std::atomic<int> flag[2];
    node* partner;
    node() : partner(nullptr) {}
    node(const node& rhs) : partner(rhs.partner) {
      flag[0] = rhs.flag[0].load();
      flag[1] = rhs.flag[1].load();
    }
  };

  struct LocalData {
    int parity;
    int sense;
    node myflags[32];
    // std::array<node, 32> myflags;
  };

  std::vector<galois::substrate::CacheLineStorage<LocalData>> nodes;
  unsigned LogP;

  void _reinit(unsigned P) {
    LogP = FAST_LOG2_UP(P);
    nodes.resize(P);
    for (unsigned i = 0; i < P; ++i) {
      LocalData& lhs = nodes.at(i).get();
      lhs.parity     = 0;
      lhs.sense      = 1;
      for (unsigned j = 0; j < sizeof(lhs.myflags) / sizeof(*lhs.myflags); ++j)
        lhs.myflags[j].flag[0] = lhs.myflags[j].flag[1] = 0;

      int d = 1;
      for (unsigned j = 0; j < LogP; ++j) {
        LocalData& rhs         = nodes.at((i + d) % P).get();
        lhs.myflags[j].partner = &rhs.myflags[j];
        d *= 2;
      }
    }
  }

public:
  DisseminationBarrier(unsigned v) { _reinit(v); }

  virtual void reinit(unsigned val) { _reinit(val); }

  virtual void wait() {
    auto& ld     = nodes.at(galois::substrate::ThreadPool::getTID()).get();
    auto& sense  = ld.sense;
    auto& parity = ld.parity;
    for (unsigned r = 0; r < LogP; ++r) {
      ld.myflags[r].partner->flag[parity] = sense;
      while (ld.myflags[r].flag[parity] != sense) {
        galois::substrate::asmPause();
      }
    }
    if (parity == 1)
      sense = 1 - ld.sense;
    parity = 1 - parity;
  }

  virtual const char* name() const { return "DisseminationBarrier"; }
};

} // namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createDisseminationBarrier(unsigned activeThreads) {
  return std::unique_ptr<Barrier>(new DisseminationBarrier(activeThreads));
}


================================================
FILE: libgalois/src/Barrier_MCS.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"

#include <atomic>

namespace {

class MCSBarrier : public galois::substrate::Barrier {
  struct treenode {
    // vpid is galois::runtime::LL::getTID()
    std::atomic<bool>* parentpointer; // null for vpid == 0
    std::atomic<bool>* childpointers[2];
    bool havechild[4];

    std::atomic<bool> childnotready[4];
    std::atomic<bool> parentsense;
    bool sense;
    treenode() {}
    treenode(const treenode& rhs)
        : parentpointer(rhs.parentpointer), sense(rhs.sense) {
      childpointers[0] = rhs.childpointers[0];
      childpointers[1] = rhs.childpointers[1];
      for (int i = 0; i < 4; ++i) {
        havechild[i]     = rhs.havechild[i];
        childnotready[i] = rhs.childnotready[i].load();
      }
      parentsense = rhs.parentsense.load();
    }
  };

  std::vector<galois::substrate::CacheLineStorage<treenode>> nodes;

  void _reinit(unsigned P) {
    nodes.resize(P);
    for (unsigned i = 0; i < P; ++i) {
      treenode& n   = nodes.at(i).get();
      n.sense       = true;
      n.parentsense = false;
      for (int j = 0; j < 4; ++j)
        n.childnotready[j] = n.havechild[j] = ((4 * i + j + 1) < P);
      n.parentpointer =
          (i == 0) ? 0
                   : &nodes.at((i - 1) / 4).get().childnotready[(i - 1) % 4];
      n.childpointers[0] =
          ((2 * i + 1) >= P) ? 0 : &nodes.at(2 * i + 1).get().parentsense;
      n.childpointers[1] =
          ((2 * i + 2) >= P) ? 0 : &nodes.at(2 * i + 2).get().parentsense;
    }
  }

public:
  MCSBarrier(unsigned v) { _reinit(v); }

  virtual void reinit(unsigned val) { _reinit(val); }

  virtual void wait() {
    treenode& n = nodes.at(galois::substrate::ThreadPool::getTID()).get();
    while (n.childnotready[0] || n.childnotready[1] || n.childnotready[2] ||
           n.childnotready[3]) {
      galois::substrate::asmPause();
    }
    for (int i = 0; i < 4; ++i)
      n.childnotready[i] = n.havechild[i];
    if (n.parentpointer) {
      // FIXME: make sure the compiler doesn't do a RMW because of the as-if
      // rule
      *n.parentpointer = false;
      while (n.parentsense != n.sense) {
        galois::substrate::asmPause();
      }
    }
    // signal children in wakeup tree
    if (n.childpointers[0])
      *n.childpointers[0] = n.sense;
    if (n.childpointers[1])
      *n.childpointers[1] = n.sense;
    n.sense = !n.sense;
  }

  virtual const char* name() const { return "MCSBarrier"; }
};

} // namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createMCSBarrier(unsigned activeThreads) {
  return std::unique_ptr<Barrier>(new MCSBarrier(activeThreads));
}


================================================
FILE: libgalois/src/Barrier_Pthread.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"
#include "galois/gIO.h"

#if defined(GALOIS_HAVE_PTHREAD)

#include <unistd.h>
#include <pthread.h>

#endif

#if defined(GALOIS_HAVE_PTHREAD) && defined(_POSIX_BARRIERS) &&                \
    (_POSIX_BARRIERS > 0)

namespace {

class PthreadBarrier : public galois::substrate::Barrier {
  pthread_barrier_t bar;

public:
  PthreadBarrier() {
    int err = 0;
    if ((err = pthread_barrier_init(&bar, 0, ~0)))
      GALOIS_DIE("pthread ", err);
  }

  PthreadBarrier(unsigned int v) {
    int err = 0;
    if ((err = pthread_barrier_init(&bar, 0, v)))
      GALOIS_DIE("pthread ", err);
  }

  virtual ~PthreadBarrier() {
    int err = 0;
    if ((err = pthread_barrier_destroy(&bar)))
      GALOIS_DIE("pthread ", err);
  }

  virtual void reinit(unsigned val) {
    int err = 0;
    if ((err = pthread_barrier_destroy(&bar)))
      GALOIS_DIE("pthread ", err);
    if ((err = pthread_barrier_init(&bar, 0, val)))
      GALOIS_DIE("pthread ", err);
  }

  virtual void wait() {
    int rc = pthread_barrier_wait(&bar);
    if (rc && rc != PTHREAD_BARRIER_SERIAL_THREAD)
      GALOIS_DIE("pthread ", rc);
  }

  virtual const char* name() const { return "PthreadBarrier"; }
};

} // namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createPthreadBarrier(unsigned activeThreads) {
  return std::unique_ptr<Barrier>(new PthreadBarrier(activeThreads));
}

#else

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createPthreadBarrier(unsigned) {
  return std::unique_ptr<Barrier>(nullptr);
}

#endif


================================================
FILE: libgalois/src/Barrier_Simple.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/Barrier.h"
#include "galois/substrate/ThreadPool.h"

#include <mutex>
#include <condition_variable>

namespace {

class OneWayBarrier : public galois::substrate::Barrier {
  std::mutex lock;
  std::condition_variable cond;
  unsigned count;
  unsigned total;

public:
  OneWayBarrier(unsigned p) { reinit(p); }

  virtual ~OneWayBarrier() {}

  virtual void reinit(unsigned val) {
    count = 0;
    total = val;
  }

  virtual void wait() {
    std::unique_lock<std::mutex> tmp(lock);
    count += 1;
    cond.wait(tmp, [this]() { return count >= total; });
    cond.notify_all();
  }

  virtual const char* name() const { return "OneWayBarrier"; }
};

class SimpleBarrier : public galois::substrate::Barrier {
  OneWayBarrier barrier1;
  OneWayBarrier barrier2;
  unsigned total;

public:
  SimpleBarrier(unsigned p) : barrier1(p), barrier2(p), total(p) {}

  virtual ~SimpleBarrier() {}

  virtual void reinit(unsigned val) {
    total = val;
    barrier1.reinit(val);
    barrier2.reinit(val);
  }

  virtual void wait() {
    barrier1.wait();
    if (galois::substrate::ThreadPool::getTID() == 0)
      barrier1.reinit(total);
    barrier2.wait();
    if (galois::substrate::ThreadPool::getTID() == 0)
      barrier2.reinit(total);
  }

  virtual const char* name() const { return "SimpleBarrier"; }
};

} // end anonymous namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createSimpleBarrier(unsigned int v) {
  return std::unique_ptr<Barrier>(new SimpleBarrier(v));
}


================================================
FILE: libgalois/src/Barrier_Topo.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/CompilerSpecific.h"

#include <atomic>

namespace {

class TopoBarrier : public galois::substrate::Barrier {
  struct treenode {
    // vpid is galois::runtime::LL::getTID()

    // socket binary tree
    treenode* parentpointer; // null of vpid == 0
    treenode* childpointers[2];

    // waiting values:
    unsigned havechild;
    std::atomic<unsigned> childnotready;

    // signal values
    std::atomic<unsigned> parentsense;
  };

  galois::substrate::PerSocketStorage<treenode> nodes;
  galois::substrate::PerThreadStorage<unsigned> sense;

  void _reinit(unsigned P) {
    auto& tp      = galois::substrate::getThreadPool();
    unsigned pkgs = tp.getCumulativeMaxSocket(P - 1) + 1;
    for (unsigned i = 0; i < pkgs; ++i) {
      treenode& n     = *nodes.getRemoteByPkg(i);
      n.childnotready = 0;
      n.havechild     = 0;
      for (int j = 0; j < 4; ++j) {
        if ((4 * i + j + 1) < pkgs) {
          ++n.childnotready;
          ++n.havechild;
        }
      }
      for (unsigned j = 0; j < P; ++j) {
        if (tp.getSocket(j) == i && !tp.isLeader(j)) {
          ++n.childnotready;
          ++n.havechild;
        }
      }
      n.parentpointer = (i == 0) ? 0 : nodes.getRemoteByPkg((i - 1) / 4);
      n.childpointers[0] =
          ((2 * i + 1) >= pkgs) ? 0 : nodes.getRemoteByPkg(2 * i + 1);
      n.childpointers[1] =
          ((2 * i + 2) >= pkgs) ? 0 : nodes.getRemoteByPkg(2 * i + 2);
      n.parentsense = 0;
    }
    for (unsigned i = 0; i < P; ++i)
      *sense.getRemote(i) = 1;
  }

public:
  TopoBarrier(unsigned v) { _reinit(v); }

  // not safe if any thread is in wait
  virtual void reinit(unsigned val) { _reinit(val); }

  virtual void wait() {
    unsigned id = galois::substrate::ThreadPool::getTID();
    treenode& n = *nodes.getLocal();
    unsigned& s = *sense.getLocal();
    bool leader = galois::substrate::ThreadPool::isLeader();
    // completion tree
    if (leader) {
      while (n.childnotready) {
        galois::substrate::asmPause();
      }
      n.childnotready = n.havechild;
      if (n.parentpointer) {
        --n.parentpointer->childnotready;
      }
    } else {
      --n.childnotready;
    }

    // wait for signal
    if (id != 0) {
      while (n.parentsense != s) {
        galois::substrate::asmPause();
      }
    }

    // signal children in wakeup tree
    if (leader) {
      if (n.childpointers[0])
        n.childpointers[0]->parentsense = s;
      if (n.childpointers[1])
        n.childpointers[1]->parentsense = s;
      if (id == 0)
        n.parentsense = s;
    }
    ++s;
  }

  virtual const char* name() const { return "TopoBarrier"; }
};

} // namespace

std::unique_ptr<galois::substrate::Barrier>
galois::substrate::createTopoBarrier(unsigned activeThreads) {
  return std::unique_ptr<Barrier>(new TopoBarrier(activeThreads));
}


================================================
FILE: libgalois/src/Context.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Context.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/CacheLineStorage.h"

#include <stdio.h>

//! Global thread context for each active thread
static thread_local galois::runtime::SimpleRuntimeContext* thread_ctx = 0;

thread_local jmp_buf galois::runtime::execFrame;

void galois::runtime::setThreadContext(
    galois::runtime::SimpleRuntimeContext* ctx) {
  thread_ctx = ctx;
}

galois::runtime::SimpleRuntimeContext* galois::runtime::getThreadContext() {
  return thread_ctx;
}

////////////////////////////////////////////////////////////////////////////////
// LockManagerBase & SimpleRuntimeContext
////////////////////////////////////////////////////////////////////////////////

galois::runtime::LockManagerBase::AcquireStatus
galois::runtime::LockManagerBase::tryAcquire(
    galois::runtime::Lockable* lockable) {
  assert(lockable);
  // XXX(ddn): Hand inlining this code makes a difference on
  // delaunaytriangulation (GCC 4.7.2)
#if 0
  if (tryLock(lockable)) {
    assert(!getOwner(lockable));
    setOwner(lockable);
    return NEW_OWNER;
#else
  if (lockable->owner.try_lock()) {
    lockable->owner.setValue(this);
    return NEW_OWNER;
#endif
}
else if (getOwner(lockable) == this) {
  return ALREADY_OWNER;
}
return FAIL;
}

void galois::runtime::SimpleRuntimeContext::release(
    galois::runtime::Lockable* lockable) {
  assert(lockable);
  // The deterministic executor, for instance, steals locks from other
  // iterations
  assert(customAcquire || getOwner(lockable) == this);
  assert(!lockable->next);
  lockable->owner.unlock_and_clear();
}

unsigned galois::runtime::SimpleRuntimeContext::commitIteration() {
  unsigned numLocks = 0;
  while (locks) {
    // ORDER MATTERS!
    Lockable* lockable = locks;
    locks              = lockable->next;
    lockable->next     = 0;
    substrate::compilerBarrier();
    release(lockable);
    ++numLocks;
  }

  return numLocks;
}

unsigned galois::runtime::SimpleRuntimeContext::cancelIteration() {
  return commitIteration();
}

void galois::runtime::SimpleRuntimeContext::subAcquire(
    galois::runtime::Lockable*, galois::MethodFlag) {
  GALOIS_DIE("unreachable");
}


================================================
FILE: libgalois/src/Deterministic.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Executor_Deterministic.h"

thread_local galois::runtime::SizedHeapFactory::SizedHeap*
    galois::runtime::internal::dagListHeap;


================================================
FILE: libgalois/src/DynamicBitset.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file DynamicBitset.cpp
 *
 * All the implementation of the DynamicBitSet class incorporated into
 * DynamicBitset.h
 */


================================================
FILE: libgalois/src/EnvCheck.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/EnvCheck.h"

#include <cstdlib>

bool galois::substrate::EnvCheck(const char* varName) {
  if (std::getenv(varName))
    return true;
  return false;
}

bool galois::substrate::EnvCheck(const std::string& varName) {
  return galois::substrate::EnvCheck(varName.c_str());
}


================================================
FILE: libgalois/src/FileGraph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file FileGraph.cpp
 *
 * Contains FileGraph.h implementations + other static helper functions
 * for FileGraph.
 */

#include "galois/gIO.h"
#include "galois/graphs/FileGraph.h"
#include "galois/substrate/PageAlloc.h"

#include <cassert>
#include <fstream>

#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>

/**
 * Performs an mmap of all provided arguments.
 */
namespace galois {
namespace graphs {
// Graph file format:
// version (1 or 2) {uint64_t LE}
// EdgeType size {uint64_t LE}
// numNodes {uint64_t LE}
// numEdges {uint64_t LE}
// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge
// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.
// outedges[numEdges] {uint32_t LE or uint64_t LE for ver == 2}
// potential padding (32bit max) to Re-Align to 64bits
// EdgeType[numEdges] {EdgeType size}

FileGraph::FileGraph()
    : sizeofEdge(0), numNodes(0), numEdges(0), outIdx(0), outs(0), edgeData(0),
      graphVersion(-1), nodeOffset(0), edgeOffset(0) {}

FileGraph::FileGraph(const FileGraph& o) {
  fromArrays(o.outIdx, o.numNodes, o.outs, o.numEdges, o.edgeData, o.sizeofEdge,
             o.nodeOffset, o.edgeOffset, true, o.graphVersion);
}

FileGraph& FileGraph::operator=(const FileGraph& other) {
  if (this != &other) {
    FileGraph tmp(other);
    *this = std::move(tmp);
  }
  return *this;
}

FileGraph::FileGraph(FileGraph&& other)
    : sizeofEdge(0), numNodes(0), numEdges(0), outIdx(0), outs(0), edgeData(0),
      graphVersion(-1), nodeOffset(0), edgeOffset(0) {
  move_assign(std::move(other));
}

FileGraph& FileGraph::operator=(FileGraph&& other) {
  move_assign(std::move(other));
  return *this;
}

FileGraph::~FileGraph() {
  for (auto& m : mappings)
    munmap(m.ptr, m.len);
  for (auto& fd : fds)
    close(fd);
}

void FileGraph::move_assign(FileGraph&& o) {
  std::swap(mappings, o.mappings);
  std::swap(fds, o.fds);
  std::swap(sizeofEdge, o.sizeofEdge);
  std::swap(numNodes, o.numNodes);
  std::swap(numEdges, o.numEdges);
  std::swap(outIdx, o.outIdx);
  std::swap(outs, o.outs);
  std::swap(edgeData, o.edgeData);
  std::swap(graphVersion, o.graphVersion);
  std::swap(nodeOffset, o.nodeOffset);
  std::swap(edgeOffset, o.edgeOffset);
}

void FileGraph::fromMem(void* m, uint64_t node_offset, uint64_t edge_offset,
                        uint64_t lenlimit) {
  uint64_t* fptr = (uint64_t*)m;
  graphVersion   = convert_le64toh(*fptr++);

  if (graphVersion != 1 && graphVersion != 2) {
    GALOIS_DIE("unknown file version ", graphVersion);
  }

  sizeofEdge = convert_le64toh(*fptr++);
  numNodes   = convert_le64toh(*fptr++);
  numEdges   = convert_le64toh(*fptr++);
  nodeOffset = node_offset;
  edgeOffset = edge_offset;
  outIdx     = fptr;

  // move over to outgoing edge data and save it
  fptr += numNodes;
  outs = (void*)fptr;

  // skip memory differently depending on file version
  if (graphVersion == 1) {
    uint32_t* fptr32 = (uint32_t*)fptr;
    fptr32 += numEdges + numEdges % 2;
    if (!lenlimit || lenlimit > numEdges + ((char*)fptr32 - (char*)m))
      edgeData = (char*)fptr32;
    else
      edgeData = 0;
  } else {
    uint64_t* fptr64 = (uint64_t*)fptr;
    fptr64 += numEdges + numEdges % 2;

    if (!lenlimit || lenlimit > numEdges + ((char*)fptr64 - (char*)m))
      edgeData = (char*)fptr64;
    else
      edgeData = 0;
  }
}

/**
 * Calculate the total size needed for all data.
 *
 * @param numNodes number of nodes to make space for
 * @param numEdges number of edges to make space for
 * @param sizeofEdgeData the size taken by 1 edge for its edge data
 * @param graphVersion the graph version of the file being loaded (determines
 * the size of edge ids)
 *
 * @returns Total size in bytes needed to store graph data
 */
static size_t rawBlockSize(size_t numNodes, size_t numEdges,
                           size_t sizeofEdgeData, int graphVersion) {
  // header size: version, sizeof_edge_data, numNodes, numEdges, all uint64_t
  size_t bytes = sizeof(uint64_t) * 4;

  // node data
  bytes += sizeof(uint64_t) * numNodes;

  if (graphVersion == 1) {
    bytes += sizeof(uint32_t) * numEdges;

    if (numEdges % 2)
      bytes += sizeof(uint32_t); // padding
  } else if (graphVersion == 2) {
    bytes += sizeof(uint64_t) * numEdges;
    // no padding necessary in version 2 TODO verify this
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }

  bytes += sizeofEdgeData * numEdges;
  return bytes;
}

void* FileGraph::fromGraph(FileGraph& g, size_t sizeof_edge_data) {
  return fromArrays(g.outIdx, g.numNodes, g.outs, g.numEdges, g.edgeData,
                    sizeof_edge_data, g.nodeOffset, g.edgeOffset, true,
                    g.graphVersion);
}

void* FileGraph::fromArrays(uint64_t* out_idx, uint64_t num_nodes, void* outs,
                            uint64_t num_edges, char* edge_data,
                            size_t sizeof_edge_data, uint64_t node_offset,
                            uint64_t edge_offset, bool converted,
                            int oGraphVersion) {
  size_t bytes =
      rawBlockSize(num_nodes, num_edges, sizeof_edge_data, oGraphVersion);

  char* base = (char*)mmap(nullptr, bytes, PROT_READ | PROT_WRITE,
                           _MAP_ANON | MAP_PRIVATE, -1, 0);
  if (base == MAP_FAILED)
    GALOIS_SYS_DIE("failed allocating graph");

  mappings.push_back({base, bytes});

  uint64_t* fptr = (uint64_t*)base;
  // set header info
  if (oGraphVersion == 1) {
    *fptr++ = convert_htole64(1);
  } else if (oGraphVersion == 2) {
    *fptr++ = convert_htole64(2);
  } else {
    GALOIS_DIE("unknown file version: ", oGraphVersion);
  }
  *fptr++ = convert_htole64(sizeof_edge_data);
  *fptr++ = convert_htole64(num_nodes);
  *fptr++ = convert_htole64(num_edges);

  // copy node data
  if (converted) {
    memcpy(fptr, out_idx, sizeof(*out_idx) * num_nodes);
    fptr += num_nodes;
  } else {
    for (size_t i = 0; i < num_nodes; ++i)
      *fptr++ = convert_htole64(out_idx[i]);
  }

  // TODO verify
  char* fptr0;

  // copy edge destinations
  if (oGraphVersion == 1) {
    uint32_t* fptr32 = (uint32_t*)fptr;

    if (converted) {
      // memcpy(fptr32, outs, sizeof(*outs) * num_edges);
      memcpy(fptr32, outs, sizeof(uint32_t) * num_edges);
      fptr32 += num_edges;
    } else {
      for (size_t i = 0; i < num_edges; ++i)
        *fptr32++ = convert_htole32(((uint32_t*)outs)[i]);
    }

    // padding
    if (num_edges % 2)
      fptr32 += 1;

    fptr0 = (char*)fptr32;
  } else {
    // should be version 2; otherwise would have died above
    // note fptr is already typed as uint64_t*...
    if (converted) {
      memcpy(fptr, outs, sizeof(uint64_t) * num_edges);
      fptr += num_edges;
    } else {
      for (size_t i = 0; i < num_edges; ++i)
        *fptr++ = convert_htole64(((uint64_t*)outs)[i]);
    }

    // padding
    if (num_edges % 2)
      fptr += 1;

    fptr0 = (char*)fptr;
  }

  // copy edge data if necessary
  if (edge_data)
    memcpy(fptr0, edge_data, sizeof_edge_data * num_edges);

  // "load" filegraph from our constructed base pointer
  fromMem(base, node_offset, edge_offset, 0);
  // graph version should be set in from mem

  assert(graphVersion == oGraphVersion);

  return edgeData;
}

void FileGraph::fromFile(const std::string& filename) {
  int fd = open(filename.c_str(), O_RDONLY);
  if (fd == -1)
    GALOIS_SYS_DIE("failed opening ", "'", filename, "'");
  fds.push_back(fd);

  struct stat buf;
  if (fstat(fd, &buf) == -1)
    GALOIS_SYS_DIE("failed reading ", "'", filename, "'");

  // mmap file, then load from mem using fromMem function
  int _MAP_BASE = MAP_PRIVATE;
#ifdef MAP_POPULATE
  _MAP_BASE |= MAP_POPULATE;
#endif
  void* base = mmap(nullptr, buf.st_size, PROT_READ, _MAP_BASE, fd, 0);
  if (base == MAP_FAILED)
    GALOIS_SYS_DIE("failed reading ", "'", filename, "'");
  mappings.push_back({base, static_cast<size_t>(buf.st_size)});

  fromMem(base, 0, 0, buf.st_size);
}

/**
 * Load graph data from a given offset
 *
 * @param fd File descriptor to load
 * @param offset Offset into file to load
 * @param length Amount of the file to laod
 * @param mappings Mappings structure that tracks the things we have mmap'd
 * @returns Pointer to mmap'd location in memory
 */
template <typename Mappings>
static void* loadFromOffset(int fd, offset_t offset, size_t length,
                            Mappings& mappings) {
  // mmap needs page-aligned offsets
  offset_t aligned =
      offset & ~static_cast<offset_t>(galois::substrate::allocSize() - 1);
  offset_t alignment = offset - aligned;
  length += alignment;
  void* base = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, aligned);
  if (base == MAP_FAILED)
    GALOIS_SYS_DIE("failed allocating for fd ", fd);
  mappings.push_back({base, length});
  return static_cast<char*>(base) + alignment;
}

/**
 * Makes multiple threads page in specific portions of a buffer of memory.
 * Useful for NUMA-aware architectures.
 *
 * @param ptr buffer to page in
 * @param length amount of data to page in
 * @param hugePageSize size of a huge page (what is being paged in)
 * @param numThreads number of threads to use when paging in memory
 */
static void pageInterleaved(void* ptr, uint64_t length, uint32_t hugePageSize,
                            unsigned int numThreads) {
  galois::substrate::getThreadPool().run(
      numThreads, [ptr, length, hugePageSize, numThreads]() {
        auto myID = galois::substrate::ThreadPool::getTID();

        volatile char* cptr = reinterpret_cast<volatile char*>(ptr);

        // round robin page distribution among threads (e.g. thread 0 gets
        // a page, then thread 1, then thread n, then back to thread 0 and
        // so on until the end of the region)
        for (size_t x = hugePageSize * myID; x < length;
             x += hugePageSize * numThreads)
          // this should do an access
          cptr[x];
      });
}

void FileGraph::partFromFile(const std::string& filename, NodeRange nrange,
                             EdgeRange erange, bool numaMap) {
  int fd = open(filename.c_str(), O_RDONLY);
  if (fd == -1)
    GALOIS_SYS_DIE("failed opening ", "'", filename, "'");
  fds.push_back(fd);

  size_t headerSize = 4 * sizeof(uint64_t);
  void* base        = mmap(nullptr, headerSize, PROT_READ, MAP_PRIVATE, fd, 0);
  if (base == MAP_FAILED)
    GALOIS_SYS_DIE("failed reading ", "'", filename, "'");
  mappings.push_back({base, headerSize});

  // Read metadata of whole graph
  fromMem(base, *nrange.first, *erange.first, 0);

  // at this point we should have access to graphVersion...

  // Adjust metadata to correspond to part
  uint64_t partNumNodes = std::distance(nrange.first, nrange.second);
  uint64_t partNumEdges = std::distance(erange.first, erange.second);
  size_t length         = partNumNodes * sizeof(uint64_t);
  offset_t offset       = headerSize + nodeOffset * sizeof(uint64_t);
  outIdx = static_cast<uint64_t*>(loadFromOffset(fd, offset, length, mappings));

  // TODO verify correctness
  if (graphVersion == 1) {
    length = partNumEdges * sizeof(uint32_t);
    offset = headerSize + numNodes * sizeof(uint64_t) +
             edgeOffset * sizeof(uint32_t);
    outs = loadFromOffset(fd, offset, length, mappings);
  } else if (graphVersion == 2) {
    length = partNumEdges * sizeof(uint64_t);
    offset = headerSize + numNodes * sizeof(uint64_t) +
             edgeOffset * sizeof(uint64_t);
    outs = loadFromOffset(fd, offset, length, mappings);
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }

  edgeData = 0;
  if (sizeofEdge) {
    length = partNumEdges * sizeofEdge;
    offset = rawBlockSize(numNodes, numEdges, 0, graphVersion) +
             sizeofEdge * edgeOffset;
    edgeData = static_cast<char*>(loadFromOffset(fd, offset, length, mappings));
  }

  numNodes = partNumNodes;
  numEdges = partNumEdges;

  // do interleaved numa allocation with current number of threads
  if (numaMap) {
    unsigned int numThreads   = galois::runtime::activeThreads;
    const size_t hugePageSize = 2 * 1024 * 1024; // 2MB

    void* ptr;

    // doesn't really matter if only 1 thread; i.e. do nothing i
    // that case
    if (numThreads != 1) {
      // node pointer to edge dest array
      ptr    = (void*)outIdx;
      length = numNodes * sizeof(uint64_t);

      pageInterleaved(ptr, length, hugePageSize, numThreads);

      // edge dest array
      ptr = (void*)outs;
      if (graphVersion == 1) {
        length = numEdges * sizeof(uint32_t);
      } else {
        // v2
        length = numEdges * sizeof(uint64_t);
      }

      pageInterleaved(ptr, length, hugePageSize, numThreads);

      // edge data (if it exists)
      if (sizeofEdge) {
        ptr    = (void*)edgeData;
        length = numEdges * sizeofEdge;

        pageInterleaved(ptr, length, hugePageSize, numThreads);
      }
    }
  }
}

size_t FileGraph::findIndex(size_t nodeSize, size_t edgeSize, size_t targetSize,
                            size_t lb, size_t ub) {
  while (lb < ub) {
    size_t mid = lb + (ub - lb) / 2;
    // edge begin assumes global id, so add nodeoffset to it as we work with
    // local ids
    size_t num_edges = *edge_begin(mid + nodeOffset);
    size_t size      = (num_edges * edgeSize) + (mid * nodeSize);
    if (size < targetSize)
      lb = mid + 1;
    else
      ub = mid;
  }
  return lb;
}

auto FileGraph::divideByNode(size_t nodeSize, size_t edgeSize, size_t id,
                             size_t total) -> GraphRange {
  std::vector<unsigned> dummy_scale_factor; // dummy passed in to function call

  return galois::graphs::divideNodesBinarySearch(
      numNodes, numEdges, nodeSize, edgeSize, id, total, outIdx,
      dummy_scale_factor, edgeOffset);
}

auto FileGraph::divideByEdge(size_t, size_t, size_t id, size_t total)
    -> std::pair<NodeRange, EdgeRange> {
  size_t size  = numEdges;
  size_t block = (size + total - 1) / total;
  size_t aa    = block * id;
  size_t ea    = std::min(block * (id + 1), static_cast<size_t>(numEdges));

  // note these use local node ids (numNodes is made local by partFromFile if
  // it was called)
  size_t bb = findIndex(0, 1, aa, 0, numNodes);
  size_t eb = findIndex(0, 1, ea, bb, numNodes);

  if (true) {
    galois::gInfo("(", id, "/", total, ") [", bb, " ", eb, " ", eb - bb, "], [",
                  aa, " ", ea, " ", ea - aa, "]");
  }

  return GraphRange(NodeRange(iterator(bb), iterator(eb)),
                    EdgeRange(edge_iterator(aa), edge_iterator(ea)));
}

void FileGraph::toFile(const std::string& file) {
  // FIXME handle files with multiple mappings
  GALOIS_ASSERT(mappings.size() == 1);

  ssize_t retval;
  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
  int fd      = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
  mapping mm  = mappings.back();
  mappings.pop_back();

  size_t total = mm.len;
  char* ptr    = (char*)mm.ptr;
  while (total) {
    retval = write(fd, ptr, total);
    if (retval == -1) {
      GALOIS_SYS_DIE("failed writing to ", "'", file, "'");
    } else if (retval == 0) {
      GALOIS_DIE("ran out of space writing to ", "'", file, "'");
    }
    total -= retval;
    ptr += retval;
  }
  close(fd);
}

uint64_t FileGraph::getEdgeIdx(GraphNode src, GraphNode dst) {
  // loop through all neighbors of src, looking for a match with dst
  if (graphVersion == 1) {
    for (auto ii = (uint32_t*)raw_neighbor_begin(src),
              ei = (uint32_t*)raw_neighbor_end(src);
         ii != ei; ++ii) {
      if (convert_le32toh(*ii) == dst)
        return std::distance((uint32_t*)outs, ii);
    }

    return ~static_cast<uint64_t>(0);
  } else if (graphVersion == 2) {
    for (auto ii = (uint64_t*)raw_neighbor_begin(src),
              ei = (uint64_t*)raw_neighbor_end(src);
         ii != ei; ++ii) {
      if (convert_le64toh(*ii) == dst)
        return std::distance((uint64_t*)outs, ii);
    }

    return ~static_cast<uint64_t>(0);
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }
}

/**
 * Touch pages of a buffer to page them in.
 *
 * @param buf Buffer to touch
 * @param len Length to touch
 * @param stride How much to stride when touching pages
 */
static void pageInReadOnly(void* buf, size_t len, size_t stride) {
  volatile char* ptr = reinterpret_cast<volatile char*>(buf);
  for (size_t i = 0; i < len; i += stride)
    ptr[i];
}

void FileGraph::pageInByNode(size_t id, size_t total, size_t sizeofEdgeData) {
  size_t edgeSize = 0;

  // different graph version have different edge sizes
  if (graphVersion == 1) {
    edgeSize = sizeof(uint32_t);
  } else if (graphVersion == 2) {
    edgeSize = sizeof(uint64_t);
  } else {
    GALOIS_DIE("unknown file version at pageInByNode", graphVersion);
  }

  // determine which nodes this id is responsible for paging in
  auto r = divideByNode(sizeof(uint64_t), sizeofEdgeData + edgeSize, id, total)
               .first;

  // get begin edge and end edge locations
  // add node offset because edge_begin assumes a global id while divideByNode
  // returns LOCAL ids (same below with edge_end)
  size_t ebegin = *edge_begin(*r.first + nodeOffset);
  size_t eend   = ebegin;

  if (r.first != r.second)
    eend = *edge_end(*r.second - 1 + nodeOffset);

  // page in the outIdx array
  pageInReadOnly(outIdx + *r.first,
                 std::distance(r.first, r.second) * sizeof(*outIdx),
                 runtime::pagePoolSize());

  // page in outs array
  if (graphVersion == 1) {
    pageInReadOnly((uint32_t*)outs + ebegin, (eend - ebegin) * sizeof(uint32_t),
                   runtime::pagePoolSize());
  } else {
    pageInReadOnly((uint64_t*)outs + ebegin, (eend - ebegin) * sizeof(uint64_t),
                   runtime::pagePoolSize());
  }

  // page in edge data
  pageInReadOnly(edgeData + ebegin * sizeofEdgeData,
                 (eend - ebegin) * sizeofEdgeData, runtime::pagePoolSize());
}

void* FileGraph::raw_neighbor_begin(GraphNode N) {
  if (graphVersion == 1) {
    return &(((uint32_t*)outs)[*edge_begin(N)]);
  } else if (graphVersion == 2) {
    return &(((uint64_t*)outs)[*edge_begin(N)]);
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }

  return nullptr;
}

void* FileGraph::raw_neighbor_end(GraphNode N) {
  if (graphVersion == 1) {
    return &(((uint32_t*)outs)[*edge_end(N)]);
  } else if (graphVersion == 2) {
    return &(((uint64_t*)outs)[*edge_end(N)]);
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }

  return nullptr;
}

FileGraph::edge_iterator FileGraph::edge_begin(GraphNode N) {
  size_t idx = 0;
  if (N > nodeOffset) {
    numBytesReadIndex += 8;
    idx = std::min(convert_le64toh(outIdx[N - 1 - nodeOffset]),
                   static_cast<uint64_t>(edgeOffset + numEdges)) -
          edgeOffset;
  } else if (N != nodeOffset) {
    printf("WARNING: reading node out of bounds for this file graph\n");
    // TODO die here?
  }
  return edge_iterator(idx);
}

FileGraph::edge_iterator FileGraph::edge_end(GraphNode N) {
  size_t idx = 0;
  if (N >= nodeOffset) {
    numBytesReadIndex += 8;
    idx = std::min(convert_le64toh(outIdx[N - nodeOffset]),
                   static_cast<uint64_t>(edgeOffset + numEdges)) -
          edgeOffset;
  } else {
    printf("WARNING: reading node out of bounds for this file graph\n");
    // TODO die here?
  }
  return edge_iterator(idx);
}

FileGraph::GraphNode FileGraph::getEdgeDst(edge_iterator it) {
  if (graphVersion == 1) {
    numBytesReadEdgeDst += 4;
    // can safely return 32 bit as 64 bit
    return convert_le32toh(((uint32_t*)outs)[*it]);
  } else if (graphVersion == 2) {
    numBytesReadEdgeDst += 8;
    return convert_le64toh(((uint64_t*)outs)[*it]);
  } else {
    GALOIS_DIE("unknown file version: ", graphVersion);
  }

  return -1;
}

FileGraph::node_id_iterator FileGraph::node_id_begin() const {
  return boost::make_transform_iterator(&((uint32_t*)outs)[0], Convert32());
}

FileGraph::node_id_iterator FileGraph::node_id_end() const {
  return boost::make_transform_iterator(&((uint32_t*)outs)[numEdges],
                                        Convert32());
}

FileGraph::edge_id_iterator FileGraph::edge_id_begin() const {
  return boost::make_transform_iterator(&outIdx[0], Convert64());
}

FileGraph::edge_id_iterator FileGraph::edge_id_end() const {
  return boost::make_transform_iterator(&outIdx[numNodes], Convert64());
}

bool FileGraph::hasNeighbor(GraphNode N1, GraphNode N2) {
  return getEdgeIdx(N1, N2) != ~static_cast<uint64_t>(0);
}

FileGraph::iterator FileGraph::begin() const { return iterator(nodeOffset); }

FileGraph::iterator FileGraph::end() const {
  return iterator(nodeOffset + numNodes);
}

void FileGraph::initNodeDegrees() {
  if (!this->node_degrees.size()) {
    // allocate memory
    this->node_degrees.create(this->numNodes);
    // loop over all nodes, calculate degrees
    galois::do_all(
        galois::iterate((uint64_t)0, this->numNodes),
        [&](unsigned long n) {
          // calculate and save degrees
          if (n != 0) {
            this->node_degrees.set(n, this->outIdx[n] - this->outIdx[n - 1]);
          } else {
            this->node_degrees.set(n, this->outIdx[0]);
          }
        },
        galois::loopname("FileGraphInitNodeDegrees"), galois::no_stats());
  }
}

uint64_t FileGraph::getDegree(uint32_t node_id) const {
  // node_degrees array should be initialized
  assert(this->node_degrees.size());
  return this->node_degrees[node_id];
}

void FileGraphWriter::phase1() {
  graphVersion = numNodes <= std::numeric_limits<uint32_t>::max() ? 1 : 2;

  size_t bytes    = galois::graphs::rawBlockSize(numNodes, numEdges, sizeofEdge,
                                              graphVersion);
  char* mmap_base = reinterpret_cast<char*>(mmap(
      nullptr, bytes, PROT_READ | PROT_WRITE, _MAP_ANON | MAP_PRIVATE, -1, 0));
  if (mmap_base == MAP_FAILED)
    GALOIS_SYS_DIE("failed allocating graph to write");

  mappings.push_back({mmap_base, bytes});

  uint64_t* fptr = reinterpret_cast<uint64_t*>(mmap_base);
  // set header info
  *fptr++    = convert_htole64(graphVersion);
  *fptr++    = convert_htole64(sizeofEdge);
  *fptr++    = convert_htole64(numNodes);
  *fptr++    = convert_htole64(numEdges);
  nodeOffset = 0;
  edgeOffset = 0;
  outIdx     = fptr;

  // move over to outgoing edge data and save it
  fptr += numNodes;
  outs = reinterpret_cast<void*>(fptr);

  // skip memory differently depending on file version
  edgeData = graphVersion == 1
                 ? reinterpret_cast<char*>(reinterpret_cast<uint32_t*>(fptr) +
                                           numEdges + numEdges % 2) // padding
                 : reinterpret_cast<char*>(
                       /*reinterpret_cast<uint64_t*>*/ (fptr) + numEdges);
}

void FileGraphWriter::phase2() {
  if (numNodes == 0)
    return;

  // Turn counts into partial sums
  uint64_t* prev = outIdx;
  for (uint64_t *ii = outIdx + 1, *ei = outIdx + numNodes; ii != ei;
       ++ii, ++prev) {
    *ii += *prev;
  }
  assert(outIdx[numNodes - 1] == numEdges);

  starts = std::make_unique<uint64_t[]>(numNodes);
}

} // namespace graphs
} // namespace galois


================================================
FILE: libgalois/src/FileGraphParallel.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/FileGraph.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/HWTopo.h"

#include <mutex>
#include <condition_variable>

namespace galois {
namespace graphs {

void FileGraph::fromFileInterleaved(const std::string& filename,
                                    size_t sizeofEdgeData) {
  fromFile(filename);

  std::mutex lock;
  std::condition_variable cond;
  auto& tp            = substrate::getThreadPool();
  unsigned maxSockets = tp.getMaxSockets();
  unsigned count      = maxSockets;

  // Interleave across all NUMA nodes
  tp.run(tp.getMaxThreads(), [&]() {
    std::unique_lock<std::mutex> lk(lock);
    if (substrate::ThreadPool::isLeader()) {
      pageInByNode(substrate::ThreadPool::getSocket(), maxSockets,
                   sizeofEdgeData);
      if (--count == 0)
        cond.notify_all();
    } else {
      cond.wait(lk, [&]() { return count == 0; });
    }
  });
}

} // namespace graphs
} // namespace galois


================================================
FILE: libgalois/src/GraphHelpers.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <galois/graphs/GraphHelpers.h>

namespace galois {
namespace graphs {
namespace internal {

uint32_t determine_block_division(uint32_t numDivisions,
                                  std::vector<unsigned>& scaleFactor) {
  uint32_t numBlocks = 0;

  if (scaleFactor.empty()) {
    // if scale factor isn't specified, everyone gets the same amount
    numBlocks = numDivisions;

    // scale factor holds a prefix sum of the scale factor
    for (uint32_t i = 0; i < numDivisions; i++) {
      scaleFactor.push_back(i + 1);
    }
  } else {
    assert(scaleFactor.size() == numDivisions);
    assert(numDivisions >= 1);

    // get numDivisions number of blocks we need + save a prefix sum of the
    // scale factor vector to scaleFactor
    for (uint32_t i = 0; i < numDivisions; i++) {
      numBlocks += scaleFactor[i];
      scaleFactor[i] = numBlocks;
    }
  }

  return numBlocks;
}

bool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,
                               uint32_t endNode,
                               std::vector<uint32_t>& returnRanges) {
  uint32_t totalNodes = endNode - beginNode;

  // check corner cases
  // no nodes = assign nothing to all units
  if (beginNode == endNode) {
    returnRanges[0] = beginNode;

    for (uint32_t i = 0; i < unitsToSplit; i++) {
      returnRanges[i + 1] = beginNode;
    }

    return true;
  }

  // single unit case; 1 unit gets all
  if (unitsToSplit == 1) {
    returnRanges[0] = beginNode;
    returnRanges[1] = endNode;
    return true;
    // more units than nodes
  } else if (unitsToSplit > totalNodes) {
    uint32_t current_node = beginNode;
    returnRanges[0]       = current_node;
    // 1 node for units until out of units
    for (uint32_t i = 0; i < totalNodes; i++) {
      returnRanges[i + 1] = ++current_node;
    }
    // deal with remainder units; they get nothing
    for (uint32_t i = totalNodes; i < unitsToSplit; i++) {
      returnRanges[i + 1] = totalNodes;
    }

    return true;
  }

  return false;
}

void unitRangeSanity(
    uint32_t GALOIS_USED_ONLY_IN_DEBUG(unitsToSplit),
    uint32_t GALOIS_USED_ONLY_IN_DEBUG(beginNode),
    uint32_t GALOIS_USED_ONLY_IN_DEBUG(endNode),
    std::vector<uint32_t>& GALOIS_USED_ONLY_IN_DEBUG(returnRanges)) {
#ifndef NDEBUG
  // sanity checks
  assert(returnRanges[0] == beginNode &&
         "return ranges begin not the begin node");
  assert(returnRanges[unitsToSplit] == endNode &&
         "return ranges end not end node");

  for (uint32_t i = 1; i < unitsToSplit; i++) {
    assert(returnRanges[i] >= beginNode && returnRanges[i] <= endNode);
    assert(returnRanges[i] >= returnRanges[i - 1]);
  }
#endif
}

} // namespace internal
} // namespace graphs
} // namespace galois


================================================
FILE: libgalois/src/HWTopo.cpp
================================================
#include "galois/substrate/HWTopo.h"

#include <stdexcept>

std::vector<int> galois::substrate::parseCPUList(const std::string& line) {
  std::vector<int> vals;

  size_t current;
  size_t next = -1;
  try {
    do {
      current  = next + 1;
      next     = line.find_first_of(',', current);
      auto buf = line.substr(current, next - current);
      if (!buf.empty()) {
        size_t dash = buf.find_first_of('-', 0);
        if (dash != std::string::npos) { // range
          auto first  = buf.substr(0, dash);
          auto second = buf.substr(dash + 1, std::string::npos);
          unsigned b  = std::stoi(first.data());
          unsigned e  = std::stoi(second.data());
          while (b <= e) {
            vals.push_back(b++);
          }
        } else { // singleton
          vals.push_back(std::stoi(buf.data()));
        }
      }
    } while (next != std::string::npos);
  } catch (const std::invalid_argument&) {
    return std::vector<int>{};
  } catch (const std::out_of_range&) {
    return std::vector<int>{};
  }

  return vals;
}


================================================
FILE: libgalois/src/HWTopoDarwin.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/EnvCheck.h"
#include "galois/substrate/HWTopo.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/gIO.h"

#include <mach/mach_interface.h>
#include <mach/thread_policy.h>
#include <sys/types.h>
#include <sys/sysctl.h>
#include <algorithm>
#include <memory>
#include <mutex>
#include <pthread.h>

using namespace galois::substrate;

namespace {

int getIntValue(const char* name) {
  int value;
  size_t len = sizeof(value);

  if (sysctlbyname(name, &value, &len, nullptr, 0) == -1) {
    GALOIS_SYS_DIE("could not get sysctl value for ", name, ": ",
                   strerror(errno));
  }

  return value;
}

HWTopoInfo makeHWTopo() {
  MachineTopoInfo mti;
  mti.maxSockets   = getIntValue("hw.packages");
  mti.maxThreads   = getIntValue("hw.logicalcpu_max");
  mti.maxCores     = getIntValue("hw.physicalcpu_max");
  mti.maxNumaNodes = mti.maxSockets;

  std::vector<ThreadTopoInfo> tti;
  tti.reserve(mti.maxThreads);

  // Darwin doesn't expose more fine-grained topology information,
  // so assume a dense configuration:
  // thread 0 +
  //          |- core 0 +
  // thread 1 +         |
  //                    |- socket 0
  // thread 2 +         |
  //          |- core 1 +
  // thread 3 +

  const unsigned threadsPerSocket =
      (mti.maxThreads + mti.maxThreads - 1) / mti.maxSockets;

  // Describe dense configuration first; then, sort logical threads to the
  // back.
  for (unsigned i = 0; i < mti.maxThreads; ++i) {
    unsigned socket = i / threadsPerSocket;
    unsigned leader = socket * threadsPerSocket;
    tti.push_back(ThreadTopoInfo{
        .socketLeader = leader,
        .socket       = socket,
        .numaNode     = socket,
        .osContext    = i,
        .osNumaNode   = socket,
    });
  }

  const unsigned logicalPerPhysical =
      (mti.maxThreads + mti.maxThreads - 1) / mti.maxCores;

  std::sort(tti.begin(), tti.end(),
            [&](const ThreadTopoInfo& a, const ThreadTopoInfo& b) {
              int smtA = a.osContext % logicalPerPhysical;
              int smtB = b.osContext % logicalPerPhysical;
              if (smtA == smtB) {
                return a.osContext < b.osContext;
              }
              return smtA < smtB;
            });

  for (unsigned i = 0, m = 0; i < mti.maxThreads; ++i) {
    m                          = std::max(m, tti[i].socket);
    tti[i].tid                 = i;
    tti[i].cumulativeMaxSocket = m;
  }

  return {
      .machineTopoInfo = mti,
      .threadTopoInfo  = tti,
  };
}

} // namespace

//! binds current thread to OS HW context "proc"
bool galois::substrate::bindThreadSelf(unsigned osContext) {
  pthread_t thread              = pthread_self();
  thread_affinity_policy policy = {int(osContext)};
  thread_t machThread           = pthread_mach_thread_np(thread);
  if (thread_policy_set(machThread, THREAD_AFFINITY_POLICY,
                        thread_policy_t(&policy),
                        THREAD_AFFINITY_POLICY_COUNT)) {
    galois::gWarn("Could not set CPU affinity to ", osContext, " (",
                  strerror(errno), ")");
    return false;
  }

  return true;
}

HWTopoInfo galois::substrate::getHWTopo() {
  static SimpleLock lock;
  static std::unique_ptr<HWTopoInfo> data;

  std::lock_guard<SimpleLock> guard(lock);
  if (!data) {
    data = std::make_unique<HWTopoInfo>(makeHWTopo());
  }
  return *data;
}


================================================
FILE: libgalois/src/HWTopoLinux.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/HWTopo.h"
#include "galois/substrate/EnvCheck.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/gIO.h"

#include <algorithm>
#include <array>
#include <cassert>
#include <cerrno>
#include <cstring>
#include <fstream>
#include <functional>
#include <memory>
#include <mutex>
#include <set>

#ifdef GALOIS_USE_NUMA
#include <numa.h>
#include <numaif.h>
#endif

#ifdef GALOIS_USE_SCHED_SETAFFINITY
#include <sched.h>
#endif

namespace {

struct cpuinfo {
  // fields filled in from OS files
  unsigned proc;
  unsigned physid;
  unsigned sib;
  unsigned coreid;
  unsigned cpucores;
  unsigned numaNode; // from libnuma
  bool valid;        // from cpuset
  bool smt;          // computed
};

bool operator<(const cpuinfo& lhs, const cpuinfo& rhs) {
  if (lhs.smt != rhs.smt)
    return lhs.smt < rhs.smt;
  if (lhs.physid != rhs.physid)
    return lhs.physid < rhs.physid;
  if (lhs.coreid != rhs.coreid)
    return lhs.coreid < rhs.coreid;
  return lhs.proc < rhs.proc;
}

unsigned getNumaNode(cpuinfo& c) {
  static bool warnOnce = false;
#ifdef GALOIS_USE_NUMA
  static bool numaAvail = false;

  if (!warnOnce) {
    warnOnce  = true;
    numaAvail = numa_available() >= 0;
    numaAvail = numaAvail && numa_num_configured_nodes() > 0;
    if (!numaAvail)
      galois::gWarn("Numa support configured but not present at runtime.  "
                    "Assuming numa topology matches socket topology.");
  }

  if (!numaAvail)
    return c.physid;
  int i = numa_node_of_cpu(c.proc);
  if (i < 0)
    GALOIS_SYS_DIE("failed finding numa node for ", c.proc);
  return i;
#else
  if (!warnOnce) {
    warnOnce = true;
    galois::gWarn("Numa Support Not configured (install libnuma-dev).  "
                  "Assuming numa topology matches socket topology.");
  }
  return c.physid;
#endif
}

//! Parse /proc/cpuinfo
std::vector<cpuinfo> parseCPUInfo() {
  std::vector<cpuinfo> vals;

  const int len = 1024;
  std::array<char, len> line;

  std::ifstream procInfo("/proc/cpuinfo");
  if (!procInfo)
    GALOIS_SYS_DIE("failed opening /proc/cpuinfo");

  int cur = -1;

  while (true) {
    procInfo.getline(line.data(), len);
    if (!procInfo)
      break;

    int num;
    if (sscanf(line.data(), "processor : %d", &num) == 1) {
      assert(cur < num);
      cur = num;
      vals.resize(cur + 1);
      vals.at(cur).proc = num;
    } else if (sscanf(line.data(), "physical id : %d", &num) == 1) {
      vals.at(cur).physid = num;
    } else if (sscanf(line.data(), "siblings : %d", &num) == 1) {
      vals.at(cur).sib = num;
    } else if (sscanf(line.data(), "core id : %d", &num) == 1) {
      vals.at(cur).coreid = num;
    } else if (sscanf(line.data(), "cpu cores : %d", &num) == 1) {
      vals.at(cur).cpucores = num;
    }
  }

  for (auto& c : vals)
    c.numaNode = getNumaNode(c);

  return vals;
}

unsigned countSockets(const std::vector<cpuinfo>& info) {
  std::set<unsigned> pkgs;
  for (auto& c : info)
    pkgs.insert(c.physid);
  return pkgs.size();
}

unsigned countCores(const std::vector<cpuinfo>& info) {
  std::set<std::pair<int, int>> cores;
  for (auto& c : info)
    cores.insert(std::make_pair(c.physid, c.coreid));
  return cores.size();
}

unsigned countNumaNodes(const std::vector<cpuinfo>& info) {
  std::set<unsigned> nodes;
  for (auto& c : info)
    nodes.insert(c.numaNode);
  return nodes.size();
}

void markSMT(std::vector<cpuinfo>& info) {
  for (unsigned int i = 1; i < info.size(); ++i)
    if (info[i - 1].physid == info[i].physid &&
        info[i - 1].coreid == info[i].coreid)
      info[i].smt = true;
    else
      info[i].smt = false;
}

std::vector<int> parseCPUSet() {
  std::vector<int> vals;

  std::ifstream data("/proc/self/status");

  if (!data) {
    return vals;
  }

  std::string line;
  std::string prefix("Cpus_allowed_list:");
  bool found = false;
  while (true) {
    std::getline(data, line);
    if (!data) {
      return vals;
    }

    if (line.compare(0, prefix.size(), prefix) == 0) {
      found = true;
      break;
    }
  }

  if (!found) {
    return vals;
  }

  line = line.substr(prefix.size());

  return galois::substrate::parseCPUList(line);
}

void markValid(std::vector<cpuinfo>& info) {
  auto v = parseCPUSet();
  if (v.empty()) {
    for (auto& c : info)
      c.valid = true;
  } else {
    std::sort(v.begin(), v.end());
    for (auto& c : info)
      c.valid = std::binary_search(v.begin(), v.end(), c.proc);
  }
}

galois::substrate::HWTopoInfo makeHWTopo() {
  galois::substrate::MachineTopoInfo retMTI;

  auto info = parseCPUInfo();
  std::sort(info.begin(), info.end());
  markSMT(info);
  markValid(info);

  info.erase(std::partition(info.begin(), info.end(),
                            [](const cpuinfo& c) { return c.valid; }),
             info.end());

  std::sort(info.begin(), info.end());
  markSMT(info);
  retMTI.maxSockets   = countSockets(info);
  retMTI.maxThreads   = info.size();
  retMTI.maxCores     = countCores(info);
  retMTI.maxNumaNodes = countNumaNodes(info);

  std::vector<galois::substrate::ThreadTopoInfo> retTTI;
  retTTI.reserve(retMTI.maxThreads);
  // compute renumberings
  std::set<unsigned> sockets;
  std::set<unsigned> numaNodes;
  for (auto& i : info) {
    sockets.insert(i.physid);
    numaNodes.insert(i.numaNode);
  }
  unsigned mid = 0; // max socket id
  for (unsigned i = 0; i < info.size(); ++i) {
    unsigned pid = info[i].physid;
    unsigned repid =
        std::distance(sockets.begin(), sockets.find(info[i].physid));
    mid             = std::max(mid, repid);
    unsigned leader = std::distance(
        info.begin(),
        std::find_if(info.begin(), info.end(),
                     [pid](const cpuinfo& c) { return c.physid == pid; }));
    retTTI.push_back(galois::substrate::ThreadTopoInfo{
        i, leader, repid,
        (unsigned)std::distance(numaNodes.begin(),
                                numaNodes.find(info[i].numaNode)),
        mid, info[i].proc, info[i].numaNode});
  }

  return {
      .machineTopoInfo = retMTI,
      .threadTopoInfo  = retTTI,
  };
}

} // namespace

galois::substrate::HWTopoInfo galois::substrate::getHWTopo() {
  static SimpleLock lock;
  static std::unique_ptr<HWTopoInfo> data;

  std::lock_guard<SimpleLock> guard(lock);
  if (!data) {
    data = std::make_unique<HWTopoInfo>(makeHWTopo());
  }
  return *data;
}

//! binds current thread to OS HW context "proc"
bool galois::substrate::bindThreadSelf(unsigned osContext) {
#ifdef GALOIS_USE_SCHED_SETAFFINITY
  cpu_set_t mask;
  /* CPU_ZERO initializes all the bits in the mask to zero. */
  CPU_ZERO(&mask);

  /* CPU_SET sets only the bit corresponding to cpu. */
  // void to cancel unused result warning
  (void)CPU_SET(osContext, &mask);

  /* sched_setaffinity returns 0 in success */
  if (sched_setaffinity(0, sizeof(mask), &mask) == -1) {
    galois::gWarn("Could not set CPU affinity to ", osContext, "(",
                  strerror(errno), ")");
    return false;
  }
  return true;
#else
  galois::gWarn(
      "Cannot set cpu affinity on this platform.  Performance will be bad.");
  return false;
#endif
}


================================================
FILE: libgalois/src/Mem.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Mem.h"

#include <map>
#include <mutex>

using namespace galois::runtime;

// Anchor the class
SystemHeap::SystemHeap() { assert(AllocSize == runtime::pagePoolSize()); }

SystemHeap::~SystemHeap() {}

#ifndef GALOIS_FORCE_STANDALONE
thread_local SizedHeapFactory::HeapMap* SizedHeapFactory::localHeaps = 0;

SizedHeapFactory::SizedHeap*
SizedHeapFactory::getHeapForSize(const size_t size) {
  if (size == 0)
    return nullptr;
  return Base::getInstance()->getHeap(size);
}

SizedHeapFactory::SizedHeap* SizedHeapFactory::getHeap(const size_t size) {
  typedef SizedHeapFactory::HeapMap HeapMap;

  if (!localHeaps) {
    std::lock_guard<galois::substrate::SimpleLock> ll(lock);
    localHeaps = new HeapMap;
    allLocalHeaps.push_front(localHeaps);
  }

  auto& lentry = (*localHeaps)[size];
  if (lentry)
    return lentry;

  {
    std::lock_guard<galois::substrate::SimpleLock> ll(lock);
    auto& gentry = heaps[size];
    if (!gentry)
      gentry = new SizedHeap();
    lentry = gentry;
    return lentry;
  }
}

Pow_2_BlockHeap::Pow_2_BlockHeap(void) noexcept : heapTable() {
  populateTable();
}

SizedHeapFactory::SizedHeapFactory() : lock() {}

SizedHeapFactory::~SizedHeapFactory() {
  // TODO destructor ordering problem: there may be pointers to deleted
  // SizedHeap when this Factory is destroyed before dependent
  // FixedSizeHeaps.
  for (auto entry : heaps)
    delete entry.second;
  for (auto mptr : allLocalHeaps)
    delete mptr;
}
#endif


================================================
FILE: libgalois/src/NumaMem.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/NumaMem.h"
#include "galois/substrate/PageAlloc.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/gIO.h"

#include <cassert>

using namespace galois::substrate;

/* Access pages on each thread so each thread has some pages already loaded
 * (preferably ones it will use) */
static void pageIn(void* _ptr, size_t len, size_t pageSize, unsigned numThreads,
                   bool finegrained) {
  char* ptr = static_cast<char*>(_ptr);

  if (numThreads == 1) {
    for (size_t x = 0; x < len; x += pageSize / 2)
      ptr[x] = 0;
  } else {
    getThreadPool().run(numThreads, [ptr, len, pageSize, numThreads,
                                     finegrained]() {
      auto myID = ThreadPool::getTID();

      if (finegrained) {
        // round robin page distribution among threads (e.g. thread 0 gets
        // a page, then thread 1, then thread n, then back to thread 0 and
        // so on until the end of the region)
        for (size_t x = pageSize * myID; x < len; x += pageSize * numThreads)
          ptr[x] = 0;
      } else {
        // sectioned page distribution (e.g. thread 0 gets first chunk, thread
        // 1 gets next chunk, ... last thread gets last chunk)
        for (size_t x = myID * len / numThreads;
             x < len && x < (myID + 1) * len / numThreads; x += pageSize)
          ptr[x] = 0;
      }
    });
  }
}

/**
 * Causes each thread to page in a specified region of the provided memory
 * based on some distribution of elements as specified by a provided array.
 *
 * @tparam RangeArrayTy Type of threadRanges array: should either be uint32_t*
 * or uint64_t*
 * @param _ptr Pointer to the memory to page in
 * @param len Length of the memory passed in
 * @param pageSize Size of a page
 * @param numThreads Number of threads to split work amongst
 * @param threadRanges Array that specifies distribution of elements among
 * threads
 * @param elementSize Size of an element that is to be distributed among
 * threads
 */
template <typename RangeArrayTy>
static void pageInSpecified(void* _ptr, size_t len, size_t pageSize,
                            unsigned numThreads, RangeArrayTy threadRanges,
                            size_t elementSize) {
  assert(numThreads > 0);
  assert(elementSize > 0);

  char* ptr = static_cast<char*>(_ptr);

  if (numThreads > 1) {
    getThreadPool().run(
        numThreads, [ptr, pageSize, threadRanges, elementSize]() {
          auto myID = ThreadPool::getTID();

          uint64_t beginLocation = threadRanges[myID];
          uint64_t endLocation   = threadRanges[myID + 1];

          assert(beginLocation <= endLocation);

          // printf("[%u] begin location %u and end location
          // %u\n", myID,
          //       beginLocation, endLocation);

          // if equal, then no memory needed to allocate in
          // first place
          if (beginLocation != endLocation) {
            size_t beginByte = beginLocation * elementSize;
            // -1 since endLocation * elementSize will result in the first
            // byte of the next element.
            size_t endByte = endLocation ? (endLocation * elementSize) - 1 : 0;

            assert(beginByte <= endByte);

            // memset(ptr + beginByte, 0, (endByte - beginByte +
            // 1));

            uint32_t beginPage = beginByte / pageSize;
            uint32_t endPage   = endByte / pageSize;

            assert(beginPage <= endPage);

            // printf("thread %u gets begin page %u and end page
            // %u\n", myID,
            //        beginPage, endPage);

            // write a byte to every page this thread occupies
            for (uint32_t i = beginPage; i <= endPage; i++) {
              ptr[i * pageSize] = 0;
            }
          }
        });
  } else {
    // 1 thread case
    for (size_t x = 0; x < len; x += pageSize / 2)
      ptr[x] = 0;
  }
}

static void largeFree(void* ptr, size_t bytes) {
  freePages(ptr, bytes / allocSize());
}

void galois::substrate::internal::largeFreer::operator()(void* ptr) const {
  largeFree(ptr, bytes);
}

// round data to a multiple of mult
static size_t roundup(size_t data, size_t mult) {
  auto rem = data % mult;

  if (!rem)
    return data;
  return data + (mult - rem);
}

LAptr galois::substrate::largeMallocInterleaved(size_t bytes,
                                                unsigned numThreads) {
  // round up to hugePageSize
  bytes = roundup(bytes, allocSize());

#ifdef GALOIS_USE_NUMA
  // We don't use numa_alloc_interleaved_subset because we really want huge
  // pages
  // yes this is a comment in a ifdef, but if libnuma improves, this is where
  // the alloc would go
#endif
  // Get a non-prefaulted allocation
  void* data = allocPages(bytes / allocSize(), false);

  // Then page in based on thread number
  if (data)
    // true = round robin paging
    pageIn(data, bytes, allocSize(), numThreads, true);

  return LAptr{data, internal::largeFreer{bytes}};
}

LAptr galois::substrate::largeMallocLocal(size_t bytes) {
  // round up to hugePageSize
  bytes = roundup(bytes, allocSize());
  // Get a prefaulted allocation
  return LAptr{allocPages(bytes / allocSize(), true),
               internal::largeFreer{bytes}};
}

LAptr galois::substrate::largeMallocFloating(size_t bytes) {
  // round up to hugePageSize
  bytes = roundup(bytes, allocSize());
  // Get a non-prefaulted allocation
  return LAptr{allocPages(bytes / allocSize(), false),
               internal::largeFreer{bytes}};
}

LAptr galois::substrate::largeMallocBlocked(size_t bytes, unsigned numThreads) {
  // round up to hugePageSize
  bytes = roundup(bytes, allocSize());
  // Get a non-prefaulted allocation
  void* data = allocPages(bytes / allocSize(), false);
  if (data)
    // false = blocked paging
    pageIn(data, bytes, allocSize(), numThreads, false);
  return LAptr{data, internal::largeFreer{bytes}};
}

/**
 * Allocates pages for some specified number of bytes, then does NUMA page
 * faulting based on a specified distribution of elements among threads.
 *
 * @tparam RangeArrayTy Type of threadRanges array: should either be uint32_t*
 * or uint64_t*
 * @param bytes Number of bytes to allocate
 * @param numThreads Number of threads to page in regions for
 * @param threadRanges Array specifying distribution of elements among threads
 * @param elementSize Size of a data element that will be stored in the
 * allocated memory
 * @returns The allocated memory along with a freer object
 */
template <typename RangeArrayTy>
LAptr galois::substrate::largeMallocSpecified(size_t bytes, uint32_t numThreads,
                                              RangeArrayTy& threadRanges,
                                              size_t elementSize) {
  // ceiling to nearest page
  bytes = roundup(bytes, allocSize());

  void* data = allocPages(bytes / allocSize(), false);

  // NUMA aware page in based on element distribution specified in threadRanges
  if (data)
    pageInSpecified(data, bytes, allocSize(), numThreads, threadRanges,
                    elementSize);

  return LAptr{data, internal::largeFreer{bytes}};
}
// Explicit template declarations since the template is defined in the .h
// file
template LAptr galois::substrate::largeMallocSpecified<std::vector<uint32_t>>(
    size_t bytes, uint32_t numThreads, std::vector<uint32_t>& threadRanges,
    size_t elementSize);
template LAptr galois::substrate::largeMallocSpecified<std::vector<uint64_t>>(
    size_t bytes, uint32_t numThreads, std::vector<uint64_t>& threadRanges,
    size_t elementSize);


================================================
FILE: libgalois/src/OCFileGraph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/OCGraph.h"
#include "galois/runtime/Mem.h"
#include "galois/gIO.h"

#include <cassert>

#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>

using namespace galois::graphs;

// File format V1:
// version (1) {uint64_t LE}
// EdgeType size {uint64_t LE}
// numNodes {uint64_t LE}
// numEdges {uint64_t LE}
// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge
// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.
// outedges[numEdges] {uint32_t LE}
// potential padding (32bit max) to Re-Align to 64bits
// EdgeType[numEdges] {EdgeType size}

OCFileGraph::~OCFileGraph() {
  if (masterMapping)
    munmap(masterMapping, masterLength);
  if (masterFD != -1)
    close(masterFD);
}

void OCFileGraph::Block::unload() {
  if (!m_mapping)
    return;

  if (munmap(m_mapping, m_length) != 0) {
    GALOIS_SYS_DIE("failed unallocating");
  }
  m_mapping = 0;
}

void OCFileGraph::Block::load(int fd, offset_t offset, size_t begin, size_t len,
                              size_t sizeof_data) {
  assert(m_mapping == 0);

  offset_t start = offset + begin * sizeof_data;
  offset_t aligned =
      start & ~static_cast<offset_t>(galois::runtime::pagePoolSize() - 1);

  int _MAP_BASE = MAP_PRIVATE;
#ifdef MAP_POPULATE
  _MAP_BASE |= MAP_POPULATE;
#endif
  m_length =
      len * sizeof_data +
      galois::runtime::pagePoolSize(); // account for round off due to alignment
  m_mapping = mmap(nullptr, m_length, PROT_READ, _MAP_BASE, fd, aligned);
  if (m_mapping == MAP_FAILED) {
    GALOIS_SYS_DIE("failed allocating ", fd);
  }

  m_data = reinterpret_cast<char*>(m_mapping);
  assert(aligned <= start);
  assert(start - aligned <=
         static_cast<offset_t>(galois::runtime::pagePoolSize()));
  m_data += start - aligned;
  m_begin       = begin;
  m_sizeof_data = sizeof_data;
}

void OCFileGraph::load(segment_type& s, edge_iterator begin, edge_iterator end,
                       size_t sizeof_data) {
  size_t bb  = *begin;
  size_t len = *end - *begin;

  offset_t outs = (4 + numNodes) * sizeof(uint64_t);
  offset_t data = outs + (numEdges + (numEdges & 1)) * sizeof(uint32_t);

  s.outs.load(masterFD, outs, bb, len, sizeof(uint32_t));
  if (sizeof_data)
    s.edgeData.load(masterFD, data, bb, len, sizeof_data);

  s.loaded = true;
}

static void readHeader(int fd, uint64_t& numNodes, uint64_t& numEdges) {
  void* m = mmap(0, 4 * sizeof(uint64_t), PROT_READ, MAP_PRIVATE, fd, 0);
  if (m == MAP_FAILED) {
    GALOIS_SYS_DIE("failed reading ", fd);
  }

  uint64_t* ptr = reinterpret_cast<uint64_t*>(m);
  assert(ptr[0] == 1);
  numNodes = ptr[2];
  numEdges = ptr[3];

  if (munmap(m, 4 * sizeof(uint64_t))) {
    GALOIS_SYS_DIE("failed reading ", fd);
  }
}

void OCFileGraph::fromFile(const std::string& filename) {
  masterFD = open(filename.c_str(), O_RDONLY);
  if (masterFD == -1) {
    GALOIS_SYS_DIE("failed opening ", filename);
  }

  readHeader(masterFD, numNodes, numEdges);
  masterLength  = 4 * sizeof(uint64_t) + numNodes * sizeof(uint64_t);
  int _MAP_BASE = MAP_PRIVATE;
#ifdef MAP_POPULATE
  _MAP_BASE |= MAP_POPULATE;
#endif
  masterMapping = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
  if (masterMapping == MAP_FAILED) {
    GALOIS_SYS_DIE("failed reading ", filename);
  }

  outIdx = reinterpret_cast<uint64_t*>(masterMapping);
  outIdx += 4;
}


================================================
FILE: libgalois/src/PageAlloc.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/PageAlloc.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/gIO.h"

#include <mutex>

// figure this out dynamically
const size_t hugePageSize = 2 * 1024 * 1024;
// protect mmap, munmap since linux has issues
static galois::substrate::SimpleLock allocLock;

static void* trymmap(size_t size, int flag) {
  std::lock_guard<galois::substrate::SimpleLock> lg(allocLock);
  const int _PROT = PROT_READ | PROT_WRITE;
  void* ptr       = galois::mmap(0, size, _PROT, flag, -1, 0);
  if (ptr == MAP_FAILED)
    ptr = nullptr;
  return ptr;
}

static const int _MAP = _MAP_ANON | MAP_PRIVATE;
#ifdef MAP_POPULATE
static const int _MAP_POP   = MAP_POPULATE | _MAP;
static const bool doHandMap = false;
#else
static const int _MAP_POP      = _MAP;
static const bool doHandMap    = true;
#endif
#ifdef MAP_HUGETLB
static const int _MAP_HUGE_POP = MAP_HUGETLB | _MAP_POP;
static const int _MAP_HUGE     = MAP_HUGETLB | _MAP;
#else
static const int _MAP_HUGE_POP = _MAP_POP;
static const int _MAP_HUGE     = _MAP;
#endif

size_t galois::substrate::allocSize() { return hugePageSize; }

void* galois::substrate::allocPages(unsigned num, bool preFault) {
  if (num > 0) {
    void* ptr =
        trymmap(num * hugePageSize, preFault ? _MAP_HUGE_POP : _MAP_HUGE);
    if (!ptr) {
      gDebug("Huge page alloc failed, falling back");
      ptr = trymmap(num * hugePageSize, preFault ? _MAP_POP : _MAP);
    }

    if (!ptr)
      GALOIS_SYS_DIE("Out of Memory");

    if (preFault && doHandMap)
      for (size_t x = 0; x < num * hugePageSize; x += 4096)
        static_cast<char*>(ptr)[x] = 0;

    return ptr;
  } else {
    return nullptr;
  }
}

void galois::substrate::freePages(void* ptr, unsigned num) {
  std::lock_guard<SimpleLock> lg(allocLock);
  if (munmap(ptr, num * hugePageSize) != 0)
    GALOIS_SYS_DIE("Unmap failed");
}

/*

class PageSizeConf {
#ifdef MAP_HUGETLB
  void checkHuge() {
    std::ifstream f("/proc/meminfo");

    if (!f)
      return;

    char line[2048];
    size_t hugePageSizeKb = 0;
    while (f.getline(line, sizeof(line)/sizeof(*line))) {
      if (strstr(line, "Hugepagesize:") != line)
        continue;
      std::stringstream ss(line + strlen("Hugepagesize:"));
      std::string kb;
      ss >> hugePageSizeKb >> kb;
      if (kb != "kB")
        galois::substrate::gWarn("error parsing meminfo");
      break;
    }
    if (hugePageSizeKb * 1024 != galois::runtime::hugePageSize)
      galois::substrate::gWarn("System HugePageSize does not match compiled
HugePageSize");
  }
#else
  void checkHuge() { }
#endif

public:
  PageSizeConf() {
#ifdef _POSIX_PAGESIZE
    galois::runtime::pageSize = _POSIX_PAGESIZE;
#else
    galois::runtime::pageSize = sysconf(_SC_PAGESIZE);
#endif
    checkHuge();
  }
};
*/


================================================
FILE: libgalois/src/PagePool.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#define __is_trivial(type)                                                     \
  __has_trivial_constructor(type) && __has_trivial_copy(type)

#include "galois/runtime/PagePool.h"

using namespace galois::runtime;

static galois::runtime::internal::PageAllocState<>* PA;

void galois::runtime::internal::setPagePoolState(PageAllocState<>* pa) {
  GALOIS_ASSERT(!(PA && pa),
                "PagePool.cpp: Double Initialization of PageAllocState");
  PA = pa;
}

int galois::runtime::numPagePoolAllocTotal() { return PA->countAll(); }

int galois::runtime::numPagePoolAllocForThread(unsigned tid) {
  return PA->count(tid);
}

void* galois::runtime::pagePoolAlloc() { return PA->pageAlloc(); }

void galois::runtime::pagePoolPreAlloc(unsigned num) {
  while (num--)
    PA->pagePreAlloc();
}

void galois::runtime::pagePoolFree(void* ptr) { PA->pageFree(ptr); }

size_t galois::runtime::pagePoolSize() { return substrate::allocSize(); }


================================================
FILE: libgalois/src/ParaMeter.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Executor_ParaMeter.h"
#include "galois/gIO.h"
#include "galois/substrate/EnvCheck.h"

#include <ctime>

struct StatsFileManager {

  constexpr static const char* const PARAM_FILE_ENV_VAR =
      "GALOIS_PARAMETER_OUTFILE";

  bool init     = false;
  bool isOpen   = false;
  FILE* statsFH = nullptr;
  // char statsFileName[FNAME_SIZE];
  std::string statsFileName;

  ~StatsFileManager(void) { close(); }

  static void getTimeStampedName(std::string& statsFileName) {

    constexpr unsigned FNAME_SIZE = 256;
    char buf[FNAME_SIZE];

    time_t rawtime;
    struct tm* timeinfo;

    time(&rawtime);
    timeinfo = localtime(&rawtime);

    strftime(buf, FNAME_SIZE, "ParaMeter-Stats-%Y-%m-%d--%H-%M-%S.csv",
             timeinfo);
    statsFileName = buf;
  }

  FILE* get(void) {
    if (!init) {
      init = true;

      if (!galois::substrate::EnvCheck(PARAM_FILE_ENV_VAR, statsFileName)) {
        // statsFileName = "ParaMeter-Stats.csv";
        getTimeStampedName(statsFileName);
      }

      statsFH = fopen(statsFileName.c_str(), "w");
      GALOIS_ASSERT(statsFH != nullptr, "ParaMeter stats file error");

      galois::runtime::ParaMeter::StepStatsBase::printHeader(statsFH);

      fclose(statsFH);
    }

    if (!isOpen) {
      statsFH = fopen(statsFileName.c_str(), "a"); // open in append mode
      GALOIS_ASSERT(statsFH != nullptr, "ParaMeter stats file error");

      isOpen = true;
    }

    return statsFH;
  }

  void close(void) {
    if (isOpen) {
      fclose(statsFH);
      isOpen  = false;
      statsFH = nullptr;
    }
  }
};

static StatsFileManager& getStatsFileManager(void) {
  static StatsFileManager s;
  return s;
}

FILE* galois::runtime::ParaMeter::getStatsFile(void) {
  return getStatsFileManager().get();
}

void galois::runtime::ParaMeter::closeStatsFile(void) {
  getStatsFileManager().close();
}


================================================
FILE: libgalois/src/PerThreadStorage.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/PageAlloc.h"

#include "galois/gIO.h"
#include <atomic>
#include <mutex>

thread_local char* galois::substrate::ptsBase;

galois::substrate::PerBackend& galois::substrate::getPTSBackend() {
  static galois::substrate::PerBackend b;
  return b;
}

thread_local char* galois::substrate::pssBase;

galois::substrate::PerBackend& galois::substrate::getPPSBackend() {
  static galois::substrate::PerBackend b;
  return b;
}

const size_t ptAllocSize = galois::substrate::allocSize();
inline void* alloc() {
  // alloc a single page, don't prefault
  void* toReturn = galois::substrate::allocPages(1, true);
  if (toReturn == nullptr) {
    GALOIS_DIE("per-thread storage out of memory");
  }
  return toReturn;
}

constexpr unsigned MAX_SIZE = 30;
// PerBackend storage is typically cache-aligned. Simplify bookkeeping at the
// expense of fragmentation by restricting all allocations to be cache-aligned.
constexpr unsigned MIN_SIZE = 7;

static_assert((1 << MIN_SIZE) == galois::substrate::GALOIS_CACHE_LINE_SIZE);

galois::substrate::PerBackend::PerBackend() { freeOffsets.resize(MAX_SIZE); }

unsigned galois::substrate::PerBackend::nextLog2(unsigned size) {
  unsigned i = MIN_SIZE;
  while ((1U << i) < size) {
    ++i;
  }
  if (i >= MAX_SIZE) {
    abort();
  }
  return i;
}

unsigned galois::substrate::PerBackend::allocOffset(const unsigned sz) {
  unsigned ll   = nextLog2(sz);
  unsigned size = (1 << ll);

  if (nextLoc.load(std::memory_order_relaxed) + size <= ptAllocSize) {
    // simple path, where we allocate bump ptr style
    unsigned offset = nextLoc.fetch_add(size);
    if (offset + size <= ptAllocSize) {
      return offset;
    }
  }

  if (invalid) {
    GALOIS_DIE("allocating after delete");
    return ptAllocSize;
  }

  // find a free offset
  std::lock_guard<Lock> llock(freeOffsetsLock);

  unsigned index = ll;
  if (!freeOffsets[index].empty()) {
    unsigned offset = freeOffsets[index].back();
    freeOffsets[index].pop_back();
    return offset;
  }

  // find a bigger size
  for (; (index < MAX_SIZE) && (freeOffsets[index].empty()); ++index)
    ;

  if (index == MAX_SIZE) {
    GALOIS_DIE("per-thread storage out of memory");
    return ptAllocSize;
  }

  // Found a bigger free offset. Use the first piece equal to required
  // size and produce vending machine change for the rest.
  assert(!freeOffsets[index].empty());
  unsigned offset = freeOffsets[index].back();
  freeOffsets[index].pop_back();

  // remaining chunk
  unsigned end   = offset + (1 << index);
  unsigned start = offset + size;
  for (unsigned i = index - 1; start < end; --i) {
    freeOffsets[i].push_back(start);
    start += (1 << i);
  }

  assert(offset != ptAllocSize);

  return offset;
}

void galois::substrate::PerBackend::deallocOffset(const unsigned offset,
                                                  const unsigned sz) {
  unsigned ll       = nextLog2(sz);
  unsigned size     = (1 << ll);
  unsigned expected = offset + size;

  if (nextLoc.compare_exchange_strong(expected, offset)) {
    // allocation was at the end, so recovered some memory
    return;
  }

  if (invalid) {
    GALOIS_DIE("deallocing after delete");
    return;
  }

  // allocation not at the end
  std::lock_guard<Lock> llock(freeOffsetsLock);
  freeOffsets[ll].push_back(offset);
}

void* galois::substrate::PerBackend::getRemote(unsigned thread,
                                               unsigned offset) {
  char* rbase = heads[thread].load(std::memory_order_relaxed);
  assert(rbase);
  return &rbase[offset];
}

void galois::substrate::PerBackend::initCommon(unsigned maxT) {
  if (!heads) {
    assert(ThreadPool::getTID() == 0);
    heads = new std::atomic<char*>[maxT] {};
  }
}

char* galois::substrate::PerBackend::initPerThread(unsigned maxT) {
  initCommon(maxT);
  char* b = heads[ThreadPool::getTID()] = (char*)alloc();
  memset(b, 0, ptAllocSize);
  return b;
}

char* galois::substrate::PerBackend::initPerSocket(unsigned maxT) {
  initCommon(maxT);
  unsigned id     = ThreadPool::getTID();
  unsigned leader = ThreadPool::getLeader();
  if (id == leader) {
    char* b = heads[id] = (char*)alloc();
    memset(b, 0, ptAllocSize);
    return b;
  }
  char* expected = nullptr;
  // wait for leader to fix up socket
  while (heads[leader].compare_exchange_weak(expected, nullptr)) {
    substrate::asmPause();
  }
  heads[id] = heads[leader].load();
  return heads[id];
}

void galois::substrate::initPTS(unsigned maxT) {
  if (!ptsBase) {
    // unguarded initialization as initPTS will run in the master thread
    // before any other threads are generated
    ptsBase = getPTSBackend().initPerThread(maxT);
  }
  if (!pssBase) {
    pssBase = getPPSBackend().initPerSocket(maxT);
  }
}


================================================
FILE: libgalois/src/PreAlloc.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Executor_OnEach.h"
#include "galois/runtime/Mem.h"
#include "galois/runtime/PagePool.h"

void galois::runtime::preAlloc_impl(unsigned num) {
  unsigned pagesPerThread = (num + activeThreads - 1) / activeThreads;
  substrate::getThreadPool().run(activeThreads,
                                 [=]() { pagePoolPreAlloc(pagesPerThread); });
}


================================================
FILE: libgalois/src/Profile.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Profile.h"

#ifdef GALOIS_ENABLE_PAPI
extern "C" {
#include <papi.h>
#include <papiStdEventDefs.h>
}
#include <iostream>

unsigned long galois::runtime::internal::papiGetTID(void) {
  return galois::substrate::ThreadPool::getTID();
}
#endif


================================================
FILE: libgalois/src/PtrLock.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/PtrLock.h"

void galois::substrate::internal::ptr_slow_lock(std::atomic<uintptr_t>& _l) {
  uintptr_t oldval;
  do {
    while ((_l.load(std::memory_order_acquire) & 1) != 0) {
      asmPause();
    }
    oldval = _l.fetch_or(1, std::memory_order_acq_rel);
  } while (oldval & 1);
  assert(_l);
}


================================================
FILE: libgalois/src/SharedMem.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/SharedMem.h"
#include "galois/substrate/Barrier.h"
#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/Termination.h"

#include <memory>

galois::substrate::SharedMem::SharedMem() {
  internal::setThreadPool(&m_tpool);

  // delayed initialization because both call getThreadPool in constructor
  // which is valid only after setThreadPool() above
  m_biPtr   = std::make_unique<internal::BarrierInstance<>>();
  m_termPtr = std::make_unique<internal::LocalTerminationDetection<>>();

  internal::setBarrierInstance(m_biPtr.get());
  internal::setTermDetect(m_termPtr.get());
}

galois::substrate::SharedMem::~SharedMem() {
  internal::setTermDetect(nullptr);
  internal::setBarrierInstance(nullptr);

  // destructors can call getThreadPool(), hence must be destroyed before
  // setThreadPool() below
  m_termPtr.reset();
  m_biPtr.reset();

  internal::setThreadPool(nullptr);
}


================================================
FILE: libgalois/src/SharedMemSys.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/SharedMemSys.h"

galois::SharedMemSys::SharedMemSys() = default;

galois::SharedMemSys::~SharedMemSys() = default;


================================================
FILE: libgalois/src/SimpleLock.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/SimpleLock.h"

void galois::substrate::SimpleLock::slow_lock() const {
  int oldval = 0;
  do {
    while (_lock.load(std::memory_order_acquire) != 0) {
      asmPause();
    }
    oldval = 0;
  } while (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel,
                                        std::memory_order_relaxed));
  assert(is_locked());
}


================================================
FILE: libgalois/src/Statistics.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Statistics.h"
#include "galois/runtime/Executor_OnEach.h"

#include <iostream>
#include <fstream>

using namespace galois::runtime;

boost::uuids::uuid galois::runtime::getRandUUID(void) {
  static boost::uuids::uuid UUID = boost::uuids::random_generator()();
  return UUID;
}

using galois::gstl::Str;

StatManager::StatManager(const std::string& outfile) : m_outfile(outfile) {}

StatManager::~StatManager(void) {}

void StatManager::setStatFile(const std::string& outfile) {
  m_outfile = outfile;
}

void galois::runtime::setStatFile(const std::string& f) {
  internal::sysStatManager()->setStatFile(f);
}

void galois::runtime::reportRUsage(const std::string& id) {
  // get rusage at this point in time
  struct rusage usage_stats;
  int rusage_result = getrusage(RUSAGE_SELF, &usage_stats);
  if (rusage_result != 0) {
    GALOIS_DIE("getrusage failed: ", rusage_result);
  }

  // report stats using ID to identify them
  reportStat("rusage", "MaxResidentSetSize_" + id, usage_stats.ru_maxrss,
             StatTotal::SINGLE);
  reportStat("rusage", "SoftPageFaults_" + id, usage_stats.ru_minflt,
             StatTotal::SINGLE);
  reportStat("rusage", "HardPageFaults_" + id, usage_stats.ru_majflt,
             StatTotal::SINGLE);
}

bool StatManager::printingThreadVals(void) {
  return galois::substrate::EnvCheck(StatManager::TSTAT_ENV_VAR);
}

void StatManager::print(void) {
  if (m_outfile == "") {
    printStats(std::cout);
  } else {
    std::ofstream outf(m_outfile.c_str());
    if (outf.good()) {
      printStats(outf);
    } else {
      gWarn("Could not open stats file for writing, file provided:", m_outfile);
      printStats(std::cerr);
    }
  }
}

void StatManager::printStats(std::ostream& out) {
  mergeStats();
  printHeader(out);
  intStats.print(out);
  fpStats.print(out);
  strStats.print(out);
}

void StatManager::printHeader(std::ostream& out) const {

  out << "STAT_TYPE" << SEP << "REGION" << SEP << "CATEGORY" << SEP;
  out << "TOTAL_TYPE" << SEP << "TOTAL";
  out << "\n";
}

StatManager::int_iterator StatManager::intBegin(void) const {
  return intStats.cbegin();
}
StatManager::int_iterator StatManager::intEnd(void) const {
  return intStats.cend();
}

StatManager::fp_iterator StatManager::fpBegin(void) const {
  return fpStats.cbegin();
}
StatManager::fp_iterator StatManager::fpEnd(void) const {
  return fpStats.cend();
}

StatManager::str_iterator StatManager::paramBegin(void) const {
  return strStats.cbegin();
}
StatManager::str_iterator StatManager::paramEnd(void) const {
  return strStats.cend();
}

static galois::runtime::StatManager* SM;

void galois::runtime::internal::setSysStatManager(
    galois::runtime::StatManager* sm) {
  GALOIS_ASSERT(!(SM && sm), "StatManager.cpp: Double Initialization of SM");
  SM = sm;
}

StatManager* galois::runtime::internal::sysStatManager(void) { return SM; }

void galois::runtime::reportPageAlloc(const char* category) {
  galois::runtime::on_each_gen(
      [category](const unsigned int tid, const unsigned int) {
        reportStat_Tsum("PageAlloc", category, numPagePoolAllocForThread(tid));
      },
      std::make_tuple());
}

void galois::runtime::reportNumaAlloc(const char*) {
  galois::gWarn("reportNumaAlloc NOT IMPLEMENTED YET. TBD");
  int nodes = substrate::getThreadPool().getMaxNumaNodes();
  for (int x = 0; x < nodes; ++x) {
    // auto rStat = Stats.getRemote(x);
    // std::lock_guard<substrate::SimpleLock> lg(rStat->first);
    //      rStat->second.emplace_back(loop, category, numNumaAllocForNode(x));
  }
  //  SC->addNumaAllocToStat(std::string("(NULL)"), std::string(category ?
  //  category : "(NULL)"));
}


================================================
FILE: libgalois/src/Substrate.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Substrate.h"
#include "galois/substrate/Barrier.h"

galois::substrate::Barrier&
galois::runtime::getBarrier(unsigned activeThreads) {
  return galois::substrate::getBarrier(activeThreads);
}


================================================
FILE: libgalois/src/Support.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */


================================================
FILE: libgalois/src/Termination.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/gIO.h"
#include "galois/substrate/Termination.h"

// vtable anchoring
galois::substrate::TerminationDetection::~TerminationDetection(void) {}

static galois::substrate::TerminationDetection* TERM = nullptr;

void galois::substrate::internal::setTermDetect(
    galois::substrate::TerminationDetection* t) {
  GALOIS_ASSERT(!(TERM && t), "Double initialization of TerminationDetection");
  TERM = t;
}

galois::substrate::TerminationDetection&
galois::substrate::getSystemTermination(unsigned activeThreads) {
  TERM->init(activeThreads);
  return *TERM;
}


================================================
FILE: libgalois/src/ThreadPool.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/ThreadPool.h"
#include "galois/substrate/EnvCheck.h"
#include "galois/substrate/HWTopo.h"
#include "galois/gIO.h"

#include <algorithm>
#include <iostream>

// Forward declare this to avoid including PerThreadStorage.
// We avoid this to stress that the thread Pool MUST NOT depend on PTS.
namespace galois::substrate {

extern void initPTS(unsigned);

}

using galois::substrate::ThreadPool;

thread_local ThreadPool::per_signal ThreadPool::my_box;

ThreadPool::ThreadPool()
    : mi(getHWTopo().machineTopoInfo), reserved(0), masterFastmode(false),
      running(false) {
  signals.resize(mi.maxThreads);
  initThread(0);

  for (unsigned i = 1; i < mi.maxThreads; ++i) {
    std::thread t(&ThreadPool::threadLoop, this, i);
    threads.emplace_back(std::move(t));
  }

  // we don't want signals to have to contain atomics, since they are set once
  while (std::any_of(signals.begin(), signals.end(),
                     [](per_signal* p) { return !p || !p->done; })) {
    std::atomic_thread_fence(std::memory_order_seq_cst);
  }
}

ThreadPool::~ThreadPool() {
  destroyCommon();
  for (auto& t : threads) {
    t.join();
  }
}

void ThreadPool::destroyCommon() {
  beKind(); // reset fastmode
  run(mi.maxThreads, []() { throw shutdown_ty(); });
}

void ThreadPool::burnPower(unsigned num) {
  num = std::min(num, getMaxUsableThreads());

  // changing number of threads?  just do a reset
  if (masterFastmode && masterFastmode != num) {
    beKind();
  }
  if (!masterFastmode) {
    run(num, []() { throw fastmode_ty{true}; });
    masterFastmode = num;
  }
}

void ThreadPool::beKind() {
  if (masterFastmode) {
    run(masterFastmode, []() { throw fastmode_ty{false}; });
    masterFastmode = 0;
  }
}

// inefficient append
template <typename T>
static void atomic_append(std::atomic<T*>& headptr, T* newnode) {
  T* n = nullptr;
  if (!headptr.compare_exchange_strong(n, newnode))
    atomic_append(headptr.load()->next, newnode);
}

// find id
template <typename T>
static unsigned findID(std::atomic<T*>& headptr, T* node, unsigned off) {
  T* n = headptr.load();
  assert(n);
  if (n == node) {
    return off;
  }
  return findID(n->next, node, off + 1);
}

template <typename T>
static T* getNth(std::atomic<T*>& headptr, unsigned off) {
  T* n = headptr.load();
  if (!off) {
    return n;
  }
  return getNth(n->next, off - 1);
}

void ThreadPool::initThread(unsigned tid) {
  signals[tid] = &my_box;
  my_box.topo  = getHWTopo().threadTopoInfo[tid];
  // Initialize
  substrate::initPTS(mi.maxThreads);

  if (!EnvCheck("GALOIS_DO_NOT_BIND_THREADS")) {
    if (my_box.topo.tid != 0 || !EnvCheck("GALOIS_DO_NOT_BIND_MAIN_THREAD")) {
      bindThreadSelf(my_box.topo.osContext);
    }
  }
  my_box.done = 1;
}

void ThreadPool::threadLoop(unsigned tid) {
  initThread(tid);
  bool fastmode = false;
  auto& me      = my_box;
  do {
    me.wait(fastmode);
    cascade(fastmode);
    try {
      work();
    } catch (const shutdown_ty&) {
      return;
    } catch (const fastmode_ty& fm) {
      fastmode = fm.mode;
    } catch (const dedicated_ty dt) {
      me.done = 1;
      dt.fn();
      return;
    } catch (const std::exception& exc) {
      // catch anything thrown within try block that derives from std::exception
      std::cerr << exc.what();
      abort();
    } catch (...) {
      abort();
    }
    decascade();
  } while (true);
}

void ThreadPool::decascade() {
  auto& me = my_box;
  // nothing to wake up
  if (me.wbegin != me.wend) {
    auto midpoint = me.wbegin + (1 + me.wend - me.wbegin) / 2;
    auto& c1done  = signals[me.wbegin]->done;
    while (!c1done) {
      asmPause();
    }
    if (midpoint < me.wend) {
      auto& c2done = signals[midpoint]->done;
      while (!c2done) {
        asmPause();
      }
    }
  }
  me.done = 1;
}

void ThreadPool::cascade(bool fastmode) {
  auto& me = my_box;
  assert(me.wbegin <= me.wend);

  // nothing to wake up
  if (me.wbegin == me.wend) {
    return;
  }

  auto midpoint = me.wbegin + (1 + me.wend - me.wbegin) / 2;

  auto child1    = signals[me.wbegin];
  child1->wbegin = me.wbegin + 1;
  child1->wend   = midpoint;
  child1->wakeup(fastmode);

  if (midpoint < me.wend) {
    auto child2    = signals[midpoint];
    child2->wbegin = midpoint + 1;
    child2->wend   = me.wend;
    child2->wakeup(fastmode);
  }
}

void ThreadPool::runInternal(unsigned num) {
  // sanitize num
  // seq write to starting should make work safe
  GALOIS_ASSERT(!running, "Recursive thread pool execution not supported");
  running = true;
  num     = std::min(std::max(1U, num), getMaxUsableThreads());
  // my_box is tid 0
  auto& me  = my_box;
  me.wbegin = 1;
  me.wend   = num;

  assert(!masterFastmode || masterFastmode == num);
  // launch threads
  cascade(masterFastmode);
  // Do master thread work
  try {
    work();
  } catch (const shutdown_ty&) {
    return;
  } catch (const fastmode_ty& fm) {
  }
  // wait for children
  decascade();
  // Clean up
  work    = nullptr;
  running = false;
}

void ThreadPool::runDedicated(std::function<void(void)>& f) {
  // TODO(ddn): update galois::runtime::activeThreads to reflect the dedicated
  // thread but we don't want to depend on galois::runtime symbols and too many
  // clients access galois::runtime::activeThreads directly.
  GALOIS_ASSERT(!running,
                "Can't start dedicated thread during parallel section");
  ++reserved;

  GALOIS_ASSERT(reserved < mi.maxThreads, "Too many dedicated threads");
  work          = [&f]() { throw dedicated_ty{f}; };
  auto child    = signals[mi.maxThreads - reserved];
  child->wbegin = 0;
  child->wend   = 0;
  child->done   = 0;
  child->wakeup(masterFastmode);
  while (!child->done) {
    asmPause();
  }
  work = nullptr;
}

static galois::substrate::ThreadPool* TPOOL = nullptr;

void galois::substrate::internal::setThreadPool(ThreadPool* tp) {
  GALOIS_ASSERT(!(TPOOL && tp), "Double initialization of ThreadPool");
  TPOOL = tp;
}

galois::substrate::ThreadPool& galois::substrate::getThreadPool() {
  GALOIS_ASSERT(TPOOL, "ThreadPool not initialized");
  return *TPOOL;
}


================================================
FILE: libgalois/src/ThreadTimer.cpp
================================================
#include "galois/runtime/ThreadTimer.h"
#include "galois/runtime/Executor_OnEach.h"
#include "galois/runtime/Statistics.h"

#include <ctime>
#include <limits>

void galois::runtime::ThreadTimers::reportTimes(const char* category,
                                                const char* region) {

  uint64_t minTime = std::numeric_limits<uint64_t>::max();

  for (unsigned i = 0; i < timers_.size(); ++i) {
    auto ns = timers_.getRemote(i)->get_nsec();
    minTime = std::min(minTime, ns);
  }

  std::string timeCat = category + std::string("PerThreadTimes");
  std::string lagCat  = category + std::string("PerThreadLag");

  on_each_gen(
      [&](auto, auto) {
        auto ns  = timers_.getLocal()->get_nsec();
        auto lag = ns - minTime;
        assert(lag > 0 && "negative time lag from min is impossible");

        reportStat_Tmax(region, timeCat.c_str(), ns / 1000000);
        reportStat_Tmax(region, lagCat.c_str(), lag / 1000000);
      },
      std::make_tuple());
}


================================================
FILE: libgalois/src/Threads.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/ThreadPool.h"
#include "galois/Threads.h"

#include <algorithm>
namespace galois {
namespace runtime {
unsigned int activeThreads = 1;
}
} // namespace galois

unsigned int galois::setActiveThreads(unsigned int num) noexcept {
  num = std::min(num, galois::substrate::getThreadPool().getMaxUsableThreads());
  num = std::max(num, 1U);
  galois::runtime::activeThreads = num;
  return num;
}

unsigned int galois::getActiveThreads() noexcept {
  return galois::runtime::activeThreads;
}


================================================
FILE: libgalois/src/Timer.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Timer.h"
#include "galois/runtime/Statistics.h"

using namespace galois;

void Timer::start() { startT = clockTy::now(); }

void Timer::stop() { stopT = clockTy::now(); }

uint64_t Timer::get() const {
  return std::chrono::duration_cast<std::chrono::milliseconds>(stopT - startT)
      .count();
}

uint64_t Timer::get_usec() const {
  return std::chrono::duration_cast<std::chrono::microseconds>(stopT - startT)
      .count();
}

TimeAccumulator::TimeAccumulator() : ltimer(), acc(0) {}

void TimeAccumulator::start() { ltimer.start(); }

void TimeAccumulator::stop() {
  ltimer.stop();
  acc += ltimer.get_usec();
}

uint64_t TimeAccumulator::get() const { return acc / 1000; }
uint64_t TimeAccumulator::get_usec() const { return acc; }

TimeAccumulator& TimeAccumulator::operator+=(const TimeAccumulator& rhs) {
  acc += rhs.acc;
  return *this;
}

TimeAccumulator& TimeAccumulator::operator+=(const Timer& rhs) {
  acc += rhs.get_usec();
  return *this;
}

StatTimer::StatTimer(const char* const name, const char* const region) {
  const char* n = name ? name : "Time";
  const char* r = region ? region : "(NULL)";

  name_   = gstl::makeStr(n);
  region_ = gstl::makeStr(r);

  valid_ = false;
}

StatTimer::~StatTimer() {
  if (valid_) {
    stop();
  }

  // only report non-zero stat
  if (TimeAccumulator::get()) {
    galois::runtime::reportStat_Tmax(region_, name_, TimeAccumulator::get());
  }
}

void StatTimer::start() {
  TimeAccumulator::start();
  valid_ = true;
}

void StatTimer::stop() {
  valid_ = false;
  TimeAccumulator::stop();
}

uint64_t StatTimer::get_usec() const { return TimeAccumulator::get_usec(); }


================================================
FILE: libgalois/src/Tracer.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Tracer.cpp
 *
 * Implementations/variables for Tracer.h
 */

#include "galois/runtime/Tracer.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/EnvCheck.h"

#include <fstream>
#include <cassert>
#include <iostream>
#include <chrono>
#include <mutex>

#include <sys/types.h>
#include <unistd.h>

using namespace galois::substrate;

static bool doCerr     = false;
static bool doCerrInit = false;

namespace galois {
namespace runtime {
uint32_t getHostID() __attribute__((weak));
} // end namespace runtime
} // end namespace galois

/**
 * Returns 0
 */
uint32_t galois::runtime::getHostID() { return 0; }

static std::ostream& openIfNot() {
  if (!doCerrInit) {
    doCerr     = EnvCheck("GALOIS_DEBUG_TRACE_STDERR");
    doCerrInit = true;
  }
  if (doCerr)
    return std::cerr;
  static std::ofstream output;
  if (!output.is_open()) {
    pid_t id       = getpid();
    char name[100] = "";
    gethostname(name, sizeof(name));
    char fname[120];
    snprintf(fname, sizeof(fname), "%s.%d.log", name, id);
    output.open(fname, std::ios_base::app);
  }
  assert(output.is_open());
  return output;
}

void galois::runtime::internal::printTrace(std::ostringstream& os) {
  using namespace std::chrono;
  static SimpleLock lock;
  std::lock_guard<SimpleLock> lg(lock);
  auto& out = openIfNot();
  auto dtn  = system_clock::now().time_since_epoch();
  out << "<" << dtn.count() << "," << getHostID() << "> ";
  out << os.str();
  out.flush();
  static int iSleep   = 0;
  static bool doSleep = EnvCheck("GALOIS_DEBUG_TRACE_PAUSE", iSleep);
  if (doSleep)
    usleep(iSleep ? iSleep : 10);
}

static std::ofstream& openIfNot_output() {
  static std::ofstream output_file;
  if (!output_file.is_open()) {
    char name[100] = "";
    gethostname(name, sizeof(name));
    char fname[120];
    snprintf(fname, sizeof(fname), "output_%s_%d.log", name,
             galois::runtime::getHostID());
    output_file.open(fname, std::ios_base::app);
  }
  assert(output_file.is_open());
  return output_file;
}

void galois::runtime::internal::print_output_impl(std::ostringstream& os) {
  using namespace galois::runtime;
  static SimpleLock lock2;
  std::lock_guard<SimpleLock> lg(lock2);
  auto& out = openIfNot_output();
  out << os.str();
  out.flush();
}

//! Specifies whether or not tracing is enabled
bool galois::runtime::internal::doTrace = false;
//! Specify if program has checked environment to see if doTrace should be on
//! or off
bool galois::runtime::internal::initTrace = false;


================================================
FILE: libgalois/src/Version.cpp.in
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Version.h"

#define QUOTE(name) #name
#define STR(macro) QUOTE(macro)

std::string galois::getVersion() { return STR(@GALOIS_VERSION@); }

std::string galois::getRevision() { return "unknown"; }

int galois::getVersionMajor() { return @GALOIS_VERSION_MAJOR@; }

int galois::getVersionMinor() { return @GALOIS_VERSION_MINOR@; }

int galois::getVersionPatch() { return @GALOIS_VERSION_PATCH@; }

int galois::getCopyrightYear() { return @GALOIS_COPYRIGHT_YEAR@; }


================================================
FILE: libgalois/src/gIO.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/gIO.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/EnvCheck.h"
#include "galois/substrate/ThreadPool.h"

#include <cstdlib>
#include <cstdio>
#include <ctime>
#include <cstring>
#include <cstdarg>
#include <cerrno>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <mutex>

static void printString(bool error, bool newline, const std::string& prefix,
                        const std::string& s) {
  static galois::substrate::SimpleLock IOLock;
  std::lock_guard<decltype(IOLock)> lock(IOLock);
  std::ostream& o = error ? std::cerr : std::cout;
  if (prefix.length())
    o << prefix << ": ";
  o << s;
  if (newline)
    o << "\n";
}

void galois::gDebugStr(const std::string& s) {
  static bool skip = galois::substrate::EnvCheck("GALOIS_DEBUG_SKIP");
  if (skip)
    return;
  static const unsigned TIME_STR_SIZE = 32;
  char time_str[TIME_STR_SIZE];
  time_t rawtime;
  struct tm* timeinfo;

  time(&rawtime);
  timeinfo = localtime(&rawtime);

  strftime(time_str, TIME_STR_SIZE, "[%H:%M:%S]", timeinfo);

  std::ostringstream os;
  os << "[" << time_str << " " << std::setw(3)
     << galois::substrate::ThreadPool::getTID() << "] " << s;

  if (galois::substrate::EnvCheck("GALOIS_DEBUG_TO_FILE")) {
    static galois::substrate::SimpleLock dIOLock;
    std::lock_guard<decltype(dIOLock)> lock(dIOLock);
    static std::ofstream debugOut;
    if (!debugOut.is_open()) {
      char fname[] = "gdebugXXXXXX";
      int fd       = mkstemp(fname);
      close(fd);
      debugOut.open(fname);
      gInfo("Debug output going to ", fname);
    }
    debugOut << os.str() << "\n";
    debugOut.flush();
  } else {
    printString(true, true, "DEBUG", os.str());
  }
}

void galois::gPrintStr(const std::string& s) {
  printString(false, false, "", s);
}

void galois::gInfoStr(const std::string& s) {
  printString(false, true, "INFO", s);
}

void galois::gWarnStr(const std::string& s) {
  printString(false, true, "WARNING", s);
}

void galois::gErrorStr(const std::string& s) {
  printString(true, true, "ERROR", s);
}

void galois::gFlush() { fflush(stdout); }


================================================
FILE: libgalois/test/CMakeLists.txt
================================================
function(add_test_unit name)
  set(options)
  set(multi_value_args REQUIRES COMMAND_PREFIX)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

  foreach(required ${X_REQUIRES})
    if(${${required}} MATCHES "TRUE")
    else()
      message(STATUS "NOT compiling ${name} (missing: ${required})")
      return()
    endif()
  endforeach()

  set(test_name unit-${name})

  add_executable(${test_name} ${name}.cpp)
  target_link_libraries(${test_name} galois_shmem lonestar)

  set(commandline ${X_COMMAND_PREFIX})
  list(APPEND commandline "$<TARGET_FILE:${test_name}>")
  list(APPEND commandline ${X_UNPARSED_ARGUMENTS})

  add_test(NAME ${test_name} COMMAND ${commandline})

  # Allow parallel tests
  set_tests_properties(${test_name}
    PROPERTIES
      ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1
      LABELS quick
    )
endfunction()

add_test_unit(acquire)
add_test_unit(bandwidth)
add_test_unit(barriers 1024 2)
add_test_unit(empty-member-lcgraph)
add_test_unit(flatmap)
add_test_unit(floatingPointErrors)
add_test_unit(foreach)
add_test_unit(forward-declare-graph)
add_test_unit(gcollections)
add_test_unit(graph)
add_test_unit(graph-compile)
add_test_unit(gslist)
add_test_unit(hwtopo)
add_test_unit(lc-adaptor)
add_test_unit(lock)
add_test_unit(loop-overhead REQUIRES OPENMP_FOUND)
add_test_unit(mem)
add_test_unit(morphgraph)
add_test_unit(move)
add_test_unit(oneach)
add_test_unit(papi 2)
add_test_unit(pc)
add_test_unit(reduction)
add_test_unit(sort)
add_test_unit(static)
add_test_unit(traits)
add_test_unit(twoleveliteratora)
add_test_unit(wakeup-overhead)
add_test_unit(worklists-compile)
add_test_unit(morphgraph-removal)


================================================
FILE: libgalois/test/README.md
================================================
# Prerequisites

Some tests use sample graphs as inputs, and these can be downloaded with:
```bash
make input
```

If you want to point the tests to an existing set of sample graphs, you
can use the `cmake -DGALOIS_GRAPH_LOCATION=...`.

# Common ctest commands

```bash
# All ctest commands should be run from your build directory
cd ${CMAKE_BINARY_DIR}

# Run all tests in parallel with 4 jobs
ctest -j 4

# Run all tests matching pattern
ctest -R regex

# Run all tests matching label pattern
ctest -L regex

# Show test output
ctest -V

ctest --rerun-failed

# Run tests with valgrind memcheck
ctest --test-action memcheck

# ctest state (e.g., last failed tests, test output) is stored in
# ${CMAKE_BINARY_DIR}/Testing
find Testing/ -type f | xargs cat
```

Tests are divided into several major labels:
- **quick**: Quick tests have no external dependencies and can be run in parallel
  with other quick tests. Each quick test should run in a second or less. These
  tests are run as part of our continuous integration pipeline.
- **nightly**: Nightly tests are tests that take longer (e.g., scalability tests).
  They are run every night on the current master commit.


================================================
FILE: libgalois/test/acquire.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Timer.h"
#include "galois/runtime/Context.h"

#include <cstdlib>
#include <iostream>

int main(int argc, char** argv) {
  galois::runtime::SimpleRuntimeContext S;
  galois::runtime::Lockable L;

  int numAcquires = 1;
  if (argc > 1)
    numAcquires = atoi(argv[1]);
  if (numAcquires <= 0)
    numAcquires = 1024 * 1024 * 1024;

  galois::Timer t;
  t.start();

  for (int x = 0; x < numAcquires; ++x)
    galois::runtime::acquire(&L, galois::MethodFlag::WRITE);

  t.stop();
  std::cout << "Locking time: " << t.get() << " ms after " << numAcquires
            << "\n";

  return 0;
}


================================================
FILE: libgalois/test/bandwidth.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"

#include <random>
#include <cstdio>
#include <time.h>

template <typename Gen>
void random_access(Gen& gen, int* buf, size_t size, size_t accesses) {
  std::uniform_int_distribution<size_t> randIndex(0, size - 1);
  for (unsigned i = 0; i < accesses; ++i) {
    size_t idx = randIndex(gen);
    buf[idx] += 1;
  }
}

struct run_local_helper {
  int* block;
  size_t seed;
  size_t size;
  run_local_helper(int* b, size_t s, size_t ss) : block(b), seed(s), size(ss) {}
  run_local_helper() {}
  void operator()(unsigned int tid, unsigned int num) {
    std::mt19937 gen(seed + tid);
    std::uniform_int_distribution<int> randSeed;
    auto r   = galois::block_range(block, block + size, tid, num);
    size_t d = std::distance(r.first, r.second);
    random_access(gen, r.first, d, d);
  }
};

void run_local(size_t seed, size_t mega) {
  size_t size = mega * 1024 * 1024;
  int* block  = (int*)malloc(size * sizeof(*block));

  // Assuming first touch policy
  run_local_helper r(block, seed, size);
  galois::on_each(r);
  free(block);
}

struct run_interleaved_helper {
  int* block;
  size_t seed;
  size_t size;
  run_interleaved_helper(int* b, size_t s, size_t ss)
      : block(b), seed(s), size(ss) {}
  run_interleaved_helper() {}
  void operator()(unsigned int tid, unsigned int num) {
    std::mt19937 gen(seed + tid);
    std::uniform_int_distribution<int> randSeed;
    auto r   = galois::block_range(block, block + size, tid, num);
    size_t d = std::distance(r.first, r.second);
    random_access(gen, block, size, d);
  }
};

void run_interleaved(size_t seed, size_t mega, bool full) {
  size_t size = mega * 1024 * 1024;
  auto ptr    = galois::substrate::largeMallocInterleaved(
      size * sizeof(int),
      full ? galois::substrate::getThreadPool().getMaxThreads()
           : galois::runtime::activeThreads);
  int* block = (int*)ptr.get();

  run_interleaved_helper r(block, seed, size);
  galois::on_each(r);
}

template <typename Fn>
long time_run(Fn fn) {
  galois::Timer t1;
  t1.start();
  fn();
  t1.stop();
  return t1.get();
}

struct F1 {
  size_t seed;
  size_t mega;
  F1(size_t s, size_t m) : seed(s), mega(m) {}
  void operator()() { run_local(seed, mega); }
};

struct F2 {
  size_t seed;
  size_t mega;
  bool full;
  F2(size_t s, size_t m, bool f) : seed(s), mega(m), full(f) {}
  void operator()() { run_interleaved(seed, mega, full); }
};

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  unsigned M  = galois::substrate::getThreadPool().getMaxThreads() / 2;
  size_t mega = 1;
  if (argc > 1)
    mega = atoi(argv[1]);
  if (!mega)
    mega = 200;

  size_t seed = time(NULL);
  printf("Working set: %zu MB\n\n", mega);
  printf("Effective random-access bandwidth (MB/s)\n");
  printf("T    LOCAL    INTERLEAVE    FULL-INTERLEAVE\n");
  for (unsigned threads = 1; threads <= M; ++threads) {
    galois::setActiveThreads(threads);

    long local_millis           = time_run(F1(seed, mega));
    long interleave_millis      = time_run(F2(seed, mega, false));
    long full_interleave_millis = time_run(F2(seed, mega, true));
    double mb                   = mega / (double)sizeof(int);
    // 4 + length of column header
    printf("%4d %8.2f %13.2f %18.2f\n", threads, mb / local_millis * 1000.0,
           mb / interleave_millis * 1000.0,
           mb / full_interleave_millis * 1000.0);
  }
}


================================================
FILE: libgalois/test/barriers.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Timer.h"
#include "galois/Galois.h"
#include "galois/substrate/Barrier.h"

#include <iostream>
#include <cstdlib>
#include <unistd.h>

unsigned iter       = 0;
unsigned numThreads = 0;

char bname[100];

struct emp {
  galois::substrate::Barrier& b;

  void go() {
    for (unsigned i = 0; i < iter; ++i) {
      b.wait();
    }
  }

  template <typename T>
  void operator()(const T&) {
    go();
  }

  template <typename T, typename C>
  void operator()(const T&, const C&) {
    go();
  }
};

void test(std::unique_ptr<galois::substrate::Barrier> b) {
  if (b == nullptr) {
    std::cout << "skipping " << bname << "\n";
    return;
  }

  unsigned M = numThreads;
  if (M > 16)
    M /= 2;
  while (M) {
    galois::setActiveThreads(M);
    b->reinit(M);
    galois::Timer t;
    t.start();
    emp e{*b.get()};
    galois::on_each(e);
    t.stop();
    std::cout << bname << "," << b->name() << "," << M << "," << t.get()
              << "\n";
    M -= 1;
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  if (argc > 1)
    iter = atoi(argv[1]);
  else
    iter = 16 * 1024;
  if (argc > 2)
    numThreads = atoi(argv[2]);
  else
    numThreads = galois::substrate::getThreadPool().getMaxThreads();

  gethostname(bname, sizeof(bname));
  using namespace galois::substrate;
  test(createPthreadBarrier(1));
  test(createCountingBarrier(1));
  test(createMCSBarrier(1));
  test(createTopoBarrier(1));
  test(createDisseminationBarrier(1));
  return 0;
}


================================================
FILE: libgalois/test/empty-member-lcgraph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/LCGraph.h"

int main() {
  constexpr size_t intvoid =
      sizeof(galois::graphs::internal::EdgeInfoBase<int, void>);
  constexpr size_t intint =
      sizeof(galois::graphs::internal::EdgeInfoBase<int, int>);
  static_assert(intvoid < intint, "Failed to do empty member optimization");
  return intvoid < intint ? 0 : 1;
}


================================================
FILE: libgalois/test/flatmap.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/FlatMap.h"
#include "galois/Timer.h"

#include <boost/iterator/counting_iterator.hpp>
#include <cstdlib>
#include <iostream>
#include <map>
#include <random>

struct element {
  volatile int val;
  element() : val() {}
  element(int x) : val(x) {}
  operator int() const { return val; }
};

std::ostream& operator<<(std::ostream& out, const element& e) {
  out << e.val;
  return out;
}

template <typename MapTy>
struct Fn1 {
  MapTy* m;
  void operator()(const int& x) const { (*m)[x] = element(x); }
};

template <typename MapTy>
struct Fn2 {
  MapTy* m;
  void operator()(const int& x) const {
    int v = (*m)[x].val;
    GALOIS_ASSERT(v == x || v == 0);
  }
};

template <typename MapTy>
void timeMapParallel(std::string c, const std::vector<int>& keys) {
  MapTy m;
  galois::Timer t1, t2;
  t1.start();
  galois::do_all(galois::iterate(keys), Fn1<MapTy>{&m});
  t1.stop();
  t2.start();
  galois::do_all(galois::iterate(keys), Fn2<MapTy>{&m});
  t2.stop();
  std::cout << c << " " << t1.get() << " " << t2.get() << "\n";
}

template <typename MapTy>
void timeMap(std::string c, const std::vector<int>& keys) {
  MapTy m;
  galois::Timer t1, t2;
  t1.start();
  for (auto& x : keys) {
    m[x] = element(x);
  }
  t1.stop();
  t2.start();
  for (auto& x : keys) {
    int v = m[x].val;
    GALOIS_ASSERT(v == x);
  }
  t2.stop();
  std::cout << c << " " << t1.get() << " " << t2.get() << "\n";
}

template <typename MapTy>
void testMap() {
  MapTy m;
  MapTy m2(m);
  MapTy m3;

  m3.insert(std::make_pair(10, 0));
  m3.insert(std::make_pair(20, 0));

  MapTy m4(m3.begin(), m3.end());

  m2 = m3;
  m3 = std::move(m2);

  m[0] = 0;
  m[1] = 1;
  m[3] = 2;
  m[3] = m[3] + 3;
  m[4] = 4;

  m.insert(std::make_pair(5, 4));
  m.insert(m4.begin(), m4.end());

  std::cout << "10 == " << m.find(10)->first << "\n";

  // m.erase(10);
  // m.erase(1);

  if (m.size() != 7 || m.empty())
    abort();
  std::swap(m, m3);
  if (m.size() != 2 || m.empty())
    abort();
  m.clear();
  if (m.size() != 0 || !m.empty())
    abort();
  std::swap(m, m3);
  if (m.size() != 7 || m.empty())
    abort();

  for (auto ii = m.begin(), ee = m.end(); ii != ee; ++ii)
    std::cout << ii->first << " " << ii->second << " ";
  std::cout << "\n";

  for (auto ii = m.cbegin(), ee = m.cend(); ii != ee; ++ii)
    std::cout << ii->first << " " << ii->second << " ";
  std::cout << "\n";

  for (auto ii = m.rbegin(), ee = m.rend(); ii != ee; ++ii)
    std::cout << ii->first << " " << ii->second << " ";
  std::cout << "\n";

  for (auto ii = m.crbegin(), ee = m.crend(); ii != ee; ++ii)
    std::cout << ii->first << " " << ii->second << " ";
  std::cout << "\n";
}

void timeTests(std::string prefix, const std::vector<int>& keys) {
  for (int i = 0; i < 3; ++i)
    timeMap<std::map<int, element>>(prefix + "std::map", keys);
  for (int i = 0; i < 3; ++i)
    timeMap<galois::flat_map<int, element>>(prefix + "flat_map", keys);
}

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  testMap<std::map<int, element>>();
  testMap<galois::flat_map<int, element>>();
  galois::setActiveThreads(8);

  int size = 100;
  if (argc > 1)
    size = atoi(argv[1]);
  if (size <= 0)
    size = 1000000;

  std::mt19937 mt(0);
  std::uniform_int_distribution<int> dist(0, size);
  std::vector<int> randomKeys;
  std::vector<int> keys;
  for (int i = 0; i < size; ++i) {
    randomKeys.push_back(dist(mt));
    keys.push_back(i);
  }

  timeTests("seq ", keys);
  timeTests("random ", randomKeys);
  return 0;
}


================================================
FILE: libgalois/test/floatingPointErrors.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <boost/rational.hpp>

#include <random>
#include <iostream>

#include <cstdio>

using Rational = boost::rational<size_t>;

void multiplyTest(const double mpcand, const double mplier, const double ans) {

  double lim = mplier / 100.0;
  assert(lim >= 1.0);

  std::mt19937 eng;
  eng.seed(0);

  std::uniform_real_distribution<double> dist(0.0, lim);

  double remainMplier = mplier;

  double computed = 0.0;

  while (remainMplier > 0.0) {

    double partial = dist(eng);

    if (partial > remainMplier) {
      partial = remainMplier;
    }

    remainMplier -= partial;

    computed += mpcand * partial;
  }

  std::printf("Error in multiplication with doubles = %g\n", (ans - computed));
}

void multiplyTestRational(const Rational& mpcand, const Rational& mplier,
                          const Rational& ans) {

  size_t lim = boost::rational_cast<size_t>(mplier / Rational(100));

  std::mt19937 eng;
  eng.seed(0);

  std::uniform_int_distribution<size_t> dist(1, lim);

  Rational remainMplier = mplier;

  Rational computed(0);

  while (remainMplier > Rational(0)) {

    Rational partial(dist(eng), lim);

    if (partial > remainMplier) {
      partial = remainMplier;
    }

    remainMplier -= partial;

    computed += mpcand * partial;
  }

  std::cout << "Error in multiplication with Rational: " << (ans - computed)
            << "\n";
}

void rationalConversionError(double fpVal) {

  static const unsigned SIGNIFICANT_BITS = 40;

  size_t q = (size_t(1) << SIGNIFICANT_BITS);
  size_t p = size_t(fpVal * q);

  Rational r(p, q);

  std::printf("Conversion error = %g\n",
              (fpVal - boost::rational_cast<double>(r)));
}

int main() {
  multiplyTest(0.125, 1000.0, 125.0);

  multiplyTestRational(Rational(125, 1000), Rational(1000), Rational(125));

  rationalConversionError(boost::rational_cast<double>(Rational(1, 3)));

  rationalConversionError(sqrt(2.0));
  rationalConversionError(sqrt(3.0));
  rationalConversionError(sqrt(1000.0));
  rationalConversionError(sqrt(100000.0));
  rationalConversionError(sqrt(15485867)); // prime number

  return 0;
}


================================================
FILE: libgalois/test/foreach.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Bag.h"
#include <vector>
#include <iostream>

void function_pointer(int x, galois::UserContext<int>&) {
  std::cout << x << "\n";
}

struct function_object {
  void operator()(int x, galois::UserContext<int>& ctx) const {
    function_pointer(x, ctx);
  }
};

int main() {
  galois::SharedMemSys Galois_runtime;
  std::vector<int> v(10);
  galois::InsertBag<int> b;

  galois::for_each(galois::iterate(v), &function_pointer,
                   galois::loopname("func-pointer"));
  galois::for_each(galois::iterate(v), function_object(),
                   galois::loopname("with function object and options"));
  galois::do_all(galois::iterate(v), [&b](int x) { b.push(x); });
  galois::for_each(galois::iterate(b), function_object());

  // Works without context as well
#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1400
#else
  // Don't support Context-free versions yet (gcc 4.7 problem)
  //  galois::for_each(v.begin(), v.end(), [](int x) { std::cout << x << "\n";
  //  });
  // galois::for_each(b, [](int x) { std::cout << x << "\n"; });
#endif

  return 0;
}


================================================
FILE: libgalois/test/forward-declare-graph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/Graph.h"
#include "galois/graphs/LCGraph.h"

struct Node1;
typedef galois::graphs::MorphGraph<Node1, void, true> Graph1;
struct Node1 {
  Graph1::edge_iterator edge;
  Graph1::GraphNode gnode;
};

struct Node2;
typedef galois::graphs::LC_CSR_Graph<Node2, void> Graph2;
struct Node2 {
  Graph2::edge_iterator edge;
  Graph2::GraphNode gnode;
};

struct Node3;
typedef galois::graphs::LC_InlineEdge_Graph<Node3, void> Graph3;
struct Node3 {
  Graph3::edge_iterator edge;
  Graph3::GraphNode gnode;
};

struct Node4;
typedef galois::graphs::LC_Linear_Graph<Node4, void> Graph4;
struct Node4 {
  Graph4::edge_iterator edge;
  Graph4::GraphNode gnode;
};

struct Node5;
typedef galois::graphs::LC_Morph_Graph<Node5, void> Graph5;
struct Node5 {
  Graph5::edge_iterator edge;
  Graph5::GraphNode gnode;
};

int main() {
  galois::SharedMemSys Galois_runtime;
  Graph1 g1;
  Graph2 g2;
  Graph3 g3;
  Graph4 g4;
  Graph5 g5;
  return 0;
}


================================================
FILE: libgalois/test/gcollections.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Bag.h"
#include "galois/Galois.h"
#include "galois/gdeque.h"
#include "galois/gslist.h"
#include "galois/Timer.h"
#include "galois/gIO.h"
#include "galois/runtime/Mem.h"

#include <boost/iterator/counting_iterator.hpp>

#include <iostream>
#include <cassert>
#include <string>
#include <deque>
#include <vector>
#include <random>

template <typename C>
auto constexpr needs_heap(int)
    -> decltype(typename C::promise_to_dealloc(), bool()) {
  return true;
}

template <typename C>
bool constexpr needs_heap(...) {
  return false;
}

template <typename C, typename HeapTy, typename V>
auto addToCollection(C& c, HeapTy& heap, V&& v) ->
    typename std::enable_if<needs_heap<C>(0)>::type {
  c.push_front(heap.heap, std::forward<V>(v));
}

template <typename C, typename HeapTy, typename V>
auto addToCollection(C& c, HeapTy&, V&& v) ->
    typename std::enable_if<!needs_heap<C>(0)>::type {
  c.push_back(std::forward<V>(v));
}

template <typename C>
auto removeFromCollection(C& c) ->
    typename std::enable_if<needs_heap<C>(0)>::type {
  c.pop_front(typename C::promise_to_dealloc());
}

template <typename C>
auto removeFromCollection(C& c) ->
    typename std::enable_if<!needs_heap<C>(0)>::type {
  c.pop_back();
}

template <typename C, bool Enable>
struct Heap {};

template <typename C>
struct Heap<C, true> {
  galois::runtime::FixedSizeHeap heap;
  Heap() : heap(sizeof(typename C::block_type)) {}
};

template <typename C>
void testBasic(std::string prefix, C&& collection, int N) {
  Heap<C, needs_heap<C>(0)> heap;

  assert(N > 0);
  C c = std::move(collection);
  for (int i = 0; i < N; ++i)
    addToCollection(c, heap, i);

  int i = 0;
  for (auto it = c.begin(); it != c.end(); ++it, ++i) {
    ;
  }

  GALOIS_ASSERT(N == std::distance(c.begin(), c.end()), prefix);

  i = N - 1;
  for (; !c.empty(); --i, removeFromCollection(c)) {
    ;
  }

  GALOIS_ASSERT(0 == std::distance(c.begin(), c.end()), prefix);
}

template <typename C>
void testNormal(std::string prefix, C&& collection, int N) {
  Heap<C, needs_heap<C>(0)> heap;

  assert(N > 0);
  C c = std::move(collection);
  for (int i = 0; i < N; ++i)
    addToCollection(c, heap, i);

  int i = 0;
  for (auto it = c.begin(); it != c.end(); ++it, ++i) {
    GALOIS_ASSERT(*it == i, prefix);
  }

  i = N - 1;
  for (auto it = c.rbegin(); it != c.rend(); ++it, --i) {
    GALOIS_ASSERT(*it == i, prefix);
  }

  GALOIS_ASSERT(static_cast<int>(c.size()) == N, prefix);

  GALOIS_ASSERT(static_cast<int>(c.size()) == std::distance(c.begin(), c.end()),
                prefix);

  i = N - 1;
  for (; !c.empty(); --i, removeFromCollection(c)) {
    GALOIS_ASSERT(c.back() == i, prefix);
  }

  GALOIS_ASSERT(static_cast<int>(c.size()) == 0, prefix);
  GALOIS_ASSERT(static_cast<int>(c.size()) == std::distance(c.begin(), c.end()),
                prefix);
}

template <typename C>
void testSort(std::string prefix, C&& collection, int N) {
  Heap<C, needs_heap<C>(0)> heap;

  assert(N > 0);
  C c = std::move(collection);
  std::mt19937 gen;
  std::uniform_int_distribution<int> dist(0, 100);
  for (int i = 0; i < N; ++i)
    addToCollection(c, heap, dist(gen));

  std::sort(c.begin(), c.end());

  int last = c.front();
  for (auto it = c.begin() + 1; it != c.end(); ++it) {
    GALOIS_ASSERT(last <= *it, prefix);
    last = *it;
  }

  last = c.back();
  removeFromCollection(c);
  for (; !c.empty(); removeFromCollection(c)) {
    GALOIS_ASSERT(last >= c.back(), prefix);
    last = c.back();
  }
}

template <typename C, typename Iterator>
void timeAccess(std::string prefix, C&& c, Iterator first, Iterator last) {
  Heap<C, needs_heap<C>(0)> heap;

  galois::Timer t1, t2;
  t1.start();
  while (first != last) {
    addToCollection(c, heap, *first++);
  }
  t1.stop();
  t2.start();
  for (auto ii = c.begin(), ei = c.end(); ii != ei; ++ii) {
    (*ii).val;
  }
  t2.stop();
  std::cout << prefix << " insert: " << t1.get() << " traverse: " << t2.get()
            << "\n";
}

template <typename T>
void timeAccesses(std::string prefix, T&& x, int size) {
  for (int i = 0; i < 3; ++i)
    timeAccess(prefix, std::forward<T>(x), boost::counting_iterator<int>(0),
               boost::counting_iterator<int>(size));
}

struct element {
  volatile int val;
  element(int x) : val(x) {}
};

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  testBasic("galois::gslist", galois::gslist<int>(), 32 * 32);
  testNormal("galois::gdeque", galois::gdeque<int>(), 32 * 32);
  // testSort("galois::gdeque", galois::gdeque<int>(), 32 * 32);

  int size = 100;
  if (argc > 1)
    size = atoi(argv[1]);
  if (size <= 0)
    size = 1000000;
  timeAccesses("std::deque", std::deque<element>(), size);
  timeAccesses("std::vector", std::vector<element>(), size);
  timeAccesses("galois::gdeque", galois::gdeque<element>(), size);
  timeAccesses("galois::gslist", galois::gslist<element>(), size);
  timeAccesses("galois::concurrent_gslist",
               galois::concurrent_gslist<element>(), size);
  timeAccesses("galois::InsertBag", galois::InsertBag<element>(), size);

  return 0;
}


================================================
FILE: libgalois/test/graph-compile.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <iostream>

#include "galois/graphs/Graph.h"

struct NoDefault {
  int x;
  explicit NoDefault(int x) : x(x) {}

private:
  NoDefault();
};

template <typename GraphTy>
void check() {
  typedef typename GraphTy::GraphNode GNode;
  int v = 0;

  GraphTy g;
  GNode n1 = g.createNode(1);
  GNode n2 = g.createNode(2);
  GNode n3 = g.createNode(3);
  GNode n4 = g.createNode(4);
  GNode n5 = g.createNode(5);
  g.addNode(n1);
  g.addNode(n2);
  g.addNode(n3);
  g.addNode(n4);
  g.addNode(n5);
  g.addMultiEdge(n1, n2, galois::MethodFlag::WRITE, v);
  g.addMultiEdge(n5, n2, galois::MethodFlag::WRITE, v);
  g.addMultiEdge(n2, n3, galois::MethodFlag::WRITE, v);
  g.addMultiEdge(n2, n4, galois::MethodFlag::WRITE, v);
  for (auto ii : g.edges(n2))
    std::cout << "o " << g.getData(g.getEdgeDst(ii)).x << "\n";
  for (auto ii : g.in_edges(n2))
    std::cout << "i " << g.getData(g.getEdgeDst(ii)).x << "\n";
  std::cout << "** removing 2->3\n";
  g.removeEdge(n2, g.findEdge(n2, n3));
  for (auto ii : g.edges(n2))
    std::cout << "o " << g.getData(g.getEdgeDst(ii)).x << "\n";
  for (auto ii : g.in_edges(n2))
    std::cout << "i " << g.getData(g.getEdgeDst(ii)).x << "\n";
  std::cout << "** removing 5->1\n";
  g.removeEdge(n5, g.findEdge(n5, n2));
  for (auto ii : g.edges(n2))
    std::cout << "o " << g.getData(g.getEdgeDst(ii)).x << "\n";
  for (auto ii : g.in_edges(n2))
    std::cout << "i " << g.getData(g.getEdgeDst(ii)).x << "\n";
  std::cout << "\n\n";
}

int main() {
  galois::SharedMemSys Galois_runtime;
  check<galois::graphs::MorphGraph<NoDefault, NoDefault, true>>();
  check<galois::graphs::MorphGraph<NoDefault, NoDefault, false>>();
  check<galois::graphs::MorphGraph<NoDefault, NoDefault, true, true>>();
  check<galois::graphs::MorphGraph<NoDefault, NoDefault, false, true>>();

  return 0;
}


================================================
FILE: libgalois/test/graph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/Graph.h"
#include <string>

int useGraph(std::string inputfile) {
  //! [Using a graph]
  typedef galois::graphs::LC_CSR_Graph<int, int> Graph;

  // Create graph
  Graph g;
  galois::graphs::readGraph(g, inputfile);

  int sum = 0;

  // Traverse graph
  for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
    Graph::GraphNode src = *ii;
    for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);
         jj != ej; ++jj) {
      Graph::GraphNode dst = g.getEdgeDst(jj);
      int edgeData         = g.getEdgeData(jj);
      int nodeData         = g.getData(dst);
      sum += edgeData * nodeData;
    }
  }
  //! [Using a graph]

  return sum;
}

int useGraphCxx11(std::string inputfile) {
  //! [Using a graph cxx11]
  typedef galois::graphs::LC_CSR_Graph<int, int> Graph;

  // Create graph
  Graph g;
  galois::graphs::readGraph(g, inputfile);

  int sum = 0;

  // Traverse graph
  for (Graph::GraphNode src : g) {
    for (Graph::edge_iterator edge : g.out_edges(src)) {
      Graph::GraphNode dst = g.getEdgeDst(edge);
      int edgeData         = g.getEdgeData(edge);
      int nodeData         = g.getData(dst);
      sum += edgeData * nodeData;
    }
  }
  //! [Using a graph cxx11]

  return sum;
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  if (argc > 1) {
    useGraph(argv[1]);
    useGraphCxx11(argv[1]);
  }
  return 0;
}


================================================
FILE: libgalois/test/gslist.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/gslist.h"
#include "galois/gIO.h"
#include "galois/runtime/Mem.h"
#include <map>

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  typedef galois::runtime::FixedSizeHeap Heap;
  typedef std::unique_ptr<Heap> HeapPtr;
  typedef galois::substrate::PerThreadStorage<HeapPtr> Heaps;
  typedef galois::concurrent_gslist<int> Collection;
  int numThreads = 2;
  unsigned size  = 100;
  if (argc > 1)
    numThreads = atoi(argv[1]);
  if (size <= 0)
    numThreads = 2;
  if (argc > 2)
    size = atoi(argv[2]);
  if (size <= 0)
    size = 10000;

  galois::setActiveThreads(numThreads);

  Heaps heaps;
  Collection c;

  galois::on_each([&](unsigned int, unsigned int) {
    HeapPtr& hp = *heaps.getLocal();
    hp          = HeapPtr(new Heap(sizeof(Collection::block_type)));
    for (unsigned i = 0; i < size; ++i)
      c.push_front(*hp, i);
  });

  std::map<int, int> counter;
  for (auto i : c) {
    counter[i] += 1;
  }
  for (unsigned i = 0; i < size; ++i) {
    GALOIS_ASSERT(counter[i] == numThreads);
  }
  GALOIS_ASSERT(counter.size() == size);

  galois::on_each([&](unsigned int, unsigned int) {
    while (c.pop_front(Collection::promise_to_dealloc()))
      ;
  });

  return 0;
}


================================================
FILE: libgalois/test/hwtopo.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/HWTopo.h"
#include "galois/gIO.h"

#include <iostream>

void printMyTopo() {
  auto t = galois::substrate::getHWTopo();
  std::cout << "T,C,P,N: " << t.machineTopoInfo.maxThreads << " "
            << t.machineTopoInfo.maxCores << " " << t.machineTopoInfo.maxSockets
            << " " << t.machineTopoInfo.maxNumaNodes << "\n";
  for (unsigned i = 0; i < t.machineTopoInfo.maxThreads; ++i) {
    auto& c = t.threadTopoInfo[i];
    std::cout << "tid: " << c.tid << " leader: " << c.socketLeader
              << " socket: " << c.socket << " numaNode: " << c.numaNode
              << " cumulativeMaxSocket: " << c.cumulativeMaxSocket
              << " osContext: " << c.osContext
              << " osNumaNode: " << c.osNumaNode << "\n";
  }
}

void test(const std::string& name, const std::vector<int>& found,
          const std::vector<int>& expected) {
  if (found != expected) {
    std::cerr << "test " << name << " failed\n";

    std::cerr << "found: ";
    for (auto i : found) {
      std::cerr << i;
    }
    std::cerr << "\n";

    std::cerr << "expected: ";
    for (auto i : expected) {
      std::cerr << i;
    }
    std::cerr << "\n";
    std::abort();
  }
}

int main() {
  printMyTopo();

  using namespace galois::substrate;

  test("parse with spaces", parseCPUList("     0   \n"), std::vector<int>{0});
  test("parse empty", parseCPUList("        \n"), std::vector<int>{});
  test("parse singletons", parseCPUList("     0,1,2   \n"),
       std::vector<int>{0, 1, 2});
  test("parse mix of singletons and ranges", parseCPUList("     0,1,2-4   \n"),
       std::vector<int>{0, 1, 2, 3, 4});
  test("parse multiple ranges", parseCPUList("     0-1,2-4   \n"),
       std::vector<int>{0, 1, 2, 3, 4});
  test("parse range", parseCPUList("     0-4   \n"),
       std::vector<int>{0, 1, 2, 3, 4});

  return 0;
}


================================================
FILE: libgalois/test/lc-adaptor.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/graphs/LC_Adaptor_Graph.h"
#include <boost/iterator/counting_iterator.hpp>

struct CSRArrays {
  int* outIdx;
  int* outs;
  int* nodeData;
  int numNodes;
  int numEdges;
};

class MyGraph
    : public galois::graphs::LC_Adaptor_Graph<
          int, void, MyGraph, int, boost::counting_iterator<int>, int*> {
  CSRArrays m_instance;

public:
  MyGraph(const CSRArrays& i) : m_instance(i) {}

  size_t get_id(GraphNode n) const { return n; }

  node_data_reference get_data(GraphNode n) { return m_instance.nodeData[n]; }

  edge_data_reference get_edge_data(edge_iterator) { return {}; }

  GraphNode get_edge_dst(edge_iterator n) { return *n; }

  int get_size() const { return m_instance.numNodes; }
  int get_size_edges() const { return m_instance.numEdges; }

  iterator get_begin() const { return iterator(0); }
  iterator get_end() const { return iterator(m_instance.numNodes); }

  edge_iterator get_edge_begin(GraphNode n) {
    return n == 0 ? &m_instance.outs[0]
                  : &m_instance.outs[m_instance.outIdx[n - 1]];
  }
  edge_iterator get_edge_end(GraphNode n) {
    return &m_instance.outs[m_instance.outIdx[n]];
  }
};

int main() {
  CSRArrays arrays;
  MyGraph g(arrays);
  return 0;
}


================================================
FILE: libgalois/test/lock.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/SimpleLock.h"

#include <cstdlib>

volatile int V;

int main(int argc, char** argv) {
  unsigned M = 1;
  if (argc > 1)
    M = atoi(argv[1]);
  if (!M)
    M = 1000000000;
  galois::substrate::SimpleLock L;
  for (unsigned x = 0; x < M; ++x) {
    V = 0;
    L.lock();
    V = 1;
    L.unlock();
    V = 2;
  }
  return 0;
}


================================================
FILE: libgalois/test/lockmgr.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/Lockable.h"

#include <iostream>

using namespace galois::runtime;

struct simple : public Lockable {
  int foo;
};

char translate(int i) {
  switch (i) {
  case 0:
    return 'F';
  case 1:
    return 'N';
  case 3:
    return 'O';
  default:
    return '?';
  }
}

// FIXME: include ro tests

int main(int argc, char** argv) {
  simple s1, s2;
  LockManagerBase b1, b2;

  std::cout << translate(b1.tryAcquire(&s1, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b1.tryAcquire(&s1, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b1.tryAcquire(&s2, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s1, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s2, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  auto rb1 = b1.releaseAll();
  std::cout << rb1.first << " " << rb1.second << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s1, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s2, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  // b1.forceAcquire(&s1);
  // b1.dump(std::cout); b2.dump(std::cout); std::cout << "\n";
  // b1.forceAcquire(&s2);
  // b1.dump(std::cout); b2.dump(std::cout); std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s1, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  std::cout << translate(b2.tryAcquire(&s2, false)) << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  auto rb2 = b2.releaseAll();
  std::cout << rb2.first << " " << rb2.second << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";
  rb1 = b1.releaseAll();
  std::cout << rb1.first << " " << rb1.second << "\n";
  b1.dump(std::cout);
  b2.dump(std::cout);
  std::cout << "\n";

  return 0;
}


================================================
FILE: libgalois/test/loop-overhead.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/substrate/ThreadPool.h"

#include <iostream>
#include <cstdlib>
#include <omp.h>

int RandomNumber() { return (rand() % 1000000); }
unsigned iter = 1;

struct emp {
  template <typename T>
  void operator()(const T& t) const {
    galois::substrate::compilerBarrier();
  }
  template <typename T, typename C>
  void operator()(const T& t, const C& c) const {
    galois::substrate::compilerBarrier();
  }
};

unsigned t_inline(std::vector<unsigned>& V, unsigned num) {
  galois::Timer t;
  t.start();
  emp e;
  for (unsigned x = 0; x < iter; ++x)
    for (unsigned i = 0; i < num; ++i)
      e(i);
  t.stop();
  return t.get();
}

unsigned t_stl(std::vector<unsigned>& V, unsigned num) {
  galois::Timer t;
  t.start();
  for (unsigned x = 0; x < iter; ++x)
    std::for_each(V.begin(), V.begin() + num, emp());
  t.stop();
  return t.get();
}

unsigned t_omp(std::vector<unsigned>& V, unsigned num, unsigned th) {
  omp_set_num_threads(th); // galois::runtime::LL::getMaxThreads());

  galois::Timer t;
  t.start();
  for (unsigned x = 0; x < iter; ++x) {
    emp f;
#pragma omp parallel for schedule(guided)
    for (unsigned n = 0; n < num; ++n)
      f(n);
  }
  t.stop();
  return t.get();
}

unsigned t_doall(bool burn, bool steal, std::vector<unsigned>& V, unsigned num,
                 unsigned th) {
  galois::setActiveThreads(th); // galois::runtime::LL::getMaxThreads());
  if (burn)
    galois::substrate::getThreadPool().burnPower(th);

  galois::Timer t;
  t.start();
  for (unsigned x = 0; x < iter; ++x)
    galois::do_all(galois::iterate(V.begin(), V.begin() + num), emp());
  t.stop();
  return t.get();
}

unsigned t_foreach(bool burn, std::vector<unsigned>& V, unsigned num,
                   unsigned th) {
  galois::setActiveThreads(th);
  if (burn)
    galois::substrate::getThreadPool().burnPower(th);

  galois::Timer t;
  t.start();
  for (unsigned x = 0; x < iter; ++x)
    galois::for_each(galois::iterate(V.begin(), V.begin() + num), emp(),
                     galois::no_pushes(), galois::disable_conflict_detection(),
                     galois::wl<galois::worklists::StableIterator<>>());
  t.stop();
  return t.get();
}

void test(
    std::string header, unsigned maxThreads, unsigned minVec, unsigned maxVec,
    std::function<unsigned(std::vector<unsigned>&, unsigned, unsigned)> func) {
  std::cout << header << "";
  for (unsigned M = maxThreads; M; M >>= 1)
    std::cout << ",\t" << M;
  std::cout << "\n";
  std::vector<unsigned> V(maxVec);
  for (unsigned v = minVec; v < maxVec; v <<= 2) {
    std::cout << v << "\t";
    for (unsigned M = maxThreads; M; M >>= 1) {
      std::cout << ",\t" << func(V, v, M);
    }
    std::cout << "\n";
  }
  std::cout << "\n";
}

int main(int argc, char** argv) {
  using namespace std::placeholders;
#pragma omp parallel for
  for (int x = 0; x < 100; ++x) {
  }

  unsigned maxVector = 16;
  if (argc > 1)
    iter = atoi(argv[1]);
  if (!iter)
    iter = 16 * 1024;
  if (argc > 2)
    maxVector = atoi(argv[2]);
  if (!maxVector)
    maxVector = 1024 * 1024;

  unsigned M = galois::substrate::getThreadPool().getMaxThreads() / 2;
  test("inline\t", 1, 16, maxVector,
       [](std::vector<unsigned>& V, unsigned num, unsigned th) {
         return t_inline(V, num);
       });
  test("stl\t", 1, 16, maxVector,
       [](std::vector<unsigned>& V, unsigned num, unsigned th) {
         return t_stl(V, num);
       });
  test("omp\t", M, 16, maxVector, t_omp);
  test("doall N W", M, 16, maxVector,
       std::bind(t_doall, false, false, _1, _2, _3));
  test("doall N S", M, 16, maxVector,
       std::bind(t_doall, false, true, _1, _2, _3));
  test("foreach N", M, 16, maxVector, std::bind(t_foreach, false, _1, _2, _3));
  test("doall B W", M, 16, maxVector,
       std::bind(t_doall, true, false, _1, _2, _3));
  test("doall B S", M, 16, maxVector,
       std::bind(t_doall, true, true, _1, _2, _3));
  test("foreach B", M, 16, maxVector, std::bind(t_foreach, true, _1, _2, _3));
  return 0;
}


================================================
FILE: libgalois/test/mem.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/gIO.h"
#include "galois/runtime/Mem.h"

using namespace galois::runtime;
using namespace galois::substrate;

struct element {
  unsigned val;
  element* next;
  element(int i) : val(i), next(0) {}
};

int main() {
  galois::SharedMemSys Galois_runtime;
  unsigned baseAllocSize = SystemHeap::AllocSize;

  FixedSizeAllocator<element> falloc;
  element* last = nullptr;
  for (unsigned i = 0; i < baseAllocSize; ++i) {
    element* ptr = falloc.allocate(1);
    falloc.construct(ptr, i);
    ptr->next = last;
    last      = ptr;
  }
  for (unsigned i = 0; i < baseAllocSize; ++i) {
    GALOIS_ASSERT(last);
    GALOIS_ASSERT(last->val == baseAllocSize - 1 - i);
    element* next = last->next;
    falloc.destroy(last);
    falloc.deallocate(last, 1);
    last = next;
  }
  GALOIS_ASSERT(!last);

  VariableSizeHeap valloc;
  size_t allocated;
  GALOIS_ASSERT(1 < baseAllocSize);
  valloc.allocate(1, allocated);
  GALOIS_ASSERT(allocated == 1);

  valloc.allocate(baseAllocSize + 1, allocated);
  GALOIS_ASSERT(allocated <= baseAllocSize);

  int toAllocate = baseAllocSize + 1;
  while (toAllocate) {
    valloc.allocate(toAllocate, allocated);
    toAllocate -= allocated;
    GALOIS_ASSERT(allocated);
  }

  return 0;
}


================================================
FILE: libgalois/test/morphgraph-removal.cpp
================================================
#include <iostream>
#include <string>
#include <cassert>
#include <vector>
#include <type_traits>

#include "galois/graphs/MorphGraph.h"

static unsigned int numNodes = 10;
static bool verbose          = false;

// only tracks out-going edges
using OutGraph =
    galois::graphs::MorphGraph<unsigned int, unsigned int, true, false>;

// tracks out-going and incoming edges w/ shared edge data
using InOutGraph =
    galois::graphs::MorphGraph<unsigned int, unsigned int, true, true>;

// tracks outgoing edges symmetrically w/ shared edge data
using SymGraph = galois::graphs::MorphGraph<unsigned int, unsigned int, false>;

template <class G>
void traverseOutGraph(G& g) {
  for (auto n : g) {
    for (auto e : g.edges(n)) {
      auto dst = g.getEdgeDst(e);
      std::cout << "(" << g.getData(n) << " -> " << g.getData(dst) << "): ";
      std::cout << g.getEdgeData(e) << "\n";
    }
  }
}

template <class G>
void traverseInGraph(G& g) {
  for (auto n : g) {
    for (auto ie : g.in_edges(n)) {
      auto src = g.getEdgeDst(ie);
      std::cout << "(" << g.getData(n) << " <- " << g.getData(src) << "): ";
      std::cout << g.getEdgeData(ie) << "\n";
    }
  }
}

template <>
void traverseInGraph(OutGraph&) {}

// construct a directed clique w/ (i, j) where i < j
template <class G>
void constructGraph(G& g, std::vector<typename G::GraphNode>& v) {
  // add nodes
  for (unsigned int i = 0; i < numNodes; i++) {
    auto n = g.createNode(i);
    v.push_back(n);
    g.addNode(n);
  }

  // add edges
  for (unsigned int i = 0; i < numNodes; i++) {
    for (unsigned int j = i + 1; j < numNodes; j++) {
      g.getEdgeData(g.addEdge(v[i], v[j])) = (i + j);
    }
  }

  if (verbose) {
    std::cout << "Original\n";
    traverseOutGraph(g);
    traverseInGraph(g);
  }
}

template <class G>
void removeGraphOutEdge(G& g, typename G::GraphNode n1,
                        typename G::GraphNode n2) {
  auto e = g.findEdge(n1, n2);
  if (e != g.edge_end(n1)) {
    g.removeEdge(n1, e);
  }
}

void removeGraphInEdge(SymGraph& g, SymGraph::GraphNode n1,
                       SymGraph::GraphNode n2) {
  auto e12                            = g.findInEdge(n1, n2);
  auto GALOIS_USED_ONLY_IN_DEBUG(e21) = g.findEdge(n2, n1);

  if (e12 == g.in_edge_end(n1)) {
    assert(e21 == g.edge_end(n1));
  } else {
    assert(e21 != g.edge_end(n1));
    assert(n2 == g.getEdgeDst(e12));
    assert(n1 == g.getEdgeDst(e21));
    assert(g.getEdgeData(e12) == g.getEdgeData(e21));
    g.removeEdge(n1, e12);
    //    g.removeEdge(n2, e21); this is also OK
  }
}

void removeGraphInEdge(InOutGraph& g, InOutGraph::GraphNode n1,
                       InOutGraph::GraphNode n2) {
  auto ie                           = g.findInEdge(n1, n2);
  auto GALOIS_USED_ONLY_IN_DEBUG(e) = g.findEdge(n2, n1);
  if (ie == g.in_edge_end(n1)) {
    assert(e == g.edge_end(n2));
  } else {
    assert(e != g.edge_end(n2));
    assert(n2 == g.getEdgeDst(ie));
    assert(n1 == g.getEdgeDst(e));
    assert(g.getEdgeData(ie) == g.getEdgeData(e));
    //    g.removeEdge(n1, ie); // this leads to compile error
    g.removeEdge(n2, e);
  }
}

unsigned int countUnmatchedEdge(OutGraph& g,
                                std::vector<typename OutGraph::GraphNode>& v,
                                unsigned int i, unsigned int j) {
  unsigned int unmatched = 0;

  // nodes whose out edges are all removed
  for (unsigned int ri = 0; ri < i; ri++) {
    for (unsigned int rj = 0; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));
    }
  }

  // the node whose out edge removed up to j
  for (unsigned int rj = 0; rj < j + 1; rj++) {
    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));
  }
  for (unsigned int rj = j + 1; rj < numNodes; rj++) {
    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));
  }

  // nodes whose out edges are kept wholly
  for (unsigned int ri = i + 1; ri < numNodes; ri++) {
    for (unsigned int rj = 0; rj < ri + 1; rj++) {
      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));
    }
    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));
    }
  }

  return unmatched;
}

unsigned int countUnmatchedEdge(InOutGraph& g,
                                std::vector<typename InOutGraph::GraphNode>& v,
                                unsigned int i, unsigned int j) {
  unsigned int unmatched = 0;

  // nodes whose out edges are all removed
  for (unsigned int ri = 0; ri < i; ri++) {
    for (unsigned int rj = 0; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));
      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));
    }
  }

  // the node whose out edge removed up to j
  for (unsigned int rj = 0; rj < j + 1; rj++) {
    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));
    unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[i]));
  }
  for (unsigned int rj = j + 1; rj < numNodes; rj++) {
    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));
    unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[i]));
  }

  // nodes whose out edges are kept wholly
  for (unsigned int ri = i + 1; ri < numNodes; ri++) {
    for (unsigned int rj = 0; rj < ri + 1; rj++) {
      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));
      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));
    }
    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));
      unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[ri]));
    }
  }

  return unmatched;
}

unsigned int countUnmatchedEdge(SymGraph& g,
                                std::vector<typename SymGraph::GraphNode>& v,
                                unsigned int i, unsigned int j) {
  unsigned int unmatched = 0;

  // no self loops
  for (unsigned int k = 0; k < numNodes; k++) {
    unmatched += (g.edge_end(v[k]) != g.findEdge(v[k], v[k]));
    unmatched += (g.in_edge_end(v[k]) != g.findInEdge(v[k], v[k]));
  }

  // nodes whose out edges are all removed
  for (unsigned int ri = 0; ri < i; ri++) {
    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));
      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));
    }
  }

  // the node whose out edge removed up to j
  for (unsigned int rj = i; rj < j + 1; rj++) {
    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));
    unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[i]));
  }
  for (unsigned int rj = j + 1; rj < numNodes; rj++) {
    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));
    unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[i]));
  }

  // nodes whose out edges are kept wholly
  for (unsigned int ri = i + 1; ri < numNodes; ri++) {
    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {
      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));
      unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[ri]));
    }
  }

  return unmatched;
}

template <class G>
unsigned int testGraphOutEdgeRemoval(G& g,
                                     std::vector<typename G::GraphNode>& v) {
  constructGraph(g, v);
  unsigned int numFailedRemoval = 0;

  for (unsigned int i = 0; i < numNodes; i++) {
    for (unsigned int j = i + 1; j < numNodes; j++) {
      removeGraphOutEdge(g, v[i], v[j]);
      numFailedRemoval += (0 != countUnmatchedEdge(g, v, i, j));

      if (verbose) {
        std::cout << "Removed edge (" << i << " -> " << j << ")\n";
        traverseOutGraph(g);
        traverseInGraph(g);
      }
    }
  }

  return numFailedRemoval;
}

template <class G>
unsigned int testGraphInEdgeRemoval(G& g,
                                    std::vector<typename G::GraphNode>& v) {
  constructGraph(g, v);
  unsigned int numFailedRemoval = 0;

  for (unsigned int i = 0; i < numNodes; i++) {
    for (unsigned int j = i + 1; j < numNodes; j++) {
      removeGraphInEdge(g, v[j], v[i]);
      numFailedRemoval += (0 != countUnmatchedEdge(g, v, i, j));

      if (verbose) {
        std::cout << "Removed in_edge (" << j << " <- " << i << ")\n";
        traverseOutGraph(g);
        traverseInGraph(g);
      }
    }
  }

  return numFailedRemoval;
}

int main() {
  galois::SharedMemSys G;
  unsigned int numFailure = 0;

  OutGraph outG;
  std::vector<OutGraph::GraphNode> outV;
  auto num = testGraphOutEdgeRemoval(outG, outV);
  numFailure += num;
  std::cout << "OutGraph: Failed " << num << " edge removals\n";

  SymGraph symG, symG2;
  std::vector<SymGraph::GraphNode> symV, symV2;
  num = testGraphOutEdgeRemoval(symG, symV);
  numFailure += num;
  std::cout << "SymGraph: Failed " << num << " edge removals\n";
  num = testGraphInEdgeRemoval(symG2, symV2);
  numFailure += num;
  std::cout << "SymGraph: Failed " << num << " in_edge removals\n";

  InOutGraph inOutG, inOutG2;
  std::vector<InOutGraph::GraphNode> inOutV, inOutV2;
  num = testGraphOutEdgeRemoval(inOutG, inOutV);
  numFailure += num;
  std::cout << "InOutGraph: Failed " << num << " edge removals\n";
  num = testGraphInEdgeRemoval(inOutG2, inOutV2);
  numFailure += num;
  std::cout << "InOutGraph: Failed " << num << " in_edge removals\n";

  return (numFailure > 0) ? -1 : 0;
}


================================================
FILE: libgalois/test/morphgraph.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/runtime/Profile.h"

#include <iostream>
#include <string>

using OutGraph =
    galois::graphs::MorphGraph<unsigned int, unsigned int, true, false>;
using InOutGraph =
    galois::graphs::MorphGraph<unsigned int, unsigned int, true, true>;
using SymGraph = galois::graphs::MorphGraph<unsigned int, unsigned int, false>;

std::string filename;
std::string statfile;
std::string graphtype;

template <typename Graph>
void initGraph(Graph& g) {
  unsigned int i = 1;
  for (auto n : g) {
    g.getData(n) = i++;
  }
}

template <typename Graph>
void traverseGraph(Graph& g) {
  uint64_t sum = 0;

  for (auto n : g) {
    for (auto oe : g.edges(n)) {
      sum += g.getEdgeData(oe);
    }
  }
  std::cout << "  out sum = " << sum << "\n";

  for (auto n : g) {
    for (auto ie : g.in_edges(n)) {
      sum -= g.getEdgeData(ie);
    }
  }
  std::cout << "  all sum = " << sum << "\n";
}

template <typename Graph>
void run(Graph& g, galois::StatTimer& timer, std::string prompt) {
  std::cout << prompt << "\n";

  galois::graphs::FileGraph f;
  f.fromFileInterleaved<typename Graph::file_edge_data_type>(filename);

  size_t approxGraphSize =
      120 * f.sizeEdges() *
      sizeof(typename Graph::edge_data_type); // 120*|E|*sizeof(E)
  size_t numThreads = galois::getActiveThreads();
  galois::preAlloc(numThreads +
                   approxGraphSize / galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  timer.start();
  galois::runtime::profileVtune(
      [&g, &f]() {
        galois::graphs::readGraphDispatch(g, typename Graph::read_tag(), f);
      },
      "Construct MorphGraph");
  timer.stop();

  galois::reportPageAlloc("MeminfoPost");

  initGraph(g);
  traverseGraph(g);
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  if (argc < 4) {
    std::cout << "Usage: ./test-morphgraph <input> <num_threads> "
                 "<out|in-out|symmetric> [stat_file]\n";
    return 0;
  }

  filename  = argv[1];
  graphtype = argv[3];

  auto numThreads = galois::setActiveThreads(std::stoul(argv[2]));
  std::cout << "Loading " << filename << " with " << numThreads
            << " threads.\n";

  if (argc >= 5) {
    galois::runtime::setStatFile(argv[4]);
  }

  if ("out" == graphtype) {
    galois::StatTimer outT("OutGraphTime");
    OutGraph outG;
    run(outG, outT, "out graph");
  } else if ("in-out" == graphtype) {
    galois::StatTimer inoutT("InOutGraphTime");
    InOutGraph inoutG;
    run(inoutG, inoutT, "in-out graph");
  } else if ("symmetric" == graphtype) {
    galois::StatTimer symT("SymGraphTime");
    SymGraph symG;
    run(symG, symT, "symmetric graph");
  }

  galois::runtime::reportParam("Load MorphGraph", "Threads", numThreads);
  galois::runtime::reportParam("Load MorphGraph", "File", filename);
  galois::runtime::reportParam("Load MorphGraph", "Graph Type", graphtype);
  return 0;
}


================================================
FILE: libgalois/test/move.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Bag.h"
#include "galois/gdeque.h"
#include "galois/gslist.h"
#include "galois/FlatMap.h"
#include "galois/LargeArray.h"
#include "galois/runtime/Mem.h"
#include "galois/substrate/PerThreadStorage.h"

struct MoveOnly {
  MoveOnly()           = default;
  MoveOnly(MoveOnly&&) = default;
  MoveOnly& operator=(MoveOnly&&) = default;
  MoveOnly(const MoveOnly&)       = delete;
  MoveOnly& operator=(const MoveOnly&) = delete;
};

struct MoveOnlyA {
  int* x;
  MoveOnlyA() {}
  MoveOnlyA(const MoveOnlyA&) = delete;
  MoveOnly& operator=(const MoveOnlyA&) = delete;
  ~MoveOnlyA() {}
};

template <typename T>
void test(T&& x) {
  T a = std::move(x);
  T b;
  std::swap(a, b);
  a = std::move(b);
}

template <typename T, typename U>
void testContainerA(T&& x, U&& y) {
  T a = std::move(x);
  T b;
  b = std::move(a);
  b.emplace_back(std::move(y));
}

template <typename T, typename U>
void testContainerAA(T&& x, U&& y) {
  galois::runtime::FixedSizeHeap heap(sizeof(typename T::block_type));

  T a = std::move(x);
  T b;
  b = std::move(a);
  b.emplace_front(heap, std::move(y));
  b.clear(heap);
}

template <typename T, typename U>
void testContainerB(T&& x, U&& y) {
  T a = std::move(x);
  T b;
  b = std::move(a);
  b.insert(std::move(y));
}

template <typename T, typename U>
void testContainerC(T&& x, U&& y) {
  T a = std::move(x);
  T b;
  b = std::move(a);
  b.emplace(b.begin(), std::move(y));
}

int main() {
  galois::SharedMemSys Galois_runtime;
  // test(galois::FixedSizeBag<MoveOnly>());
  // test(galois::ConcurrentFixedSizeBag<MoveOnly>());
  // test(galois::FixedSizeRing<MoveOnly>());
  test(galois::gdeque<MoveOnly>());
  test(galois::gslist<MoveOnly>());
  test(galois::concurrent_gslist<MoveOnly>());
  test(galois::InsertBag<MoveOnly>());
  test(galois::LargeArray<MoveOnly>());
  test(galois::substrate::PerSocketStorage<MoveOnly>());
  test(galois::substrate::PerThreadStorage<MoveOnly>());

  testContainerA(galois::gdeque<MoveOnly>(), MoveOnly());
  testContainerAA(galois::gslist<MoveOnly>(), MoveOnly());
  // testContainerAA(galois::concurrent_gslist<MoveOnly>(), MoveOnly());
  testContainerA(galois::InsertBag<MoveOnly>(), MoveOnly());
  testContainerC(galois::gdeque<MoveOnly>(), MoveOnly());

  return 0;
}


================================================
FILE: libgalois/test/oneach.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"

#include <iostream>

int main() {
  galois::SharedMemSys Galois_runtime;
  galois::substrate::SimpleLock l;
  galois::setActiveThreads(10000);
  galois::on_each(
      [&l](int t, int num) {
        l.lock();
        std::cout << t << "," << num << "\n";
        l.unlock();
      },
      galois::loopname("simple loop"));
  return 0;
}


================================================
FILE: libgalois/test/papi.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/runtime/Profile.h"

#include <iostream>

template <typename V>
size_t vecSumSerial(V& vec) {
  galois::runtime::profilePapi(
      [&](void) {
        for (size_t i = 0, sz = vec.size(); i < sz; ++i) {
          vec[i] = i;
        }
      },
      "vecInit");

  size_t sum = 0;

  galois::runtime::profilePapi(
      [&](void) {
        for (size_t i = 0, sz = vec.size(); i < sz; ++i) {
          sum += vec[i];
        }
      },
      "vecSum");

  return sum;
}

template <typename V>
size_t vecSumParallel(V& vec) {
  galois::runtime::profilePapi(
      [&](void) {
        galois::do_all(galois::iterate(size_t{0}, vec.size()),
                       [&](size_t i) { vec[i] = i; });
      },
      "vecInit");

  size_t sum = 0;

  galois::runtime::profilePapi(
      [&](void) {
        galois::do_all(galois::iterate(size_t{0}, vec.size()),
                       [&](size_t i) { sum += vec[i]; });
      },
      "vecSum");

  return sum;
}

int main(int argc, char* argv[]) {

  galois::SharedMemSys G;

  unsigned long long numThreads;
  if (argc == 1) {
    numThreads = 1;
  } else if (argc == 2) {
    numThreads = galois::setActiveThreads(std::stoull(argv[1]));
  } else {
    throw std::invalid_argument(
        "Test received too many command line arguments");
  }

  galois::runtime::reportParam("NULL", "Threads", numThreads);

  size_t vecSz = 1024 * 1024;

  std::vector<size_t> vec(vecSz);

  size_t sum = vecSumSerial(vec);

  std::cout << "Array Sum = " << sum << "\n";

  return 0;
}


================================================
FILE: libgalois/test/pc.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/substrate/PerThreadStorage.h"
#include "galois/Timer.h"
#include "galois/Galois.h"

#include <cstdlib>
#include <iostream>

using namespace galois::substrate;

int num = 1;

template <typename T>
struct testL {
  PerThreadStorage<T>& b;

  testL(PerThreadStorage<T>& B) : b(B) {}
  void operator()(unsigned, unsigned) {
    for (int x = 0; x < num; ++x) {
      *b.getLocal() += x;
    }
  }
};

template <typename T>
struct testR {
  PerThreadStorage<T>& b;

  testR(PerThreadStorage<T>& B) : b(B) {}
  void operator()(unsigned t, unsigned n) {
    for (int x = 0; x < num; ++x) {
      *b.getRemote((t + 1) % n) += x;
    }
  }
};

template <typename T>
void testf(const char* str) {
  PerThreadStorage<T> b;
  std::cout << "\nRunning: " << str << " sizeof " << sizeof(PerThreadStorage<T>)
            << "\n";
  galois::Timer tL;
  tL.start();
  testL<T> L(b);
  galois::on_each(L);
  tL.stop();
  galois::Timer tR;
  tR.start();
  testR<T> R(b);
  galois::on_each(R);
  tR.stop();
  std::cout << str << " L: " << tL.get() << " R: " << tR.get() << '\n';
}

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  if (argc > 1)
    num = atoi(argv[1]);
  if (num <= 0)
    num = 1024 * 1024 * 1024;

  unsigned M = galois::substrate::getThreadPool().getMaxThreads();

  while (M) {
    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());
    std::cout << "Using " << M << " threads\n";

    testf<int>("int");
    testf<double>("double");

    M /= 2;
  }

  return 0;
}


================================================
FILE: libgalois/test/reduction.cpp
================================================
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/SharedMemSys.h"

#include <algorithm>
#include <iostream>
#include <functional>

struct Move {
  Move()            = default;
  ~Move()           = default;
  Move(const Move&) = delete;
  Move(Move&&) noexcept {}
  Move& operator=(const Move&) = delete;
  Move& operator               =(Move&&) noexcept { return *this; }
};

void test_move() {
  auto merge_fn = [](Move& a, Move &&) -> Move& { return a; };

  auto identity_fn = []() { return Move(); };

  auto r = galois::make_reducible(merge_fn, identity_fn);

  Move x;
  r.update(std::move(x));
  r.reduce();

  // And as expected, this will not compile:
  // reducible.update(x);
}

void test_map() {
  using Map = std::map<std::string, int>;

  auto reduce = [](Map& a, Map&& b) -> Map& {
    Map v{std::move(b)};

    for (auto& kv : v) {
      if (a.count(kv.first) == 0) {
        a[kv.first] = 0;
      }
      a[kv.first] += kv.second;
    }

    return a;
  };

  auto zero_fn = []() -> Map { return Map(); };

  auto r = galois::make_reducible(reduce, zero_fn);
  r.update(Map{std::make_pair("key", 1)});
  Map& result = r.reduce();

  GALOIS_ASSERT(result["key"] == 1);
}

void other() {}

void test_max() {
  const int& (*int_max)(const int&, const int&) = std::max<int>;
  std::function<const int&(const int&, const int&)> fn{int_max};

  auto r = galois::make_reducible(fn, []() { return 0; });

  constexpr int num = 10;

  r.update(num);
  r.update(1);

  GALOIS_ASSERT(r.reduce() == num);
}

void test_accum() {
  galois::GAccumulator<int> accum;

  constexpr int num = 123456;

  galois::do_all(galois::iterate(0, num), [&](int) { accum += 1; });

  GALOIS_ASSERT(accum.reduce() == num);
}

int main() {
  galois::SharedMemSys sys;
  galois::setActiveThreads(2);

  static_assert(sizeof(galois::GAccumulator<int>) <=
                sizeof(galois::substrate::PerThreadStorage<int>));

  test_map();
  test_move();
  test_max();
  test_accum();

  return 0;
}


================================================
FILE: libgalois/test/sort.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/ParallelSTL.h"
#include "galois/Timer.h"

#include <iostream>
#include <cstdlib>
#include <numeric>

int RandomNumber() { return (rand() % 1000000); }
bool IsOdd(int i) { return ((i % 2) == 1); }

struct IsOddS {
  bool operator()(int i) const { return ((i % 2) == 1); }
};

int vectorSize = 1;

int do_sort() {

  unsigned M = galois::substrate::getThreadPool().getMaxThreads();
  std::cout << "sort:\n";

  while (M) {

    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());
    std::cout << "Using " << M << " threads\n";

    std::vector<unsigned> V(vectorSize);
    std::generate(V.begin(), V.end(), RandomNumber);
    std::vector<unsigned> C = V;

    galois::Timer t;
    t.start();
    galois::ParallelSTL::sort(V.begin(), V.end());
    t.stop();

    galois::Timer t2;
    t2.start();
    std::sort(C.begin(), C.end());
    t2.stop();

    bool eq = std::equal(C.begin(), C.end(), V.begin());

    std::cout << "Galois: " << t.get() << " STL: " << t2.get()
              << " Equal: " << eq << "\n";

    if (!eq) {
      std::vector<unsigned> R = V;
      std::sort(R.begin(), R.end());
      if (!std::equal(C.begin(), C.end(), R.begin()))
        std::cout << "Cannot be made equal, sort mutated array\n";
      for (size_t x = 0; x < V.size(); ++x) {
        std::cout << x << "\t" << V[x] << "\t" << C[x];
        if (V[x] != C[x])
          std::cout << "\tDiff";
        if (V[x] < C[x])
          std::cout << "\tLT";
        if (V[x] > C[x])
          std::cout << "\tGT";
        std::cout << "\n";
      }
      return 1;
    }

    M >>= 1;
  }

  return 0;
}

int do_count_if() {

  unsigned M = galois::substrate::getThreadPool().getMaxThreads();
  std::cout << "count_if:\n";

  while (M) {

    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());
    std::cout << "Using " << M << " threads\n";

    std::vector<unsigned> V(vectorSize);
    std::generate(V.begin(), V.end(), RandomNumber);

    unsigned x1, x2;

    galois::Timer t;
    t.start();
    x1 = galois::ParallelSTL::count_if(V.begin(), V.end(), IsOddS());
    t.stop();

    galois::Timer t2;
    t2.start();
    x2 = std::count_if(V.begin(), V.end(), IsOddS());
    t2.stop();

    std::cout << "Galois: " << t.get() << " STL: " << t2.get()
              << " Equal: " << (x1 == x2) << "\n";
    M >>= 1;
  }

  return 0;
}

template <typename T>
struct mymax {
  T operator()(const T& x, const T& y) const { return std::max(x, y); }
};

int do_accumulate() {

  unsigned M = galois::substrate::getThreadPool().getMaxThreads();
  std::cout << "accumulate:\n";

  while (M) {
    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());
    std::cout << "Using " << M << " threads\n";

    std::vector<unsigned> V(vectorSize);
    std::generate(V.begin(), V.end(), RandomNumber);

    unsigned x1, x2;

    galois::Timer t;
    t.start();
    x1 = galois::ParallelSTL::accumulate(V.begin(), V.end(), 0u,
                                         mymax<unsigned>());
    t.stop();

    galois::Timer t2;
    t2.start();
    x2 = std::accumulate(V.begin(), V.end(), 0u, mymax<unsigned>());
    t2.stop();

    std::cout << "Galois: " << t.get() << " STL: " << t2.get()
              << " Equal: " << (x1 == x2) << "\n";
    if (x1 != x2)
      std::cout << x1 << " " << x2 << "\n";
    M >>= 1;
  }

  return 0;
}

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  if (argc > 1)
    vectorSize = atoi(argv[1]);
  if (vectorSize <= 0)
    vectorSize = 1024 * 1024 * 16;

  int ret = 0;
  //  ret |= do_sort();
  //  ret |= do_count_if();
  ret |= do_accumulate();
  return ret;
}


================================================
FILE: libgalois/test/static.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// std_tr1__type_traits__is_pod.cpp

#include "galois/substrate/PtrLock.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/StaticInstance.h"

#include <type_traits>
#include <iostream>

using namespace galois::substrate;

int main() {
  std::cout << "is_pod PtrLock<int> == " << std::boolalpha
            << std::is_pod<PtrLock<int>>::value << "\n";

  std::cout << "is_pod SimpleLock == " << std::boolalpha
            << std::is_pod<SimpleLock>::value << "\n";
  std::cout << "is_pod DummyLock == " << std::boolalpha
            << std::is_pod<DummyLock>::value << "\n";

  std::cout << "is_pod StaticInstance<int> == " << std::boolalpha
            << std::is_pod<StaticInstance<int>>::value << "\n";
  std::cout << "is_pod StaticInstance<std::iostream> == " << std::boolalpha
            << std::is_pod<StaticInstance<std::iostream>>::value << "\n";

  std::cout << "is_pod volatile int == " << std::boolalpha
            << std::is_pod<volatile int>::value << "\n";
  std::cout << "is_pod int == " << std::boolalpha << std::is_pod<int>::value
            << "\n";

  return (0);
}


================================================
FILE: libgalois/test/traits.cpp
================================================
#include "galois/gIO.h"
#include "galois/Traits.h"
#include <iostream>
#include <utility>

struct A {};

struct B : public A {
  std::string name_;
  B(std::string name) : name_(std::move(name)) {}
  B() : B("") {}
};

struct Unrelated {};

template <size_t... Ints, typename Tuple>
void print(std::index_sequence<Ints...>, Tuple tup) {
  (..., (std::cout << typeid(std::get<Ints>(tup)).name() << " ")) << "\n";
}

template <typename Tuple>
void print(Tuple tup) {
  print(std::make_index_sequence<std::tuple_size<Tuple>::value>(), tup);
}

int main() {
  auto pull_from_default = galois::get_default_trait_values(
      std::make_tuple(Unrelated{}), std::make_tuple(A{}), std::make_tuple(B{}));
  static_assert(
      std::is_same<decltype(pull_from_default), std::tuple<B>>::value);

  auto no_pull_from_default_when_same = galois::get_default_trait_values(
      std::make_tuple(A{}), std::make_tuple(A{}), std::make_tuple(B{}));
  static_assert(std::is_same<decltype(no_pull_from_default_when_same),
                             std::tuple<>>::value);

  auto no_pull_from_default_when_derived = galois::get_default_trait_values(
      std::make_tuple(B{}), std::make_tuple(A{}), std::make_tuple(B{}));
  static_assert(std::is_same<decltype(no_pull_from_default_when_derived),
                             std::tuple<>>::value);

  auto empty_tuple = galois::get_default_trait_values(
      std::make_tuple(), std::make_tuple(), std::make_tuple());
  static_assert(std::is_same<decltype(empty_tuple), std::tuple<>>::value);

  auto value_from_default = galois::get_default_trait_values(
      std::make_tuple(), std::make_tuple(A{}), std::make_tuple(B{"name"}));
  GALOIS_ASSERT(std::get<0>(value_from_default).name_ == "name");

  auto get_value = galois::get_trait_value<A>(std::tuple<B>(B{"name"}));
  GALOIS_ASSERT(get_value.name_ == "name");
}


================================================
FILE: libgalois/test/twoleveliteratora.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/TwoLevelIteratorA.h"
#include "galois/gIO.h"

#include <algorithm>
#include <boost/iterator/counting_iterator.hpp>
#include <vector>
#include <list>
#include <iostream>
#include <cstdlib>
#include <random>

int N = 10;

template <class D, class I>
struct GetBegin {
  typename I::iterator operator()(typename D::reference x) const {
    return x.begin();
  }
  typename I::const_iterator operator()(typename D::const_reference x) const {
    return x.begin();
  }
};

template <class D, class I>
struct GetEnd {
  typename I::iterator operator()(typename D::reference x) const {
    return x.end();
  }
  typename I::const_iterator operator()(typename D::const_reference x) const {
    return x.end();
  }
};

template <bool NonEmpty, class Tag, class D>
void check_forward() {
  D data;

  for (int i = 0; i < N; ++i) {
#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE
    if (NonEmpty) {
      data.push_back(typename D::value_type());
      data.back().push_back(i);
    } else {
      data.push_back(typename D::value_type());
      data.push_back(typename D::value_type());
      data.back().push_back(i);
      data.push_back(typename D::value_type());
    }
#else
    if (NonEmpty) {
      data.emplace_back();
      data.back().push_back(i);
    } else {
      data.emplace_back();
      data.emplace_back();
      data.back().push_back(i);
      data.emplace_back();
    }
#endif
  }

#if __cplusplus >= 201103L
  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());
#else
  auto r =
      galois::make_two_level_iterator<Tag, typename D::iterator,
                                      typename I::iterator, GetBegin<D, I>,
                                      GetEnd<D, I>>(data.begin(), data.end());
#endif
  GALOIS_ASSERT(
      std::equal(r.first, r.second, boost::make_counting_iterator<int>(0)),
      "failed case: forward ", (NonEmpty ? "non-empty" : "empty"),
      " inner range");
  GALOIS_ASSERT(std::distance(r.first, r.second) == N, "failed case: forward ",
                (NonEmpty ? "non-empty" : "empty"),
                " inner range: ", std::distance(r.first, r.second), " != ", N);
}

template <bool NonEmpty, class Tag, class D>
void check_backward() {
  D data;

  for (int i = N - 1; i >= 0; --i) {
#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE
    if (NonEmpty) {
      data.push_back(typename D::value_type());
      data.back().push_back(i);
    } else {
      data.push_back(typename D::value_type());
      data.push_back(typename D::value_type());
      data.back().push_back(i);
      data.push_back(typename D::value_type());
    }
#else
    if (NonEmpty) {
      data.emplace_back();
      data.back().push_back(i);
    } else {
      data.emplace_back();
      data.emplace_back();
      data.back().push_back(i);
      data.emplace_back();
    }
#endif
  }

#if __cplusplus >= 201103L
  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());
#else
  auto r =
      galois::make_two_level_iterator<Tag, typename D::iterator,
                                      typename I::iterator, GetBegin<D, I>,
                                      GetEnd<D, I>>(data.begin(), data.end());
#endif
  auto c = boost::make_counting_iterator<int>(0);
  GALOIS_ASSERT(std::distance(r.first, r.second) == N, "failed case: backward ",
                (NonEmpty ? "non-empty" : "empty"),
                " inner range: ", std::distance(r.first, r.second), " != ", N);
  if (r.first == r.second) {
    return;
  }

  --r.second;
  while (true) {
    GALOIS_ASSERT(*r.second == *c, "failed case: backward ",
                  (NonEmpty ? "non-empty" : "empty"),
                  " inner range: ", *r.second, " != ", *c);
    if (r.first == r.second)
      break;
    --r.second;
    ++c;
  }
}

template <bool NonEmpty, class Tag, class D>
void check_strided() {
  D data;

  for (int i = 0; i < N; ++i) {
#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE
    if (NonEmpty) {
      data.push_back(typename D::value_type());
      data.back().push_back(i);
    } else {
      data.push_back(typename D::value_type());
      data.push_back(typename D::value_type());
      data.back().push_back(i);
      data.push_back(typename D::value_type());
    }
#else
    if (NonEmpty) {
      data.emplace_back();
      data.back().push_back(i);
    } else {
      data.emplace_back();
      data.emplace_back();
      data.back().push_back(i);
      data.emplace_back();
    }
#endif
  }

#if __cplusplus >= 201103L
  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());
#else
  auto r =
      galois::make_two_level_iterator<Tag, typename D::iterator,
                                      typename I::iterator, GetBegin<D, I>,
                                      GetEnd<D, I>>(data.begin(), data.end());
#endif
  auto c = boost::make_counting_iterator<int>(0);
  GALOIS_ASSERT(std::distance(r.first, r.second) == N, "failed case: strided ",
                (NonEmpty ? "non-empty" : "empty"),
                " inner range: ", std::distance(r.first, r.second), " != ", N);
  if (r.first == r.second) {
    return;
  }

  while (r.first != r.second) {
    GALOIS_ASSERT(*r.first == *c, "failed case: strided ",
                  (NonEmpty ? "non-empty" : "empty"),
                  " inner range: ", *r.first, " != ", *c);

    auto orig = r.first;

    int k = std::max((N - *c) / 2, 1);
    std::advance(r.first, k);
    GALOIS_ASSERT(std::distance(orig, r.first) == k, "failed case: strided ",
                  (NonEmpty ? "non-empty" : "empty"),
                  " inner range: ", std::distance(orig, r.first), " != ", k);
    for (int i = 0; i < k - 1; ++i)
      std::advance(r.first, -1);

    GALOIS_ASSERT(std::distance(orig, r.first) == 1, "failed case: strided ",
                  (NonEmpty ? "non-empty" : "empty"),
                  " inner range: ", std::distance(orig, r.first), " != 1");

    ++c;
  }
}

template <bool NonEmpty, class Tag, class D>
void check_random() {
  D data;
  std::mt19937 gen;
  std::uniform_int_distribution<int> dist(0, 100);

  for (int i = 0; i < N; ++i) {
#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE
    if (NonEmpty) {
      data.push_back(typename D::value_type());
      data.back().push_back(dist(gen));
    } else {
      data.push_back(typename D::value_type());
      data.push_back(typename D::value_type());
      data.back().push_back(dist(gen));
      data.push_back(typename D::value_type());
    }
#else
    if (NonEmpty) {
      data.emplace_back();
      data.back().push_back(dist(gen));
    } else {
      data.emplace_back();
      data.emplace_back();
      data.back().push_back(dist(gen));
      data.emplace_back();
    }
#endif
  }

#if __cplusplus >= 201103L
  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());
#else
  auto r =
      galois::make_two_level_iterator<Tag, typename D::iterator,
                                      typename I::iterator, GetBegin<D, I>,
                                      GetEnd<D, I>>(data.begin(), data.end());
#endif

  std::sort(r.first, r.second);

  int last = *r.first;
  for (auto ii = r.first + 1; ii != r.second; ++ii) {
    GALOIS_ASSERT(last <= *ii, "failed case: random ",
                  (NonEmpty ? "non-empty" : "empty"), " inner range: ", last,
                  " > ", *ii);
    last = *ii;
  }
}

void check_forward_iteration() {
  check_forward<true, std::forward_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<true, std::forward_iterator_tag, std::vector<std::list<int>>>();
  check_forward<true, std::forward_iterator_tag, std::list<std::vector<int>>>();
  check_forward<true, std::forward_iterator_tag, std::list<std::list<int>>>();

  check_forward<true, std::bidirectional_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<true, std::bidirectional_iterator_tag,
                std::vector<std::list<int>>>();
  check_forward<true, std::bidirectional_iterator_tag,
                std::list<std::vector<int>>>();
  check_forward<true, std::bidirectional_iterator_tag,
                std::list<std::list<int>>>();

  check_forward<true, std::random_access_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<true, std::random_access_iterator_tag,
                std::vector<std::list<int>>>();
  check_forward<true, std::random_access_iterator_tag,
                std::list<std::vector<int>>>();
  check_forward<true, std::random_access_iterator_tag,
                std::list<std::list<int>>>();

  check_forward<false, std::forward_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<false, std::forward_iterator_tag,
                std::vector<std::list<int>>>();
  check_forward<false, std::forward_iterator_tag,
                std::list<std::vector<int>>>();
  check_forward<false, std::forward_iterator_tag, std::list<std::list<int>>>();

  check_forward<false, std::bidirectional_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<false, std::bidirectional_iterator_tag,
                std::vector<std::list<int>>>();
  check_forward<false, std::bidirectional_iterator_tag,
                std::list<std::vector<int>>>();
  check_forward<false, std::bidirectional_iterator_tag,
                std::list<std::list<int>>>();

  check_forward<false, std::random_access_iterator_tag,
                std::vector<std::vector<int>>>();
  check_forward<false, std::random_access_iterator_tag,
                std::vector<std::list<int>>>();
  check_forward<false, std::random_access_iterator_tag,
                std::list<std::vector<int>>>();
  check_forward<false, std::random_access_iterator_tag,
                std::list<std::list<int>>>();
}

void check_backward_iteration() {
  check_backward<true, std::bidirectional_iterator_tag,
                 std::vector<std::vector<int>>>();
  check_backward<true, std::bidirectional_iterator_tag,
                 std::vector<std::list<int>>>();
  check_backward<true, std::bidirectional_iterator_tag,
                 std::list<std::vector<int>>>();
  check_backward<true, std::bidirectional_iterator_tag,
                 std::list<std::list<int>>>();

  check_backward<true, std::random_access_iterator_tag,
                 std::vector<std::vector<int>>>();
  check_backward<true, std::random_access_iterator_tag,
                 std::vector<std::list<int>>>();
  check_backward<true, std::random_access_iterator_tag,
                 std::list<std::vector<int>>>();
  check_backward<true, std::random_access_iterator_tag,
                 std::list<std::list<int>>>();

  check_backward<false, std::bidirectional_iterator_tag,
                 std::vector<std::vector<int>>>();
  check_backward<false, std::bidirectional_iterator_tag,
                 std::vector<std::list<int>>>();
  check_backward<false, std::bidirectional_iterator_tag,
                 std::list<std::vector<int>>>();
  check_backward<false, std::bidirectional_iterator_tag,
                 std::list<std::list<int>>>();

  check_backward<false, std::random_access_iterator_tag,
                 std::vector<std::vector<int>>>();
  check_backward<false, std::random_access_iterator_tag,
                 std::vector<std::list<int>>>();
  check_backward<false, std::random_access_iterator_tag,
                 std::list<std::vector<int>>>();
  check_backward<false, std::random_access_iterator_tag,
                 std::list<std::list<int>>>();
}

void check_strided_iteration() {
  check_strided<true, std::bidirectional_iterator_tag,
                std::vector<std::vector<int>>>();
  check_strided<true, std::bidirectional_iterator_tag,
                std::vector<std::list<int>>>();
  check_strided<true, std::bidirectional_iterator_tag,
                std::list<std::vector<int>>>();
  check_strided<true, std::bidirectional_iterator_tag,
                std::list<std::list<int>>>();

  check_strided<true, std::random_access_iterator_tag,
                std::vector<std::vector<int>>>();
  check_strided<true, std::random_access_iterator_tag,
                std::vector<std::list<int>>>();
  check_strided<true, std::random_access_iterator_tag,
                std::list<std::vector<int>>>();
  check_strided<true, std::random_access_iterator_tag,
                std::list<std::list<int>>>();

  check_strided<false, std::bidirectional_iterator_tag,
                std::vector<std::vector<int>>>();
  check_strided<false, std::bidirectional_iterator_tag,
                std::vector<std::list<int>>>();
  check_strided<false, std::bidirectional_iterator_tag,
                std::list<std::vector<int>>>();
  check_strided<false, std::bidirectional_iterator_tag,
                std::list<std::list<int>>>();

  check_strided<false, std::random_access_iterator_tag,
                std::vector<std::vector<int>>>();
  check_strided<false, std::random_access_iterator_tag,
                std::vector<std::list<int>>>();
  check_strided<false, std::random_access_iterator_tag,
                std::list<std::vector<int>>>();
  check_strided<false, std::random_access_iterator_tag,
                std::list<std::list<int>>>();
}

void check_random_iteration() {
  check_random<true, std::random_access_iterator_tag,
               std::vector<std::vector<int>>>();
  check_random<true, std::random_access_iterator_tag,
               std::vector<std::list<int>>>();
  check_random<true, std::random_access_iterator_tag,
               std::list<std::vector<int>>>();
  check_random<true, std::random_access_iterator_tag,
               std::list<std::list<int>>>();

  check_random<false, std::random_access_iterator_tag,
               std::vector<std::vector<int>>>();
  check_random<false, std::random_access_iterator_tag,
               std::vector<std::list<int>>>();
  check_random<false, std::random_access_iterator_tag,
               std::list<std::vector<int>>>();
  check_random<false, std::random_access_iterator_tag,
               std::list<std::list<int>>>();
}

int main(int argc, char** argv) {
  if (argc > 1)
    N = atoi(argv[1]);
  if (N <= 0)
    N = 1024 * 4;

  typedef std::vector<std::vector<int>> NestedVector;

  // Static checks
  NestedVector data;
  const NestedVector& d(data);
#if __cplusplus >= 201103L
  auto r = galois::make_two_level_iterator(d.begin(), d.end());
#else
  auto r =
      galois::make_two_level_iterator<std::forward_iterator_tag,
                                      NestedVector::const_iterator,
                                      std::vector<int>::const_iterator,
                                      GetBegin<NestedVector, std::vector<int>>,
                                      GetEnd<NestedVector, std::vector<int>>>(
          d.begin(), d.end());
#endif
  static_assert(std::is_same<decltype(*r.first), const int&>::value,
                "failed case: preserve constness");

  // Runtime checks
  check_forward_iteration();
  check_backward_iteration();
  check_strided_iteration();
  check_random_iteration();

  return 0;
}


================================================
FILE: libgalois/test/wakeup-overhead.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "Lonestar/BoilerPlate.h"
#include "llvm/Support/CommandLine.h"

#include <boost/iterator/counting_iterator.hpp>

#include <chrono>
#include <cmath>
#include <iostream>
#include <vector>

typedef galois::GAccumulator<double> AccumDouble;

namespace cll = llvm::cl;

static cll::opt<int> size("size", cll::desc("length of vectors"),
                          cll::init(1000));
static cll::opt<int> rounds("rounds", cll::desc("number of rounds"),
                            cll::init(10000));
static cll::opt<int> trials("trials", cll::desc("number of trials"),
                            cll::init(1));
static cll::opt<unsigned> threads("threads", cll::desc("number of threads"),
                                  cll::init(2));

void runDoAllBurn(int num) {
  galois::substrate::getThreadPool().burnPower(galois::getActiveThreads());

  for (int r = 0; r < rounds; ++r) {
    galois::do_all(galois::iterate(0, num), [&](int) {
      asm volatile("" ::: "memory");
    });
  }

  galois::substrate::getThreadPool().beKind();
}

void runDoAll(int num) {
  for (int r = 0; r < rounds; ++r) {
    galois::do_all(galois::iterate(0, num), [&](int) {
      asm volatile("" ::: "memory");
    });
  }
}

void runExplicitThread(int num) {
  galois::substrate::Barrier& barrier =
      galois::runtime::getBarrier(galois::getActiveThreads());

  galois::on_each([&](unsigned tid, unsigned total) {
    auto range =
        galois::block_range(boost::counting_iterator<int>(0),
                            boost::counting_iterator<int>(num), tid, total);
    for (int r = 0; r < rounds; ++r) {
      for (auto ii = range.first, ei = range.second; ii != ei; ++ii) {
        asm volatile("" ::: "memory");
      }
      barrier();
    }
  });
}

void run(std::function<void(int)> fn, std::string name) {
  galois::Timer t;
  t.start();
  fn(size);
  t.stop();
  std::cout << name << " time: " << t.get() << "\n";
}

std::atomic<int> EXIT;

int main(int argc, char* argv[]) {
  galois::SharedMemSys Galois_runtime;
  LonestarStart(argc, argv);

  galois::setActiveThreads(threads);

  EXIT                        = 0;
  std::function<void(void)> f = []() {
    while (!EXIT) {
      std::cerr << ".";
      std::this_thread::sleep_for(std::chrono::milliseconds(100));
    }
  };
  galois::substrate::getThreadPool().runDedicated(f);

  for (int t = 0; t < trials; ++t) {
    run(runDoAll, "DoAll");
    run(runDoAllBurn, "DoAllBurn");
    run(runExplicitThread, "ExplicitThread");
  }
  EXIT = 1;

  std::cout << "threads: " << galois::getActiveThreads() << " usable threads: "
            << galois::substrate::getThreadPool().getMaxUsableThreads()
            << " rounds: " << rounds << " size: " << size << "\n";

  return 0;
}


================================================
FILE: libgalois/test/worklists-compile.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/runtime/Range.h"

#include <cstdlib>

int run = 1;

template <typename T2>
struct checker {
  typedef typename T2::template retype<int> T;
  T wl;
  typename T::template rethread<true> wl2;
  typename T::template rethread<false> wl3;

  checker() {
    int a[4] = {1, 2, 3, 0};

    // Don't actually run this code as some worklists don't support
    // the full worklist API
    if (run)
      return;

    wl.push(0);
    wl.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));
    wl.push(&a[0], &a[4]);
    wl.pop();

    wl2.push(0);
    wl2.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));
    wl2.push(&a[0], &a[4]);
    wl2.pop();

    wl3.push(0);
    wl3.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));
    wl3.push(&a[0], &a[4]);
    wl3.pop();
  }
};

#undef GALOIS_WLCOMPILECHECK
#define GALOIS_WLCOMPILECHECK(name) checker<name<>> ck_##name;
#include "galois/worklists/WorkList.h"

int main(int argc, char** argv) {
  galois::SharedMemSys Galois_runtime;
  if (argc > 1)
    run = atoi(argv[1]);

  return 0;
}


================================================
FILE: libgluon/CMakeLists.txt
================================================
add_library(galois_gluon STATIC)
add_library(Galois::gluon ALIAS galois_gluon)
set_target_properties(galois_gluon PROPERTIES EXPORT_NAME gluon)
add_dependencies(lib galois_gluon)

target_sources(galois_gluon PRIVATE
        src/cuda_device.cpp
        src/SyncStructures.cpp
        src/GlobalObj.cpp
        src/GluonSubstrate.cpp
)

target_link_libraries(galois_gluon PUBLIC galois_dist_async)

target_include_directories(galois_gluon PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

if (GALOIS_COMM_STATS)
  target_compile_definitions(galois_gluon PRIVATE GALOIS_COMM_STATS=1)
endif()

if (GALOIS_USE_BARE_MPI)
  target_compile_definitions(galois_gluon PRIVATE GALOIS_USE_BARE_MPI=1)
endif()

install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(TARGETS galois_gluon
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libgluon/include/galois/cuda/Context.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file cuda/Context.h
 *
 * Contains definition of CUDA context structures.
 *
 * @todo document this file
 */

#pragma once
#include <cuda.h>
#include "gg.h"
#include "galois/cuda/HostDecls.h"

struct CUDA_Context_Shared {
  unsigned int* num_nodes;         // per host
  DeviceOnly<unsigned int>* nodes; // per host
};

struct CUDA_Context_Common {
  int device;
  int id;
  unsigned int numOwned;    // Number of nodes owned (masters) by this host
  unsigned int beginMaster; // local id of the beginning of master nodes
  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
                                  // have outgoing edges
  CSRGraphTy gg;
  struct CUDA_Context_Shared master;
  struct CUDA_Context_Shared mirror;
  DeviceOnly<unsigned int> offsets; // union across master/mirror of all hosts
  Shared<DynamicBitset> is_updated; // union across master/mirror of all hosts
};

template <typename Type>
struct CUDA_Context_Field {
  Shared<Type> data;
  Shared<DynamicBitset> is_updated;
  DeviceOnly<Type> shared_data; // union across master/mirror of all hosts
};

bool init_CUDA_context_common(struct CUDA_Context_Common* ctx, int device) {
  struct cudaDeviceProp dev;
  if (device == -1) {
    check_cuda(cudaGetDevice(&device));
  } else {
    int count;
    check_cuda(cudaGetDeviceCount(&count));
    if (device > count) {
      fprintf(stderr, "Error: Out-of-range GPU %d specified (%d total GPUs)",
              device, count);
      return false;
    }
    check_cuda(cudaSetDevice(device));
  }
  ctx->device = device;
  check_cuda(cudaGetDeviceProperties(&dev, device));
  printf("[%d] Using GPU %d: %s\n", ctx->id, device, dev.name);
  return true;
}

void load_graph_CUDA_common(struct CUDA_Context_Common* ctx, MarshalGraph& g,
                            unsigned num_hosts) {
  CSRGraphTy graph;
  ctx->numOwned          = g.numOwned;
  ctx->beginMaster       = g.beginMaster;
  ctx->numNodesWithEdges = g.numNodesWithEdges;
  assert(ctx->id == g.id);

  size_t mem_usage = ((g.nnodes + 1) + g.nedges) * sizeof(index_type) +
                     (g.nnodes) * sizeof(node_data_type);
  if (!g.edge_data)
    mem_usage += (g.nedges) * sizeof(edge_data_type);
  printf("[%d] Host memory for graph: %3u MB\n", ctx->id, mem_usage / 1048756);

  // copy the graph to the GPU
  graph.nnodes    = g.nnodes;
  graph.nedges    = g.nedges;
  graph.row_start = g.row_start;
  graph.edge_dst  = g.edge_dst;
  graph.node_data = g.node_data;
  graph.edge_data = g.edge_data;
  graph.copy_to_gpu(ctx->gg);

  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  ctx->master.num_nodes =
      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
  memcpy(ctx->master.num_nodes, g.num_master_nodes,
         sizeof(unsigned int) * num_hosts);
  ctx->master.nodes = (DeviceOnly<unsigned int>*)calloc(
      num_hosts, sizeof(Shared<unsigned int>));
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->master.num_nodes[h] > 0) {
      ctx->master.nodes[h].alloc(ctx->master.num_nodes[h]);
      ctx->master.nodes[h].copy_to_gpu(g.master_nodes[h],
                                       ctx->master.num_nodes[h]);
    }
    if (ctx->master.num_nodes[h] > max_shared_size) {
      max_shared_size = ctx->master.num_nodes[h];
    }
  }
  ctx->mirror.num_nodes =
      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
  memcpy(ctx->mirror.num_nodes, g.num_mirror_nodes,
         sizeof(unsigned int) * num_hosts);
  ctx->mirror.nodes = (DeviceOnly<unsigned int>*)calloc(
      num_hosts, sizeof(Shared<unsigned int>));
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->mirror.num_nodes[h] > 0) {
      ctx->mirror.nodes[h].alloc(ctx->mirror.num_nodes[h]);
      ctx->mirror.nodes[h].copy_to_gpu(g.mirror_nodes[h],
                                       ctx->mirror.num_nodes[h]);
    }
    if (ctx->mirror.num_nodes[h] > max_shared_size) {
      max_shared_size = ctx->mirror.num_nodes[h];
    }
  }
  ctx->offsets.alloc(max_shared_size);
  ctx->is_updated.alloc(1);
  ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);
  // printf("[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu
  // edges\n", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);
}

size_t mem_usage_CUDA_common(MarshalGraph& g, unsigned num_hosts) {
  size_t mem_usage       = 0;
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  mem_usage += num_hosts * sizeof(unsigned int);
  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_master_nodes[h] > 0) {
      mem_usage += g.num_master_nodes[h] * sizeof(unsigned int);
    }
    if (g.num_master_nodes[h] > max_shared_size) {
      max_shared_size = g.num_master_nodes[h];
    }
  }
  mem_usage += num_hosts * sizeof(unsigned int);
  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_mirror_nodes[h] > 0) {
      mem_usage += g.num_mirror_nodes[h] * sizeof(unsigned int);
    }
    if (g.num_mirror_nodes[h] > max_shared_size) {
      max_shared_size = g.num_mirror_nodes[h];
    }
  }
  mem_usage += max_shared_size * sizeof(unsigned int);
  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);
  return mem_usage;
}

template <typename Type>
void load_graph_CUDA_field(struct CUDA_Context_Common* ctx,
                           struct CUDA_Context_Field<Type>* field,
                           unsigned num_hosts) {
  field->data.alloc(ctx->gg.nnodes);
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->master.num_nodes[h] > max_shared_size) {
      max_shared_size = ctx->master.num_nodes[h];
    }
  }
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->mirror.num_nodes[h] > max_shared_size) {
      max_shared_size = ctx->mirror.num_nodes[h];
    }
  }
  field->shared_data.alloc(max_shared_size);
  field->is_updated.alloc(1);
  field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nnodes);
}

template <typename Type>
size_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,
                            MarshalGraph& g, unsigned num_hosts) {
  size_t mem_usage = 0;
  mem_usage += g.nnodes * sizeof(Type);
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_master_nodes[h] > max_shared_size) {
      max_shared_size = g.num_master_nodes[h];
    }
  }
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_mirror_nodes[h] > max_shared_size) {
      max_shared_size = g.num_mirror_nodes[h];
    }
  }
  mem_usage += max_shared_size * sizeof(Type);
  mem_usage += ((g.nnodes + 63) / 64) * sizeof(unsigned long long int);
  return mem_usage;
}


================================================
FILE: libgluon/include/galois/cuda/DynamicBitset.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file cuda/DynamicBitset.h
 *
 * Contains implementation of CUDA dynamic bitset and iterators for it.
 */

// thread-safe dynamic bitset in CUDA
#pragma once
#include <cuda.h>
#include <math.h>
#include <iterator>

/**
 * Dynamic Bitset, CUDA version. See galois/DynamicBitset.h.
 *
 * @todo document this file
 */
class DynamicBitset {
  size_t num_bits_capacity;
  size_t num_bits;
  uint64_t* bit_vector;

public:
  DynamicBitset() {
    num_bits_capacity = 0;
    num_bits          = 0;
    bit_vector        = NULL;
  }

  DynamicBitset(size_t nbits) { alloc(nbits); }

  ~DynamicBitset() {
    if (bit_vector != NULL)
      cudaFree(bit_vector);
  }

  void alloc(size_t nbits) {
    assert(num_bits == 0);
    assert(sizeof(unsigned long long int) * 8 == 64);
    assert(sizeof(uint64_t) * 8 == 64);
    num_bits_capacity = nbits;
    num_bits          = nbits;
    CUDA_SAFE_CALL(cudaMalloc(&bit_vector, vec_size() * sizeof(uint64_t)));
    reset();
  }

  void resize(size_t nbits) {
    assert(nbits <= num_bits_capacity);
    num_bits = nbits;
  }

  __device__ __host__ size_t size() const { return num_bits; }

  __device__ __host__ size_t vec_size() const {
    size_t bit_vector_size = (num_bits + 63) / 64;
    return bit_vector_size;
  }

  __device__ __host__ size_t alloc_size() const {
    return vec_size() * sizeof(uint64_t);
  }

  void reset() {
    CUDA_SAFE_CALL(cudaMemset(bit_vector, 0, vec_size() * sizeof(uint64_t)));
  }

  // assumes bit_vector is not updated (set) in parallel
  __device__ bool test(const size_t id) const {
    size_t bit_index    = id / 64;
    uint64_t bit_offset = 1;
    bit_offset <<= (id % 64);
    return ((bit_vector[bit_index] & bit_offset) != 0);
  }

  __device__ void set(const size_t id) {
    size_t bit_index                  = id / 64;
    unsigned long long int bit_offset = 1;
    bit_offset <<= (id % 64);
    if ((bit_vector[bit_index] & bit_offset) == 0) { // test and set
      atomicOr((unsigned long long int*)&bit_vector[bit_index], bit_offset);
    }
  }

  // different indices can be updated in parallel
  __device__ void batch_reset(const size_t bit_index) {
    bit_vector[bit_index] = 0;
  }

  // different indices can be updated in parallel
  // but assumes same index is not updated in parallel
  __device__ void batch_bitwise_and(const size_t bit_index,
                                    const uint64_t mask) {
    bit_vector[bit_index] &= mask;
  }

  void copy_to_cpu(uint64_t* bit_vector_cpu_copy) {
    assert(bit_vector_cpu_copy != NULL);
    CUDA_SAFE_CALL(cudaMemcpy(bit_vector_cpu_copy, bit_vector,
                              vec_size() * sizeof(uint64_t),
                              cudaMemcpyDeviceToHost));
  }

  void copy_to_gpu(uint64_t* cpu_bit_vector) {
    assert(cpu_bit_vector != NULL);
    CUDA_SAFE_CALL(cudaMemcpy(bit_vector, cpu_bit_vector,
                              vec_size() * sizeof(uint64_t),
                              cudaMemcpyHostToDevice));
  }
};

class DynamicBitsetIterator
    : public std::iterator<std::random_access_iterator_tag, bool> {
  DynamicBitset* bitset;
  size_t offset;

public:
  __device__ __host__ __forceinline__ DynamicBitsetIterator(DynamicBitset* b,
                                                            size_t i = 0)
      : bitset(b), offset(i) {}

  __device__ __host__ __forceinline__ DynamicBitsetIterator& operator++() {
    offset++;
    return *this;
  }

  __device__ __host__ __forceinline__ DynamicBitsetIterator& operator--() {
    offset--;
    return *this;
  }

  __device__ __host__ __forceinline__ bool
  operator<(const DynamicBitsetIterator& bi) {
    return (offset < bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator<=(const DynamicBitsetIterator& bi) {
    return (offset <= bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator>(const DynamicBitsetIterator& bi) {
    return (offset > bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator>=(const DynamicBitsetIterator& bi) {
    return (offset >= bi.offset);
  }

  __device__ __host__ __forceinline__ DynamicBitsetIterator&
  operator+=(size_t i) {
    offset += i;
    return *this;
  }

  __device__ __host__ __forceinline__ DynamicBitsetIterator&
  operator-=(size_t i) {
    offset -= i;
    return *this;
  }

  __device__ __host__ __forceinline__ DynamicBitsetIterator
  operator+(size_t i) {
    return DynamicBitsetIterator(bitset, offset + i);
  }

  __device__ __host__ __forceinline__ DynamicBitsetIterator
  operator-(size_t i) {
    return DynamicBitsetIterator(bitset, offset - i);
  }

  __device__ __host__ __forceinline__ difference_type
  operator-(const DynamicBitsetIterator& bi) {
    return (offset - bi.offset);
  }

  __device__ __forceinline__ bool operator*() const {
    return bitset->test(offset);
  }

  __device__ __forceinline__ bool operator[](const size_t id) const {
    return bitset->test(offset + id);
  }
};

class IdentityIterator
    : public std::iterator<std::random_access_iterator_tag, size_t> {
  size_t offset;

public:
  __device__ __host__ __forceinline__ IdentityIterator(size_t i = 0)
      : offset(i) {}

  __device__ __host__ __forceinline__ IdentityIterator& operator++() {
    offset++;
    return *this;
  }

  __device__ __host__ __forceinline__ IdentityIterator& operator--() {
    offset--;
    return *this;
  }

  __device__ __host__ __forceinline__ bool
  operator<(const IdentityIterator& bi) {
    return (offset < bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator<=(const IdentityIterator& bi) {
    return (offset <= bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator>(const IdentityIterator& bi) {
    return (offset > bi.offset);
  }

  __device__ __host__ __forceinline__ bool
  operator>=(const IdentityIterator& bi) {
    return (offset >= bi.offset);
  }

  __device__ __host__ __forceinline__ IdentityIterator& operator+=(size_t i) {
    offset += i;
    return *this;
  }

  __device__ __host__ __forceinline__ IdentityIterator& operator-=(size_t i) {
    offset -= i;
    return *this;
  }

  __device__ __host__ __forceinline__ IdentityIterator operator+(size_t i) {
    return IdentityIterator(offset + i);
  }

  __device__ __host__ __forceinline__ IdentityIterator operator-(size_t i) {
    return IdentityIterator(offset - i);
  }

  __device__ __host__ __forceinline__ difference_type
  operator-(const IdentityIterator& bi) {
    return (offset - bi.offset);
  }

  __device__ __forceinline__ size_t operator*() const { return offset; }

  __device__ __forceinline__ size_t operator[](const size_t id) const {
    return offset + id;
  }
};


================================================
FILE: libgluon/include/galois/cuda/EdgeContext.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file cuda/EdgeContext.h
 *
 * Contains definition of CUDA context structures.
 *
 * @todo document this file
 */

#pragma once

#pragma once
#include <cuda.h>
#include "gg.h"
#pragma once
#include "galois/cuda/EdgeHostDecls.h"

struct CUDA_Context_Shared_Edges {
  unsigned int* num_edges;         // per host
  DeviceOnly<unsigned int>* edges; // per host
};

struct CUDA_Context_Common_Edges {
  int device;
  int id;
  unsigned int numOwned;    // Number of nodes owned (masters) by this host
  unsigned int beginMaster; // local id of the beginning of master nodes
  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
                                  // have outgoing edges
  CSRGraphTy gg;
  struct CUDA_Context_Shared_Edges master;
  struct CUDA_Context_Shared_Edges mirror;
  DeviceOnly<unsigned int> offsets; // union across master/mirror of all hosts
  Shared<DynamicBitset> is_updated; // union across master/mirror of all hosts
};

template <typename Type>
struct CUDA_Context_Field_Edges {
  Shared<Type> data;
  Shared<DynamicBitset> is_updated; // size of edges
  DeviceOnly<Type> shared_data;     // union across master/mirror of all hosts
};

bool init_CUDA_context_common_edges(struct CUDA_Context_Common_Edges* ctx,
                                    int device) {
  struct cudaDeviceProp dev;
  if (device == -1) {
    check_cuda(cudaGetDevice(&device));
  } else {
    int count;
    check_cuda(cudaGetDeviceCount(&count));
    if (device > count) {
      fprintf(stderr, "Error: Out-of-range GPU %d specified (%d total GPUs)",
              device, count);
      return false;
    }
    check_cuda(cudaSetDevice(device));
  }
  ctx->device = device;
  check_cuda(cudaGetDeviceProperties(&dev, device));
  printf("[%d] Using GPU %d: %s\n", ctx->id, device, dev.name);
  return true;
}

void load_graph_CUDA_common_edges(struct CUDA_Context_Common_Edges* ctx,
                                  EdgeMarshalGraph& g, unsigned num_hosts,
                                  bool LoadProxyEdges = true) {
  CSRGraphTy graph;
  ctx->numOwned          = g.numOwned;
  ctx->beginMaster       = g.beginMaster;
  ctx->numNodesWithEdges = g.numNodesWithEdges;
  assert(ctx->id == g.id);

  size_t mem_usage = ((g.nnodes + 1) + g.nedges) * sizeof(index_type) +
                     (g.nnodes) * sizeof(node_data_type);
  if (!g.edge_data)
    mem_usage += (g.nedges) * sizeof(edge_data_type);
  printf("[%d] Host memory for graph: %3u MB\n", ctx->id, mem_usage / 1048756);

  // copy the graph to the GPU
  graph.nnodes    = g.nnodes;
  graph.nedges    = g.nedges;
  graph.row_start = g.row_start;
  graph.edge_dst  = g.edge_dst;
  graph.node_data = g.node_data;
  graph.edge_data = g.edge_data;
  graph.copy_to_gpu(ctx->gg);

  if (LoadProxyEdges) {
    size_t max_shared_size = 0; // for union across master/mirror of all hosts
    ctx->master.num_edges =
        (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
    memcpy(ctx->master.num_edges, g.num_master_edges,
           sizeof(unsigned int) * num_hosts);
    ctx->master.edges = (DeviceOnly<unsigned int>*)calloc(
        num_hosts, sizeof(Shared<unsigned int>));
    for (uint32_t h = 0; h < num_hosts; ++h) {
      if (ctx->master.num_edges[h] > 0) {
        ctx->master.edges[h].alloc(ctx->master.num_edges[h]);
        ctx->master.edges[h].copy_to_gpu(g.master_edges[h],
                                         ctx->master.num_edges[h]);
      }
      if (ctx->master.num_edges[h] > max_shared_size) {
        max_shared_size = ctx->master.num_edges[h];
      }
    }
    ctx->mirror.num_edges =
        (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
    memcpy(ctx->mirror.num_edges, g.num_mirror_edges,
           sizeof(unsigned int) * num_hosts);
    ctx->mirror.edges = (DeviceOnly<unsigned int>*)calloc(
        num_hosts, sizeof(Shared<unsigned int>));
    for (uint32_t h = 0; h < num_hosts; ++h) {
      if (ctx->mirror.num_edges[h] > 0) {
        ctx->mirror.edges[h].alloc(ctx->mirror.num_edges[h]);
        ctx->mirror.edges[h].copy_to_gpu(g.mirror_edges[h],
                                         ctx->mirror.num_edges[h]);
      }
      if (ctx->mirror.num_edges[h] > max_shared_size) {
        max_shared_size = ctx->mirror.num_edges[h];
      }
    }
    ctx->offsets.alloc(max_shared_size);
    ctx->is_updated.alloc(1);
    ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);
  }
  // printf("[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu
  // edges\n", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);
}

size_t mem_usage_CUDA_common_edges(EdgeMarshalGraph& g, unsigned num_hosts) {
  size_t mem_usage       = 0;
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  mem_usage += num_hosts * sizeof(unsigned int);
  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_master_edges[h] > 0) {
      mem_usage += g.num_master_edges[h] * sizeof(unsigned int);
    }
    if (g.num_master_edges[h] > max_shared_size) {
      max_shared_size = g.num_master_edges[h];
    }
  }
  mem_usage += num_hosts * sizeof(unsigned int);
  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_mirror_edges[h] > 0) {
      mem_usage += g.num_mirror_edges[h] * sizeof(unsigned int);
    }
    if (g.num_mirror_edges[h] > max_shared_size) {
      max_shared_size = g.num_mirror_edges[h];
    }
  }
  mem_usage += max_shared_size * sizeof(unsigned int);
  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);
  return mem_usage;
}

template <typename Type>
void load_graph_CUDA_field_edges(struct CUDA_Context_Common_Edges* ctx,
                                 struct CUDA_Context_Field_Edges<Type>* field,
                                 unsigned num_hosts) {
  field->data.alloc(ctx->gg.nedges);
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->master.num_edges[h] > max_shared_size) {
      max_shared_size = ctx->master.num_edges[h];
    }
  }
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (ctx->mirror.num_edges[h] > max_shared_size) {
      max_shared_size = ctx->mirror.num_edges[h];
    }
  }
  field->shared_data.alloc(max_shared_size);
  field->is_updated.alloc(1);
  field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nedges);
}

template <typename Type>
size_t mem_usage_CUDA_field_edges(struct CUDA_Context_Field_Edges<Type>* field,
                                  EdgeMarshalGraph& g, unsigned num_hosts) {
  size_t mem_usage = 0;
  mem_usage += g.nedges * sizeof(Type);
  size_t max_shared_size = 0; // for union across master/mirror of all hosts
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_master_edges[h] > max_shared_size) {
      max_shared_size = g.num_master_edges[h];
    }
  }
  for (uint32_t h = 0; h < num_hosts; ++h) {
    if (g.num_mirror_edges[h] > max_shared_size) {
      max_shared_size = g.num_mirror_edges[h];
    }
  }
  mem_usage += max_shared_size * sizeof(Type);
  mem_usage += ((g.nedges + 63) / 64) * sizeof(unsigned long long int);
  return mem_usage;
}


================================================
FILE: libgluon/include/galois/cuda/EdgeHostDecls.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file EdgeHostDecls.h
 *
 * Contains forward declarations and the definition of the EdgeMarshalGraph
 * class, which is used to marshal a graph to GPUs.
 *
 * @todo document this file
 */

#pragma once
#include <string>

#ifndef LSG_CSR_GRAPH
typedef unsigned int index_type; // GPU kernels choke on size_t
typedef unsigned int node_data_type;
typedef unsigned edge_data_type;
#endif

struct EdgeMarshalGraph {
  size_t nnodes;
  size_t nedges;
  unsigned int numOwned;    // Number of nodes owned (masters) by this host
  unsigned int beginMaster; // local id of the beginning of master nodes
  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
                                  // have outgoing edges
  int id;
  unsigned numHosts;
  index_type* row_start;
  index_type* edge_dst;
  node_data_type* node_data;
  edge_data_type* edge_data;
  unsigned int* num_master_edges;
  unsigned int** master_edges;
  unsigned int* num_mirror_edges;
  unsigned int** mirror_edges;

  EdgeMarshalGraph()
      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),
        edge_data(NULL), num_master_edges(NULL), master_edges(NULL),
        num_mirror_edges(NULL), mirror_edges(NULL) {}

  ~EdgeMarshalGraph() {
    if (!row_start)
      free(row_start);
    if (!edge_dst)
      free(edge_dst);
    if (!node_data)
      free(node_data);
    if (!edge_data)
      free(edge_data);
    if (!num_master_edges)
      free(num_master_edges);
    if (master_edges != NULL) {
      for (unsigned i = 0; i < numHosts; ++i) {
        free(master_edges[i]);
      }
      free(master_edges);
    }
    if (!num_mirror_edges)
      free(num_mirror_edges);
    if (mirror_edges != NULL) {
      for (unsigned i = 0; i < numHosts; ++i) {
        free(mirror_edges[i]);
      }
      free(mirror_edges);
    }
  }
};

// to determine the GPU device id
int get_gpu_device_id(std::string personality_set,
                      int num_nodes); // defined on the host

struct CUDA_Context; // forward declaration only because rest is dependent on
                     // the dist_app

// defined on the device
struct CUDA_Context* get_CUDA_context(int id);
bool init_CUDA_context(struct CUDA_Context* ctx, int device);
void load_graph_CUDA(struct CUDA_Context* ctx, EdgeMarshalGraph& g,
                     unsigned num_hosts);
void reset_CUDA_context(struct CUDA_Context* ctx);


================================================
FILE: libgluon/include/galois/cuda/HostDecls.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file HostDecls.h
 *
 * Contains forward declarations and the definition of the MarshalGraph
 * class, which is used to marshal a graph to GPUs.
 *
 * @todo document this file
 */

#pragma once
#include <string>

#ifndef LSG_CSR_GRAPH
typedef unsigned int index_type; // GPU kernels choke on size_t
typedef unsigned int node_data_type;
typedef unsigned int edge_data_type;
#endif

struct MarshalGraph {
  size_t nnodes;
  size_t nedges;
  unsigned int numOwned;    // Number of nodes owned (masters) by this host
  unsigned int beginMaster; // local id of the beginning of master nodes
  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
                                  // have outgoing edges
  int id;
  unsigned numHosts;
  index_type* row_start;
  index_type* edge_dst;
  node_data_type* node_data;
  edge_data_type* edge_data;
  unsigned int* num_master_nodes;
  unsigned int** master_nodes;
  unsigned int* num_mirror_nodes;
  unsigned int** mirror_nodes;

  MarshalGraph()
      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),
        edge_data(NULL), num_master_nodes(NULL), master_nodes(NULL),
        num_mirror_nodes(NULL), mirror_nodes(NULL) {}

  ~MarshalGraph() {
    if (!row_start)
      free(row_start);
    if (!edge_dst)
      free(edge_dst);
    if (!node_data)
      free(node_data);
    if (!edge_data)
      free(edge_data);
    if (!num_master_nodes)
      free(num_master_nodes);
    if (!master_nodes) {
      for (unsigned i = 0; i < numHosts; ++i) {
        free(master_nodes[i]);
      }
      free(master_nodes);
    }
    if (!num_mirror_nodes)
      free(num_mirror_nodes);
    if (!mirror_nodes) {
      for (unsigned i = 0; i < numHosts; ++i) {
        free(mirror_nodes[i]);
      }
      free(mirror_nodes);
    }
  }
};

// to determine the GPU device id
int get_gpu_device_id(std::string personality_set,
                      int num_nodes); // defined on the host

struct CUDA_Context; // forward declaration only because rest is dependent on
                     // the dist_app

// defined on the device
struct CUDA_Context* get_CUDA_context(int id);
bool init_CUDA_context(struct CUDA_Context* ctx, int device);
void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph& g,
                     unsigned num_hosts);
void reset_CUDA_context(struct CUDA_Context* ctx);


================================================
FILE: libgluon/include/galois/graphs/GluonEdgeSubstrate.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file GluonEdgeSubstrate.h
 *
 * Contains the implementation for GluonEdgeSubstrate.
 */

// TODO merge with GluonSubstrate; way too much code duplication

#ifndef _GALOIS_GLUONEDGESUB_H_
#define _GALOIS_GLUONEDGESUB_H_

#include <unordered_map>
#include <fstream>

#include "galois/runtime/GlobalObj.h"
#include "galois/runtime/DistStats.h"
#include "galois/runtime/SyncStructures.h"
#include "galois/runtime/DataCommMode.h"
#include "galois/DynamicBitset.h"

#ifdef GALOIS_ENABLE_GPU
#include "galois/cuda/EdgeHostDecls.h"
#endif

#include "galois/runtime/BareMPI.h"

// TODO make not global
//! Specifies what format to send metadata in
extern DataCommMode enforcedDataMode;

#ifdef GALOIS_USE_BARE_MPI
//! bare_mpi type to use; see options in runtime/BareMPI.h
extern BareMPI bare_mpi;
#endif

namespace galois {
namespace graphs {

/**
 * Gluon communication substrate that handles communication given a user graph.
 * User graph should provide certain things the substrate expects.
 *
 * TODO documentation on expected things
 *
 * @tparam GraphTy User graph to handle communication for
 */
template <typename GraphTy>
class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
private:
  //! Synchronization type
  enum SyncType {
    syncReduce,   //!< Reduction sync
    syncBroadcast //!< Broadcast sync
  };

  //! Graph name used for printing things
  constexpr static const char* const RNAME = "GluonEdges";

  //! The graph to handle communication for
  GraphTy& userGraph;
  const unsigned id; //!< Copy of net.ID, which is the ID of the machine.
  DataCommMode substrateDataMode; //!< datamode to enforce
  const uint32_t
      numHosts;     //!< Copy of net.Num, which is the total number of machines
  uint32_t num_run; //!< Keep track of number of runs.
  uint32_t num_round; //!< Keep track of number of rounds.

  // memoization optimization
  //! Master edges on different hosts. For broadcast;
  std::vector<std::vector<size_t>> masterEdges;
  //! Mirror edges on different hosts. For reduce; comes from the user graph
  //! during initialization (we expect user to give to us)
  std::vector<std::vector<size_t>>& mirrorEdges;
  //! Maximum size of master or mirror edges on different hosts
  size_t maxSharedSize;

#ifdef GALOIS_USE_BARE_MPI
  std::vector<MPI_Group> mpi_identity_groups;
#endif
  // Used for efficient comms
  galois::DynamicBitSet syncBitset;
  galois::PODResizeableArray<unsigned int> syncOffsets;

  void reset_bitset(void (*bitset_reset_range)(size_t, size_t)) {
    if (userGraph.sizeEdges() > 0) {
      bitset_reset_range(0, userGraph.sizeEdges() - 1);
    }
  }

  //! Increments evilPhase, a phase counter used by communication.
  void inline incrementEvilPhase() {
    ++galois::runtime::evilPhase;
    // limit defined by MPI or LCI
    if (galois::runtime::evilPhase >=
        uint32_t{std::numeric_limits<int16_t>::max()}) {
      galois::runtime::evilPhase = 1;
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Proxy communication setup
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Let other hosts know about which host has what mirrors/masters;
   * used for later communication of mirrors/masters.
   */
  void exchangeProxyInfo() {
    auto& net = galois::runtime::getSystemNetworkInterface();

    // send off the mirror edges
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      galois::runtime::SendBuffer b;
      gSerialize(b, mirrorEdges[x]);
      net.sendTagged(x, galois::runtime::evilPhase, b);
    }

    // receive the mirror edges
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      galois::runtime::gDeserialize(p->second, masterEdges[p->first]);
    }
    incrementEvilPhase();
  }

  /**
   * Send statistics about master/mirror edges to each host, and
   * report the statistics.
   */
  void sendInfoToHost() {
    auto& net = galois::runtime::getSystemNetworkInterface();

    uint64_t totalMirrorEdges =
        userGraph.sizeEdges() - userGraph.numOwnedEdges();
    uint64_t totalOwnedEdges = userGraph.numOwnedEdges();

    // send info to host
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      galois::runtime::SendBuffer b;
      gSerialize(b, totalMirrorEdges, totalOwnedEdges);
      net.sendTagged(x, galois::runtime::evilPhase, b);
    }

    // receive
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      uint64_t totalMirrorFromOther;
      uint64_t totalOwnedFromOther;
      galois::runtime::gDeserialize(p->second, totalMirrorFromOther,
                                    totalOwnedFromOther);
      totalMirrorEdges += totalMirrorFromOther;
      totalOwnedEdges += totalOwnedFromOther;
    }
    incrementEvilPhase();

    assert(userGraph.globalEdges() == totalOwnedEdges);

    // report stats
    if (net.ID == 0) {
      reportProxyStats(totalMirrorEdges);
    }
  }

  /**
   * Sets up the communication between the different hosts that contain
   * different parts of the graph by exchanging master/mirror information.
   */
  void setupCommunication() {
    galois::CondStatTimer<MORE_DIST_STATS> Tcomm_setup("CommunicationSetupTime",
                                                       RNAME);

    // barrier so that all hosts start the timer together
    galois::runtime::getHostBarrier().wait();

    Tcomm_setup.start();

    // Exchange information for memoization optimization.
    exchangeProxyInfo();
    // convert the global ids stored in the master/mirror edges arrays to local
    // ids
    // TODO: use 32-bit distinct vectors for masters and mirrors from here on
    for (uint32_t h = 0; h < masterEdges.size(); ++h) {
      galois::do_all(
          galois::iterate(size_t{0}, masterEdges[h].size()),
          [&](size_t n) {
            masterEdges[h][n] = userGraph.getEdgeLID(masterEdges[h][n]);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier("MasterEdges").c_str()),
#endif
          galois::no_stats());
    }

    for (uint32_t h = 0; h < mirrorEdges.size(); ++h) {
      galois::do_all(
          galois::iterate(size_t{0}, mirrorEdges[h].size()),
          [&](size_t n) {
            mirrorEdges[h][n] = userGraph.getEdgeLID(mirrorEdges[h][n]);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier("MirrorEdges").c_str()),
#endif
          galois::no_stats());
    }

    Tcomm_setup.stop();

    maxSharedSize = 0;
    // report masters/mirrors to/from other hosts as statistics
    for (auto x = 0U; x < masterEdges.size(); ++x) {
      if (x == id)
        continue;
      std::string master_edges_str =
          "MasterEdgesFrom_" + std::to_string(id) + "_To_" + std::to_string(x);
      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, master_edges_str, masterEdges[x].size());
      if (masterEdges[x].size() > maxSharedSize) {
        maxSharedSize = masterEdges[x].size();
      }
    }

    for (auto x = 0U; x < mirrorEdges.size(); ++x) {
      if (x == id)
        continue;
      std::string mirror_edges_str =
          "MirrorEdgesFrom_" + std::to_string(x) + "_To_" + std::to_string(id);
      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, mirror_edges_str, mirrorEdges[x].size());
      if (mirrorEdges[x].size() > maxSharedSize) {
        maxSharedSize = mirrorEdges[x].size();
      }
    }

    sendInfoToHost();

    // do not track memory usage of partitioning
    auto& net = galois::runtime::getSystemNetworkInterface();
    net.resetMemUsage();
  }

  /**
   * Reports master/mirror stats.
   * Assumes that communication has already occured so that the host
   * calling it actually has the info required.
   *
   * @param totalMirrorEdges number of mirror edges on all hosts
   */
  void reportProxyStats(uint64_t totalMirrorEdges) {
    float replication_factor =
        (float)(totalMirrorEdges + userGraph.globalEdges()) /
        (float)userGraph.globalEdges();
    galois::runtime::reportStat_Single(RNAME, "ReplicationFactorEdges",
                                       replication_factor);
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
        RNAME, "TotalGlobalMirrorEdges", totalMirrorEdges);
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Initializers
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Initalize MPI related things. The MPI layer itself should have been
   * initialized when the network interface was initiailized.
   */
  void initBareMPI() {
#ifdef GALOIS_USE_BARE_MPI
    if (bare_mpi == noBareMPI)
      return;

#ifdef GALOIS_USE_LCI
    // sanity check of ranks
    int taskRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskRank);
    if ((unsigned)taskRank != id)
      GALOIS_DIE("mismatch in MPI rank");
    int numTasks;
    MPI_Comm_size(MPI_COMM_WORLD, &numTasks);
    if ((unsigned)numTasks != numHosts)
      GALOIS_DIE("mismatch in MPI rank");
#endif
    // group setup
    MPI_Group world_group;
    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
    mpi_identity_groups.resize(numHosts);

    for (unsigned x = 0; x < numHosts; ++x) {
      const int g[1] = {(int)x};
      MPI_Group_incl(world_group, 1, g, &mpi_identity_groups[x]);
    }

    if (id == 0) {
      switch (bare_mpi) {
      case nonBlockingBareMPI:
        galois::gPrint("Using non-blocking bare MPI\n");
        break;
      case oneSidedBareMPI:
        galois::gPrint("Using one-sided bare MPI\n");
        break;
      case noBareMPI:
      default:
        GALOIS_DIE("unsupported bare MPI");
      }
    }
#endif
  }

public:
  /**
   * Delete default constructor: this class NEEDS to have a graph passed into
   * it.
   */
  GluonEdgeSubstrate() = delete;

  /**
   * Constructor for GluonEdgeSubstrate. Initializes metadata fields.
   *
   * @param _userGraph Graph to build substrate on
   * @param host host number that this graph resides on
   * @param numHosts total number of hosts in the currently executing program
   * @param doNothing
   * @param _substrateDataMode
   */
  GluonEdgeSubstrate(GraphTy& _userGraph, unsigned host, unsigned numHosts,
                     bool doNothing                  = false,
                     DataCommMode _substrateDataMode = DataCommMode::noData)
      : galois::runtime::GlobalObject(this), userGraph(_userGraph), id(host),
        substrateDataMode(_substrateDataMode), numHosts(numHosts), num_run(0),
        num_round(0), mirrorEdges(userGraph.getMirrorEdges()) {
    if (!doNothing) {
      galois::StatTimer edgeSubstrateSetupTimer(
          "GluonEdgeSubstrateConstructTime", RNAME);
      edgeSubstrateSetupTimer.start();

      // set global
      enforcedDataMode = _substrateDataMode;

      initBareMPI();
      // master setup from mirrors done by setupCommunication call
      masterEdges.resize(numHosts);

      // setup proxy communication
      galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(
          "GraphCommSetupTime", RNAME);
      Tgraph_construct_comm.start();
      setupCommunication();
      Tgraph_construct_comm.stop();

      edgeSubstrateSetupTimer.stop();
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Data extraction from bitsets
  ////////////////////////////////////////////////////////////////////////////////

private:
  /**
   * Given a bitset, determine the indices of the bitset that are currently
   * set.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName string used to name the timer for this function
   * @param bitset_comm the bitset to get the offsets of
   * @param offsets output: the offset vector that will contain indices into
   * the bitset that are set
   * @param bit_set_count output: will be set to the number of bits set in the
   * bitset
   */
  template <SyncType syncType>
  void getOffsetsFromBitset(const std::string& loopName,
                            const galois::DynamicBitSet& bitset_comm,
                            galois::PODResizeableArray<unsigned int>& offsets,
                            size_t& bit_set_count) const {
    // timer creation
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string offsets_timer_str(syncTypeStr + "Offsets_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Toffsets(offsets_timer_str.c_str(),
                                                      RNAME);

    Toffsets.start();

    auto activeThreads = galois::getActiveThreads();
    std::vector<unsigned int> t_prefix_bit_counts(activeThreads);

    // count how many bits are set on each thread
    galois::on_each([&](unsigned tid, unsigned nthreads) {
      // TODO use block_range instead
      unsigned int block_size = bitset_comm.size() / nthreads;
      if ((bitset_comm.size() % nthreads) > 0)
        ++block_size;
      assert((block_size * nthreads) >= bitset_comm.size());

      unsigned int start = tid * block_size;
      unsigned int end   = (tid + 1) * block_size;
      if (end > bitset_comm.size())
        end = bitset_comm.size();

      unsigned int count = 0;
      for (unsigned int i = start; i < end; ++i) {
        if (bitset_comm.test(i))
          ++count;
      }

      t_prefix_bit_counts[tid] = count;
    });

    // calculate prefix sum of bits per thread
    for (unsigned int i = 1; i < activeThreads; ++i) {
      t_prefix_bit_counts[i] += t_prefix_bit_counts[i - 1];
    }
    // total num of set bits
    bit_set_count = t_prefix_bit_counts[activeThreads - 1];

    // calculate the indices of the set bits and save them to the offset
    // vector
    if (bit_set_count > 0) {
      offsets.resize(bit_set_count);
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        // TODO use block_range instead
        // TODO this is same calculation as above; maybe refactor it
        // into function?
        unsigned int block_size = bitset_comm.size() / nthreads;
        if ((bitset_comm.size() % nthreads) > 0)
          ++block_size;
        assert((block_size * nthreads) >= bitset_comm.size());

        unsigned int start = tid * block_size;
        unsigned int end   = (tid + 1) * block_size;
        if (end > bitset_comm.size())
          end = bitset_comm.size();

        unsigned int count = 0;
        unsigned int t_prefix_bit_count;
        if (tid == 0) {
          t_prefix_bit_count = 0;
        } else {
          t_prefix_bit_count = t_prefix_bit_counts[tid - 1];
        }

        for (unsigned int i = start; i < end; ++i) {
          if (bitset_comm.test(i)) {
            offsets[t_prefix_bit_count + count] = i;
            ++count;
          }
        }
      });
    }
    Toffsets.stop();
  }

  /**
   * Determine what data needs to be synchronized based on the passed in
   * bitset_compute and returns information regarding these need-to-be-sync'd
   * edges.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done;
   * only used to get the size of the type being synchronized in this function
   * @tparam syncType type of synchronization this function is being called
   * for; only used to name a timer
   *
   * @param loopName loopname used to name the timer for the function
   * @param indices A vector that contains the local ids of the edges that
   * you want to potentially synchronize
   * @param bitset_compute Contains the full bitset of all edges in this
   * graph
   * @param bitset_comm OUTPUT: bitset that marks which indices in the passed
   * in indices array need to be synchronized
   * @param offsets OUTPUT: contains indices into bitset_comm that are set
   * @param bit_set_count OUTPUT: contains number of bits set in bitset_comm
   * @param data_mode OUTPUT: the way that this data should be communicated
   * based on how much data needs to be sent out
   */
  template <typename FnTy, SyncType syncType>
  void getBitsetAndOffsets(const std::string& loopName,
                           const std::vector<size_t>& indices,
                           const galois::DynamicBitSet& bitset_compute,
                           galois::DynamicBitSet& bitset_comm,
                           galois::PODResizeableArray<unsigned int>& offsets,
                           size_t& bit_set_count,
                           DataCommMode& data_mode) const {
    if (substrateDataMode != onlyData) {
      bitset_comm.reset();
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "Bitset_" + loopName);

      bitset_comm.reset();
      // determine which local edges in the indices array need to be
      // sychronized
      galois::do_all(
          galois::iterate(size_t{0}, indices.size()),
          [&](size_t n) {
            // assumes each lid is unique as test is not thread safe
            size_t lid = indices[n];
            if (bitset_compute.test(lid)) {
              bitset_comm.set(n);
            }
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());

      // get the number of set bits and the offsets into the comm bitset
      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
                                     bit_set_count);
    }

    data_mode =
        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Local to global ID conversion
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Converts LIDs of edges we are interested in into GIDs.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of edges that we are interested in
   * @param offsets INPUT/OUTPUT holds offsets into "indices" that we should
   * use; after function completion, holds global ids of edges we are interested
   * in
   */
  template <SyncType syncType>
  void convertLIDToGID(const std::string& loopName,
                       const std::vector<size_t>& indices,
                       galois::PODResizeableArray<unsigned int>& offsets) {
    galois::gWarn("LID to GID edge conversion is extremely inefficient at the "
                  "moment!");
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "_LID2GID_" +
                          get_run_identifier(loopName));
    galois::do_all(
        galois::iterate(size_t{0}, offsets.size()),
        [&](size_t n) {
          offsets[n] =
              static_cast<uint32_t>(userGraph.getEdgeGID(indices[offsets[n]]));
        },
#if GALOIS_COMM_STATS
        galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
        galois::no_stats());
  }

  /**
   * Converts a vector of GIDs into local ids.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName name of loop used to name timer
   * @param offsets holds GIDs to convert to LIDs
   */
  template <SyncType syncType>
  void convertGIDToLID(const std::string& loopName,
                       galois::PODResizeableArray<unsigned int>& offsets) {
    galois::gWarn("convert GID to LID used in sync call (not optimized)");
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "_GID2LID_" +
                          get_run_identifier(loopName));

    galois::do_all(
        galois::iterate(size_t{0}, offsets.size()),
        [&](size_t n) { offsets[n] = userGraph.getEdgeLID(offsets[n]); },
#if GALOIS_COMM_STATS
        galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
        galois::no_stats());
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Message prep functions (buffering, send buffer getting, etc.)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Get data that is going to be sent for synchronization and returns
   * it in a send buffer.
   *
   * @tparam syncType synchronization type
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has information needed to access bitset
   *
   * @param loopName Name to give timer
   * @param x Host to send to
   * @param b OUTPUT: Buffer that will hold data to send
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void getSendBuffer(std::string loopName, unsigned x,
                     galois::runtime::SendBuffer& b) {
    auto& sharedEdges = (syncType == syncReduce) ? mirrorEdges : masterEdges;

    if (BitsetFnTy::is_valid()) {
      syncExtract<syncType, SyncFnTy, BitsetFnTy, async>(loopName, x,
                                                         sharedEdges[x], b);
    } else {
      syncExtract<syncType, SyncFnTy, async>(loopName, x, sharedEdges[x], b);
    }

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string statSendBytes_str(syncTypeStr + "SendBytes_" +
                                  get_run_identifier(loopName));

    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
  }

  /**
   * Given data to serialize in val_vec, serialize it into the send buffer
   * depending on the mode of data communication selected for the data.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam VecType type of val_vec, which stores the data to send
   *
   * @param loopName loop name used for timers
   * @param data_mode the way that the data should be communicated
   * @param bit_set_count the number of items we are sending in this message
   * @param indices list of all edges that we are potentially interested in
   * sending things to
   * @param offsets contains indicies into "indices" that we are interested in
   * @param val_vec contains the data that we are serializing to send
   * @param b the buffer in which to serialize the message we are sending
   * to
   */
  template <bool async, SyncType syncType, typename VecType>
  void serializeMessage(std::string loopName, DataCommMode data_mode,
                        size_t bit_set_count, std::vector<size_t>& indices,
                        galois::PODResizeableArray<unsigned int>& offsets,
                        galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
                        galois::runtime::SendBuffer& b) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
        serialize_timer_str.c_str(), RNAME);
    if (data_mode == noData) {
      if (!async) {
        Tserialize.start();
        gSerialize(b, data_mode);
        Tserialize.stop();
      }
    } else if (data_mode == gidsData) {
      offsets.resize(bit_set_count);
      convertLIDToGID<syncType>(loopName, indices, offsets);
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
      Tserialize.stop();
    } else if (data_mode == offsetsData) {
      offsets.resize(bit_set_count);
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
      Tserialize.stop();
    } else if (data_mode == bitsetData) {
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);
      Tserialize.stop();
    } else { // onlyData
      Tserialize.start();
      gSerialize(b, data_mode, val_vec);
      Tserialize.stop();
    }
  }

  /**
   * Given the data mode, deserialize the rest of a message in a Receive Buffer.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam VecType type of val_vec, which data will be deserialized into
   *
   * @param loopName used to name timers for statistics
   * @param data_mode data mode with which the original message was sent;
   * determines how to deserialize the rest of the message
   * @param buf buffer which contains the received message to deserialize
   *
   * The rest of the arguments are output arguments (they are passed by
   * reference)
   *
   * @param bit_set_count Var that holds number of bits set (i.e. number of
   * node changed) after deserialization
   * @param offsets holds offsets data after deserialization if data mode is
   * offsets + data
   * @param bit_set_comm holds the bitset representing changed edges after
   * deserialization of data mode is bitset + data
   * @param buf_start
   * @param retval
   * @param val_vec The data proper will be deserialized into this vector
   */
  template <SyncType syncType, typename VecType>
  void deserializeMessage(std::string loopName, DataCommMode data_mode,
                          uint32_t num, galois::runtime::RecvBuffer& buf,
                          size_t& bit_set_count,
                          galois::PODResizeableArray<unsigned int>& offsets,
                          galois::DynamicBitSet& bit_set_comm,
                          size_t& buf_start, size_t& retval, VecType& val_vec) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string serialize_timer_str(syncTypeStr + "DeserializeMessage_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(
        serialize_timer_str.c_str(), RNAME);
    Tdeserialize.start();

    // get other metadata associated with message if mode isn't OnlyData
    if (data_mode != onlyData) {
      galois::runtime::gDeserialize(buf, bit_set_count);

      if (data_mode == gidsData) {
        galois::runtime::gDeserialize(buf, offsets);
        convertGIDToLID<syncType>(loopName, offsets);
      } else if (data_mode == offsetsData) {
        galois::runtime::gDeserialize(buf, offsets);
      } else if (data_mode == bitsetData) {
        bit_set_comm.resize(num);
        galois::runtime::gDeserialize(buf, bit_set_comm);
      } else if (data_mode == dataSplit) {
        galois::runtime::gDeserialize(buf, buf_start);
      } else if (data_mode == dataSplitFirst) {
        galois::runtime::gDeserialize(buf, retval);
      }
    }

    // get data itself
    galois::runtime::gDeserialize(buf, val_vec);

    Tdeserialize.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Other helper functions
  ////////////////////////////////////////////////////////////////////////////////
  // Requirement: For all X and Y,
  // On X, nothingToSend(Y) <=> On Y, nothingToRecv(X)
  /**
   * Determine if we have anything that we need to send to a particular host
   *
   * @param host Host number that we may or may not send to
   * @param syncType Synchronization type to determine which edges on a
   * host need to be considered
   * @returns true if there is nothing to send to a host, false otherwise
   */
  bool nothingToSend(unsigned host, SyncType syncType) {
    auto& sharedEdges = (syncType == syncReduce) ? mirrorEdges : masterEdges;
    return (sharedEdges[host].size() == 0);
  }

  /**
   * Determine if we have anything that we need to receive from a particular
   * host
   *
   * @param host Host number that we may or may not receive from
   * @param syncType Synchronization type to determine which edges on a
   * host need to be considered
   * @returns true if there is nothing to receive from a host, false otherwise
   */
  bool nothingToRecv(unsigned host, SyncType syncType) {
    auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;
    return (sharedEdges[host].size() == 0);
  }

  /**
   * Reports bytes saved by using the bitset to only selectively load data
   * to send.
   *
   * @tparam SyncFnTy synchronization structure with info needed to synchronize;
   * used for size calculation
   *
   * @param loopName loop name used for timers
   * @param syncTypeStr String used to name timers
   * @param totalToSend Total amount of edges that are potentially sent (not
   * necessarily all nodees will be sent)
   * @param bitSetCount Number of edges that will actually be sent
   * @param bitSetComm bitset used to send data
   */
  template <typename SyncFnTy>
  void reportRedundantSize(std::string loopName, std::string syncTypeStr,
                           uint32_t totalToSend, size_t bitSetCount,
                           const galois::DynamicBitSet& bitSetComm) {
    size_t redundant_size =
        (totalToSend - bitSetCount) * sizeof(typename SyncFnTy::ValTy);
    size_t bit_set_size = (bitSetComm.get_vec().size() * sizeof(uint64_t));

    if (redundant_size > bit_set_size) {
      std::string statSavedBytes_str(syncTypeStr + "SavedBytes_" +
                                     get_run_identifier(loopName));

      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, statSavedBytes_str, (redundant_size - bit_set_size));
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Extract data from edges (for reduce and broadcast)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Extracts data at provided lid.
   *
   * This version (reduce) resets the value after extract.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; determines if reset is
   * necessary
   *
   * @param lid local id of node to get data from
   * @returns data (specified by FnTy) of node with local id lid
   */
  /* Reduction extract resets the value afterwards */
  template <typename FnTy, SyncType syncType>
  inline typename FnTy::ValTy extractWrapper(size_t lid) {
    if (syncType == syncReduce) {
      auto val = FnTy::extract(lid, userGraph.getEdgeData(lid));
      FnTy::reset(lid, userGraph.getEdgeData(lid));
      return val;
    } else {
      return FnTy::extract(lid, userGraph.getEdgeData(lid));
    }
  }

  /**
   * Extracts data at provided lid; uses vecIndex to get the correct element
   * from the vector.
   *
   * This version (reduce) resets the value after extract.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; determines if reset is
   * necessary
   *
   * @param lid local id of node to get data from
   * @param vecIndex index to grab from vector in node
   * @returns data (specified by FnTy) of node with local id lid
   */
  /* Reduction extract resets the value afterwards */
  template <typename FnTy, SyncType syncType>
  inline typename FnTy::ValTy extractWrapper(size_t lid, unsigned vecIndex) {
    if (syncType == syncReduce) {
      auto val = FnTy::extract(lid, userGraph.getEdgeData(lid), vecIndex);
      FnTy::reset(lid, userGraph.getEdgeData(lid), vecIndex);
      return val;
    } else {
      return FnTy::extract(lid, userGraph.getEdgeData(lid), vecIndex);
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into val_vec.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of edges that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec OUTPUT: holds the extracted data
   * @param start Offset into val_vec to start saving data to
   */
  template <typename FnTy, SyncType syncType, bool identity_offsets = false,
            bool parallelize = true>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,
                     size_t start = 0) {
    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            size_t lid         = indices[offset];
            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else { // non-parallel version
      for (unsigned n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];

        size_t lid         = indices[offset];
        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
      }
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into val_vec. Same as above, except it has the vecIndex
   * arguments and requires vecSync to be true
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   * @tparam vecSync Only set to true if the field being synchronized is a
   * vector and synchronization is occuring element by element. MUST BE SET
   * TO TRUE IN ORDER FOR THIS FUNCTION TO COMPILE.
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of edges that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec OUTPUT: holds the extracted data
   * @param vecIndex which element of the vector to extract from node
   * @param start Offset into val_vec to start saving data to
   */
  // TODO find a better way to have this variant without code duplication
  template <typename FnTy, SyncType syncType, bool identity_offsets = false,
            bool parallelize = true, bool vecSync = false,
            typename std::enable_if<vecSync>::type* = nullptr>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,
                     unsigned vecIndex, size_t start = 0) {
    val_vec.resize(size); // resize val vec for this vecIndex

    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractValVector_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            size_t lid         = indices[offset];
            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else { // non-parallel version
      for (unsigned n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        size_t lid         = indices[offset];
        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
      }
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into a send buffer. Lazy serialize variant that works with
   * certain SeqTy.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SeqTy Type of sequence that we are getting data from
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of edges that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param b send buffer to extract data into
   * @param lseq sequence to get data from
   * @param start Offset into send buffer to start saving data to
   */
  template <typename FnTy, typename SeqTy, SyncType syncType,
            bool identity_offsets = false, bool parallelize = true>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     galois::runtime::SendBuffer& b, SeqTy lseq,
                     size_t start = 0) {
    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];

            size_t lid = indices[offset];
            gSerializeLazy(b, lseq, n - start,
                           extractWrapper<FnTy, syncType>(lid));
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else {
      for (unsigned int n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        size_t lid = indices[offset];
        gSerializeLazy(b, lseq, n - start, extractWrapper<FnTy, syncType>(lid));
      }
    }
  }

  /**
   * GPU wrap function: extracts data from edges and resets them to the
   * reduction identity value as specified by the sync structure. (Reduce only)
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to extract from
   * @param v vector to extract data to
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType>
  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
    if (syncType == syncReduce) {
      return FnTy::extract_reset_batch(x, b.getVec().data());
    } else {
      return FnTy::extract_batch(x, b.getVec().data());
    }
  }

  /**
   * GPU wrap function: extracts data from edges and resets them to the
   * reduction identity value as specified by the sync structure. (Reduce only)
   *
   * This version specifies more arguments.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to extract from
   * @param b
   * @param o
   * @param v
   * @param s
   * @param data_mode
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType>
  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                  size_t& s, DataCommMode& data_mode) {
    if (syncType == syncReduce) {
      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
    } else {
      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Reduce/sets on node (for broadcast)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Reduce variant. Takes a value and reduces it according to the sync
   * structure provided to the function.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType Reduce sync or broadcast sync
   *
   * @param lid local id of node to reduce to
   * @param val value to reduce to
   * @param bit_set_compute bitset indicating which edges have changed; updated
   * if reduction causes a change
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline void setWrapper(size_t lid, typename FnTy::ValTy val,
                         galois::DynamicBitSet& bit_set_compute) {
    if (syncType == syncReduce) {
      if (FnTy::reduce(lid, userGraph.getEdgeData(lid), val)) {
        if (bit_set_compute.size() != 0) {
          bit_set_compute.set(lid);
        }
      }
    } else {
      if (async) {
        FnTy::reduce(lid, userGraph.getEdgeData(lid), val);
      } else {
        FnTy::setVal(lid, userGraph.getEdgeData(lid), val);
        assert(FnTy::extract(lid, userGraph.getEdgeData(lid)) == val);
      }
    }
  }

  /**
   * Given data received from another host and information on which edges
   * to update, do the reduce/set of the received data to update local edges.
   *
   * Complement function, in some sense, of extractSubset.
   *
   * @tparam VecTy type of indices variable
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Reduce or broadcast
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize True if updates to edges are to be parallelized
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of edges that we are interested in
   * @param size Number of elements to set
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec holds data we will use to set
   * @param bit_set_compute bitset indicating which edges have changed
   * @param start Offset into val_vec to start saving data to
   */
  template <typename VecTy, typename FnTy, SyncType syncType, bool async,
            bool identity_offsets = false, bool parallelize = true>
  void setSubset(const std::string& loopName, const VecTy& indices, size_t size,
                 const galois::PODResizeableArray<unsigned int>& offsets,
                 galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,
                 galois::DynamicBitSet& bit_set_compute, size_t start = 0) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "SetVal_" +
                          get_run_identifier(loopName));

    if (parallelize) {
      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            auto lid = indices[offset];
            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                              bit_set_compute);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else {
      for (unsigned int n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        auto lid = indices[offset];
        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                          bit_set_compute);
      }
    }
  }

  /**
   * GPU wrapper function to reduce multiple edges at once.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to set
   * @param v
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
    if (syncType == syncReduce) {
      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
    } else {
      if (async) {
        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
      } else {
        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
      }
    }
  }

  /**
   * GPU wrapper function to reduce multiple edges at once. More detailed
   * arguments.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to set
   * @param b
   * @param o
   * @param v
   * @param s
   * @param data_mode
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                              DataCommMode& data_mode) {
    if (syncType == syncReduce) {
      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
                                data_mode);
    } else {
      if (async) {
        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
                                         data_mode);
      } else {
        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
                                  data_mode);
      }
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Sends
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Non-bitset extract that uses serializelazy to copy data over to the
   * buffer. REQUIRES that the ValTy be memory copyable.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of edges that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <SyncType syncType, typename SyncFnTy, bool async,
            typename std::enable_if<galois::runtime::is_memory_copyable<
                typename SyncFnTy::ValTy>::value>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    uint32_t num = indices.size();
    static galois::PODResizeableArray<typename SyncFnTy::ValTy>
        val_vec; // sometimes wasteful
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    Textract.start();

    if (num > 0) {
      data_mode = onlyData;
      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +
                (num * sizeof(typename SyncFnTy::ValTy)));

      Textractbatch.start();
      bool batch_succeeded =
          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
      Textractbatch.stop();

      if (!batch_succeeded) {
        b.resize(0);
        val_vec.reserve(maxSharedSize);
        val_vec.resize(num);
        gSerialize(b, onlyData);
        auto lseq = gSerializeLazySeq(
            b, num,
            (galois::PODResizeableArray<typename SyncFnTy::ValTy>*)nullptr);
        extractSubset<SyncFnTy, decltype(lseq), syncType, true, true>(
            loopName, indices, num, offsets, b, lseq);
      } else {
        b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                 (num * sizeof(typename SyncFnTy::ValTy)));
      }
    } else {
      b.resize(0);
      if (!async) {
        data_mode = noData;
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

  /**
   * Non-bitset extract for when the type of the item being sync'd isn't
   * memory copyable.
   *
   * Extracts all of the data for all edges in indices and saves it into
   * a send buffer for return.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of edges that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <SyncType syncType, typename SyncFnTy, bool async,
            typename std::enable_if<!galois::runtime::is_memory_copyable<
                typename SyncFnTy::ValTy>::value>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    uint32_t num = indices.size();
    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;
    static galois::PODResizeableArray<unsigned int> dummyVector;

    Textract.start();

    if (num > 0) {
      data_mode = onlyData;
      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +
                (num * sizeof(typename SyncFnTy::ValTy)));

      Textractbatch.start();
      bool batch_succeeded =
          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
      Textractbatch.stop();

      if (!batch_succeeded) {
        b.resize(0);
        val_vec.reserve(maxSharedSize);
        val_vec.resize(num);
        // get everything (note I pass in "indices" as offsets as it won't
        // even get used anyways)
        extractSubset<SyncFnTy, syncType, true, true>(loopName, indices, num,
                                                      dummyVector, val_vec);
        gSerialize(b, onlyData, val_vec);
      } else {
        b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                 (num * sizeof(typename SyncFnTy::ValTy)));
      }

    } else {
      b.resize(0);
      if (!async) {
        data_mode = noData;
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

  /**
   * Extracts the data that will be sent to a host in this round of
   * synchronization based on the passed in bitset and saves it to a
   * send buffer.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   * being used for the extraction
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of edges that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
    uint64_t manualBitsetCount                   = bit_set_compute.count();
    uint32_t num                                 = indices.size();
    galois::DynamicBitSet& bit_set_comm          = syncBitset;
    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
        extract_alloc_timer_str.c_str(), RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    Textract.start();

    if (num > 0 && manualBitsetCount > 0) {
      // if (num > 0) {
      size_t bit_set_count = 0;
      Textractalloc.start();
      if (substrateDataMode == gidsData) {
        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +
                  sizeof(size_t) + (num * sizeof(unsigned int)) +
                  sizeof(size_t) + (num * sizeof(typename SyncFnTy::ValTy)));
      } else if (substrateDataMode == offsetsData) {
        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +
                  sizeof(size_t) + (num * sizeof(unsigned int)) +
                  sizeof(size_t) + (num * sizeof(typename SyncFnTy::ValTy)));
      } else if (substrateDataMode == bitsetData) {
        size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +
                  sizeof(size_t)   // bitset size
                  + sizeof(size_t) // bitset vector size
                  + bitset_alloc_size + sizeof(size_t) +
                  (num * sizeof(typename SyncFnTy::ValTy)));
      } else { // onlyData or noData (auto)
        size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +
                  sizeof(size_t)   // bitset size
                  + sizeof(size_t) // bitset vector size
                  + bitset_alloc_size + sizeof(size_t) +
                  (num * sizeof(typename SyncFnTy::ValTy)));
      }
      Textractalloc.stop();

      Textractbatch.start();
      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
          from_id, b, bit_set_count, data_mode);
      Textractbatch.stop();

      // GPUs have a batch function they can use; CPUs do not; therefore,
      // CPUS always enter this if block
      if (!batch_succeeded) {
        Textractalloc.start();
        b.resize(0);
        bit_set_comm.reserve(maxSharedSize);
        offsets.reserve(maxSharedSize);
        val_vec.reserve(maxSharedSize);
        bit_set_comm.resize(num);
        offsets.resize(num);
        val_vec.resize(num);
        Textractalloc.stop();

        getBitsetAndOffsets<SyncFnTy, syncType>(
            loopName, indices, bit_set_compute, bit_set_comm, offsets,
            bit_set_count, data_mode);

        if (data_mode == onlyData) {
          bit_set_count = indices.size();
          extractSubset<SyncFnTy, syncType, true, true>(
              loopName, indices, bit_set_count, offsets, val_vec);
        } else if (data_mode !=
                   noData) { // bitsetData or offsetsData or gidsData
          extractSubset<SyncFnTy, syncType, false, true>(
              loopName, indices, bit_set_count, offsets, val_vec);
        }
        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
                                          indices, offsets, bit_set_comm,
                                          val_vec, b);
      } else {
        if (data_mode == noData) {
          b.resize(0);
          if (!async) {
            gSerialize(b, data_mode);
          }
        } else if (data_mode == gidsData) {
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
                   sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else if (data_mode == offsetsData) {
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
                   sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else if (data_mode == bitsetData) {
          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t)   // bitset size
                   + sizeof(size_t) // bitset vector size
                   + bitset_alloc_size + sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else { // onlyData
          b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                   (num * sizeof(typename SyncFnTy::ValTy)));
        }
      }

      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
                                    bit_set_comm);
    } else {
      b.resize(0);
      if (!async) {
        data_mode = noData;
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

#ifdef GALOIS_USE_BARE_MPI
  /**
   * Sync using MPI instead of network layer.
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_send(std::string loopName) {
    static std::vector<galois::runtime::SendBuffer> b;
    static std::vector<MPI_Request> request;
    b.resize(numHosts);
    request.resize(numHosts, MPI_REQUEST_NULL);

    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType))
        continue;

      int ready = 0;
      MPI_Test(&request[x], &ready, MPI_STATUS_IGNORE);
      if (!ready) {
        assert(b[x].size() > 0);
        MPI_Wait(&request[x], MPI_STATUS_IGNORE);
      }
      if (b[x].size() > 0) {
        b[x].getVec().clear();
      }

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy>(loopName, x, b[x]);

      MPI_Isend((uint8_t*)b[x].linearData(), b[x].size(), MPI_BYTE, x, 32767,
                MPI_COMM_WORLD, &request[x]);
    }

    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {
      reset_bitset(&BitsetFnTy::reset_range);
    }
  }

  /**
   * Sync put using MPI instead of network layer
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_put(std::string loopName, const MPI_Group& mpi_access_group,
                    const std::vector<MPI_Win>& window) {

    MPI_Win_start(mpi_access_group, 0, window[id]);

    std::vector<galois::runtime::SendBuffer> b(numHosts);
    std::vector<size_t> size(numHosts);
    uint64_t send_buffers_size = 0;

    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType))
        continue;

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy>(loopName, x, b[x]);

      size[x] = b[x].size();
      send_buffers_size += size[x];
      MPI_Put((uint8_t*)&size[x], sizeof(size_t), MPI_BYTE, x, 0,
              sizeof(size_t), MPI_BYTE, window[id]);
      MPI_Put((uint8_t*)b[x].linearData(), size[x], MPI_BYTE, x, sizeof(size_t),
              size[x], MPI_BYTE, window[id]);
    }

    auto& net = galois::runtime::getSystemNetworkInterface();
    net.incrementMemUsage(send_buffers_size);

    MPI_Win_complete(window[id]);
    net.decrementMemUsage(send_buffers_size);

    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {
      reset_bitset(&BitsetFnTy::reset_range);
    }
  }
#endif

  /**
   * Sends data to all hosts (if there is anything that needs to be sent
   * to that particular host) and adjusts bitset according to sync type.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has information needed to access bitset
   *
   * @param loopName used to name timers created by this sync send
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            bool async>
  void syncNetSend(std::string loopName) {
    static galois::runtime::SendBuffer
        b; // although a static variable, allocation not reused
           // due to std::move in net.sendTagged()

    auto& net               = galois::runtime::getSystemNetworkInterface();
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string statNumMessages_str(syncTypeStr + "NumMessages_" +
                                    get_run_identifier(loopName));

    size_t numMessages = 0;
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType))
        continue;

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, async>(loopName, x, b);

      if ((!async) || (b.size() > 0)) {
        size_t syncTypePhase = 0;
        if (async && (syncType == syncBroadcast))
          syncTypePhase = 1;
        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
        ++numMessages;
      }
    }
    if (!async) {
      // Will force all messages to be processed before continuing
      net.flush();
    }

    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {
      reset_bitset(&BitsetFnTy::reset_range);
    }

    galois::runtime::reportStat_Tsum(RNAME, statNumMessages_str, numMessages);
  }

  /**
   * Sends data over the network to other hosts based on the provided template
   * arguments.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            bool async>
  void syncSend(std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);

    TSendTime.start();
    syncNetSend<syncType, SyncFnTy, BitsetFnTy, async>(loopName);
    TSendTime.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Receives
  ////////////////////////////////////////////////////////////////////////////////

  /**
   * Deserializes messages from other hosts and applies them to update local
   * data based on the provided sync structures.
   *
   * Complement of syncExtract.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param from_id ID of host which the message we are processing was received
   * from
   * @param buf Buffer that contains received message from other host
   * @param loopName used to name timers for statistics
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,
                       std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string set_timer_str(syncTypeStr + "Set_" +
                              get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
        set_batch_timer_str.c_str(), RNAME);

    galois::DynamicBitSet& bit_set_comm = syncBitset;
    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;
    uint32_t num      = sharedEdges[from_id].size();
    size_t retval     = 0;
    Tset.start();

    if (num > 0) { // only enter if we expect message from that host
      DataCommMode data_mode;
      // 1st deserialize gets data mode
      galois::runtime::gDeserialize(buf, data_mode);

      if (data_mode != noData) {
        // GPU update call
        Tsetbatch.start();
        bool batch_succeeded =
            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
        Tsetbatch.stop();

        // cpu always enters this block
        if (!batch_succeeded) {
          size_t bit_set_count = num;
          size_t buf_start     = 0;

          // deserialize the rest of the data in the buffer depending on the
          // data mode; arguments passed in here are mostly output vars
          deserializeMessage<syncType>(loopName, data_mode, num, buf,
                                       bit_set_count, offsets, bit_set_comm,
                                       buf_start, retval, val_vec);

          bit_set_comm.reserve(maxSharedSize);
          offsets.reserve(maxSharedSize);
          val_vec.reserve(maxSharedSize);

          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();

          if (data_mode == bitsetData) {
            size_t bit_set_count2;
            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
                                           bit_set_count2);
            assert(bit_set_count == bit_set_count2);
          }

          if (data_mode == onlyData) {
            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,
                      true, true>(loopName, sharedEdges[from_id], bit_set_count,
                                  offsets, val_vec, bit_set_compute);
          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,
                      true, true>(loopName, sharedEdges[from_id], bit_set_count,
                                  offsets, val_vec, bit_set_compute, buf_start);
          } else if (data_mode == gidsData) {
            setSubset<decltype(offsets), SyncFnTy, syncType, async, true, true>(
                loopName, offsets, bit_set_count, offsets, val_vec,
                bit_set_compute);
          } else { // bitsetData or offsetsData
            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,
                      false, true>(loopName, sharedEdges[from_id],
                                   bit_set_count, offsets, val_vec,
                                   bit_set_compute);
          }
          // TODO: reduce could update the bitset, so it needs to be copied
          // back to the device
        }
      }
    }

    Tset.stop();

    return retval;
  }

#ifdef GALOIS_USE_BARE_MPI
  /**
   * MPI Irecv wrapper for sync
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_recv_post(std::string loopName,
                          std::vector<MPI_Request>& request,
                          const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType))
        continue;

      MPI_Irecv((uint8_t*)rb[x].data(), rb[x].size(), MPI_BYTE, x, 32767,
                MPI_COMM_WORLD, &request[x]);
    }
  }

  /**
   * MPI receive wrapper for sync
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_recv_wait(std::string loopName,
                          std::vector<MPI_Request>& request,
                          const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType))
        continue;

      MPI_Status status;
      MPI_Wait(&request[x], &status);

      int size = 0;
      MPI_Get_count(&status, MPI_BYTE, &size);

      galois::runtime::RecvBuffer rbuf(rb[x].begin(), rb[x].begin() + size);

      syncRecvApply<syncType, SyncFnTy, BitsetFnTy>(x, rbuf, loopName);
    }
  }

  /**
   * MPI get wrapper for sync
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_get(std::string loopName, const std::vector<MPI_Win>& window,
                    const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType))
        continue;

      MPI_Win_wait(window[x]);

      size_t size = 0;
      memcpy(&size, rb[x].data(), sizeof(size_t));

      galois::runtime::RecvBuffer rbuf(rb[x].begin() + sizeof(size_t),
                                       rb[x].begin() + sizeof(size_t) + size);

      MPI_Win_post(mpi_identity_groups[x], 0, window[x]);

      syncRecvApply<syncType, SyncFnTy, BitsetFnTy>(x, rbuf, loopName);
    }
  }
#endif

  /**
   * Determines if there is anything to receive from a host and receives/applies
   * the messages.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            bool async>
  void syncNetRecv(std::string loopName) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),
                                                   RNAME);

    if (async) {
      size_t syncTypePhase = 0;
      if (syncType == syncBroadcast)
        syncTypePhase = 1;
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
                                 syncTypePhase)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
                              syncTypePhase);

        if (p) {
          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(
              p->first, p->second, loopName);
        }
      } while (p);
    } else {
      for (unsigned x = 0; x < numHosts; ++x) {
        if (x == id)
          continue;
        if (nothingToRecv(x, syncType))
          continue;

        Twait.start();
        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
        do {
          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
        } while (!p);
        Twait.stop();

        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(
            p->first, p->second, loopName);
      }
      incrementEvilPhase();
    }
  }

  /**
   * Receives messages from all other hosts and "applies" the message (reduce
   * or set) based on the sync structure provided.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            bool async>
  void syncRecv(std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    TRecvTime.start();
    syncNetRecv<syncType, SyncFnTy, BitsetFnTy, async>(loopName);
    TRecvTime.stop();
  }

////////////////////////////////////////////////////////////////////////////////
// MPI sync variants
////////////////////////////////////////////////////////////////////////////////
#ifdef GALOIS_USE_BARE_MPI
  /**
   * Nonblocking MPI sync
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void syncNonblockingMPI(std::string loopName,
                          bool use_bitset_to_send = true) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    static std::vector<std::vector<uint8_t>> rb;
    static std::vector<MPI_Request> request;

    if (rb.size() == 0) { // create the receive buffers
      TRecvTime.start();
      auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;
      rb.resize(numHosts);
      request.resize(numHosts, MPI_REQUEST_NULL);

      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + numHosts - h) % numHosts;
        if (nothingToRecv(x, syncType))
          continue;

        size_t size =
            (sharedEdges[x].size() * sizeof(typename SyncFnTy::ValTy));
        size += sizeof(size_t);       // vector size
        size += sizeof(DataCommMode); // data mode

        rb[x].resize(size);
      }
      TRecvTime.stop();
    }

    TRecvTime.start();
    sync_mpi_recv_post<syncType, SyncFnTy, BitsetFnTy>(loopName, request, rb);
    TRecvTime.stop();

    TSendTime.start();
    if (use_bitset_to_send) {
      sync_mpi_send<syncType, SyncFnTy, BitsetFnTy>(loopName);
    } else {
      sync_mpi_send<syncType, SyncFnTy, galois::InvalidBitsetFnTy>(loopName);
    }
    TSendTime.stop();

    TRecvTime.start();
    sync_mpi_recv_wait<syncType, SyncFnTy, BitsetFnTy>(loopName, request, rb);
    TRecvTime.stop();
  }

  /**
   * Onesided MPI sync
   */
  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void syncOnesidedMPI(std::string loopName, bool use_bitset_to_send = true) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    static std::vector<MPI_Win> window;
    static MPI_Group mpi_access_group;
    static std::vector<std::vector<uint8_t>> rb;

    if (window.size() == 0) { // create the windows
      TRecvTime.start();
      auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;
      window.resize(numHosts);
      rb.resize(numHosts);

      uint64_t recv_buffers_size = 0;
      for (unsigned x = 0; x < numHosts; ++x) {
        size_t size = sharedEdges[x].size() * sizeof(typename SyncFnTy::ValTy);
        size += sizeof(size_t);       // vector size
        size += sizeof(DataCommMode); // data mode
        size += sizeof(size_t);       // buffer size
        recv_buffers_size += size;

        rb[x].resize(size);

        MPI_Info info;
        MPI_Info_create(&info);
        MPI_Info_set(info, "no_locks", "true");
        MPI_Info_set(info, "same_disp_unit", "true");

        MPI_Win_create(rb[x].data(), size, 1, info, MPI_COMM_WORLD, &window[x]);

        MPI_Info_free(&info);
      }
      auto& net = galois::runtime::getSystemNetworkInterface();
      net.incrementMemUsage(recv_buffers_size);

      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + numHosts - h) % numHosts;
        if (nothingToRecv(x, syncType))
          continue;
        // exposure group of each window is same as identity group of that
        // window
        MPI_Win_post(mpi_identity_groups[x], 0, window[x]);
      }
      TRecvTime.stop();

      TSendTime.start();
      std::vector<int> access_hosts;
      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + h) % numHosts;

        if (nothingToSend(x, syncType))
          continue;

        access_hosts.push_back(x);
      }
      MPI_Group world_group;
      MPI_Comm_group(MPI_COMM_WORLD, &world_group);
      // access group for only one window since only one window is accessed
      MPI_Group_incl(world_group, access_hosts.size(), access_hosts.data(),
                     &mpi_access_group);
      TSendTime.stop();
    }

    TSendTime.start();
    if (use_bitset_to_send) {
      sync_mpi_put<syncType, SyncFnTy, BitsetFnTy>(loopName, mpi_access_group,
                                                   window);
    } else {
      sync_mpi_put<syncType, SyncFnTy, galois::InvalidBitsetFnTy>(
          loopName, mpi_access_group, window);
    }
    TSendTime.stop();

    TRecvTime.start();
    sync_mpi_get<syncType, SyncFnTy, BitsetFnTy>(loopName, window, rb);
    TRecvTime.stop();
  }
#endif

  ////////////////////////////////////////////////////////////////////////////////
  // Higher Level Sync Calls (broadcast/reduce, etc)
  ////////////////////////////////////////////////////////////////////////////////

  /**
   * Does a reduction of data from mirror edges to master edges.
   *
   * @tparam ReduceFnTy reduce sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename ReduceFnTy, typename BitsetFnTy, bool async>
  inline void reduce(std::string loopName) {
    std::string timer_str("Reduce_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                         RNAME);
    TsyncReduce.start();

#ifdef GALOIS_USE_BARE_MPI
    switch (bare_mpi) {
    case noBareMPI:
#endif
      syncSend<syncReduce, ReduceFnTy, BitsetFnTy, async>(loopName);
      syncRecv<syncReduce, ReduceFnTy, BitsetFnTy, async>(loopName);
#ifdef GALOIS_USE_BARE_MPI
      break;
    case nonBlockingBareMPI:
      syncNonblockingMPI<syncReduce, ReduceFnTy, BitsetFnTy>(loopName);
      break;
    case oneSidedBareMPI:
      syncOnesidedMPI<syncReduce, ReduceFnTy, BitsetFnTy>(loopName);
      break;
    default:
      GALOIS_DIE("unsupported bare MPI");
    }
#endif

    TsyncReduce.stop();
  }

  /**
   * Does a broadcast of data from master to mirror edges.
   *
   * @tparam BroadcastFnTy broadcast sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename BroadcastFnTy, typename BitsetFnTy, bool async>
  inline void broadcast(std::string loopName) {
    std::string timer_str("Broadcast_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
                                                            RNAME);

    TsyncBroadcast.start();

    bool use_bitset = true;

#ifdef GALOIS_USE_BARE_MPI
    switch (bare_mpi) {
    case noBareMPI:
#endif
      if (use_bitset) {
        syncSend<syncBroadcast, BroadcastFnTy, BitsetFnTy, async>(loopName);
      } else {
        syncSend<syncBroadcast, BroadcastFnTy, galois::InvalidBitsetFnTy,
                 async>(loopName);
      }
      syncRecv<syncBroadcast, BroadcastFnTy, BitsetFnTy, async>(loopName);
#ifdef GALOIS_USE_BARE_MPI
      break;
    case nonBlockingBareMPI:
      syncNonblockingMPI<syncBroadcast, BroadcastFnTy, BitsetFnTy>(loopName,
                                                                   use_bitset);
      break;
    case oneSidedBareMPI:
      syncOnesidedMPI<syncBroadcast, BroadcastFnTy, BitsetFnTy>(loopName,
                                                                use_bitset);
      break;
    default:
      GALOIS_DIE("unsupported bare MPI");
    }
#endif

    TsyncBroadcast.stop();
  }

  /**
   * Do sync necessary for write any, read any.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_any_to_any(std::string loopName) {
    // reduce and broadcast for OEC, IEC, CVC, UVC
    reduce<SyncFnTy, BitsetFnTy, async>(loopName);
    broadcast<SyncFnTy, BitsetFnTy, async>(loopName);
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Public iterface: sync
  ////////////////////////////////////////////////////////////////////////////////

public:
  /**
   * Main sync call exposed to the user that calls the correct sync function
   * based on provided template arguments. Must provide information through
   * structures on how to do synchronization/which fields to synchronize.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
            bool async = false>
  inline void sync(std::string loopName) {
    std::string timer_str("Sync_" + loopName + "_" + get_run_identifier());
    galois::StatTimer Tsync(timer_str.c_str(), RNAME);

    Tsync.start();
    sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
    Tsync.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // GPU marshaling
  ////////////////////////////////////////////////////////////////////////////////

#ifdef GALOIS_ENABLE_GPU
private:
  using GraphNode     = typename GraphTy::GraphNode;
  using edge_iterator = typename GraphTy::edge_iterator;
  using EdgeTy        = typename GraphTy::EdgeType;

  // Code that handles getting the graph onto the GPU
  template <bool isVoidType,
            typename std::enable_if<isVoidType>::type* = nullptr>
  inline void setMarshalEdge(EdgeMarshalGraph& GALOIS_UNUSED(m),
                             const size_t GALOIS_UNUSED(index),
                             const edge_iterator& GALOIS_UNUSED(e)) {
    // do nothing
  }

  template <bool isVoidType,
            typename std::enable_if<!isVoidType>::type* = nullptr>
  inline void setMarshalEdge(EdgeMarshalGraph& m, const size_t index,
                             const edge_iterator& e) {
    m.edge_data[index] = userGraph.getEdgeData(e);
  }

public:
  void getEdgeMarshalGraph(EdgeMarshalGraph& m, bool loadProxyEdges = true) {
    m.nnodes   = userGraph.size();
    m.nedges   = userGraph.sizeEdges();
    m.numOwned = userGraph.numMasters();
    //// Assumption: master occurs at beginning in contiguous range
    m.beginMaster       = 0;
    m.numNodesWithEdges = userGraph.getNumNodesWithEdges();
    m.id                = id;
    m.numHosts          = numHosts;
    m.row_start         = (index_type*)calloc(m.nnodes + 1, sizeof(index_type));
    m.edge_dst          = (index_type*)calloc(m.nedges, sizeof(index_type));
    m.node_data         = (index_type*)calloc(m.nnodes, sizeof(node_data_type));

    //// TODO deal with edgety
    if (std::is_void<EdgeTy>::value) {
      m.edge_data = NULL;
    } else {
      if (!std::is_same<EdgeTy, edge_data_type>::value) {
        galois::gWarn("Edge data type mismatch between CPU and GPU\n");
      }
      m.edge_data = (edge_data_type*)calloc(m.nedges, sizeof(edge_data_type));
    }

    galois::do_all(
        // TODO not using thread ranges, can be optimized if I can iterate
        // directly over userGraph
        galois::iterate(userGraph.allNodesRange()),
        [&](const GraphNode& nodeID) {
          // initialize node_data with localID-to-globalID mapping
          m.node_data[nodeID] =
              userGraph.getGID(nodeID); // this may not be required.
          m.row_start[nodeID] = *(userGraph.edge_begin(nodeID));
          for (auto e = userGraph.edge_begin(nodeID);
               e != userGraph.edge_end(nodeID); e++) {
            auto edgeID = *e;
            setMarshalEdge<std::is_void<EdgeTy>::value>(m, edgeID, e);
            m.edge_dst[edgeID] = userGraph.getEdgeDst(e);
          }
        },
        galois::steal());

    m.row_start[m.nnodes] = m.nedges;

    // TODO?
    // copy memoization meta-data
    if (loadProxyEdges) {
      m.num_master_edges =
          (unsigned int*)calloc(masterEdges.size(), sizeof(unsigned int));
      ;
      m.master_edges =
          (unsigned int**)calloc(masterEdges.size(), sizeof(unsigned int*));
      ;

      for (uint32_t h = 0; h < masterEdges.size(); ++h) {
        m.num_master_edges[h] = masterEdges[h].size();

        if (masterEdges[h].size() > 0) {
          m.master_edges[h] = (unsigned int*)calloc(masterEdges[h].size(),
                                                    sizeof(unsigned int));
          ;
          std::copy(masterEdges[h].begin(), masterEdges[h].end(),
                    m.master_edges[h]);
        } else {
          m.master_edges[h] = NULL;
        }
      }

      m.num_mirror_edges =
          (unsigned int*)calloc(mirrorEdges.size(), sizeof(unsigned int));
      ;
      m.mirror_edges =
          (unsigned int**)calloc(mirrorEdges.size(), sizeof(unsigned int*));
      ;
      for (uint32_t h = 0; h < mirrorEdges.size(); ++h) {
        m.num_mirror_edges[h] = mirrorEdges[h].size();

        if (mirrorEdges[h].size() > 0) {
          m.mirror_edges[h] = (unsigned int*)calloc(mirrorEdges[h].size(),
                                                    sizeof(unsigned int));
          ;
          std::copy(mirrorEdges[h].begin(), mirrorEdges[h].end(),
                    m.mirror_edges[h]);
        } else {
          m.mirror_edges[h] = NULL;
        }
      }
    }

    //// user needs to provide method of freeing up graph (it can do nothing
    //// if they wish)
    // userGraph.deallocate();
  }
#endif // het galois def

public:
  ////////////////////////////////////////////////////////////////////////////////
  // Metadata settings/getters
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Set the run number.
   *
   * @param runNum Number to set the run to
   */
  inline void set_num_run(const uint32_t runNum) { num_run = runNum; }

  /**
   * Get the set run number.
   *
   * @returns The set run number saved in the graph
   */
  inline uint32_t get_run_num() const { return num_run; }

  /**
   * Set the round number for use in the run identifier.
   *
   * @param round round number to set to
   */
  inline void set_num_round(const uint32_t round) { num_round = round; }

  /**
   * Get a run identifier using the set run and set round.
   *
   * @returns a string run identifier
   * @deprecated We want to move away from calling this by itself; use ones
   * that take an argument; will be removed once we eliminate all instances
   * of its use from code
   */
  inline std::string get_run_identifier() const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::to_string(num_run) + "_" +
                       std::to_string(num_round));
#else
    return std::string(std::to_string(num_run));
#endif
  }

  /**
   * Get a run identifier using the set run and set round and
   * append to the passed in string.
   *
   * @param loop_name String to append the run identifier
   * @returns String with run identifier appended to passed in loop name
   */
  inline std::string get_run_identifier(std::string loop_name) const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::string(loop_name) + "_" + std::to_string(num_run) +
                       "_" + std::to_string(num_round));
#else
    return std::string(std::string(loop_name) + "_" + std::to_string(num_run));
#endif
  }

  /**
   * Get a run identifier using the set run and set round and
   * append to the passed in string in addition to the number identifier passed
   * in.
   *
   * @param loop_name String to append the run identifier
   * @param alterID another ID with which to add to the timer name.
   *
   * @returns String with run identifier appended to passed in loop name +
   * alterID
   */
  inline std::string get_run_identifier(std::string loop_name,
                                        unsigned alterID) const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::string(loop_name) + "_" + std::to_string(alterID) +
                       "_" + std::to_string(num_run) + "_" +
                       std::to_string(num_round));
#else
    return std::string(std::string(loop_name) + "_" + std::to_string(alterID) +
                       "_" + std::to_string(num_run));
#endif
  }
};

template <typename GraphTy>
constexpr const char* const galois::graphs::GluonEdgeSubstrate<GraphTy>::RNAME;
} // end namespace graphs
} // end namespace galois

#endif // header guard


================================================
FILE: libgluon/include/galois/graphs/GluonSubstrate.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file GluonSubstrate.h
 *
 * Contains the implementation for GluonSubstrate.
 */

#ifndef _GALOIS_GLUONSUB_H_
#define _GALOIS_GLUONSUB_H_

#include <unordered_map>
#include <fstream>

#include "galois/runtime/GlobalObj.h"
#include "galois/runtime/DistStats.h"
#include "galois/runtime/SyncStructures.h"
#include "galois/runtime/DataCommMode.h"
#include "galois/DynamicBitset.h"

#ifdef GALOIS_ENABLE_GPU
#include "galois/cuda/HostDecls.h"
#endif

#include "galois/runtime/BareMPI.h"

// TODO find a better way to do this without globals
//! Specifies what format to send metadata in
extern DataCommMode enforcedDataMode;

#ifdef GALOIS_USE_BARE_MPI
extern BareMPI bare_mpi;
#endif

//! Enumeration for specifiying write location for sync calls
enum WriteLocation {
  //! write at source
  writeSource,
  //! write at destination
  writeDestination,
  //! write at source and/or destination
  writeAny
};
//! Enumeration for specifiying read location for sync calls
enum ReadLocation {
  //! read at source
  readSource,
  //! read at destination
  readDestination,
  //! read at source and/or destination
  readAny
};

namespace galois {
namespace graphs {

/**
 * Gluon communication substrate that handles communication given a user graph.
 * User graph should provide certain things the substrate expects.
 *
 * TODO documentation on expected things
 *
 * @tparam GraphTy User graph to handle communication for
 */
template <typename GraphTy>
class GluonSubstrate : public galois::runtime::GlobalObject {
private:
  //! Synchronization type
  enum SyncType {
    syncReduce,   //!< Reduction sync
    syncBroadcast //!< Broadcast sync
  };

  //! Graph name used for printing things
  constexpr static const char* const RNAME = "Gluon";

  //! The graph to handle communication for
  GraphTy& userGraph;
  const unsigned id; //!< Copy of net.ID, which is the ID of the machine.
  bool transposed;   //!< Marks if passed in graph is transposed or not.
  bool isVertexCut;  //!< Marks if passed in graph's partitioning is vertex cut.
  std::pair<unsigned, unsigned> cartesianGrid; //!< cartesian grid (if any)
  bool partitionAgnostic; //!< true if communication should ignore partitioning
  DataCommMode substrateDataMode; //!< datamode to enforce
  const uint32_t
      numHosts;     //!< Copy of net.Num, which is the total number of machines
  uint32_t num_run; //!< Keep track of number of runs.
  uint32_t num_round; //!< Keep track of number of rounds.
  bool isCartCut;     //!< True if graph is a cartesian cut

  // bitvector status hasn't been maintained
  //! Typedef used so galois::runtime::BITVECTOR_STATUS doesn't have to be
  //! written
  using BITVECTOR_STATUS = galois::runtime::BITVECTOR_STATUS;
  //! A pointer set during syncOnDemand calls that points to the status
  //! of a bitvector with regard to where data has been synchronized
  //! @todo pass the flag as function paramater instead
  BITVECTOR_STATUS* currentBVFlag;

  // memoization optimization
  //! Master nodes on different hosts. For broadcast;
  std::vector<std::vector<size_t>> masterNodes;
  //! Mirror nodes on different hosts. For reduce; comes from the user graph
  //! during initialization (we expect user to give to us)
  std::vector<std::vector<size_t>>& mirrorNodes;
  //! Maximum size of master or mirror nodes on different hosts
  size_t maxSharedSize;

#ifdef GALOIS_USE_BARE_MPI
  std::vector<MPI_Group> mpi_identity_groups;
#endif
  // Used for efficient comms
  galois::DynamicBitSet syncBitset;
  galois::PODResizeableArray<unsigned int> syncOffsets;

  /**
   * Reset a provided bitset given the type of synchronization performed
   *
   * @param syncType Type of synchronization to consider when doing reset
   * @param bitset_reset_range Function to reset range with
   */
  void reset_bitset(SyncType syncType,
                    void (*bitset_reset_range)(size_t, size_t)) {
    size_t numMasters = userGraph.numMasters();
    if (numMasters > 0) {
      // note this assumes masters are from 0 -> a number; CuSP should
      // do this automatically
      if (syncType == syncBroadcast) { // reset masters
        bitset_reset_range(0, numMasters - 1);
      } else {
        assert(syncType == syncReduce);
        // mirrors occur after masters
        if (numMasters < userGraph.size()) {
          bitset_reset_range(numMasters, userGraph.size() - 1);
        }
      }
    } else { // all things are mirrors
      // only need to reset if reduce
      if (syncType == syncReduce) {
        if (userGraph.size() > 0) {
          bitset_reset_range(0, userGraph.size() - 1);
        }
      }
    }
  }

  //! Increments evilPhase, a phase counter used by communication.
  void inline incrementEvilPhase() {
    ++galois::runtime::evilPhase;
    // limit defined by MPI or LCI
    if (galois::runtime::evilPhase >=
        static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {
      galois::runtime::evilPhase = 1;
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Proxy communication setup
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Let other hosts know about which host has what mirrors/masters;
   * used for later communication of mirrors/masters.
   */
  void exchangeProxyInfo() {
    auto& net = galois::runtime::getSystemNetworkInterface();

    // send off the mirror nodes
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      galois::runtime::SendBuffer b;
      gSerialize(b, mirrorNodes[x]);
      net.sendTagged(x, galois::runtime::evilPhase, b);
    }

    // receive the mirror nodes
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      galois::runtime::gDeserialize(p->second, masterNodes[p->first]);
    }
    incrementEvilPhase();
  }

  /**
   * Send statistics about master/mirror nodes to each host, and
   * report the statistics.
   */
  void sendInfoToHost() {
    auto& net = galois::runtime::getSystemNetworkInterface();

    uint64_t global_total_mirror_nodes =
        userGraph.size() - userGraph.numMasters();
    uint64_t global_total_owned_nodes = userGraph.numMasters();

    // send info to host
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      galois::runtime::SendBuffer b;
      gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes);
      net.sendTagged(x, galois::runtime::evilPhase, b);
    }

    // receive
    for (unsigned x = 0; x < numHosts; ++x) {
      if (x == id)
        continue;

      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
      } while (!p);

      uint64_t total_mirror_nodes_from_others;
      uint64_t total_owned_nodes_from_others;
      galois::runtime::gDeserialize(p->second, total_mirror_nodes_from_others,
                                    total_owned_nodes_from_others);
      global_total_mirror_nodes += total_mirror_nodes_from_others;
      global_total_owned_nodes += total_owned_nodes_from_others;
    }
    incrementEvilPhase();

    assert(userGraph.globalSize() == global_total_owned_nodes);
    // report stats
    if (net.ID == 0) {
      reportProxyStats(global_total_mirror_nodes, global_total_owned_nodes);
    }
  }

  /**
   * Sets up the communication between the different hosts that contain
   * different parts of the graph by exchanging master/mirror information.
   */
  void setupCommunication() {
    galois::CondStatTimer<MORE_DIST_STATS> Tcomm_setup("CommunicationSetupTime",
                                                       RNAME);

    // barrier so that all hosts start the timer together
    galois::runtime::getHostBarrier().wait();

    Tcomm_setup.start();

    // Exchange information for memoization optimization.
    exchangeProxyInfo();
    // convert the global ids stored in the master/mirror nodes arrays to local
    // ids
    // TODO: use 32-bit distinct vectors for masters and mirrors from here on
    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
      galois::do_all(
          galois::iterate(size_t{0}, masterNodes[h].size()),
          [&](size_t n) {
            masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier("MasterNodes").c_str()),
#endif
          galois::no_stats());
    }

    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
      galois::do_all(
          galois::iterate(size_t{0}, mirrorNodes[h].size()),
          [&](size_t n) {
            mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier("MirrorNodes").c_str()),
#endif
          galois::no_stats());
    }

    Tcomm_setup.stop();

    maxSharedSize = 0;
    // report masters/mirrors to/from other hosts as statistics
    for (auto x = 0U; x < masterNodes.size(); ++x) {
      if (x == id)
        continue;
      std::string master_nodes_str =
          "MasterNodesFrom_" + std::to_string(id) + "_To_" + std::to_string(x);
      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, master_nodes_str, masterNodes[x].size());
      if (masterNodes[x].size() > maxSharedSize) {
        maxSharedSize = masterNodes[x].size();
      }
    }

    for (auto x = 0U; x < mirrorNodes.size(); ++x) {
      if (x == id)
        continue;
      std::string mirror_nodes_str =
          "MirrorNodesFrom_" + std::to_string(x) + "_To_" + std::to_string(id);
      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, mirror_nodes_str, mirrorNodes[x].size());
      if (mirrorNodes[x].size() > maxSharedSize) {
        maxSharedSize = mirrorNodes[x].size();
      }
    }

    sendInfoToHost();

    // do not track memory usage of partitioning
    auto& net = galois::runtime::getSystemNetworkInterface();
    net.resetMemUsage();
  }

  /**
   * Reports master/mirror stats.
   * Assumes that communication has already occured so that the host
   * calling it actually has the info required.
   *
   * @param global_total_mirror_nodes number of mirror nodes on all hosts
   * @param global_total_owned_nodes number of "owned" nodes on all hosts
   */
  void reportProxyStats(uint64_t global_total_mirror_nodes,
                        uint64_t GALOIS_UNUSED(global_total_owned_nodes)) {
    float replication_factor =
        (float)(global_total_mirror_nodes + userGraph.globalSize()) /
        (float)userGraph.globalSize();
    galois::runtime::reportStat_Single(RNAME, "ReplicationFactor",
                                       replication_factor);

    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
        RNAME, "TotalNodes", userGraph.globalSize());
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
        RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes);
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Initializers
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Initalize MPI related things. The MPI layer itself should have been
   * initialized when the network interface was initiailized.
   */
  void initBareMPI() {
#ifdef GALOIS_USE_BARE_MPI
    if (bare_mpi == noBareMPI)
      return;

#ifdef GALOIS_USE_LCI
    // sanity check of ranks
    int taskRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskRank);
    if ((unsigned)taskRank != id)
      GALOIS_DIE("mismatch in MPI rank");
    int numTasks;
    MPI_Comm_size(MPI_COMM_WORLD, &numTasks);
    if ((unsigned)numTasks != numHosts)
      GALOIS_DIE("mismatch in MPI rank");
#endif
    // group setup
    MPI_Group world_group;
    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
    mpi_identity_groups.resize(numHosts);

    for (unsigned x = 0; x < numHosts; ++x) {
      const int g[1] = {(int)x};
      MPI_Group_incl(world_group, 1, g, &mpi_identity_groups[x]);
    }

    if (id == 0) {
      switch (bare_mpi) {
      case nonBlockingBareMPI:
        galois::gPrint("Using non-blocking bare MPI\n");
        break;
      case oneSidedBareMPI:
        galois::gPrint("Using one-sided bare MPI\n");
        break;
      case noBareMPI:
      default:
        GALOIS_DIE("unsupported bare MPI");
      }
    }
#endif
  }

public:
  /**
   * Delete default constructor: this class NEEDS to have a graph passed into
   * it.
   */
  GluonSubstrate() = delete;

  /**
   * Constructor for GluonSubstrate. Initializes metadata fields.
   *
   * @param _userGraph graph to build substrate on
   * @param host host number that this graph resides on
   * @param numHosts total number of hosts in the currently executing program
   * @param _transposed True if the graph is transposed
   * @param _cartesianGrid cartesian grid for sync
   * @param _partitionAgnostic determines if sync should be partition agnostic
   * or not
   * @param _enforcedDataMode Forced data comm mode for sync
   */
  GluonSubstrate(
      GraphTy& _userGraph, unsigned host, unsigned numHosts, bool _transposed,
      std::pair<unsigned, unsigned> _cartesianGrid = std::make_pair(0u, 0u),
      bool _partitionAgnostic                      = false,
      DataCommMode _enforcedDataMode               = DataCommMode::noData)
      : galois::runtime::GlobalObject(this), userGraph(_userGraph), id(host),
        transposed(_transposed), isVertexCut(userGraph.is_vertex_cut()),
        cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic),
        substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),
        num_round(0), currentBVFlag(nullptr),
        mirrorNodes(userGraph.getMirrorNodes()) {
    if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {
      GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,
                    "Cartesian split doesn't equal number of hosts");
      if (id == 0) {
        galois::gInfo("Gluon optimizing communication for 2-D cartesian cut: ",
                      cartesianGrid.first, " x ", cartesianGrid.second);
      }
      isCartCut = true;
    } else {
      assert(cartesianGrid.first == 0 && cartesianGrid.second == 0);
      isCartCut = false;
    }

    // set this global value for use on GPUs mostly
    enforcedDataMode = _enforcedDataMode;

    initBareMPI();
    // master setup from mirrors done by setupCommunication call
    masterNodes.resize(numHosts);
    // setup proxy communication
    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(
        "GraphCommSetupTime", RNAME);
    Tgraph_construct_comm.start();
    setupCommunication();
    Tgraph_construct_comm.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Data extraction from bitsets
  ////////////////////////////////////////////////////////////////////////////////

private:
  /**
   * Given a bitset, determine the indices of the bitset that are currently
   * set.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName string used to name the timer for this function
   * @param bitset_comm the bitset to get the offsets of
   * @param offsets output: the offset vector that will contain indices into
   * the bitset that are set
   * @param bit_set_count output: will be set to the number of bits set in the
   * bitset
   */
  template <SyncType syncType>
  void getOffsetsFromBitset(const std::string& loopName,
                            const galois::DynamicBitSet& bitset_comm,
                            galois::PODResizeableArray<unsigned int>& offsets,
                            size_t& bit_set_count) const {
    // timer creation
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string offsets_timer_str(syncTypeStr + "Offsets_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Toffsets(offsets_timer_str.c_str(),
                                                      RNAME);

    Toffsets.start();

    auto activeThreads = galois::getActiveThreads();
    std::vector<unsigned int> t_prefix_bit_counts(activeThreads);

    // count how many bits are set on each thread
    galois::on_each([&](unsigned tid, unsigned nthreads) {
      // TODO use block_range instead
      unsigned int block_size = bitset_comm.size() / nthreads;
      if ((bitset_comm.size() % nthreads) > 0)
        ++block_size;
      assert((block_size * nthreads) >= bitset_comm.size());

      unsigned int start = tid * block_size;
      unsigned int end   = (tid + 1) * block_size;
      if (end > bitset_comm.size())
        end = bitset_comm.size();

      unsigned int count = 0;
      for (unsigned int i = start; i < end; ++i) {
        if (bitset_comm.test(i))
          ++count;
      }

      t_prefix_bit_counts[tid] = count;
    });

    // calculate prefix sum of bits per thread
    for (unsigned int i = 1; i < activeThreads; ++i) {
      t_prefix_bit_counts[i] += t_prefix_bit_counts[i - 1];
    }
    // total num of set bits
    bit_set_count = t_prefix_bit_counts[activeThreads - 1];

    // calculate the indices of the set bits and save them to the offset
    // vector
    if (bit_set_count > 0) {
      offsets.resize(bit_set_count);
      galois::on_each([&](unsigned tid, unsigned nthreads) {
        // TODO use block_range instead
        // TODO this is same calculation as above; maybe refactor it
        // into function?
        unsigned int block_size = bitset_comm.size() / nthreads;
        if ((bitset_comm.size() % nthreads) > 0)
          ++block_size;
        assert((block_size * nthreads) >= bitset_comm.size());

        unsigned int start = tid * block_size;
        unsigned int end   = (tid + 1) * block_size;
        if (end > bitset_comm.size())
          end = bitset_comm.size();

        unsigned int count = 0;
        unsigned int t_prefix_bit_count;
        if (tid == 0) {
          t_prefix_bit_count = 0;
        } else {
          t_prefix_bit_count = t_prefix_bit_counts[tid - 1];
        }

        for (unsigned int i = start; i < end; ++i) {
          if (bitset_comm.test(i)) {
            offsets[t_prefix_bit_count + count] = i;
            ++count;
          }
        }
      });
    }
    Toffsets.stop();
  }

  /**
   * Determine what data needs to be synchronized based on the passed in
   * bitset_compute and returns information regarding these need-to-be-sync'd
   * nodes.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done;
   * only used to get the size of the type being synchronized in this function
   * @tparam syncType type of synchronization this function is being called
   * for; only used to name a timer
   *
   * @param loopName loopname used to name the timer for the function
   * @param indices A vector that contains the local ids of the nodes that
   * you want to potentially synchronize
   * @param bitset_compute Contains the full bitset of all nodes in this
   * graph
   * @param bitset_comm OUTPUT: bitset that marks which indices in the passed
   * in indices array need to be synchronized
   * @param offsets OUTPUT: contains indices into bitset_comm that are set
   * @param bit_set_count OUTPUT: contains number of bits set in bitset_comm
   * @param data_mode OUTPUT: the way that this data should be communicated
   * based on how much data needs to be sent out
   */
  template <typename FnTy, SyncType syncType>
  void getBitsetAndOffsets(const std::string& loopName,
                           const std::vector<size_t>& indices,
                           const galois::DynamicBitSet& bitset_compute,
                           galois::DynamicBitSet& bitset_comm,
                           galois::PODResizeableArray<unsigned int>& offsets,
                           size_t& bit_set_count,
                           DataCommMode& data_mode) const {
    if (substrateDataMode != onlyData) {
      bitset_comm.reset();
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "Bitset_" + loopName);

      bitset_comm.reset();
      // determine which local nodes in the indices array need to be
      // sychronized
      galois::do_all(
          galois::iterate(size_t{0}, indices.size()),
          [&](size_t n) {
            // assumes each lid is unique as test is not thread safe
            size_t lid = indices[n];
            if (bitset_compute.test(lid)) {
              bitset_comm.set(n);
            }
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());

      // get the number of set bits and the offsets into the comm bitset
      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
                                     bit_set_count);
    }

    data_mode =
        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
  }

  template <typename SyncFnTy>
  size_t getMaxSendBufferSize(uint32_t numShared) {
    if (substrateDataMode == gidsData) {
      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
             (numShared * sizeof(typename SyncFnTy::ValTy));
    } else if (substrateDataMode == offsetsData) {
      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
             (numShared * sizeof(typename SyncFnTy::ValTy));
    } else if (substrateDataMode == bitsetData) {
      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
      return sizeof(DataCommMode) + sizeof(size_t) +
             sizeof(size_t)   // bitset size
             + sizeof(size_t) // bitset vector size
             + bitset_alloc_size + sizeof(size_t) +
             (numShared * sizeof(typename SyncFnTy::ValTy));
    } else { // onlyData or noData (auto)
      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
      return sizeof(DataCommMode) + sizeof(size_t) +
             sizeof(size_t)   // bitset size
             + sizeof(size_t) // bitset vector size
             + bitset_alloc_size + sizeof(size_t) +
             (numShared * sizeof(typename SyncFnTy::ValTy));
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Local to global ID conversion
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Converts LIDs of nodes we are interested in into GIDs.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param offsets INPUT/OUTPUT holds offsets into "indices" that we should
   * use; after function completion, holds global ids of nodes we are interested
   * in
   */
  template <SyncType syncType>
  void convertLIDToGID(const std::string& loopName,
                       const std::vector<size_t>& indices,
                       galois::PODResizeableArray<unsigned int>& offsets) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "_LID2GID_" +
                          get_run_identifier(loopName));
    galois::do_all(
        galois::iterate(size_t{0}, offsets.size()),
        [&](size_t n) {
          offsets[n] =
              static_cast<uint32_t>(userGraph.getGID(indices[offsets[n]]));
        },
#if GALOIS_COMM_STATS
        galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
        galois::no_stats());
  }

  /**
   * Converts a vector of GIDs into local ids.
   *
   * @tparam syncType either reduce or broadcast; only used to name the timer
   *
   * @param loopName name of loop used to name timer
   * @param offsets holds GIDs to convert to LIDs
   */
  template <SyncType syncType>
  void convertGIDToLID(const std::string& loopName,
                       galois::PODResizeableArray<unsigned int>& offsets) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "_GID2LID_" +
                          get_run_identifier(loopName));

    galois::do_all(
        galois::iterate(size_t{0}, offsets.size()),
        [&](size_t n) { offsets[n] = userGraph.getLID(offsets[n]); },
#if GALOIS_COMM_STATS
        galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
        galois::no_stats());
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Message prep functions (buffering, send buffer getting, etc.)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Get data that is going to be sent for synchronization and returns
   * it in a send buffer.
   *
   * @tparam syncType synchronization type
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has information needed to access bitset
   *
   * @param loopName Name to give timer
   * @param x Host to send to
   * @param b OUTPUT: Buffer that will hold data to send
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void getSendBuffer(std::string loopName, unsigned x,
                     galois::runtime::SendBuffer& b) {
    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;

    if (BitsetFnTy::is_valid()) {
      syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
          loopName, x, sharedNodes[x], b);
    } else {
      syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
                                                    b);
    }

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string statSendBytes_str(syncTypeStr + "SendBytes_" +
                                  get_run_identifier(loopName));

    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
  }
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void getSendBuffer(std::string loopName, unsigned x,
                     galois::runtime::SendBuffer& b) {
    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;

    syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
        loopName, x, sharedNodes[x], b);

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" +
                                  get_run_identifier(loopName));

    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
  }

  /**
   * Given data to serialize in val_vec, serialize it into the send buffer
   * depending on the mode of data communication selected for the data.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam VecType type of val_vec, which stores the data to send
   *
   * @param loopName loop name used for timers
   * @param data_mode the way that the data should be communicated
   * @param bit_set_count the number of items we are sending in this message
   * @param indices list of all nodes that we are potentially interested in
   * sending things to
   * @param offsets contains indicies into "indices" that we are interested in
   * @param val_vec contains the data that we are serializing to send
   * @param b the buffer in which to serialize the message we are sending
   * to
   */
  template <bool async, SyncType syncType, typename VecType>
  void serializeMessage(std::string loopName, DataCommMode data_mode,
                        size_t bit_set_count, std::vector<size_t>& indices,
                        galois::PODResizeableArray<unsigned int>& offsets,
                        galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
                        galois::runtime::SendBuffer& b) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
        serialize_timer_str.c_str(), RNAME);
    if (data_mode == noData) {
      if (!async) {
        Tserialize.start();
        gSerialize(b, data_mode);
        Tserialize.stop();
      }
    } else if (data_mode == gidsData) {
      offsets.resize(bit_set_count);
      convertLIDToGID<syncType>(loopName, indices, offsets);
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
      Tserialize.stop();
    } else if (data_mode == offsetsData) {
      offsets.resize(bit_set_count);
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
      Tserialize.stop();
    } else if (data_mode == bitsetData) {
      val_vec.resize(bit_set_count);
      Tserialize.start();
      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);
      Tserialize.stop();
    } else { // onlyData
      Tserialize.start();
      gSerialize(b, data_mode, val_vec);
      Tserialize.stop();
    }
  }

  /**
   * Given the data mode, deserialize the rest of a message in a Receive Buffer.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam VecType type of val_vec, which data will be deserialized into
   *
   * @param loopName used to name timers for statistics
   * @param data_mode data mode with which the original message was sent;
   * determines how to deserialize the rest of the message
   * @param buf buffer which contains the received message to deserialize
   *
   * The rest of the arguments are output arguments (they are passed by
   * reference)
   *
   * @param bit_set_count Var that holds number of bits set (i.e. number of
   * node changed) after deserialization
   * @param offsets holds offsets data after deserialization if data mode is
   * offsets + data
   * @param bit_set_comm holds the bitset representing changed nodes after
   * deserialization of data mode is bitset + data
   * @param buf_start
   * @param retval
   * @param val_vec The data proper will be deserialized into this vector
   */
  template <SyncType syncType, typename VecType>
  void deserializeMessage(std::string loopName, DataCommMode data_mode,
                          uint32_t num, galois::runtime::RecvBuffer& buf,
                          size_t& bit_set_count,
                          galois::PODResizeableArray<unsigned int>& offsets,
                          galois::DynamicBitSet& bit_set_comm,
                          size_t& buf_start, size_t& retval, VecType& val_vec) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string serialize_timer_str(syncTypeStr + "DeserializeMessage_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(
        serialize_timer_str.c_str(), RNAME);
    Tdeserialize.start();

    // get other metadata associated with message if mode isn't OnlyData
    if (data_mode != onlyData) {
      galois::runtime::gDeserialize(buf, bit_set_count);

      if (data_mode == gidsData) {
        galois::runtime::gDeserialize(buf, offsets);
        convertGIDToLID<syncType>(loopName, offsets);
      } else if (data_mode == offsetsData) {
        galois::runtime::gDeserialize(buf, offsets);
      } else if (data_mode == bitsetData) {
        bit_set_comm.resize(num);
        galois::runtime::gDeserialize(buf, bit_set_comm);
      } else if (data_mode == dataSplit) {
        galois::runtime::gDeserialize(buf, buf_start);
      } else if (data_mode == dataSplitFirst) {
        galois::runtime::gDeserialize(buf, retval);
      }
    }

    // get data itself
    galois::runtime::gDeserialize(buf, val_vec);

    Tdeserialize.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Other helper functions
  ////////////////////////////////////////////////////////////////////////////////

  //! Returns the grid row ID of this host
  unsigned gridRowID() const { return (id / cartesianGrid.second); }
  //! Returns the grid row ID of the specified host
  unsigned gridRowID(unsigned hid) const {
    return (hid / cartesianGrid.second);
  }
  //! Returns the grid column ID of this host
  unsigned gridColumnID() const { return (id % cartesianGrid.second); }
  //! Returns the grid column ID of the specified host
  unsigned gridColumnID(unsigned hid) const {
    return (hid % cartesianGrid.second);
  }

  /**
   * Determine if a host is a communication partner using cartesian grid.
   */
  bool isNotCommPartnerCVC(unsigned host, SyncType syncType,
                           WriteLocation writeLocation,
                           ReadLocation readLocation) {
    assert(cartesianGrid.first != 0);
    assert(cartesianGrid.second != 0);

    if (transposed) {
      if (syncType == syncReduce) {
        switch (writeLocation) {
        case writeSource:
          return (gridColumnID() != gridColumnID(host));
        case writeDestination:
          return (gridRowID() != gridRowID(host));
        case writeAny:
          assert((gridRowID() == gridRowID(host)) ||
                 (gridColumnID() == gridColumnID(host)));
          return ((gridRowID() != gridRowID(host)) &&
                  (gridColumnID() != gridColumnID(host))); // false
        default:
          GALOIS_DIE("unreachable");
        }
      } else { // syncBroadcast
        switch (readLocation) {
        case readSource:
          return (gridColumnID() != gridColumnID(host));
        case readDestination:
          return (gridRowID() != gridRowID(host));
        case readAny:
          assert((gridRowID() == gridRowID(host)) ||
                 (gridColumnID() == gridColumnID(host)));
          return ((gridRowID() != gridRowID(host)) &&
                  (gridColumnID() != gridColumnID(host))); // false
        default:
          GALOIS_DIE("unreachable");
        }
      }
    } else {
      if (syncType == syncReduce) {
        switch (writeLocation) {
        case writeSource:
          return (gridRowID() != gridRowID(host));
        case writeDestination:
          return (gridColumnID() != gridColumnID(host));
        case writeAny:
          assert((gridRowID() == gridRowID(host)) ||
                 (gridColumnID() == gridColumnID(host)));
          return ((gridRowID() != gridRowID(host)) &&
                  (gridColumnID() != gridColumnID(host))); // false
        default:
          GALOIS_DIE("unreachable");
        }
      } else { // syncBroadcast, 1
        switch (readLocation) {
        case readSource:
          return (gridRowID() != gridRowID(host));
        case readDestination:
          return (gridColumnID() != gridColumnID(host));
        case readAny:
          assert((gridRowID() == gridRowID(host)) ||
                 (gridColumnID() == gridColumnID(host)));
          return ((gridRowID() != gridRowID(host)) &&
                  (gridColumnID() != gridColumnID(host))); // false
        default:
          GALOIS_DIE("unreachable");
        }
      }
      return false;
    }
  }

  // Requirement: For all X and Y,
  // On X, nothingToSend(Y) <=> On Y, nothingToRecv(X)
  /**
   * Determine if we have anything that we need to send to a particular host
   *
   * @param host Host number that we may or may not send to
   * @param syncType Synchronization type to determine which nodes on a
   * host need to be considered
   * @param writeLocation If data is being written to on source or
   * destination (or both)
   * @param readLocation If data is being read from on source or
   * destination (or both)
   * @returns true if there is nothing to send to a host, false otherwise
   */
  bool nothingToSend(unsigned host, SyncType syncType,
                     WriteLocation writeLocation, ReadLocation readLocation) {
    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
    // TODO refactor (below)
    if (!isCartCut) {
      return (sharedNodes[host].size() == 0);
    } else {
      // TODO If CVC, call is not comm partner else use default above
      if (sharedNodes[host].size() > 0) {
        return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);
      } else {
        return true;
      }
    }
  }

  /**
   * Determine if we have anything that we need to receive from a particular
   * host
   *
   * @param host Host number that we may or may not receive from
   * @param syncType Synchronization type to determine which nodes on a
   * host need to be considered
   * @param writeLocation If data is being written to on source or
   * destination (or both)
   * @param readLocation If data is being read from on source or
   * destination (or both)
   * @returns true if there is nothing to receive from a host, false otherwise
   */
  bool nothingToRecv(unsigned host, SyncType syncType,
                     WriteLocation writeLocation, ReadLocation readLocation) {
    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
    // TODO refactor (above)
    if (!isCartCut) {
      return (sharedNodes[host].size() == 0);
    } else {
      if (sharedNodes[host].size() > 0) {
        return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);
      } else {
        return true;
      }
    }
  }

  /**
   * Reports bytes saved by using the bitset to only selectively load data
   * to send.
   *
   * @tparam SyncFnTy synchronization structure with info needed to synchronize;
   * used for size calculation
   *
   * @param loopName loop name used for timers
   * @param syncTypeStr String used to name timers
   * @param totalToSend Total amount of nodes that are potentially sent (not
   * necessarily all nodees will be sent)
   * @param bitSetCount Number of nodes that will actually be sent
   * @param bitSetComm bitset used to send data
   */
  template <typename SyncFnTy>
  void reportRedundantSize(std::string loopName, std::string syncTypeStr,
                           uint32_t totalToSend, size_t bitSetCount,
                           const galois::DynamicBitSet& bitSetComm) {
    size_t redundant_size =
        (totalToSend - bitSetCount) * sizeof(typename SyncFnTy::ValTy);
    size_t bit_set_size = (bitSetComm.get_vec().size() * sizeof(uint64_t));

    if (redundant_size > bit_set_size) {
      std::string statSavedBytes_str(syncTypeStr + "SavedBytes_" +
                                     get_run_identifier(loopName));

      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
          RNAME, statSavedBytes_str, (redundant_size - bit_set_size));
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Extract data from nodes (for reduce and broadcast)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Extracts data at provided lid.
   *
   * This version (reduce) resets the value after extract.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; determines if reset is
   * necessary
   *
   * @param lid local id of node to get data from
   * @returns data (specified by FnTy) of node with local id lid
   */
  /* Reduction extract resets the value afterwards */
  template <typename FnTy, SyncType syncType>
  inline typename FnTy::ValTy extractWrapper(size_t lid) {
    if (syncType == syncReduce) {
      auto val = FnTy::extract(lid, userGraph.getData(lid));
      FnTy::reset(lid, userGraph.getData(lid));
      return val;
    } else {
      return FnTy::extract(lid, userGraph.getData(lid));
    }
  }

  /**
   * Extracts data at provided lid; uses vecIndex to get the correct element
   * from the vector.
   *
   * This version (reduce) resets the value after extract.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; determines if reset is
   * necessary
   *
   * @param lid local id of node to get data from
   * @param vecIndex index to grab from vector in node
   * @returns data (specified by FnTy) of node with local id lid
   */
  /* Reduction extract resets the value afterwards */
  template <typename FnTy, SyncType syncType>
  inline typename FnTy::ValTy extractWrapper(size_t lid, unsigned vecIndex) {
    if (syncType == syncReduce) {
      auto val = FnTy::extract(lid, userGraph.getData(lid), vecIndex);
      FnTy::reset(lid, userGraph.getData(lid), vecIndex);
      return val;
    } else {
      return FnTy::extract(lid, userGraph.getData(lid), vecIndex);
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into val_vec.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec OUTPUT: holds the extracted data
   * @param start Offset into val_vec to start saving data to
   */
  template <typename FnTy, SyncType syncType, typename VecTy,
            bool identity_offsets = false, bool parallelize = true>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     VecTy& val_vec, size_t start = 0) {
    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            size_t lid         = indices[offset];
            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else { // non-parallel version
      for (unsigned n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];

        size_t lid         = indices[offset];
        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
      }
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into val_vec. Same as above, except it has the vecIndex
   * arguments and requires vecSync to be true
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   * @tparam vecSync Only set to true if the field being synchronized is a
   * vector and synchronization is occuring element by element. MUST BE SET
   * TO TRUE IN ORDER FOR THIS FUNCTION TO COMPILE.
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec OUTPUT: holds the extracted data
   * @param vecIndex which element of the vector to extract from node
   * @param start Offset into val_vec to start saving data to
   */
  // TODO find a better way to have this variant without code duplication
  template <typename FnTy, SyncType syncType, typename VecTy,
            bool identity_offsets = false, bool parallelize = true,
            bool vecSync                            = false,
            typename std::enable_if<vecSync>::type* = nullptr>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     VecTy& val_vec, unsigned vecIndex, size_t start = 0) {
    val_vec.resize(size); // resize val vec for this vecIndex

    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractValVector_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            size_t lid         = indices[offset];
            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else { // non-parallel version
      for (unsigned n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        size_t lid         = indices[offset];
        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
      }
    }
  }

  /**
   * Based on provided arguments, extracts the data that we are interested
   * in sending into a send buffer. Lazy serialize variant that works with
   * certain SeqTy.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SeqTy Type of sequence that we are getting data from
   * @tparam syncType either reduce or broadcast; used to determine if reseting
   * the extracted field is necessary
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize Determines if parallelizing the extraction is done or
   * not
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param size Number of elements to extract
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param b send buffer to extract data into
   * @param lseq sequence to get data from
   * @param start Offset into send buffer to start saving data to
   */
  template <typename FnTy, typename SeqTy, SyncType syncType,
            bool identity_offsets = false, bool parallelize = true>
  void extractSubset(const std::string& loopName,
                     const std::vector<size_t>& indices, size_t size,
                     const galois::PODResizeableArray<unsigned int>& offsets,
                     galois::runtime::SendBuffer& b, SeqTy lseq,
                     size_t start = 0) {
    if (parallelize) {
      std::string syncTypeStr =
          (syncType == syncReduce) ? "Reduce" : "Broadcast";
      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);

      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];

            size_t lid = indices[offset];
            gSerializeLazy(b, lseq, n - start,
                           extractWrapper<FnTy, syncType>(lid));
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else {
      for (unsigned int n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        size_t lid = indices[offset];
        gSerializeLazy(b, lseq, n - start, extractWrapper<FnTy, syncType>(lid));
      }
    }
  }

  /**
   * GPU wrap function: extracts data from nodes and resets them to the
   * reduction identity value as specified by the sync structure. (Reduce only)
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to extract from
   * @param v vector to extract data to
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType>
  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
    if (syncType == syncReduce) {
      return FnTy::extract_reset_batch(x, b.getVec().data());
    } else {
      return FnTy::extract_batch(x, b.getVec().data());
    }
  }

  /**
   * GPU wrap function: extracts data from nodes and resets them to the
   * reduction identity value as specified by the sync structure. (Reduce only)
   *
   * This version specifies more arguments.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to extract from
   * @param b
   * @param o
   * @param v
   * @param s
   * @param data_mode
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType>
  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                  size_t& s, DataCommMode& data_mode) {
    if (syncType == syncReduce) {
      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
    } else {
      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Reduce/sets on node (for broadcast)
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Reduce variant. Takes a value and reduces it according to the sync
   * structure provided to the function.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType Reduce sync or broadcast sync
   *
   * @param lid local id of node to reduce to
   * @param val value to reduce to
   * @param bit_set_compute bitset indicating which nodes have changed; updated
   * if reduction causes a change
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline void setWrapper(size_t lid, typename FnTy::ValTy val,
                         galois::DynamicBitSet& bit_set_compute) {
    if (syncType == syncReduce) {
      if (FnTy::reduce(lid, userGraph.getData(lid), val)) {
        if (bit_set_compute.size() != 0)
          bit_set_compute.set(lid);
      }
    } else {
      if (async)
        FnTy::reduce(lid, userGraph.getData(lid), val);
      else
        FnTy::setVal(lid, userGraph.getData(lid), val);
    }
  }

  /**
   * VECTOR VARIANT.
   *
   * Reduce variant. Takes a value and reduces it according to the sync
   * structure provided to the function. Only reduces the element at a
   * particular index of the vector field being sychronized.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam syncType Reduce sync or broadcast sync
   *
   * @param lid local id of node to reduce to
   * @param val value to reduce to
   * @param bit_set_compute bitset indicating which nodes have changed; updated
   * if reduction causes a change
   * @param vecIndex which element of the vector to reduce in the node
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline void setWrapper(size_t lid, typename FnTy::ValTy val,
                         galois::DynamicBitSet& bit_set_compute,
                         unsigned vecIndex) {
    if (syncType == syncReduce) {
      if (FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex)) {
        if (bit_set_compute.size() != 0)
          bit_set_compute.set(lid);
      }
    } else {
      if (async)
        FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex);
      else
        FnTy::setVal(lid, userGraph.getData(lid), val, vecIndex);
    }
  }

  /**
   * Given data received from another host and information on which nodes
   * to update, do the reduce/set of the received data to update local nodes.
   *
   * Complement function, in some sense, of extractSubset.
   *
   * @tparam VecTy type of indices variable
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Reduce or broadcast
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize True if updates to nodes are to be parallelized
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param size Number of elements to set
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec holds data we will use to set
   * @param bit_set_compute bitset indicating which nodes have changed
   * @param start Offset into val_vec to start saving data to
   */
  template <typename IndicesVecTy, typename FnTy, SyncType syncType,
            typename VecTy, bool async, bool identity_offsets = false,
            bool parallelize = true>
  void setSubset(const std::string& loopName, const IndicesVecTy& indices,
                 size_t size,
                 const galois::PODResizeableArray<unsigned int>& offsets,
                 VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,
                 size_t start = 0) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "SetVal_" +
                          get_run_identifier(loopName));

    if (parallelize) {
      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            auto lid = indices[offset];
            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                              bit_set_compute);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else {
      for (unsigned int n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        auto lid = indices[offset];
        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                          bit_set_compute);
      }
    }
  }

  /**
   * VECTOR BITSET VARIANT.
   *
   * Given data received from another host and information on which nodes
   * to update, do the reduce/set of the received data to update local nodes.
   * It will only update a single index of the vector specified by the
   * sync structures at a time.
   *
   * Complement function, in some sense, of extractSubset, vector bitset
   * variant.
   *
   * @tparam VecTy type of indices variable
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Reduce or broadcast
   * @tparam identity_offsets If this is true, then ignore the offsets
   * array and just grab directly from indices (i.e. don't pick out
   * particular elements, just grab contiguous chunk)
   * @tparam parallelize True if updates to nodes are to be parallelized
   * @tparam vecSync Only set to true if the field being synchronized is a
   * vector. MUST BE SET TO TRUE FOR THIS FUNCTION TO COMPILE
   *
   * @param loopName name of loop used to name timer
   * @param indices Local ids of nodes that we are interested in
   * @param size Number of elements to set
   * @param offsets Holds offsets into "indices" of the data that we are
   * interested in
   * @param val_vec holds data we will use to set
   * @param bit_set_compute bitset indicating which nodes have changed
   * @param vecIndex which element of the vector to set in the node
   * @param start Offset into val_vec to start saving data to
   */
  // TODO find a better way to have this variant without code duplication
  template <typename IndicesVecTy, typename FnTy, SyncType syncType,
            typename VecTy, bool async, bool identity_offsets = false,
            bool parallelize = true, bool vecSync = false,
            typename std::enable_if<vecSync>::type* = nullptr>
  void setSubset(const std::string& loopName, const IndicesVecTy& indices,
                 size_t size,
                 const galois::PODResizeableArray<unsigned int>& offsets,
                 VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,
                 unsigned vecIndex, size_t start = 0) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string doall_str(syncTypeStr + "SetValVector_" +
                          get_run_identifier(loopName));

    if (parallelize) {
      galois::do_all(
          galois::iterate(start, start + size),
          [&](unsigned int n) {
            unsigned int offset;
            if (identity_offsets)
              offset = n;
            else
              offset = offsets[n];
            auto lid = indices[offset];
            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                              bit_set_compute, vecIndex);
          },
#if GALOIS_COMM_STATS
          galois::loopname(get_run_identifier(doall_str).c_str()),
#endif
          galois::no_stats());
    } else {
      for (unsigned int n = start; n < start + size; ++n) {
        unsigned int offset;
        if (identity_offsets)
          offset = n;
        else
          offset = offsets[n];
        auto lid = indices[offset];
        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
                                          bit_set_compute, vecIndex);
      }
    }
  }

  /**
   * GPU wrapper function to reduce multiple nodes at once.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to set
   * @param v
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
    if (syncType == syncReduce) {
      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
    } else {
      if (async) {
        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
      } else {
        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
      }
    }
  }

  /**
   * GPU wrapper function to reduce multiple nodes at once. More detailed
   * arguments.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   * @tparam SyncType Must be reduce
   *
   * @param x node id to set
   * @param b
   * @param o
   * @param v
   * @param s
   * @param data_mode
   *
   * @returns true if called on GPU device
   */
  template <typename FnTy, SyncType syncType, bool async>
  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                              DataCommMode& data_mode) {
    if (syncType == syncReduce) {
      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
                                data_mode);
    } else {
      if (async) {
        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
                                         data_mode);
      } else {
        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
                                  data_mode);
      }
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Sends
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Non-bitset extract that uses serializelazy to copy data over to the
   * buffer. REQUIRES that the ValTy be memory copyable.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of nodes that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <SyncType syncType, typename SyncFnTy, typename VecTy, bool async,
            typename std::enable_if<galois::runtime::is_memory_copyable<
                typename SyncFnTy::ValTy>::value>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    uint32_t num = indices.size();
    static VecTy val_vec; // sometimes wasteful
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    Textract.start();

    if (num > 0) {
      data_mode = onlyData;
      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +
                (num * sizeof(typename SyncFnTy::ValTy)));

      Textractbatch.start();
      bool batch_succeeded =
          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
      Textractbatch.stop();

      if (!batch_succeeded) {
        b.resize(0);
        val_vec.reserve(maxSharedSize);
        val_vec.resize(num);
        gSerialize(b, onlyData);
        auto lseq = gSerializeLazySeq(
            b, num,
            (galois::PODResizeableArray<typename SyncFnTy::ValTy>*)nullptr);
        extractSubset<SyncFnTy, decltype(lseq), syncType, true, true>(
            loopName, indices, num, offsets, b, lseq);
      } else {
        b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                 (num * sizeof(typename SyncFnTy::ValTy)));
      }
    } else {
      data_mode = noData;
      b.resize(0);
      if (!async) {
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

  /**
   * Non-bitset extract for when the type of the item being sync'd isn't
   * memory copyable.
   *
   * Extracts all of the data for all nodes in indices and saves it into
   * a send buffer for return.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of nodes that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <SyncType syncType, typename SyncFnTy, typename VecTy, bool async,
            typename std::enable_if<!galois::runtime::is_memory_copyable<
                typename SyncFnTy::ValTy>::value>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    uint32_t num = indices.size();
    static VecTy val_vec; // sometimes wasteful
    static galois::PODResizeableArray<unsigned int> dummyVector;

    Textract.start();

    if (num > 0) {
      data_mode = onlyData;
      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +
                (num * sizeof(typename SyncFnTy::ValTy)));

      Textractbatch.start();
      bool batch_succeeded =
          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
      Textractbatch.stop();

      if (!batch_succeeded) {
        b.resize(0);
        val_vec.reserve(maxSharedSize);
        val_vec.resize(num);
        // get everything (note I pass in "indices" as offsets as it won't
        // even get used anyways)
        extractSubset<SyncFnTy, syncType, VecTy, true, true>(
            loopName, indices, num, dummyVector, val_vec);
        gSerialize(b, onlyData, val_vec);
      } else {
        b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                 (num * sizeof(typename SyncFnTy::ValTy)));
      }

    } else {
      b.resize(0);
      if (!async) {
        data_mode = noData;
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

  /**
   * Extracts the data that will be sent to a host in this round of
   * synchronization based on the passed in bitset and saves it to a
   * send buffer.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   * being used for the extraction
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of nodes that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned from_id,
                   std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    uint32_t num                        = indices.size();
    galois::DynamicBitSet& bit_set_comm = syncBitset;
    static VecTy val_vec; // sometimes wasteful
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "Extract_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);
    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
        extract_alloc_timer_str.c_str(), RNAME);
    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
                                        get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
        extract_batch_timer_str.c_str(), RNAME);

    DataCommMode data_mode;

    Textract.start();

    if (num > 0) {
      size_t bit_set_count = 0;
      Textractalloc.start();
      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));
      Textractalloc.stop();

      Textractbatch.start();
      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
          from_id, b, bit_set_count, data_mode);
      Textractbatch.stop();

      // GPUs have a batch function they can use; CPUs do not; therefore,
      // CPUS always enter this if block
      if (!batch_succeeded) {
        Textractalloc.start();
        b.resize(0);
        bit_set_comm.reserve(maxSharedSize);
        offsets.reserve(maxSharedSize);
        val_vec.reserve(maxSharedSize);
        bit_set_comm.resize(num);
        offsets.resize(num);
        val_vec.resize(num);
        Textractalloc.stop();
        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();

        getBitsetAndOffsets<SyncFnTy, syncType>(
            loopName, indices, bit_set_compute, bit_set_comm, offsets,
            bit_set_count, data_mode);

        if (data_mode == onlyData) {
          bit_set_count = indices.size();
          extractSubset<SyncFnTy, syncType, VecTy, true, true>(
              loopName, indices, bit_set_count, offsets, val_vec);
        } else if (data_mode !=
                   noData) { // bitsetData or offsetsData or gidsData
          extractSubset<SyncFnTy, syncType, VecTy, false, true>(
              loopName, indices, bit_set_count, offsets, val_vec);
        }
        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
                                          indices, offsets, bit_set_comm,
                                          val_vec, b);
      } else {
        if (data_mode == noData) {
          b.resize(0);
          if (!async) {
            gSerialize(b, data_mode);
          }
        } else if (data_mode == gidsData) {
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
                   sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else if (data_mode == offsetsData) {
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
                   sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else if (data_mode == bitsetData) {
          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
                   sizeof(size_t)   // bitset size
                   + sizeof(size_t) // bitset vector size
                   + bitset_alloc_size + sizeof(size_t) +
                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
        } else { // onlyData
          b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                   (num * sizeof(typename SyncFnTy::ValTy)));
        }
      }

      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
                                    bit_set_comm);
    } else {
      data_mode = noData;
      b.resize(0);
      if (!async) {
        gSerialize(b, noData);
      }
    }

    Textract.stop();

    std::string metadata_str(syncTypeStr + "MetadataMode_" +
                             std::to_string(data_mode) + "_" +
                             get_run_identifier(loopName));
    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
                                                            1);
  }

  /**
   * Vector bitset variant.
   *
   * Extracts the data that will be sent to a host in this round of
   * synchronization based on the passed in bitset and saves it to a
   * send buffer. Unlike other variants, this will extract an entire
   * vector element by element.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam syncFnTy struct that has info on how to do synchronization
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   * being used for the extraction. MUST BE A VECTOR BITSET
   *
   * @param loopName loop name used for timers
   * @param from_id
   * @param indices Vector that contains node ids of nodes that we will
   * potentially send things to
   * @param b OUTPUT: buffer that will be sent over the network; contains data
   * based on set bits in bitset
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  void syncExtract(std::string loopName, unsigned, std::vector<size_t>& indices,
                   galois::runtime::SendBuffer& b) {
    uint32_t num                        = indices.size();
    galois::DynamicBitSet& bit_set_comm = syncBitset;
    static VecTy val_vec; // sometimes wasteful
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string extract_timer_str(syncTypeStr + "ExtractVector_" +
                                  get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                      RNAME);

    Textract.start();

    if (num > 0) {
      bit_set_comm.reserve(maxSharedSize);
      offsets.reserve(maxSharedSize);
      val_vec.reserve(maxSharedSize);
      bit_set_comm.resize(num);
      offsets.resize(num);
      val_vec.resize(num);
    }

    DataCommMode data_mode;
    // loop over all bitsets in the vector of bitsets; each one corresponds to
    // a different index in the vector field we are synchronizing
    for (unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {
      if (num > 0) {
        bit_set_comm.reset();

        size_t bit_set_count = 0;

        // No GPU support currently
        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(i);

        getBitsetAndOffsets<SyncFnTy, syncType>(
            loopName, indices, bit_set_compute, bit_set_comm, offsets,
            bit_set_count, data_mode);

        // note the extra template argument which specifies that this is a
        // vector extract, i.e. get element i of the vector (i passed in as
        // argument as well)
        if (data_mode == onlyData) {
          // galois::gInfo(id, " node ", i, " has data to send");
          bit_set_count = indices.size();
          extractSubset<SyncFnTy, syncType, VecTy, true, true, true>(
              loopName, indices, bit_set_count, offsets, val_vec, i);
        } else if (data_mode !=
                   noData) { // bitsetData or offsetsData or gidsData
          // galois::gInfo(id, " node ", i, " has data to send");
          extractSubset<SyncFnTy, syncType, VecTy, false, true, true>(
              loopName, indices, bit_set_count, offsets, val_vec, i);
        }

        reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
                                      bit_set_comm);
        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
                                          indices, offsets, bit_set_comm,
                                          val_vec, b);
      } else {
        if (!async) { // TODO: is this fine?
          // append noData for however many bitsets there are
          gSerialize(b, noData);
        }
      }
    }

    Textract.stop();

    // FIXME report metadata mode for the different bitsets?
    // std::string metadata_str(syncTypeStr + "_METADATA_MODE" +
    //                         std::to_string(data_mode) +
    //                         get_run_identifier(loopName));
    // galois::runtime::reportStat_Single(RNAME, metadata_str, 1);
  }

#ifdef GALOIS_USE_BARE_MPI
  /**
   * Sync using MPI instead of network layer.
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void sync_mpi_send(std::string loopName) {
    static std::vector<galois::runtime::SendBuffer> b;
    static std::vector<MPI_Request> request;
    b.resize(numHosts);
    request.resize(numHosts, MPI_REQUEST_NULL);

    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType, writeLocation, readLocation))
        continue;

      int ready = 0;
      MPI_Test(&request[x], &ready, MPI_STATUS_IGNORE);
      if (!ready) {
        assert(b[x].size() > 0);
        MPI_Wait(&request[x], MPI_STATUS_IGNORE);
      }
      if (b[x].size() > 0) {
        b[x].getVec().clear();
      }

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
                                                                  b[x]);

      MPI_Isend((uint8_t*)b[x].linearData(), b[x].size(), MPI_BYTE, x, 32767,
                MPI_COMM_WORLD, &request[x]);
    }

    if (BitsetFnTy::is_valid()) {
      reset_bitset(syncType, &BitsetFnTy::reset_range);
    }
  }

  /**
   * Sync put using MPI instead of network layer
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void sync_mpi_put(std::string loopName, const MPI_Group& mpi_access_group,
                    const std::vector<MPI_Win>& window) {

    MPI_Win_start(mpi_access_group, 0, window[id]);

    std::vector<galois::runtime::SendBuffer> b(numHosts);
    std::vector<size_t> size(numHosts);
    uint64_t send_buffers_size = 0;

    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType, writeLocation, readLocation))
        continue;

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
                                                                  b[x]);

      size[x] = b[x].size();
      send_buffers_size += size[x];
      MPI_Put((uint8_t*)&size[x], sizeof(size_t), MPI_BYTE, x, 0,
              sizeof(size_t), MPI_BYTE, window[id]);
      MPI_Put((uint8_t*)b[x].linearData(), size[x], MPI_BYTE, x, sizeof(size_t),
              size[x], MPI_BYTE, window[id]);
    }

    auto& net = galois::runtime::getSystemNetworkInterface();
    net.incrementMemUsage(send_buffers_size);

    MPI_Win_complete(window[id]);
    net.decrementMemUsage(send_buffers_size);

    if (BitsetFnTy::is_valid()) {
      reset_bitset(syncType, &BitsetFnTy::reset_range);
    }
  }
#endif

  /**
   * Sends data to all hosts (if there is anything that needs to be sent
   * to that particular host) and adjusts bitset according to sync type.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has information needed to access bitset
   *
   * @param loopName used to name timers created by this sync send
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncNetSend(std::string loopName) {
    static galois::runtime::SendBuffer
        b; // although a static variable, allocation not reused
           // due to std::move in net.sendTagged()

    auto& net               = galois::runtime::getSystemNetworkInterface();
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string statNumMessages_str(syncTypeStr + "NumMessages_" +
                                    get_run_identifier(loopName));

    size_t numMessages = 0;
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + h) % numHosts;

      if (nothingToSend(x, syncType, writeLocation, readLocation))
        continue;

      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
                                                                  b);

      if ((!async) || (b.size() > 0)) {
        size_t syncTypePhase = 0;
        if (async && (syncType == syncBroadcast))
          syncTypePhase = 1;
        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
        ++numMessages;
      }
    }
    if (!async) {
      // Will force all messages to be processed before continuing
      net.flush();
    }

    if (BitsetFnTy::is_valid()) {
      reset_bitset(syncType, &BitsetFnTy::reset_range);
    }

    galois::runtime::reportStat_Tsum(RNAME, statNumMessages_str, numMessages);
  }

  /**
   * Sends data over the network to other hosts based on the provided template
   * arguments.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncSend(std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);

    TSendTime.start();
    syncNetSend<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
                VecTy, async>(loopName);
    TSendTime.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Receives
  ////////////////////////////////////////////////////////////////////////////////

  /**
   * Deserializes messages from other hosts and applies them to update local
   * data based on the provided sync structures.
   *
   * Complement of syncExtract.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param from_id ID of host which the message we are processing was received
   * from
   * @param buf Buffer that contains received message from other host
   * @param loopName used to name timers for statistics
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,
                       std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string set_timer_str(syncTypeStr + "Set_" +
                              get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
                                    get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
        set_batch_timer_str.c_str(), RNAME);

    galois::DynamicBitSet& bit_set_comm = syncBitset;
    static VecTy val_vec;
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
    uint32_t num      = sharedNodes[from_id].size();
    size_t retval     = 0;

    Tset.start();

    if (num > 0) { // only enter if we expect message from that host
      DataCommMode data_mode;
      // 1st deserialize gets data mode
      galois::runtime::gDeserialize(buf, data_mode);

      if (data_mode != noData) {
        // GPU update call
        Tsetbatch.start();
        bool batch_succeeded =
            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
        Tsetbatch.stop();

        // cpu always enters this block
        if (!batch_succeeded) {
          size_t bit_set_count = num;
          size_t buf_start     = 0;

          // deserialize the rest of the data in the buffer depending on the
          // data mode; arguments passed in here are mostly output vars
          deserializeMessage<syncType>(loopName, data_mode, num, buf,
                                       bit_set_count, offsets, bit_set_comm,
                                       buf_start, retval, val_vec);

          bit_set_comm.reserve(maxSharedSize);
          offsets.reserve(maxSharedSize);
          val_vec.reserve(maxSharedSize);

          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();

          if (data_mode == bitsetData) {
            size_t bit_set_count2;
            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
                                           bit_set_count2);
            assert(bit_set_count == bit_set_count2);
          }

          if (data_mode == onlyData) {
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      async, true, true>(loopName, sharedNodes[from_id],
                                         bit_set_count, offsets, val_vec,
                                         bit_set_compute);
          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      async, true, true>(loopName, sharedNodes[from_id],
                                         bit_set_count, offsets, val_vec,
                                         bit_set_compute, buf_start);
          } else if (data_mode == gidsData) {
            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,
                      true>(loopName, offsets, bit_set_count, offsets, val_vec,
                            bit_set_compute);
          } else { // bitsetData or offsetsData
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      async, false, true>(loopName, sharedNodes[from_id],
                                          bit_set_count, offsets, val_vec,
                                          bit_set_compute);
          }
          // TODO: reduce could update the bitset, so it needs to be copied
          // back to the device
        }
      }
    }

    Tset.stop();

    return retval;
  }

  /**
   * VECTOR BITSET VARIANT.
   *
   * Deserializes messages from other hosts and applies them to update local
   * data based on the provided sync structures. Each message will contain
   * a series of messages that must be deserialized (the number of such
   * messages corresponds to the size of the vector that is being synchronized).
   *
   * Complement of syncExtract, vector bitset version.
   *
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   * MUST BE VECTOR BITSET
   *
   * @param from_id ID of host which the message we are processing was received
   * from
   * @param buf Buffer that contains received message from other host
   * @param loopName used to name timers for statistics
   */
  template <
      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
      bool async,
      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,
                       std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    std::string set_timer_str(syncTypeStr + "SetVector_" +
                              get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);

    galois::DynamicBitSet& bit_set_comm = syncBitset;
    static VecTy val_vec;
    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;

    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
    uint32_t num      = sharedNodes[from_id].size();
    size_t retval     = 0;

    Tset.start();

    if (num > 0) { // only enter if we expect message from that host
      for (unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {
        DataCommMode data_mode;
        // 1st deserialize gets data mode
        galois::runtime::gDeserialize(buf, data_mode);

        if (data_mode != noData) {
          size_t bit_set_count = num;
          size_t buf_start     = 0;

          // deserialize the rest of the data in the buffer depending on the
          // data mode; arguments passed in here are mostly output vars
          deserializeMessage<syncType>(loopName, data_mode, num, buf,
                                       bit_set_count, offsets, bit_set_comm,
                                       buf_start, retval, val_vec);

          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(i);

          if (data_mode == bitsetData) {
            size_t bit_set_count2;
            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
                                           bit_set_count2);
            assert(bit_set_count == bit_set_count2);
          }

          // Note the extra template argument and i argument which cause
          // execution to deal with a particular element of the vector field
          // we are synchronizing
          if (data_mode == onlyData) {
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      async, true, true, true>(loopName, sharedNodes[from_id],
                                               bit_set_count, offsets, val_vec,
                                               bit_set_compute, i);
          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      true, async, true, true, true>(
                loopName, sharedNodes[from_id], bit_set_count, offsets, val_vec,
                bit_set_compute, i, buf_start);
          } else if (data_mode == gidsData) {
            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,
                      true, true>(loopName, offsets, bit_set_count, offsets,
                                  val_vec, bit_set_compute, i);
          } else { // bitsetData or offsetsData
            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
                      async, false, true, true>(loopName, sharedNodes[from_id],
                                                bit_set_count, offsets, val_vec,
                                                bit_set_compute, i);
          }
        }
      }
    }

    Tset.stop();

    return retval;
  }

#ifdef GALOIS_USE_BARE_MPI
  /**
   * MPI Irecv wrapper for sync
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy>
  void sync_mpi_recv_post(std::vector<MPI_Request>& request,
                          const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType, writeLocation, readLocation))
        continue;

      MPI_Irecv((uint8_t*)rb[x].data(), rb[x].size(), MPI_BYTE, x, 32767,
                MPI_COMM_WORLD, &request[x]);
    }
  }

  /**
   * MPI receive wrapper for sync
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void sync_mpi_recv_wait(std::string loopName,
                          std::vector<MPI_Request>& request,
                          const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType, writeLocation, readLocation))
        continue;

      MPI_Status status;
      MPI_Wait(&request[x], &status);

      int size = 0;
      MPI_Get_count(&status, MPI_BYTE, &size);

      galois::runtime::RecvBuffer rbuf(rb[x].begin(), rb[x].begin() + size);

      syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,
                                                                  loopName);
    }
  }

  /**
   * MPI get wrapper for sync
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void sync_mpi_get(std::string loopName, const std::vector<MPI_Win>& window,
                    const std::vector<std::vector<uint8_t>>& rb) {
    for (unsigned h = 1; h < numHosts; ++h) {
      unsigned x = (id + numHosts - h) % numHosts;
      if (nothingToRecv(x, syncType, writeLocation, readLocation))
        continue;

      MPI_Win_wait(window[x]);

      size_t size = 0;
      memcpy(&size, rb[x].data(), sizeof(size_t));

      galois::runtime::RecvBuffer rbuf(rb[x].begin() + sizeof(size_t),
                                       rb[x].begin() + sizeof(size_t) + size);

      MPI_Win_post(mpi_identity_groups[x], 0, window[x]);

      syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,
                                                                  loopName);
    }
  }
#endif

  /**
   * Determines if there is anything to receive from a host and receives/applies
   * the messages.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncNetRecv(std::string loopName) {
    auto& net = galois::runtime::getSystemNetworkInterface();
    std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),
                                                   RNAME);

    if (async) {
      size_t syncTypePhase = 0;
      if (syncType == syncBroadcast)
        syncTypePhase = 1;
      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
                                 syncTypePhase)) p;
      do {
        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
                              syncTypePhase);

        if (p) {
          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
              p->first, p->second, loopName);
        }
      } while (p);
    } else {
      for (unsigned x = 0; x < numHosts; ++x) {
        if (x == id)
          continue;
        if (nothingToRecv(x, syncType, writeLocation, readLocation))
          continue;

        Twait.start();
        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
        do {
          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
        } while (!p);
        Twait.stop();

        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
            p->first, p->second, loopName);
      }
      incrementEvilPhase();
    }
  }

  /**
   * Receives messages from all other hosts and "applies" the message (reduce
   * or set) based on the sync structure provided.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam syncType either reduce or broadcast
   * @tparam SyncFnTy synchronization structure with info needed to synchronize
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncRecv(std::string loopName) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    TRecvTime.start();
    syncNetRecv<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
                VecTy, async>(loopName);
    TRecvTime.stop();
  }

////////////////////////////////////////////////////////////////////////////////
// MPI sync variants
////////////////////////////////////////////////////////////////////////////////
#ifdef GALOIS_USE_BARE_MPI
  /**
   * Nonblocking MPI sync
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncNonblockingMPI(std::string loopName,
                          bool use_bitset_to_send = true) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    static std::vector<std::vector<uint8_t>> rb;
    static std::vector<MPI_Request> request;

    if (rb.size() == 0) { // create the receive buffers
      TRecvTime.start();
      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
      rb.resize(numHosts);
      request.resize(numHosts, MPI_REQUEST_NULL);

      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + numHosts - h) % numHosts;
        if (nothingToRecv(x, syncType, writeLocation, readLocation))
          continue;

        size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());
        rb[x].resize(size);
      }
      TRecvTime.stop();
    }

    TRecvTime.start();
    sync_mpi_recv_post<writeLocation, readLocation, syncType, SyncFnTy,
                       BitsetFnTy>(request, rb);
    TRecvTime.stop();

    TSendTime.start();
    if (use_bitset_to_send) {
      sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
                    VecTy, async>(loopName);
    } else {
      sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy,
                    galois::InvalidBitsetFnTy, VecTy, async>(loopName);
    }
    TSendTime.stop();

    TRecvTime.start();
    sync_mpi_recv_wait<writeLocation, readLocation, syncType, SyncFnTy,
                       BitsetFnTy, VecTy, async>(loopName, request, rb);
    TRecvTime.stop();
  }

  /**
   * Onesided MPI sync
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
            typename VecTy, bool async>
  void syncOnesidedMPI(std::string loopName, bool use_bitset_to_send = true) {
    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
        (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(
        (syncTypeStr + "Recv_" + get_run_identifier(loopName)).c_str(), RNAME);

    static std::vector<MPI_Win> window;
    static MPI_Group mpi_access_group;
    static std::vector<std::vector<uint8_t>> rb;

    if (window.size() == 0) { // create the windows
      TRecvTime.start();
      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
      window.resize(numHosts);
      rb.resize(numHosts);

      uint64_t recv_buffers_size = 0;
      for (unsigned x = 0; x < numHosts; ++x) {
        size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());
        rb[x].resize(size);
        recv_buffers_size += size;

        MPI_Info info;
        MPI_Info_create(&info);
        MPI_Info_set(info, "no_locks", "true");
        MPI_Info_set(info, "same_disp_unit", "true");

        MPI_Win_create(rb[x].data(), size, 1, info, MPI_COMM_WORLD, &window[x]);

        MPI_Info_free(&info);
      }
      auto& net = galois::runtime::getSystemNetworkInterface();
      net.incrementMemUsage(recv_buffers_size);

      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + numHosts - h) % numHosts;
        if (nothingToRecv(x, syncType, writeLocation, readLocation))
          continue;
        // exposure group of each window is same as identity group of that
        // window
        MPI_Win_post(mpi_identity_groups[x], 0, window[x]);
      }
      TRecvTime.stop();

      TSendTime.start();
      std::vector<int> access_hosts;
      for (unsigned h = 1; h < numHosts; ++h) {
        unsigned x = (id + h) % numHosts;

        if (nothingToSend(x, syncType, writeLocation, readLocation))
          continue;

        access_hosts.push_back(x);
      }
      MPI_Group world_group;
      MPI_Comm_group(MPI_COMM_WORLD, &world_group);
      // access group for only one window since only one window is accessed
      MPI_Group_incl(world_group, access_hosts.size(), access_hosts.data(),
                     &mpi_access_group);
      TSendTime.stop();
    }

    TSendTime.start();
    if (use_bitset_to_send) {
      sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
                   VecTy, async>(loopName, mpi_access_group, window);
    } else {
      sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy,
                   galois::InvalidBitsetFnTy, VecTy, async>(
          loopName, mpi_access_group, window);
    }
    TSendTime.stop();

    TRecvTime.start();
    sync_mpi_get<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
                 VecTy, async>(loopName, window, rb);
    TRecvTime.stop();
  }
#endif

  ////////////////////////////////////////////////////////////////////////////////
  // Higher Level Sync Calls (broadcast/reduce, etc)
  ////////////////////////////////////////////////////////////////////////////////

  /**
   * Does a reduction of data from mirror nodes to master nodes.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam ReduceFnTy reduce sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            typename ReduceFnTy, typename BitsetFnTy, bool async>
  inline void reduce(std::string loopName) {
    std::string timer_str("Reduce_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                         RNAME);

    typedef typename ReduceFnTy::ValTy T;
    typedef
        typename std::conditional<galois::runtime::is_memory_copyable<T>::value,
                                  galois::PODResizeableArray<T>,
                                  galois::gstl::Vector<T>>::type VecTy;

    TsyncReduce.start();

#ifdef GALOIS_USE_BARE_MPI
    switch (bare_mpi) {
    case noBareMPI:
#endif
      syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
               VecTy, async>(loopName);
      syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
               VecTy, async>(loopName);
#ifdef GALOIS_USE_BARE_MPI
      break;
    case nonBlockingBareMPI:
      syncNonblockingMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,
                         BitsetFnTy, VecTy, async>(loopName);
      break;
    case oneSidedBareMPI:
      syncOnesidedMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,
                      BitsetFnTy, VecTy, async>(loopName);
      break;
    default:
      GALOIS_DIE("unsupported bare MPI");
    }
#endif

    TsyncReduce.stop();
  }

  /**
   * Does a broadcast of data from master to mirror nodes.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam BroadcastFnTy broadcast sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            typename BroadcastFnTy, typename BitsetFnTy, bool async>
  inline void broadcast(std::string loopName) {
    std::string timer_str("Broadcast_" + get_run_identifier(loopName));
    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
                                                            RNAME);

    typedef typename BroadcastFnTy::ValTy T;
    typedef
        typename std::conditional<galois::runtime::is_memory_copyable<T>::value,
                                  galois::PODResizeableArray<T>,
                                  galois::gstl::Vector<T>>::type VecTy;

    TsyncBroadcast.start();

    bool use_bitset = true;

    if (currentBVFlag != nullptr) {
      if (readLocation == readSource &&
          galois::runtime::src_invalid(*currentBVFlag)) {
        use_bitset     = false;
        *currentBVFlag = BITVECTOR_STATUS::NONE_INVALID;
        currentBVFlag  = nullptr;
      } else if (readLocation == readDestination &&
                 galois::runtime::dst_invalid(*currentBVFlag)) {
        use_bitset     = false;
        *currentBVFlag = BITVECTOR_STATUS::NONE_INVALID;
        currentBVFlag  = nullptr;
      } else if (readLocation == readAny &&
                 *currentBVFlag != BITVECTOR_STATUS::NONE_INVALID) {
        // the bitvector flag being non-null means this call came from
        // sync on demand; sync on demand will NEVER use readAny
        // if location is read Any + one of src or dst is invalid
        GALOIS_DIE("readAny + use of bitvector flag without none_invalid "
                   "should never happen");
      }
    }

#ifdef GALOIS_USE_BARE_MPI
    switch (bare_mpi) {
    case noBareMPI:
#endif
      if (use_bitset) {
        syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
                 BitsetFnTy, VecTy, async>(loopName);
      } else {
        syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
                 galois::InvalidBitsetFnTy, VecTy, async>(loopName);
      }
      syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
               BitsetFnTy, VecTy, async>(loopName);
#ifdef GALOIS_USE_BARE_MPI
      break;
    case nonBlockingBareMPI:
      syncNonblockingMPI<writeLocation, readLocation, syncBroadcast,
                         BroadcastFnTy, BitsetFnTy, VecTy, async>(loopName,
                                                                  use_bitset);
      break;
    case oneSidedBareMPI:
      syncOnesidedMPI<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
                      BitsetFnTy, VecTy, async>(loopName, use_bitset);
      break;
    default:
      GALOIS_DIE("unsupported bare MPI");
    }
#endif

    TsyncBroadcast.stop();
  }

  /**
   * Do sync necessary for write source, read source.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_src_to_src(std::string loopName) {
    // do nothing for OEC
    // reduce and broadcast for IEC, CVC, UVC
    if (transposed || isVertexCut) {
      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
    }
  }

  /**
   * Do sync necessary for write source, read destination.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_src_to_dst(std::string loopName) {
    // only broadcast for OEC
    // only reduce for IEC
    // reduce and broadcast for CVC, UVC
    if (transposed) {
      reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
          loopName);
      if (isVertexCut) {
        broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
            loopName);
      }
    } else {
      if (isVertexCut) {
        reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
            loopName);
      }
      broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
          loopName);
    }
  }

  /**
   * Do sync necessary for write source, read any.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_src_to_any(std::string loopName) {
    // only broadcast for OEC
    // reduce and broadcast for IEC, CVC, UVC
    if (transposed || isVertexCut) {
      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
    }
    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
  }

  /**
   * Do sync necessary for write dest, read source.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_dst_to_src(std::string loopName) {
    // only reduce for OEC
    // only broadcast for IEC
    // reduce and broadcast for CVC, UVC
    if (transposed) {
      if (isVertexCut) {
        reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
            loopName);
      }
      broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
          loopName);
    } else {
      reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
          loopName);
      if (isVertexCut) {
        broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
            loopName);
      }
    }
  }

  /**
   * Do sync necessary for write dest, read dest.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_dst_to_dst(std::string loopName) {
    // do nothing for IEC
    // reduce and broadcast for OEC, CVC, UVC
    if (!transposed || isVertexCut) {
      reduce<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
          loopName);
      broadcast<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
          loopName);
    }
  }

  /**
   * Do sync necessary for write dest, read any.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_dst_to_any(std::string loopName) {
    // only broadcast for IEC
    // reduce and broadcast for OEC, CVC, UVC
    if (!transposed || isVertexCut) {
      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
    }
    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
  }

  /**
   * Do sync necessary for write any, read src.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_any_to_src(std::string loopName) {
    // only reduce for OEC
    // reduce and broadcast for IEC, CVC, UVC
    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
    if (transposed || isVertexCut) {
      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
    }
  }

  /**
   * Do sync necessary for write any, read dst.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_any_to_dst(std::string loopName) {
    // only reduce for IEC
    // reduce and broadcast for OEC, CVC, UVC
    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName);

    if (!transposed || isVertexCut) {
      broadcast<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(
          loopName);
    }
  }

  /**
   * Do sync necessary for write any, read any.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <typename SyncFnTy, typename BitsetFnTy, bool async>
  inline void sync_any_to_any(std::string loopName) {
    // reduce and broadcast for OEC, IEC, CVC, UVC
    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Public iterface: sync
  ////////////////////////////////////////////////////////////////////////////////

public:
  /**
   * Main sync call exposed to the user that calls the correct sync function
   * based on provided template arguments. Must provide information through
   * structures on how to do synchronization/which fields to synchronize.
   *
   * @tparam writeLocation Location data is written (src or dst)
   * @tparam readLocation Location data is read (src or dst)
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct that has info on how to access the bitset
   *
   * @param loopName used to name timers for statistics
   */
  template <WriteLocation writeLocation, ReadLocation readLocation,
            typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
            bool async = false>
  inline void sync(std::string loopName) {
    std::string timer_str("Sync_" + loopName + "_" + get_run_identifier());
    galois::StatTimer Tsync(timer_str.c_str(), RNAME);

    Tsync.start();

    if (partitionAgnostic) {
      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
    } else {
      if (writeLocation == writeSource) {
        if (readLocation == readSource) {
          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
        } else if (readLocation == readDestination) {
          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
        } else { // readAny
          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
        }
      } else if (writeLocation == writeDestination) {
        if (readLocation == readSource) {
          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
        } else if (readLocation == readDestination) {
          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
        } else { // readAny
          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
        }
      } else { // writeAny
        if (readLocation == readSource) {
          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
        } else if (readLocation == readDestination) {
          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
        } else { // readAny
          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
        }
      }
    }

    Tsync.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Sync on demand code (unmaintained, may not work)
  ////////////////////////////////////////////////////////////////////////////////
private:
  /**
   * Generic Sync on demand handler. Should NEVER get to this (hence
   * the galois die).
   */
  template <ReadLocation rl, typename SyncFnTy, typename BitsetFnTy>
  struct SyncOnDemandHandler {
    // note this call function signature is diff. from specialized versions:
    // will cause compile time error if this struct is used (which is what
    // we want)
    void call() { GALOIS_DIE("invalid read location for sync on demand"); }
  };

  /**
   * Sync on demand handler specialized for read source.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy tells program what data needs to be sync'd
   */
  template <typename SyncFnTy, typename BitsetFnTy>
  struct SyncOnDemandHandler<readSource, SyncFnTy, BitsetFnTy> {
    /**
     * Based on sync flags, handles syncs for cases when you need to read
     * at source
     *
     * @param substrate sync substrate
     * @param fieldFlags the flags structure specifying what needs to be
     * sync'd
     * @param loopName loopname used to name timers
     * @param bvFlag Copy of the bitvector status (valid/invalid at particular
     * locations)
     */
    static inline void call(GluonSubstrate* substrate,
                            galois::runtime::FieldFlags& fieldFlags,
                            std::string loopName, const BITVECTOR_STATUS&) {
      if (fieldFlags.src_to_src() && fieldFlags.dst_to_src()) {
        substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
      } else if (fieldFlags.src_to_src()) {
        substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
      } else if (fieldFlags.dst_to_src()) {
        substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
      }

      fieldFlags.clear_read_src();
    }
  };

  /**
   * Sync on demand handler specialized for read destination.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy tells program what data needs to be sync'd
   */
  template <typename SyncFnTy, typename BitsetFnTy>
  struct SyncOnDemandHandler<readDestination, SyncFnTy, BitsetFnTy> {
    /**
     * Based on sync flags, handles syncs for cases when you need to read
     * at destination
     *
     * @param substrate sync substrate
     * @param fieldFlags the flags structure specifying what needs to be
     * sync'd
     * @param loopName loopname used to name timers
     * @param bvFlag Copy of the bitvector status (valid/invalid at particular
     * locations)
     */
    static inline void call(GluonSubstrate* substrate,
                            galois::runtime::FieldFlags& fieldFlags,
                            std::string loopName, const BITVECTOR_STATUS&) {
      if (fieldFlags.src_to_dst() && fieldFlags.dst_to_dst()) {
        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
      } else if (fieldFlags.src_to_dst()) {
        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
      } else if (fieldFlags.dst_to_dst()) {
        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
      }

      fieldFlags.clear_read_dst();
    }
  };

  /**
   * Sync on demand handler specialized for read any.
   *
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy tells program what data needs to be sync'd
   */
  template <typename SyncFnTy, typename BitsetFnTy>
  struct SyncOnDemandHandler<readAny, SyncFnTy, BitsetFnTy> {
    /**
     * Based on sync flags, handles syncs for cases when you need to read
     * at both source and destination
     *
     * @param substrate sync substrate
     * @param fieldFlags the flags structure specifying what needs to be
     * sync'd
     * @param loopName loopname used to name timers
     * @param bvFlag Copy of the bitvector status (valid/invalid at particular
     * locations)
     */
    static inline void call(GluonSubstrate* substrate,
                            galois::runtime::FieldFlags& fieldFlags,
                            std::string loopName,
                            const BITVECTOR_STATUS& bvFlag) {
      bool src_write = fieldFlags.src_to_src() || fieldFlags.src_to_dst();
      bool dst_write = fieldFlags.dst_to_src() || fieldFlags.dst_to_dst();

      if (!(src_write && dst_write)) {
        // src or dst write flags aren't set (potentially both are not set),
        // but it's NOT the case that both are set, meaning "any" isn't
        // required in the "from"; can work at granularity of just src
        // write or dst wrte

        if (src_write) {
          if (fieldFlags.src_to_src() && fieldFlags.src_to_dst()) {
            if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName);
            } else if (galois::runtime::src_invalid(bvFlag)) {
              // src invalid bitset; sync individually so it can be called
              // without bitset
              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
            } else if (galois::runtime::dst_invalid(bvFlag)) {
              // dst invalid bitset; sync individually so it can be called
              // without bitset
              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
            } else {
              GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
            }
          } else if (fieldFlags.src_to_src()) {
            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
          } else { // src to dst is set
            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
          }
        } else if (dst_write) {
          if (fieldFlags.dst_to_src() && fieldFlags.dst_to_dst()) {
            if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName);
            } else if (galois::runtime::src_invalid(bvFlag)) {
              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
            } else if (galois::runtime::dst_invalid(bvFlag)) {
              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
            } else {
              GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
            }
          } else if (fieldFlags.dst_to_src()) {
            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
          } else { // dst to dst is set
            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
          }
        }

        // note the "no flags are set" case will enter into this block
        // as well, and it is correctly handled by doing nothing since
        // both src/dst_write will be false
      } else {
        // it is the case that both src/dst write flags are set, so "any"
        // is required in the "from"; what remains to be determined is
        // the use of src, dst, or any for the destination of the sync
        bool src_read = fieldFlags.src_to_src() || fieldFlags.dst_to_src();
        bool dst_read = fieldFlags.src_to_dst() || fieldFlags.dst_to_dst();

        if (src_read && dst_read) {
          if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName);
          } else if (galois::runtime::src_invalid(bvFlag)) {
            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
          } else if (galois::runtime::dst_invalid(bvFlag)) {
            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
          } else {
            GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
          }
        } else if (src_read) {
          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
        } else { // dst_read
          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
        }
      }

      fieldFlags.clear_read_src();
      fieldFlags.clear_read_dst();
    }
  };

  ////////////////////////////////////////////////////////////////////////////////
  // GPU marshaling
  ////////////////////////////////////////////////////////////////////////////////

#ifdef GALOIS_ENABLE_GPU
private:
  using GraphNode     = typename GraphTy::GraphNode;
  using edge_iterator = typename GraphTy::edge_iterator;
  using EdgeTy        = typename GraphTy::EdgeType;

  // Code that handles getting the graph onto the GPU
  template <bool isVoidType,
            typename std::enable_if<isVoidType>::type* = nullptr>
  inline void setMarshalEdge(MarshalGraph& GALOIS_UNUSED(m),
                             const size_t GALOIS_UNUSED(index),
                             const edge_iterator& GALOIS_UNUSED(e)) {
    // do nothing
  }

  template <bool isVoidType,
            typename std::enable_if<!isVoidType>::type* = nullptr>
  inline void setMarshalEdge(MarshalGraph& m, const size_t index,
                             const edge_iterator& e) {
    m.edge_data[index] = userGraph.getEdgeData(e);
  }

public:
  void getMarshalGraph(MarshalGraph& m) {
    m.nnodes   = userGraph.size();
    m.nedges   = userGraph.sizeEdges();
    m.numOwned = userGraph.numMasters();
    // Assumption: master occurs at beginning in contiguous range
    m.beginMaster       = 0;
    m.numNodesWithEdges = userGraph.getNumNodesWithEdges();
    m.id                = id;
    m.numHosts          = numHosts;
    m.row_start         = (index_type*)calloc(m.nnodes + 1, sizeof(index_type));
    m.edge_dst          = (index_type*)calloc(m.nedges, sizeof(index_type));
    m.node_data         = (index_type*)calloc(m.nnodes, sizeof(node_data_type));

    // TODO deal with edgety
    if (std::is_void<EdgeTy>::value) {
      m.edge_data = NULL;
    } else {
      if (!std::is_same<EdgeTy, edge_data_type>::value) {
        galois::gWarn("Edge data type mismatch between CPU and GPU\n");
      }
      m.edge_data = (edge_data_type*)calloc(m.nedges, sizeof(edge_data_type));
    }

    galois::do_all(
        // TODO not using thread ranges, can be optimized if I can iterate
        // directly over userGraph
        galois::iterate(userGraph.allNodesRange()),
        [&](const GraphNode& nodeID) {
          // initialize node_data with localID-to-globalID mapping
          m.node_data[nodeID] = userGraph.getGID(nodeID);
          m.row_start[nodeID] = *(userGraph.edge_begin(nodeID));
          for (auto e = userGraph.edge_begin(nodeID);
               e != userGraph.edge_end(nodeID); e++) {
            auto edgeID = *e;
            setMarshalEdge<std::is_void<EdgeTy>::value>(m, edgeID, e);
            m.edge_dst[edgeID] = userGraph.getEdgeDst(e);
          }
        },
        galois::steal());

    m.row_start[m.nnodes] = m.nedges;

    ////// TODO

    // copy memoization meta-data
    m.num_master_nodes =
        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));
    ;
    m.master_nodes =
        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));
    ;

    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
      m.num_master_nodes[h] = masterNodes[h].size();

      if (masterNodes[h].size() > 0) {
        m.master_nodes[h] =
            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));
        ;
        std::copy(masterNodes[h].begin(), masterNodes[h].end(),
                  m.master_nodes[h]);
      } else {
        m.master_nodes[h] = NULL;
      }
    }

    m.num_mirror_nodes =
        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));
    ;
    m.mirror_nodes =
        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));
    ;
    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
      m.num_mirror_nodes[h] = mirrorNodes[h].size();

      if (mirrorNodes[h].size() > 0) {
        m.mirror_nodes[h] =
            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));
        ;
        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
                  m.mirror_nodes[h]);
      } else {
        m.mirror_nodes[h] = NULL;
      }
    }

    // user needs to provide method of freeing up graph (it can do nothing
    // if they wish)
    userGraph.deallocate();
  }
#endif // het galois def

  ////////////////////////////////////////////////////////////////////////////////
  // Public sync interface
  ////////////////////////////////////////////////////////////////////////////////

public:
  /**
   * Given a structure that contains flags signifying what needs to be
   * synchronized, syncOnDemand will synchronize what is necessary based
   * on the read location of the * field.
   *
   * @tparam readLocation Location in which field will need to be read
   * @tparam SyncFnTy sync structure for the field
   * @tparam BitsetFnTy struct which holds a bitset which can be used
   * to control synchronization at a more fine grain level
   * @param fieldFlags structure for field you are syncing
   * @param loopName Name of loop this sync is for for naming timers
   */
  template <ReadLocation readLocation, typename SyncFnTy,
            typename BitsetFnTy = galois::InvalidBitsetFnTy>
  inline void syncOnDemand(galois::runtime::FieldFlags& fieldFlags,
                           std::string loopName) {
    std::string timer_str("Sync_" + get_run_identifier(loopName));
    galois::StatTimer Tsync(timer_str.c_str(), RNAME);
    Tsync.start();

    currentBVFlag = &(fieldFlags.bitvectorStatus);

    // call a template-specialized function depending on the read location
    SyncOnDemandHandler<readLocation, SyncFnTy, BitsetFnTy>::call(
        this, fieldFlags, loopName, *currentBVFlag);

    currentBVFlag = nullptr;

    Tsync.stop();
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Metadata settings/getters
  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Set the run number.
   *
   * @param runNum Number to set the run to
   */
  inline void set_num_run(const uint32_t runNum) { num_run = runNum; }

  /**
   * Get the set run number.
   *
   * @returns The set run number saved in the graph
   */
  inline uint32_t get_run_num() const { return num_run; }

  /**
   * Set the round number for use in the run identifier.
   *
   * @param round round number to set to
   */
  inline void set_num_round(const uint32_t round) { num_round = round; }

  /**
   * Get a run identifier using the set run and set round.
   *
   * @returns a string run identifier
   * @deprecated We want to move away from calling this by itself; use ones
   * that take an argument; will be removed once we eliminate all instances
   * of its use from code
   */
  inline std::string get_run_identifier() const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::to_string(num_run) + "_" +
                       std::to_string(num_round));
#else
    return std::string(std::to_string(num_run));
#endif
  }

  /**
   * Get a run identifier using the set run and set round and
   * append to the passed in string.
   *
   * @param loop_name String to append the run identifier
   * @returns String with run identifier appended to passed in loop name
   */
  inline std::string get_run_identifier(std::string loop_name) const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::string(loop_name) + "_" + std::to_string(num_run) +
                       "_" + std::to_string(num_round));
#else
    return std::string(std::string(loop_name) + "_" + std::to_string(num_run));
#endif
  }

  /**
   * Get a run identifier using the set run and set round and
   * append to the passed in string in addition to the number identifier passed
   * in.
   *
   * @param loop_name String to append the run identifier
   * @param alterID another ID with which to add to the timer name.
   *
   * @returns String with run identifier appended to passed in loop name +
   * alterID
   */
  inline std::string get_run_identifier(std::string loop_name,
                                        unsigned alterID) const {
#if GALOIS_PER_ROUND_STATS
    return std::string(std::string(loop_name) + "_" + std::to_string(alterID) +
                       "_" + std::to_string(num_run) + "_" +
                       std::to_string(num_round));
#else
    return std::string(std::string(loop_name) + "_" + std::to_string(alterID) +
                       "_" + std::to_string(num_run));
#endif
  }

  /**
   * Given a sync structure, reset the field specified by the structure
   * to the 0 of the reduction on mirrors.
   *
   * @tparam FnTy structure that specifies how synchronization is to be done
   */
  template <typename FnTy>
  void reset_mirrorField() {
    // TODO make sure this is correct still
    auto mirrorRanges = userGraph.getMirrorRanges();
    for (auto r : mirrorRanges) {
      if (r.first == r.second)
        continue;
      assert(r.first < r.second);

      // GPU call
      bool batch_succeeded = FnTy::reset_batch(r.first, r.second - 1);

      // CPU always enters this block
      if (!batch_succeeded) {
        galois::do_all(
            galois::iterate(r.first, r.second),
            [&](uint32_t lid) { FnTy::reset(lid, userGraph.getData(lid)); },
            galois::no_stats(),
            galois::loopname(get_run_identifier("RESET:MIRRORS").c_str()));
      }
    }
  }

////////////////////////////////////////////////////////////////////////////////
// Checkpointing code for graph
////////////////////////////////////////////////////////////////////////////////

// @todo Checkpointing code needs updates to make it work.
#ifdef GALOIS_CHECKPOINT
///*
// * Headers for boost serialization
// */
//#include <boost/archive/binary_oarchive.hpp>
//#include <boost/archive/binary_iarchive.hpp>
//#include <boost/serialization/split_member.hpp>
//#include <boost/serialization/binary_object.hpp>
//#include <boost/serialization/serialization.hpp>
//#include <boost/serialization/vector.hpp>
//#include <boost/serialization/unordered_map.hpp>
//
// public:
//  /**
//   * Checkpoint the complete structure on the node to disk
//   */
//  void checkpointSaveNodeData(std::string checkpointFileName = "checkpoint") {
//    using namespace boost::archive;
//    galois::StatTimer TimerSaveCheckPoint(
//        get_run_identifier("TimerSaveCheckpoint").c_str(), RNAME);
//
//    TimerSaveCheckPoint.start();
//    std::string checkpointFileName_local =
//        checkpointFileName + "_" + std::to_string(id);
//
//    std::ofstream outputStream(checkpointFileName_local, std::ios::binary);
//    if (!outputStream.is_open()) {
//      galois::gPrint("ERROR: Could not open ", checkpointFileName_local,
//                     " to save checkpoint!!!\n");
//    }
//    galois::gPrint("[", id,
//                   "] Saving local checkpoint to :", checkpointFileName_local,
//                   "\n");
//
//    boost::archive::binary_oarchive ar(outputStream,
//    boost::archive::no_header);
//
//    // TODO handle this with CuSP
//    userGraph.serializeNodeData(ar);
//
//    std::string statSendBytes_str("CheckpointBytesTotal");
//    constexpr static const char* const RREGION = "RECOVERY";
//    size_t cp_size                             = outputStream.tellp();
//    galois::runtime::reportStat_Tsum(RREGION, statSendBytes_str, cp_size);
//
//    outputStream.flush();
//    outputStream.close();
//    TimerSaveCheckPoint.stop();
//  }
//
//  /**
//   * Load checkpointed data from disk.
//   */
//  void checkpointApplyNodeData(std::string checkpointFileName = "checkpoint")
//  {
//    using namespace boost::archive;
//    galois::StatTimer TimerApplyCheckPoint(
//        get_run_identifier("TimerApplyCheckpoint").c_str(), RNAME);
//
//    TimerApplyCheckPoint.start();
//    std::string checkpointFileName_local =
//        checkpointFileName + "_" + std::to_string(id);
//
//    std::ifstream inputStream(checkpointFileName_local, std::ios::binary);
//
//    if (!inputStream.is_open()) {
//      galois::gPrint("ERROR: Could not open ", checkpointFileName_local,
//                     " to read checkpoint!!!\n");
//    }
//    galois::gPrint("[", id, "] reading local checkpoint from: ",
//                   checkpointFileName_local, "\n");
//
//    boost::archive::binary_iarchive ar(inputStream,
//    boost::archive::no_header);
//
//    // TODO handle this with CuSP
//    userGraph.deSerializeNodeData(ar);
//
//    inputStream.close();
//    TimerApplyCheckPoint.stop();
//  }
#endif
};

template <typename GraphTy>
constexpr const char* const galois::graphs::GluonSubstrate<GraphTy>::RNAME;
} // end namespace graphs
} // end namespace galois

#endif // header guard


================================================
FILE: libgluon/include/galois/runtime/DataCommMode.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file DataCommMode.h
 *
 * Contains the DataCommMode enumeration and a function that chooses a data
 * comm mode based on its arguments.
 */
#pragma once

//! Enumeration of data communication modes that can be used in synchronization
//! @todo document the enums in doxygen
enum DataCommMode {
  noData, //!< send no data
  bitsetData,
  offsetsData,
  gidsData,
  onlyData,
  dataSplitFirst, // NOT USED
  dataSplit       // NOT USED
};

//! If some mode is to be enforced, set this variable
//! @todo using a global is not great, but current problem is that GPU code
//! assumes variable and would take some reorg to fix
extern DataCommMode enforcedDataMode;

/**
 * Given a size of a subset of elements to send and the total number of
 * elements, determine an appropriate data mode to use for sending out the data
 * during synchronization.
 *
 * @tparam DataType type of the data to be synchronized
 *
 * @param num_selected number of elements to send out (subset of num_total)
 * @param num_total total number of elements that exist
 *
 * @returns an appropriate DataCommMode to use for synchronization
 */
template <typename DataType>
DataCommMode get_data_mode(size_t num_selected, size_t num_total) {
  DataCommMode data_mode = noData;
  if (enforcedDataMode != noData) {
    data_mode = enforcedDataMode;
  } else { // no enforced mode, so find an appropriate mode
    if (num_selected == 0) {
      data_mode = noData;
    } else if (num_selected == num_total) {
      data_mode = onlyData;
    } else {
      size_t bitset_alloc_size =
          ((num_total + 63) / 64) * sizeof(uint64_t) + (2 * sizeof(size_t));

      size_t bitsetDataSize = (num_selected * sizeof(DataType)) +
                              bitset_alloc_size + sizeof(num_selected);
      size_t offsetsDataSize = (num_selected * sizeof(DataType)) +
                               (num_selected * sizeof(unsigned int)) +
                               sizeof(size_t) + sizeof(num_selected);
      // find the minimum size one
      if (bitsetDataSize < offsetsDataSize) {
        data_mode = bitsetData;
      } else {
        data_mode = offsetsData;
      }
    }
  }
  return data_mode;
}


================================================
FILE: libgluon/include/galois/runtime/GlobalObj.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file GlobalObj.h
 *
 * Defines the GlobalObject class, which is a base class that other
 * classes inherit from to be assigned a unique global id.
 */

#include <vector>
#include <cstdint>
#include <cassert>

#ifndef _GALOIS_DIST_GLOBAL_OBJECT_H
#define _GALOIS_DIST_GLOBAL_OBJECT_H

namespace galois {
namespace runtime {

/**
 * A class to be inherited from so that all child classes will have a tracked
 * unique ID.
 *
 * @warning Not thread safe: do not concurrently construct GlobalObjects
 */
class GlobalObject {
  //! Vector that points to all GlobalObject instances
  //! @todo make a pointer to avoid static initialization?
  static std::vector<uintptr_t> allobjs;
  //! ID of a global object
  uint32_t objID;

protected:
  GlobalObject(const GlobalObject&) = delete;
  GlobalObject(GlobalObject&&)      = delete;

  /**
   * Returns the pointer for a global object
   *
   * @param oid Global object id to get
   * @returns pointer to requested global object
   */
  static uintptr_t ptrForObj(unsigned oid);

  /**
   * Constructs a global object given a pointer to the object you want to make
   * a global object.
   *
   * @tparam T type of the object to make a GlobalObject
   * @param ptr pointer to object to make a GlobalObject
   *
   * @todo lock needed if multiple GlobalObjects are being constructed in
   * parallel
   */
  template <typename T>
  GlobalObject(const T* ptr) {
    objID = allobjs.size();
    allobjs.push_back(reinterpret_cast<uintptr_t>(ptr));
  }

  /**
   * Returns own global id
   *
   * @returns this object's global id
   */
  uint32_t idForSelf() const { return objID; }
};

} // end namespace runtime
} // end namespace galois

#endif //_GALOIS_DIST_GLOBAL_OBJECT_H


================================================
FILE: libgluon/include/galois/runtime/SyncStructures.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file SyncStructures.h
 *
 * Contains macros for easily defining common Galois sync structures and the
 * field flags class used for on-demand synchronization.
 */

#ifndef _SYNC_STRUCT_MACROS_
#define _SYNC_STRUCT_MACROS_

#include <cstdint>                       // for uint types used below
#include <galois/AtomicHelpers.h>        // for galois::max, min
#include <galois/runtime/DataCommMode.h> // for galois::max, min
#include <galois/gIO.h>                  // for GALOIS DIE

////////////////////////////////////////////////////////////////////////////////
// Field flag class
////////////////////////////////////////////////////////////////////////////////

namespace galois {
namespace runtime {

/**
 * Bitvector status enum specifying validness of certain things in bitvector.
 */
enum BITVECTOR_STATUS {
  NONE_INVALID, //!< none of the bitvector is invalid
  SRC_INVALID,  //!< sources on bitvector are invalid
  DST_INVALID,  //!< destinations on bitvector are invalid
  BOTH_INVALID  //< both source and destinations on bitvector are invalid
};

//! Return true if the sources are invalid in bitvector flag
bool src_invalid(BITVECTOR_STATUS bv_flag);
//! Return true if the destinations are invalid in bitvector flag
bool dst_invalid(BITVECTOR_STATUS bv_flag);
//! Marks sources invalid on passed in bitvector flag
void make_src_invalid(BITVECTOR_STATUS* bv_flag);
//! Marks destinations invalid on passed in bitvector flag
void make_dst_invalid(BITVECTOR_STATUS* bv_flag);

/**
 * Each field has a FieldFlags object that indicates synchronization status
 * of that field.
 */
class FieldFlags {
private:
  uint8_t _s2s;
  uint8_t _s2d;
  uint8_t _d2s;
  uint8_t _d2d;

public:
  /**
   * Status of the bitvector in terms of if it can be used to sync the field
   */
  BITVECTOR_STATUS bitvectorStatus;
  /**
   * Field Flags constructor. Sets all flags to false and bitvector
   * status to invalid.
   */
  FieldFlags() {
    _s2s            = false;
    _s2d            = false;
    _d2s            = false;
    _d2d            = false;
    bitvectorStatus = BITVECTOR_STATUS::NONE_INVALID;
  }

  //! Return true if src2src is set
  bool src_to_src() const { return _s2s; }

  //! Return true if src2dst is set
  bool src_to_dst() const { return _s2d; }

  //! Return true if dst2src is set
  bool dst_to_src() const { return _d2s; }

  //! Return true if dst2dst is set
  bool dst_to_dst() const { return _d2d; }

  //! Sets write src flags to true
  void set_write_src() {
    _s2s = true;
    _s2d = true;
  }

  //! Sets write dst flags to true
  void set_write_dst() {
    _d2s = true;
    _d2d = true;
  }

  //! Sets all write flags to true
  void set_write_any() {
    _s2s = true;
    _s2d = true;
    _d2s = true;
    _d2d = true;
  }

  //! Sets write src flags to false
  void clear_read_src() {
    _s2s = false;
    _d2s = false;
  }

  //! Sets write dst flags to false
  void clear_read_dst() {
    _s2d = false;
    _d2d = false;
  }

  //! Sets all write flags to false
  void clear_read_any() {
    _s2d = false;
    _d2d = false;
    _s2s = false;
    _d2s = false;
  }

  //! Sets all write flags to false and sets bitvector stats to none invalid
  void clear_all() {
    _s2s            = false;
    _s2d            = false;
    _d2s            = false;
    _d2d            = false;
    bitvectorStatus = BITVECTOR_STATUS::NONE_INVALID;
  }
};

} // end namespace runtime
} // end namespace galois

////////////////////////////////////////////////////////////////////////////////
// Reduce Add, Edges
////////////////////////////////////////////////////////////////////////////////
#ifdef GALOIS_ENABLE_GPU
#define GALOIS_SYNC_STRUCTURE_ADD_EDGES(fieldtype)                             \
  struct EdgeAddReduce {                                                       \
    using ValTy = fieldtype;                                                   \
                                                                               \
    static ValTy extract(uint64_t edgeID, ValTy& edgeData) {                   \
      if (personality == GPU_CUDA)                                             \
        return get_edge_cuda(cuda_ctx, edgeID);                                \
      assert(personality == CPU);                                              \
      return edgeData;                                                         \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_edge_cuda(cuda_ctx, from_id, y, s, data_mode);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_edge_cuda(cuda_ctx, from_id, y);                             \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_edge_cuda(cuda_ctx, from_id, y, s, data_mode,          \
                                  (ValTy)0);                                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_edge_cuda(cuda_ctx, from_id, y, (ValTy)0);             \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \
      if (personality == GPU_CUDA) {                                           \
        add_edge_cuda(cuda_ctx, edgeID, y);                                    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      edgeData += y;                                                           \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_add_edge_cuda(cuda_ctx, from_id, y, data_mode);                  \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_add_mirror_edge_cuda(cuda_ctx, from_id, y, data_mode);           \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint64_t edgeID, ValTy& edgeData) {                      \
      if (personality == GPU_CUDA) {                                           \
        set_edge_cuda(cuda_ctx, edgeID, (ValTy)0);                             \
      }                                                                        \
      assert(personality == CPU);                                              \
      edgeData = 0;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_reset_edge_cuda(cuda_ctx, begin, end, (ValTy)0);                 \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void setVal(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \
      if (personality == GPU_CUDA) {                                           \
        set_edge_cuda(cuda_ctx, edgeID, (ValTy)0);                             \
      }                                                                        \
      assert(personality == CPU);                                              \
      edgeData = y;                                                            \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_edge_cuda(cuda_ctx, from_id, y, data_mode);           \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  };
#else
#define GALOIS_SYNC_STRUCTURE_ADD_EDGES(fieldtype)                             \
  struct EdgeAddReduce {                                                       \
    using ValTy = fieldtype;                                                   \
                                                                               \
    static ValTy extract(uint64_t edgeID, ValTy& edgeData) {                   \
      return edgeData;                                                         \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reduce(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \
      edgeData += y;                                                           \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint64_t edgeID, ValTy& edgeData) { edgeData = 0; }      \
                                                                               \
    static void setVal(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \
      edgeData = y;                                                            \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  };
#endif

/**
 * Sync structure for dynamic bitsets, edges.
 *
 * Bitsets are expected to have the following naming scheme:
 * bitset_edges
 *
 * In addition, you will have to declare and appropriately resize the bitset
 * in your main program as well as set the bitset appropriately (i.e. when you
 * do a write to a particular node).
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_BITSET_EDGES                                     \
  struct Bitset_edges {                                                        \
    static constexpr bool is_vector_bitset() { return false; }                 \
    static bool is_valid() { return true; }                                    \
                                                                               \
    static galois::DynamicBitSet& get() {                                      \
      if (personality == GPU_CUDA)                                             \
        get_bitset_edge_cuda(cuda_ctx,                                         \
                             (uint64_t*)bitset_edges.get_vec().data());        \
      return bitset_edges;                                                     \
    }                                                                          \
                                                                               \
    static void reset_range(size_t begin, size_t end) {                        \
      if (personality == GPU_CUDA) {                                           \
        bitset_edge_reset_cuda(cuda_ctx, begin, end);                          \
      } else {                                                                 \
        assert(personality == CPU);                                            \
        bitset_edges.reset(begin, end);                                        \
      }                                                                        \
    }                                                                          \
  }
#else
// no GPU code
#define GALOIS_SYNC_STRUCTURE_BITSET_EDGES                                     \
  struct Bitset_edges {                                                        \
    static constexpr bool is_vector_bitset() { return false; }                 \
                                                                               \
    static constexpr bool is_valid() { return true; }                          \
                                                                               \
    static galois::DynamicBitSet& get() { return bitset_edges; }               \
                                                                               \
    static void reset_range(size_t begin, size_t end) {                        \
      bitset_edges.reset(begin, end);                                          \
    }                                                                          \
  }
#endif

////////////////////////////////////////////////////////////////////////////////
// Reduce Add
////////////////////////////////////////////////////////////////////////////////

/**
 * Creates a Galois reduction sync structure that does a sum reduction.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD(fieldname, fieldtype)                 \
  struct Reduce_add_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,       \
                                                data_mode, (ValTy)0);          \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y,          \
                                                (ValTy)0);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t begin, size_t end) {                        \
      if (personality == GPU_CUDA) {                                           \
        batch_reset_node_##fieldname##_cuda(cuda_ctx, begin, end, (ValTy)0);   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA) {                                           \
        add_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      {                                                                        \
        galois::add(node.fieldname, y);                                        \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_add_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_add_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t node_id, struct NodeData& node) {               \
      if (personality == GPU_CUDA) {                                           \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, (ValTy)0);              \
      } else if (personality == CPU)                                           \
        galois::set(node.fieldname, (ValTy)0);                                 \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        node.fieldname = y;                                                    \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD(fieldname, fieldtype)                 \
  struct Reduce_add_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t, const struct NodeData& node) {              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t, size_t) { return false; }                  \
                                                                               \
    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \
      {                                                                        \
        galois::add(node.fieldname, y);                                        \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t, struct NodeData& node) {                       \
      galois::set(node.fieldname, (ValTy)0);                                   \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  }
#endif

/**
 * Creates a Galois reduction sync structure that does a sum reduction
 * on a field that is represented by an array.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD_ARRAY(fieldname, fieldtype)           \
  struct Reduce_add_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,       \
                                                data_mode, (ValTy)0);          \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y,          \
                                                (ValTy)0);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t begin, size_t end) {                        \
      if (personality == GPU_CUDA) {                                           \
        batch_reset_node_##fieldname##_cuda(cuda_ctx, begin, end, (ValTy)0);   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA) {                                           \
        add_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      {                                                                        \
        galois::add(fieldname[node_id], y);                                    \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_add_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_add_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t node_id,                                        \
                      struct NodeData& GALOIS_UNUSED(node)) {                  \
      if (personality == GPU_CUDA) {                                           \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, (ValTy)0);              \
      } else if (personality == CPU)                                           \
        galois::set(fieldname[node_id], (ValTy)0);                             \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        fieldname[node_id] = y;                                                \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD_ARRAY(fieldname, fieldtype)           \
  struct Reduce_add_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      {                                                                        \
        galois::add(fieldname[node_id], y);                                    \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t node_id,                                        \
                      struct NodeData& GALOIS_UNUSED(node)) {                  \
      galois::set(fieldname[node_id], (ValTy)0);                               \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      fieldname[node_id] = y;                                                  \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
  }
#endif

////////////////////////////////////////////////////////////////////////////////
// Reduce Set
////////////////////////////////////////////////////////////////////////////////

/**
 * Creates a Galois reduction sync structure that does a set as a reduction.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_SET(fieldname, fieldtype)                 \
  struct Reduce_set_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA) {                                           \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      {                                                                        \
        galois::set(node.fieldname, y);                                        \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        node.fieldname = y;                                                    \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_SET(fieldname, fieldtype)                 \
  struct Reduce_set_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t, const struct NodeData& node) {              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t, size_t) { return true; }                   \
                                                                               \
    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \
      {                                                                        \
        galois::set(node.fieldname, y);                                        \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t, struct NodeData&) {}                           \
                                                                               \
    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  }
#endif

/**
 * Creates a Galois reduction sync structure that does a set as a reduction
 * on a field represented by an array.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_SET_ARRAY(fieldname, fieldtype)           \
  struct Reduce_set_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA) {                                           \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      {                                                                        \
        galois::set(fieldname[node_id], y);                                    \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        fieldname[node_id] = y;                                                \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_SET_ARRAY(fieldname, fieldtype)           \
  struct Reduce_set_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      {                                                                        \
        galois::set(fieldname[node_id], y);                                    \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      fieldname[node_id] = y;                                                  \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
  }
#endif

////////////////////////////////////////////////////////////////////////////////
// Reduce Min
////////////////////////////////////////////////////////////////////////////////

/**
 * Creates a Galois reduction sync structure that does a min reduction.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN(fieldname, fieldtype)                 \
  struct Reduce_min_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA) {                                           \
        return y < min_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \
      }                                                                        \
      assert(personality == CPU);                                              \
      { return y < galois::min(node.fieldname, y); }                           \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_min_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_min_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        node.fieldname = y;                                                    \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN(fieldname, fieldtype)                 \
  struct Reduce_min_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t, const struct NodeData& node) {              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t, size_t) { return true; }                   \
                                                                               \
    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \
      { return y < galois::min(node.fieldname, y); }                           \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t, struct NodeData&) {}                           \
                                                                               \
    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  }
#endif

////////////////////////////////////////////////////////////////////////////////
// Reduce Max
////////////////////////////////////////////////////////////////////////////////

/**
 * Creates a Galois reduction sync structure that does a max reduction.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_MAX(fieldname, fieldtype)                 \
  struct Reduce_max_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA) {                                           \
        return y > max_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \
      }                                                                        \
      assert(personality == CPU);                                              \
      { return y > galois::max(node.fieldname, y); }                           \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_max_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_max_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        node.fieldname = y;                                                    \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_MAX(fieldname, fieldtype)                 \
  struct Reduce_max_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \
                       ValTy y) {                                              \
      { return y > galois::max(node.fieldname, y); }                           \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \
                       ValTy y) {                                              \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
  }
#endif

/**
 * Creates a Galois reduction sync structure that does a pairwise
 * min reduction on an array.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN_ARRAY(fieldname, fieldtype)           \
  struct Reduce_min_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA) {                                           \
        return y < min_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \
      }                                                                        \
      assert(personality == CPU);                                              \
      { return y < galois::min(fieldname[node_id], y); }                       \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_min_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_min_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        fieldname[node_id] = y;                                                \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN_ARRAY(fieldname, fieldtype)           \
  struct Reduce_min_##fieldname {                                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      return fieldname[node_id];                                               \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      { return y < galois::min(fieldname[node_id], y); }                       \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& GALOIS_UNUSED(node)) {}                 \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \
                       ValTy y) {                                              \
      fieldname[node_id] = y;                                                  \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      return false;                                                            \
    }                                                                          \
  }
#endif

/**
 * Creates a Galois reduction sync structure that does a pairwise
 * average on an array.
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(fieldname, fieldtype) \
  struct Reduce_pair_wise_avg_array_##fieldname {                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \
      if (personality == GPU_CUDA)                                             \
        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \
      assert(personality == CPU);                                              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \
                                    DataCommMode* data_mode) {                 \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \
      if (personality == GPU_CUDA) {                                           \
        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA) {                                           \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      {                                                                        \
        galois::pairWiseAvg_vec(node.fieldname, y);                            \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \
                      struct NodeData& node) {                                 \
      { galois::resetVec(node.fieldname); }                                    \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \
      if (personality == GPU_CUDA)                                             \
        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \
      else if (personality == CPU)                                             \
        node.fieldname = y;                                                    \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \
                             DataCommMode data_mode) {                         \
      if (personality == GPU_CUDA) {                                           \
        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \
                                                 data_mode);                   \
        return true;                                                           \
      }                                                                        \
      assert(personality == CPU);                                              \
      return false;                                                            \
    }                                                                          \
  }
#else
// Non-GPU code
#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(fieldname, fieldtype) \
  struct Reduce_pair_wise_avg_array_##fieldname {                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t, const struct NodeData& node) {              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t, size_t) { return false; }                  \
                                                                               \
    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \
      {                                                                        \
        galois::pairWiseAvg_vec(node.fieldname, y);                            \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t, struct NodeData& node) {                       \
      { galois::resetVec(node.fieldname); }                                    \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  }
#endif

/**
 * Creates a Galois reduction sync structure that does a pairwise
 * sum reduction on an array.
 */
#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(fieldname, fieldtype) \
  struct Reduce_pair_wise_add_array_##fieldname {                              \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t, const struct NodeData& node) {              \
      return node.fieldname;                                                   \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned, uint8_t*) { return false; }            \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t, size_t) { return false; }                  \
                                                                               \
    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \
      {                                                                        \
        galois::addArray(node.fieldname, y);                                   \
        return true;                                                           \
      }                                                                        \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t, struct NodeData& node) {                       \
      { galois::resetVec(node.fieldname); }                                    \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \
      node.fieldname = y;                                                      \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \
      return false;                                                            \
    }                                                                          \
  }

/**
 * Creates a Galois reduction sync structure that does a pairwise
 * sum reduction on an array on a SINGLE element.
 */
#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY_SINGLE(fieldname,     \
                                                                fieldtype)     \
  struct Reduce_pair_wise_add_array_single_##fieldname {                       \
    typedef fieldtype ValTy;                                                   \
                                                                               \
    static ValTy extract(uint32_t node_id, const struct NodeData& node,        \
                         unsigned vecIndex) {                                  \
      return node.fieldname[vecIndex];                                         \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \
                              DataCommMode* data_mode) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \
                                    DataCommMode*) {                           \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \
                                                                               \
    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \
                            size_t GALOIS_UNUSED(end)) {                       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \
                       ValTy y, unsigned vecIndex) {                           \
      node.fieldname[vecIndex] = node.fieldname[vecIndex] + y;                 \
      return true;                                                             \
    }                                                                          \
                                                                               \
    static bool reduce_batch(unsigned, uint8_t*, size_t, DataCommMode) {       \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \
                                    DataCommMode data_mode) {                  \
      return false;                                                            \
    }                                                                          \
                                                                               \
    static void reset(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node,  \
                      unsigned vecIndex) {                                     \
      node.fieldname[vecIndex] = 0;                                            \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \
                       ValTy y, unsigned vecIndex) {                           \
      node.fieldname[vecIndex] = y;                                            \
    }                                                                          \
                                                                               \
    static void setVal(uint32_t GALOIS_UNUSED(node_id),                        \
                       struct NodeData& GALOIS_UNUSED(node), ValTy y) {        \
      GALOIS_DIE("execution shouldn't get here; needs index arg");             \
    }                                                                          \
                                                                               \
    static bool setVal_batch(unsigned uint8_t*, DataCommMode) {                \
      return false;                                                            \
    }                                                                          \
  }

////////////////////////////////////////////////////////////////////////////////
// Bitset struct
////////////////////////////////////////////////////////////////////////////////

/**
 * Sync structure for dynamic bitsets.
 *
 * Bitsets are expected to have the following naming scheme:
 * bitset_<fieldname>
 *
 * In addition, you will have to declare and appropriately resize the bitset
 * in your main program as well as set the bitset appropriately (i.e. when you
 * do a write to a particular node).
 */
#ifdef GALOIS_ENABLE_GPU
// GPU code included
#define GALOIS_SYNC_STRUCTURE_BITSET(fieldname)                                \
  struct Bitset_##fieldname {                                                  \
    static constexpr bool is_vector_bitset() { return false; }                 \
    static bool is_valid() { return true; }                                    \
                                                                               \
    static galois::DynamicBitSet& get() {                                      \
      if (personality == GPU_CUDA)                                             \
        get_bitset_##fieldname##_cuda(                                         \
            cuda_ctx, (uint64_t*)bitset_##fieldname.get_vec().data());         \
      return bitset_##fieldname;                                               \
    }                                                                          \
                                                                               \
    static void reset_range(size_t begin, size_t end) {                        \
      if (personality == GPU_CUDA) {                                           \
        bitset_##fieldname##_reset_cuda(cuda_ctx, begin, end);                 \
      } else {                                                                 \
        assert(personality == CPU);                                            \
        bitset_##fieldname.reset(begin, end);                                  \
      }                                                                        \
    }                                                                          \
  }
#else
// no GPU code
#define GALOIS_SYNC_STRUCTURE_BITSET(fieldname)                                \
  struct Bitset_##fieldname {                                                  \
    static constexpr bool is_vector_bitset() { return false; }                 \
                                                                               \
    static constexpr bool is_valid() { return true; }                          \
                                                                               \
    static galois::DynamicBitSet& get() { return bitset_##fieldname; }         \
                                                                               \
    static void reset_range(size_t begin, size_t end) {                        \
      bitset_##fieldname.reset(begin, end);                                    \
    }                                                                          \
  }
#endif

/**
 * Sync structure for a vector of dynamic bitsets. Function signatures
 * allow indexing into this vector to get the correct bitset
 *
 * Bitsets are expected to have the following naming scheme:
 * bitset_<fieldname>
 *
 * In addition, you will have to declare and appropriately resize the bitset
 * in your main program as well as set the bitset appropriately (i.e. when you
 * do a write to a particular node).
 */
#define GALOIS_SYNC_STRUCTURE_VECTOR_BITSET(fieldname)                         \
  struct Bitset_##fieldname {                                                  \
    static unsigned numBitsets() { return vbitset_##fieldname.size(); }        \
                                                                               \
    static constexpr bool is_vector_bitset() { return true; }                  \
                                                                               \
    static constexpr bool is_valid() { return true; }                          \
                                                                               \
    static galois::DynamicBitSet& get(unsigned i) {                            \
      return vbitset_##fieldname[i];                                           \
    }                                                                          \
                                                                               \
    static void reset_range(size_t begin, size_t end) {                        \
      for (unsigned i = 0; i < vbitset_##fieldname.size(); i++) {              \
        vbitset_##fieldname[i].reset(begin, end);                              \
      }                                                                        \
    }                                                                          \
  }

#endif // header guard


================================================
FILE: libgluon/include/galois/runtime/cuda/DeviceEdgeSync.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once
/**
 * @file DeviceEdgeSync.h
 *
 * CUDA header for GPU runtime
 *
 * @todo better file description + document this file
 */
#pragma once
#include "galois/cuda/DynamicBitset.h"
#include "galois/cuda/EdgeContext.h"
#include "galois/runtime/DataCommMode.h"
#include "cub/util_allocator.cuh"

#ifdef GALOIS_CUDA_CHECK_ERROR
#define check_cuda_kernel                                                      \
  check_cuda(cudaDeviceSynchronize());                                         \
  check_cuda(cudaGetLastError());
#else
#define check_cuda_kernel check_cuda(cudaGetLastError());
#endif

enum SharedType { sharedMaster, sharedMirror };
enum UpdateOp { setOp, addOp, minOp };

void kernel_sizing(dim3& blocks, dim3& threads) {
  threads.x = 256;
  threads.y = threads.z = 1;
  blocks.x              = ggc_get_nSM() * 8;
  blocks.y = blocks.z = 1;
}

template <typename DataType>
__global__ void batch_get_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 DataType* __restrict__ subset,
                                 const DataType* __restrict__ array) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    subset[src]    = array[index];
  }
}

template <typename DataType, typename OffsetIteratorType>
__global__ void batch_get_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 DataType* __restrict__ subset,
                                 const DataType* __restrict__ array) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    subset[src]    = array[index];
  }
}

template <typename DataType>
__global__ void batch_get_reset_subset(index_type subset_size,
                                       const unsigned int* __restrict__ indices,
                                       DataType* __restrict__ subset,
                                       DataType* __restrict__ array,
                                       DataType reset_value) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    subset[src]    = array[index];
    array[index]   = reset_value;
  }
}

template <typename DataType, typename OffsetIteratorType>
__global__ void batch_get_reset_subset(index_type subset_size,
                                       const unsigned int* __restrict__ indices,
                                       const OffsetIteratorType offsets,
                                       DataType* __restrict__ subset,
                                       DataType* __restrict__ array,
                                       DataType reset_value) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    subset[src]    = array[index];
    array[index]   = reset_value;
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_set_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    array[index]   = subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_set_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    array[index]   = subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_add_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    array[index] += subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_add_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    array[index] += subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_min_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (array[index] > subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_min_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    if (array[index] > subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_max_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (array[index] < subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_max_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    if (array[index] < subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType>
__global__ void batch_reset(DataType* __restrict__ array, index_type begin,
                            index_type end, DataType val) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = end;
  for (index_type src = begin + tid; src < src_end; src += nthreads) {
    array[src] = val;
  }
}

__global__ void
batch_get_subset_bitset(index_type subset_size,
                        const unsigned int* __restrict__ indices,
                        DynamicBitset* __restrict__ is_subset_updated,
                        DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (is_array_updated->test(index)) {
      is_subset_updated->set(src);
    }
  }
}

// inclusive range
__global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,
                                   size_t vec_begin, size_t vec_end, bool test1,
                                   size_t bit_index1, uint64_t mask1,
                                   bool test2, size_t bit_index2,
                                   uint64_t mask2) {
  unsigned tid      = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
    bitset->batch_reset(src);
  }

  if (tid == 0) {
    if (test1) {
      bitset->batch_bitwise_and(bit_index1, mask1);
    }
    if (test2) {
      bitset->batch_bitwise_and(bit_index2, mask2);
    }
  }
}

template <typename DataType>
void reset_bitset_field(struct CUDA_Context_Field_Edges<DataType>* field,
                        size_t begin, size_t end) {
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);
  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();
  assert(begin <= (bitset_cpu->size() - 1));
  assert(end <= (bitset_cpu->size() - 1));

  size_t vec_begin = (begin + 63) / 64;
  size_t vec_end;

  if (end == (bitset_cpu->size() - 1))
    vec_end = bitset_cpu->vec_size();
  else
    vec_end = (end + 1) / 64; // floor

  size_t begin2 = vec_begin * 64;
  size_t end2   = vec_end * 64;

  bool test1;
  size_t bit_index1;
  uint64_t mask1;

  bool test2;
  size_t bit_index2;
  uint64_t mask2;

  if (begin2 > end2) {
    test2 = false;

    if (begin < begin2) {
      test1       = true;
      bit_index1  = begin / 64;
      size_t diff = begin2 - begin;
      assert(diff < 64);
      mask1 = ((uint64_t)1 << (64 - diff)) - 1;

      // create or mask
      size_t diff2 = end - end2 + 1;
      assert(diff2 < 64);
      mask2 = ~(((uint64_t)1 << diff2) - 1);
      mask1 |= ~mask2;
    } else {
      test1 = false;
    }
  } else {
    if (begin < begin2) {
      test1       = true;
      bit_index1  = begin / 64;
      size_t diff = begin2 - begin;
      assert(diff < 64);
      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
    } else {
      test1 = false;
    }

    if (end >= end2) {
      test2       = true;
      bit_index2  = end / 64;
      size_t diff = end - end2 + 1;
      assert(diff < 64);
      mask2 = ~(((uint64_t)1 << diff) - 1);
    } else {
      test2 = false;
    }
  }

  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),
                                          vec_begin, vec_end, test1, bit_index1,
                                          mask1, test2, bit_index2, mask2);
}

template <typename DataType>
void reset_data_field(struct CUDA_Context_Field_Edges<DataType>* field,
                      size_t begin, size_t end, DataType val) {
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  batch_reset<DataType><<<blocks, threads>>>(
      field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);
}

void get_offsets_from_bitset(index_type bitset_size,
                             unsigned int* __restrict__ offsets,
                             DynamicBitset* __restrict__ bitset,
                             size_t* __restrict__ num_set_bits) {
  cub::CachingDeviceAllocator g_allocator(
      true); // Caching allocator for device memory
  DynamicBitsetIterator flag_iterator(bitset);
  IdentityIterator offset_iterator;
  Shared<size_t> num_set_bits_ptr;
  num_set_bits_ptr.alloc(1);
  void* d_temp_storage      = NULL;
  size_t temp_storage_bytes = 0;
  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
                             offset_iterator, flag_iterator, offsets,
                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
  check_cuda_kernel;
  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));
  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
                             offset_iterator, flag_iterator, offsets,
                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
  check_cuda_kernel;
  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));
  if (d_temp_storage)
    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
}

template <typename DataType, SharedType sharedType, bool reset>
void batch_get_shared_edge(struct CUDA_Context_Common_Edges* ctx,
                           struct CUDA_Context_Field_Edges<DataType>* field,
                           unsigned from_id, uint8_t* send_buffer,
                           DataType i = 0) {
  struct CUDA_Context_Shared_Edges* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
  // timer.start();
  // timer1.start();
  size_t v_size = shared->num_edges[from_id];
  if (reset) {
    batch_get_reset_subset<DataType><<<blocks, threads>>>(
        v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),
        field->data.gpu_wr_ptr(), i);
  } else {
    batch_get_subset<DataType><<<blocks, threads>>>(
        v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),
        field->data.gpu_rd_ptr());
  }
  check_cuda_kernel;
  // timer1.stop();
  // timer2.start();
  DataCommMode data_mode = onlyData;
  memcpy(send_buffer, &data_mode, sizeof(data_mode));
  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
  shared_data->copy_to_cpu(
      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
  // timer2.stop();
  // timer.stop();
  // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
  //  ctx->id, from_id,
  //  timer1.duration_ms(), timer2.duration_ms(),
  //  timer.duration_ms());
}

template <typename DataType>
void serializeMessage(struct CUDA_Context_Common_Edges* ctx,
                      DataCommMode data_mode, size_t bit_set_count,
                      size_t num_shared, DeviceOnly<DataType>* shared_data,
                      uint8_t* send_buffer) {
  if (data_mode == noData) {
    // do nothing
    return;
  }

  size_t offset = 0;

  // serialize data_mode
  memcpy(send_buffer, &data_mode, sizeof(data_mode));
  offset += sizeof(data_mode);

  if (data_mode != onlyData) {
    // serialize bit_set_count
    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
  }

  if ((data_mode == gidsData) || (data_mode == offsetsData)) {
    // serialize offsets vector
    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
    ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),
                             bit_set_count);
    offset += bit_set_count * sizeof(unsigned int);
  } else if ((data_mode == bitsetData)) {
    // serialize bitset
    memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));
    offset += sizeof(num_shared);
    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
    memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));
    offset += sizeof(vec_size);
    ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(
        (uint64_t*)(send_buffer + offset));
    offset += vec_size * sizeof(uint64_t);
  }

  // serialize data vector
  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
  offset += sizeof(bit_set_count);
  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
  // offset += bit_set_count * sizeof(DataType);
}

template <typename DataType, SharedType sharedType, bool reset>
void batch_get_shared_edge(struct CUDA_Context_Common_Edges* ctx,
                           struct CUDA_Context_Field_Edges<DataType>* field,
                           unsigned from_id, uint8_t* send_buffer,
                           size_t* v_size, DataCommMode* data_mode,
                           DataType i = 0) {
  struct CUDA_Context_Shared_Edges* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
  // timer3("timer3"), timer4("timer 4"); timer.start();
  if (enforcedDataMode != onlyData) {
    // timer1.start();
    ctx->is_updated.cpu_rd_ptr()->resize(shared->num_edges[from_id]);
    ctx->is_updated.cpu_rd_ptr()->reset();
    batch_get_subset_bitset<<<blocks, threads>>>(
        shared->num_edges[from_id], shared->edges[from_id].device_ptr(),
        ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
    check_cuda_kernel;
    // timer1.stop();
    // timer2.start();
    get_offsets_from_bitset(shared->num_edges[from_id],
                            ctx->offsets.device_ptr(),
                            ctx->is_updated.gpu_rd_ptr(), v_size);
    // timer2.stop();
  }
  *data_mode = get_data_mode<DataType>(*v_size, shared->num_edges[from_id]);
  // timer3.start();
  if ((*data_mode) == onlyData) {
    *v_size = shared->num_edges[from_id];
    if (reset) {
      batch_get_reset_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->edges[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
    } else {
      batch_get_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->edges[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_rd_ptr());
    }
  } else { // bitsetData || offsetsData
    if (reset) {
      batch_get_reset_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->edges[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), i);
    } else {
      batch_get_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->edges[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_rd_ptr());
    }
  }
  check_cuda_kernel;
  // timer3.stop();
  // timer4.start();
  serializeMessage(ctx, *data_mode, *v_size, shared->num_edges[from_id],
                   shared_data, send_buffer);
  // timer4.stop();
  // timer.stop();
  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
  // + %llu + %llu + %llu = %llu\n",
  //  ctx->id, from_id, *data_mode,
  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),
  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
}

template <typename DataType>
void deserializeMessage(struct CUDA_Context_Common_Edges* ctx,
                        DataCommMode data_mode, size_t& bit_set_count,
                        size_t num_shared, DeviceOnly<DataType>* shared_data,
                        uint8_t* recv_buffer) {
  size_t offset = 0; // data_mode is already deserialized

  if (data_mode != onlyData) {
    // deserialize bit_set_count
    memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
  } else {
    bit_set_count = num_shared;
  }

  assert(data_mode != gidsData); // not supported for deserialization on GPUs
  if (data_mode == offsetsData) {
    // deserialize offsets vector
    offset += sizeof(bit_set_count);
    ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),
                             bit_set_count);
    offset += bit_set_count * sizeof(unsigned int);
  } else if ((data_mode == bitsetData)) {
    // deserialize bitset
    ctx->is_updated.cpu_rd_ptr()->resize(num_shared);
    offset += sizeof(num_shared);
    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
    offset += sizeof(vec_size);
    ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(
        (uint64_t*)(recv_buffer + offset));
    offset += vec_size * sizeof(uint64_t);
    // get offsets
    size_t v_size;
    get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),
                            ctx->is_updated.gpu_rd_ptr(), &v_size);

    assert(bit_set_count == v_size);
  }

  // deserialize data vector
  offset += sizeof(bit_set_count);
  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
  // offset += bit_set_count * sizeof(DataType);
}

template <typename DataType, SharedType sharedType, UpdateOp op>
void batch_set_shared_edge(struct CUDA_Context_Common_Edges* ctx,
                           struct CUDA_Context_Field_Edges<DataType>* field,
                           unsigned from_id, uint8_t* recv_buffer,
                           DataCommMode data_mode) {
  assert(data_mode != noData);
  struct CUDA_Context_Shared_Edges* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);
  size_t v_size;

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
  // timer.start();
  // timer1.start();
  deserializeMessage(ctx, data_mode, v_size, shared->num_edges[from_id],
                     shared_data, recv_buffer);
  // timer1.stop();
  // timer2.start();
  if (data_mode == onlyData) {
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    }
  } else if (data_mode == gidsData) {
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    }
  } else { // bitsetData || offsetsData
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->edges[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    }
  }
  check_cuda_kernel;
  // timer2.stop();
  // timer.stop();
  // fprintf(stderr, "Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\n",
  //  ctx->id, from_id, data_mode,
  //  timer1.duration_ms(), timer2.duration_ms(),
  //  timer.duration_ms());
}


================================================
FILE: libgluon/include/galois/runtime/cuda/DeviceSync.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file DeviceSync.h
 *
 * CUDA header for GPU runtime
 *
 * @todo better file description + document this file
 */
#pragma once
#include "galois/cuda/DynamicBitset.h"
#include "galois/cuda/Context.h"
#include "galois/runtime/DataCommMode.h"
#include "cub/util_allocator.cuh"

#ifdef GALOIS_CUDA_CHECK_ERROR
#define check_cuda_kernel                                                      \
  check_cuda(cudaDeviceSynchronize());                                         \
  check_cuda(cudaGetLastError());
#else
#define check_cuda_kernel check_cuda(cudaGetLastError());
#endif

enum SharedType { sharedMaster, sharedMirror };
enum UpdateOp { setOp, addOp, minOp };

void kernel_sizing(dim3& blocks, dim3& threads) {
  threads.x = 256;
  threads.y = threads.z = 1;
  blocks.x              = ggc_get_nSM() * 8;
  blocks.y = blocks.z = 1;
}

template <typename DataType>
__global__ void batch_get_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 DataType* __restrict__ subset,
                                 const DataType* __restrict__ array) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    subset[src]    = array[index];
  }
}

template <typename DataType, typename OffsetIteratorType>
__global__ void batch_get_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 DataType* __restrict__ subset,
                                 const DataType* __restrict__ array) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    subset[src]    = array[index];
  }
}

template <typename DataType>
__global__ void batch_get_reset_subset(index_type subset_size,
                                       const unsigned int* __restrict__ indices,
                                       DataType* __restrict__ subset,
                                       DataType* __restrict__ array,
                                       DataType reset_value) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    subset[src]    = array[index];
    array[index]   = reset_value;
  }
}

template <typename DataType, typename OffsetIteratorType>
__global__ void batch_get_reset_subset(index_type subset_size,
                                       const unsigned int* __restrict__ indices,
                                       const OffsetIteratorType offsets,
                                       DataType* __restrict__ subset,
                                       DataType* __restrict__ array,
                                       DataType reset_value) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    subset[src]    = array[index];
    array[index]   = reset_value;
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_set_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    array[index]   = subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_set_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    array[index]   = subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_add_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    array[index] += subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_add_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    array[index] += subset[src];
    if (sharedType != sharedMirror) {
      is_array_updated->set(index);
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_min_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (array[index] > subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_min_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    if (array[index] > subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType>
__global__ void batch_max_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (array[index] < subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
__global__ void batch_max_subset(index_type subset_size,
                                 const unsigned int* __restrict__ indices,
                                 const OffsetIteratorType offsets,
                                 const DataType* __restrict__ subset,
                                 DataType* __restrict__ array,
                                 DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[offsets[src]];
    if (array[index] < subset[src]) {
      array[index] = subset[src];
      if (sharedType != sharedMirror) {
        is_array_updated->set(index);
      }
    }
  }
}

template <typename DataType>
__global__ void batch_reset(DataType* __restrict__ array, index_type begin,
                            index_type end, DataType val) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = end;
  for (index_type src = begin + tid; src < src_end; src += nthreads) {
    array[src] = val;
  }
}

__global__ void
batch_get_subset_bitset(index_type subset_size,
                        const unsigned int* __restrict__ indices,
                        DynamicBitset* __restrict__ is_subset_updated,
                        DynamicBitset* __restrict__ is_array_updated) {
  unsigned tid       = TID_1D;
  unsigned nthreads  = TOTAL_THREADS_1D;
  index_type src_end = subset_size;
  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
    unsigned index = indices[src];
    if (is_array_updated->test(index)) {
      is_subset_updated->set(src);
    }
  }
}

// inclusive range
__global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,
                                   size_t vec_begin, size_t vec_end, bool test1,
                                   size_t bit_index1, uint64_t mask1,
                                   bool test2, size_t bit_index2,
                                   uint64_t mask2) {
  unsigned tid      = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
    bitset->batch_reset(src);
  }

  if (tid == 0) {
    if (test1) {
      bitset->batch_bitwise_and(bit_index1, mask1);
    }
    if (test2) {
      bitset->batch_bitwise_and(bit_index2, mask2);
    }
  }
}

template <typename DataType>
void reset_bitset_field(struct CUDA_Context_Field<DataType>* field,
                        size_t begin, size_t end) {
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);
  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();
  assert(begin <= (bitset_cpu->size() - 1));
  assert(end <= (bitset_cpu->size() - 1));

  size_t vec_begin = (begin + 63) / 64;
  size_t vec_end;

  if (end == (bitset_cpu->size() - 1))
    vec_end = bitset_cpu->vec_size();
  else
    vec_end = (end + 1) / 64; // floor

  size_t begin2 = vec_begin * 64;
  size_t end2   = vec_end * 64;

  bool test1;
  size_t bit_index1;
  uint64_t mask1;

  bool test2;
  size_t bit_index2;
  uint64_t mask2;

  if (begin2 > end2) {
    test2 = false;

    if (begin < begin2) {
      test1       = true;
      bit_index1  = begin / 64;
      size_t diff = begin2 - begin;
      assert(diff < 64);
      mask1 = ((uint64_t)1 << (64 - diff)) - 1;

      // create or mask
      size_t diff2 = end - end2 + 1;
      assert(diff2 < 64);
      mask2 = ~(((uint64_t)1 << diff2) - 1);
      mask1 |= ~mask2;
    } else {
      test1 = false;
    }
  } else {
    if (begin < begin2) {
      test1       = true;
      bit_index1  = begin / 64;
      size_t diff = begin2 - begin;
      assert(diff < 64);
      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
    } else {
      test1 = false;
    }

    if (end >= end2) {
      test2       = true;
      bit_index2  = end / 64;
      size_t diff = end - end2 + 1;
      assert(diff < 64);
      mask2 = ~(((uint64_t)1 << diff) - 1);
    } else {
      test2 = false;
    }
  }

  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),
                                          vec_begin, vec_end, test1, bit_index1,
                                          mask1, test2, bit_index2, mask2);
}

template <typename DataType>
void reset_data_field(struct CUDA_Context_Field<DataType>* field, size_t begin,
                      size_t end, DataType val) {
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  batch_reset<DataType><<<blocks, threads>>>(
      field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);
}

void get_offsets_from_bitset(index_type bitset_size,
                             unsigned int* __restrict__ offsets,
                             DynamicBitset* __restrict__ bitset,
                             size_t* __restrict__ num_set_bits) {
  cub::CachingDeviceAllocator g_allocator(
      true); // Caching allocator for device memory
  DynamicBitsetIterator flag_iterator(bitset);
  IdentityIterator offset_iterator;
  Shared<size_t> num_set_bits_ptr;
  num_set_bits_ptr.alloc(1);
  void* d_temp_storage      = NULL;
  size_t temp_storage_bytes = 0;
  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
                             offset_iterator, flag_iterator, offsets,
                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
  check_cuda_kernel;
  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));
  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
                             offset_iterator, flag_iterator, offsets,
                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
  check_cuda_kernel;
  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));
  if (d_temp_storage)
    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
}

template <typename DataType, SharedType sharedType, bool reset>
void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                            struct CUDA_Context_Field<DataType>* field,
                            unsigned from_id, uint8_t* send_buffer,
                            DataType i = 0) {
  struct CUDA_Context_Shared* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
  // timer.start();
  // timer1.start();
  size_t v_size = shared->num_nodes[from_id];
  if (reset) {
    batch_get_reset_subset<DataType><<<blocks, threads>>>(
        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
        field->data.gpu_wr_ptr(), i);
  } else {
    batch_get_subset<DataType><<<blocks, threads>>>(
        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
        field->data.gpu_rd_ptr());
  }
  check_cuda_kernel;
  // timer1.stop();
  // timer2.start();
  DataCommMode data_mode = onlyData;
  memcpy(send_buffer, &data_mode, sizeof(data_mode));
  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
  shared_data->copy_to_cpu(
      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
  // timer2.stop();
  // timer.stop();
  // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
  //  ctx->id, from_id,
  //  timer1.duration_ms(), timer2.duration_ms(),
  //  timer.duration_ms());
}

template <typename DataType>
void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                      size_t bit_set_count, size_t num_shared,
                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {
  if (data_mode == noData) {
    // do nothing
    return;
  }

  size_t offset = 0;

  // serialize data_mode
  memcpy(send_buffer, &data_mode, sizeof(data_mode));
  offset += sizeof(data_mode);

  if (data_mode != onlyData) {
    // serialize bit_set_count
    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
  }

  if ((data_mode == gidsData) || (data_mode == offsetsData)) {
    // serialize offsets vector
    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
    ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),
                             bit_set_count);
    offset += bit_set_count * sizeof(unsigned int);
  } else if ((data_mode == bitsetData)) {
    // serialize bitset
    memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));
    offset += sizeof(num_shared);
    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
    memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));
    offset += sizeof(vec_size);
    ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(
        (uint64_t*)(send_buffer + offset));
    offset += vec_size * sizeof(uint64_t);
  }

  // serialize data vector
  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
  offset += sizeof(bit_set_count);
  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
  // offset += bit_set_count * sizeof(DataType);
}

template <typename DataType, SharedType sharedType, bool reset>
void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                            struct CUDA_Context_Field<DataType>* field,
                            unsigned from_id, uint8_t* send_buffer,
                            size_t* v_size, DataCommMode* data_mode,
                            DataType i = 0) {
  struct CUDA_Context_Shared* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
  // timer3("timer3"), timer4("timer 4"); timer.start();
  if (enforcedDataMode != onlyData) {
    // timer1.start();
    ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]);
    ctx->is_updated.cpu_rd_ptr()->reset();
    batch_get_subset_bitset<<<blocks, threads>>>(
        shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(),
        ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
    check_cuda_kernel;
    // timer1.stop();
    // timer2.start();
    get_offsets_from_bitset(shared->num_nodes[from_id],
                            ctx->offsets.device_ptr(),
                            ctx->is_updated.gpu_rd_ptr(), v_size);
    // timer2.stop();
  }
  *data_mode = get_data_mode<DataType>(*v_size, shared->num_nodes[from_id]);
  // timer3.start();
  if ((*data_mode) == onlyData) {
    *v_size = shared->num_nodes[from_id];
    if (reset) {
      batch_get_reset_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->nodes[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
    } else {
      batch_get_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->nodes[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_rd_ptr());
    }
  } else { // bitsetData || offsetsData
    if (reset) {
      batch_get_reset_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->nodes[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), i);
    } else {
      batch_get_subset<DataType><<<blocks, threads>>>(
          *v_size, shared->nodes[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_rd_ptr());
    }
  }
  check_cuda_kernel;
  // timer3.stop();
  // timer4.start();
  serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id],
                   shared_data, send_buffer);
  // timer4.stop();
  // timer.stop();
  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
  // + %llu + %llu + %llu = %llu\n",
  //  ctx->id, from_id, *data_mode,
  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),
  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
}

template <typename DataType>
void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                        size_t& bit_set_count, size_t num_shared,
                        DeviceOnly<DataType>* shared_data,
                        uint8_t* recv_buffer) {
  size_t offset = 0; // data_mode is already deserialized

  if (data_mode != onlyData) {
    // deserialize bit_set_count
    memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));
    offset += sizeof(bit_set_count);
  } else {
    bit_set_count = num_shared;
  }

  assert(data_mode != gidsData); // not supported for deserialization on GPUs
  if (data_mode == offsetsData) {
    // deserialize offsets vector
    offset += sizeof(bit_set_count);
    ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),
                             bit_set_count);
    offset += bit_set_count * sizeof(unsigned int);
  } else if ((data_mode == bitsetData)) {
    // deserialize bitset
    ctx->is_updated.cpu_rd_ptr()->resize(num_shared);
    offset += sizeof(num_shared);
    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
    offset += sizeof(vec_size);
    ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(
        (uint64_t*)(recv_buffer + offset));
    offset += vec_size * sizeof(uint64_t);
    // get offsets
    size_t v_size;
    get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),
                            ctx->is_updated.gpu_rd_ptr(), &v_size);

    assert(bit_set_count == v_size);
  }

  // deserialize data vector
  offset += sizeof(bit_set_count);
  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
  // offset += bit_set_count * sizeof(DataType);
}

template <typename DataType, SharedType sharedType, UpdateOp op>
void batch_set_shared_field(struct CUDA_Context_Common* ctx,
                            struct CUDA_Context_Field<DataType>* field,
                            unsigned from_id, uint8_t* recv_buffer,
                            DataCommMode data_mode) {
  assert(data_mode != noData);
  struct CUDA_Context_Shared* shared;
  if (sharedType == sharedMaster) {
    shared = &ctx->master;
  } else { // sharedMirror
    shared = &ctx->mirror;
  }
  DeviceOnly<DataType>* shared_data = &field->shared_data;
  dim3 blocks;
  dim3 threads;
  kernel_sizing(blocks, threads);
  size_t v_size;

  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
  // timer.start();
  // timer1.start();
  deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id],
                     shared_data, recv_buffer);
  // timer1.stop();
  // timer2.start();
  if (data_mode == onlyData) {
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
          field->is_updated.gpu_wr_ptr());
    }
  } else if (data_mode == gidsData) {
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    }
  } else { // bitsetData || offsetsData
    if (op == setOp) {
      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == addOp) {
      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    } else if (op == minOp) {
      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
          v_size, shared->nodes[from_id].device_ptr(),
          ctx->offsets.device_ptr(), shared_data->device_ptr(),
          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
    }
  }
  check_cuda_kernel;
  // timer2.stop();
  // timer.stop();
  // fprintf(stderr, "Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\n",
  //  ctx->id, from_id, data_mode,
  //  timer1.duration_ms(), timer2.duration_ms(),
  //  timer.duration_ms());
}


================================================
FILE: libgluon/src/GlobalObj.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file GlobalObj.cpp
 *
 * Defines GlobalObject's static vector that tracks all GlobalObjects
 * and the ptrForObj function.
 */

#include "galois/runtime/GlobalObj.h"

std::vector<uintptr_t> galois::runtime::GlobalObject::allobjs;

uintptr_t galois::runtime::GlobalObject::ptrForObj(unsigned oid) {
  assert(oid < allobjs.size());
  return allobjs[oid];
}


================================================
FILE: libgluon/src/GluonSubstrate.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file GluonSubstrate.cpp
 * Contains the enforced datamode global for use by GPUs.
 *
 * TODO get rid of this file/global.
 */

#include "galois/graphs/GluonSubstrate.h"

DataCommMode enforcedDataMode = DataCommMode::noData;

#ifdef GALOIS_USE_BARE_MPI
//! bare_mpi type to use; see options in runtime/BareMPI.h
// BareMPI bare_mpi = BareMPI::noBareMPI;
BareMPI bare_mpi = BareMPI::nonBlockingBareMPI;
#endif


================================================
FILE: libgluon/src/SyncStructures.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file SyncStructures.cpp
 *
 * Contains implementations of the bitvector status setter/getter functions
 */

#include <galois/runtime/SyncStructures.h>

using namespace galois::runtime; // for easy access to BITVECTOR_STATUS

bool galois::runtime::src_invalid(BITVECTOR_STATUS bv_flag) {
  return (bv_flag == BITVECTOR_STATUS::SRC_INVALID ||
          bv_flag == BITVECTOR_STATUS::BOTH_INVALID);
}

bool galois::runtime::dst_invalid(BITVECTOR_STATUS bv_flag) {
  return (bv_flag == BITVECTOR_STATUS::DST_INVALID ||
          bv_flag == BITVECTOR_STATUS::BOTH_INVALID);
}

void galois::runtime::make_src_invalid(BITVECTOR_STATUS* bv_flag) {
  switch (*bv_flag) {
  case NONE_INVALID:
    *bv_flag = BITVECTOR_STATUS::SRC_INVALID;
    break;
  case DST_INVALID:
    *bv_flag = BITVECTOR_STATUS::BOTH_INVALID;
    break;
  case SRC_INVALID:
  case BOTH_INVALID:
    break;
  }
}

void galois::runtime::make_dst_invalid(BITVECTOR_STATUS* bv_flag) {
  switch (*bv_flag) {
  case NONE_INVALID:
    *bv_flag = BITVECTOR_STATUS::DST_INVALID;
    break;
  case SRC_INVALID:
    *bv_flag = BITVECTOR_STATUS::BOTH_INVALID;
    break;
  case DST_INVALID:
  case BOTH_INVALID:
    break;
  }
}


================================================
FILE: libgluon/src/cuda_device.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 */

/**
 * @file cuda_device.cpp
 *
 * Contains implementation for function that gets gpu device ID.
 */
#include "galois/cuda/HostDecls.h"
#include "galois/Galois.h"
#include "galois/runtime/Network.h"
#include <algorithm>

int get_gpu_device_id(std::string personality_set, int num_nodes) {
  auto& net                   = galois::runtime::getSystemNetworkInterface();
  unsigned host_id            = net.ID;
  unsigned num_hosts          = net.Num;
  unsigned num_hosts_per_node = num_hosts / num_nodes;
  assert((num_hosts % num_nodes) == 0);
  assert(personality_set.length() == num_hosts_per_node);
  unsigned num_gpus_per_node =
      std::count(personality_set.begin(), personality_set.end(), 'g');
  unsigned num_gpus_before =
      std::count(personality_set.begin(),
                 personality_set.begin() + (host_id % num_hosts_per_node), 'g');
  return (num_gpus_before % num_gpus_per_node);
}


================================================
FILE: libgpu/CMakeLists.txt
================================================
add_library(galois_gpu)
add_library(Galois::gpu ALIAS galois_gpu)
set_target_properties(galois_gpu PROPERTIES EXPORT_NAME gpu)
add_dependencies(lib galois_gpu)

#target_link_libraries(galois_gpu ${CUDA_cudadevrt_LIBRARY})

target_sources(galois_gpu PRIVATE
  src/csr_graph.cu
  src/ggc_rt.cu
  src/skelapp/skel.cu
)

target_include_directories(galois_gpu PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include/galois/gpu>
)
install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" PATTERN ".cuh"
)

if(NOT EXISTS "${PROJECT_SOURCE_DIR}/external/moderngpu")
  message(FATAL_ERROR "Cannot find the moderngpu Git submodule. Please run `git submodule update --init --recursive`")
endif()

target_include_directories(galois_gpu PUBLIC
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/moderngpu/src>
  $<INSTALL_INTERFACE:include/galois/gpu/moderngpu/src>
)
install(
  DIRECTORY ${PROJECT_SOURCE_DIR}/external/moderngpu
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu/moderngpu"
  COMPONENT dev
)

if(NOT EXISTS "${PROJECT_SOURCE_DIR}/external/cub")
  message(FATAL_ERROR "Cannot find the cub Git submodule. Please run `git submodule update --init --recursive`")
endif()

target_include_directories(galois_gpu PUBLIC
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/cub>
  $<INSTALL_INTERFACE:include/galois/gpu/cub>
)
install(
  DIRECTORY ${PROJECT_SOURCE_DIR}/external/cub
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu/cub"
  COMPONENT dev
)
target_compile_definitions(galois_gpu PRIVATE _FORCE_INLINES)
target_compile_options(galois_gpu PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>")
set_property(TARGET galois_gpu PROPERTY CUDA_STANDARD 14)

install(TARGETS galois_gpu
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libgpu/include/Timer.h
================================================
#pragma once
/*
   Timer.h

   Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#include <time.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>

#if !(_POSIX_TIMERS > 0)
#error "POSIX timers not available"
#endif

#ifdef _POSIX_MONOTONIC_CLOCK
#ifdef CLOCK_MONOTONIC_RAW
static clockid_t CLOCKTYPE    = CLOCK_MONOTONIC_RAW;
static const char* SCLOCKTYPE = "CLOCK_MONOTONIC_RAW";
#else
static clockid_t CLOCKTYPE = CLOCK_MONOTONIC static const char* SCLOCKTYPE =
    "CLOCK_MONOTONIC";
#endif /* CLOCK_MONOTONIC_RAW */
#else
#warning "CLOCK_MONOTONIC is unavailable, using CLOCK_REALTIME"
static clockid CLOCKTYPE      = CLOCK_REALTIME;
static const char* SCLOCKTYPE = "CLOCK_REALTIME";
#endif /* _POSIX_MONOTONIC_CLOCK */

#define NANOSEC 1000000000LL

namespace ggc {
class Timer {
  char const* name;
  struct timespec begin, end;
  bool active, valid;
  unsigned long long last;
  unsigned long long total;

public:
  Timer(const char* timer_name) {
    name   = timer_name;
    active = false;
    valid  = false;
    total  = 0;
  }

  unsigned long long normalize(const struct timespec& t) const {
    return t.tv_sec * NANOSEC + t.tv_nsec;
  }

  void reset() {
    assert(!active);
    total = 0;
    last  = 0;
  }

  void start() {
    assert(!active);
    active = true;
    valid  = false;
    if (clock_gettime(CLOCKTYPE, &begin) == -1) {
      if (errno == EINVAL) {
        fprintf(stderr, "%s (%d) not available.\n", SCLOCKTYPE, CLOCKTYPE);
        // exit?
      }
    }
  }

  void print() {
    printf("%s %llu %llu\n", name, normalize(begin), normalize(end));
  }
  void stop() {
    assert(active);

    if (clock_gettime(CLOCKTYPE, &end) == -1) {
      if (errno == EINVAL) {
        fprintf(stderr, "%s (%d) not available.\n", SCLOCKTYPE, CLOCKTYPE);
        // exit?
      }
    }

    // assert(normalize(end) > normalize(begin) // paranoid level 2

    last = normalize(end) - normalize(begin);
    total += last;
    active = false;
    valid  = true;
  }

  unsigned long long duration() const { return last; }

  unsigned long long duration_ms() const { return last * 1000 / NANOSEC; }

  unsigned long long duration_s() const { return last / NANOSEC; }

  unsigned long long total_duration() const { return total; }

  unsigned long long total_duration_ms() const {
    return total * 1000 / NANOSEC;
  }

  unsigned long long total_duration_s() const { return total / NANOSEC; }
};
} // namespace ggc

#if 0
__attribute__((constructor)) static void upgrade_timer(void) {
  struct timespec res;
  
// see if CLOCK_MONOTONIC_RAW is available at runtime
#if defined(_POSIX_MONOTONIC_CLOCK) && defined(__linux__)
    if(CLOCKTYPE == CLOCK_MONOTONIC) {
      int rv;
      clockid_t clockid;

#ifdef CLOCK_MONOTONIC_RAW
      clockid = CLOCK_MONOTONIC_RAW;
#else
      clockid = 4; // from bits/time.h
#endif

      rv = clock_getres(clockid, &res);
      if(rv == 0) {
	//fprintf(stderr, "Using CLOCK_MONOTONIC_RAW for Timer.\n");
	CLOCKTYPE = clockid;
	SCLOCKTYPE = "CLOCK_MONOTONIC_RAW";
      }
    }
#endif
}
#endif


================================================
FILE: libgpu/include/abitset.h
================================================
#pragma once
/*
   abitset.h

   Implements ApproxBitset. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

template <typename T>
struct Base {};

template <>
struct Base<unsigned int> {
  enum {
    BITS     = 32,
    LOG_BITS = 5,
  };
};

template <>
struct Base<unsigned char> {
  enum {
    BITS     = 8,
    LOG_BITS = 3,
  };
};

template <typename T>
class ApproxBitset {
  int nbits;
  Shared<T> bitset;
  cudaTextureObject_t btx;
  static const unsigned int bits_per_base = Base<T>::BITS,
                            divby         = Base<T>::LOG_BITS,
                            modby         = (1 << Base<T>::LOG_BITS) - 1;
  T* bitarray;

public:
  int size;

  ApproxBitset() { nbits = 0; }

  ApproxBitset(size_t nbits) {
    this->nbits = nbits;
    // bits_per_base = sizeof(unsigned int) * 8;
    size = (nbits + bits_per_base - 1) / bits_per_base;

    // int mask = bits_per_base, count = 0;
    // while(!(mask & 1)) { mask >>=1; count++; }

    // divby = count;
    // modby = (1 << divby) - 1;

    // printf("%d: %d divby: %d, modby = %d\n", count, bits_per_base, divby,
    // modby);

    bitset.alloc(size);
    bitset.zero_gpu();
    bitarray = bitset.gpu_wr_ptr();

    cudaResourceDesc resDesc;

    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType           = cudaResourceTypeLinear;
    resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
    resDesc.res.linear.desc.x = Base<T>::BITS; // bits per channel

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType;

    resDesc.res.linear.devPtr      = bitarray;
    resDesc.res.linear.sizeInBytes = size;
    check_cuda(cudaCreateTextureObject(&btx, &resDesc, &texDesc, NULL));
  }

  __device__ void set(int pos) {
    int elem = pos >> divby, bitpos = pos & modby;
    // printf("before %d %d: %x\n", pos, elem, bitarray[elem]);
    bitarray[elem] |= (1 << bitpos);
    // printf("after %d %d: %x\n", pos, elem, bitarray[elem]);
  }

  __device__ void unset(int pos) {
    int elem = pos >> divby, bitpos = pos & modby;
    bitarray[elem] &= ~(1 << bitpos);
  }

  __device__ int is_set(int pos) const {
    int elem = pos >> divby, bitpos = pos & modby;

    // printf("%d %d\n", bitarray[elem], tex1Dfetch<unsigned int>(btx, elem));

    // return bitarray[elem] & (1 << bitpos);
    // return tex1Dfetch<unsigned int>(btx, elem) & (1 << bitpos);
    if (!(tex1Dfetch<T>(btx, elem) & (1 << bitpos)))
      return bitarray[elem] & (1 << bitpos);
    else
      return 1;
  }

  void dump() {
    T* x = bitset.cpu_rd_ptr();
    for (int i = 0; i < size; i++) {
      printf("%d: %x\n", i, x[i]);
    }
  }
};

typedef ApproxBitset<unsigned int> ApproxBitsetInt;
typedef ApproxBitset<unsigned char> ApproxBitsetByte;


================================================
FILE: libgpu/include/aolist.h
================================================
#pragma once
/*
   aolist.h

   Implements AppendOnlyList. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#include "cub/cub.cuh"
#include "cutil_subset.h"
#include "bmk2.h"
#include <moderngpu/kernel_mergesort.hxx>

struct AppendOnlyList {
  int* dl;
  int *dindex, index;
  int size;
  bool f_will_write;

  Shared<int> list;

  AppendOnlyList() { size = 0; }

  AppendOnlyList(size_t nsize) {
    size = nsize;

    if (nsize == 0) {
      dl    = NULL;
      index = 0;
    } else {
      list.alloc(nsize);
      dl = list.gpu_wr_ptr();
      CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int)));
      CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &zero, 1 * sizeof(zero),
                                cudaMemcpyHostToDevice));
      index = 0;
    }
  }

  void sort() {
    mgpu::standard_context_t context;
    mergesort(list.gpu_wr_ptr(), nitems(), mgpu::less_t<int>(), context);
  }

  void update_cpu() { list.cpu_rd_ptr(); }

  void display_items() {
    int nsize = nitems();
    int* l    = list.cpu_rd_ptr();

    printf("LIST: ");
    for (int i = 0; i < nsize; i++)
      printf("%d %d, ", i, l[i]);

    printf("\n");
    return;
  }

  void reset() {
    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &zero, 1 * sizeof(zero),
                              cudaMemcpyHostToDevice));
  }

  __device__ __host__ int nitems() {
#ifdef __CUDA_ARCH__
    return *dindex;
#else
    CUDA_SAFE_CALL(cudaMemcpy(&index, (void*)dindex, 1 * sizeof(index),
                              cudaMemcpyDeviceToHost));
    return index;
#endif
  }

  __device__ int push(int item) {
    int lindex = atomicAdd((int*)dindex, 1);
    assert(lindex <= size);

    dl[lindex] = item;
    return 1;
  }

  __device__ int pop_id(int id, int& item) {
    if (id < *dindex) {
      item = cub::ThreadLoad<cub::LOAD_CG>(dl + id);
      // item = dwl[id];
      return 1;
    }

    return 0;
  }

  __device__ int pop(int& item) {
    int lindex = atomicSub((int*)dindex, 1);
    if (lindex <= 0) {
      *dindex = 0;
      return 0;
    }

    item = dl[lindex - 1];
    return 1;
  }

  __device__ int setup_push_warp_one() {
    int first, total, offset, lindex = 0;

    warp_active_count(first, offset, total);

    if (offset == 0) {
      lindex = atomicAdd((int*)dindex, total);
      assert(lindex <= size);
    }

    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);
    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
    return lindex + offset;
  }

  __device__ int setup_push_thread(int nitems) {
    int lindex = atomicAdd((int*)dindex, nitems);
    assert(lindex <= size);

    return lindex;
  }

  __device__ int do_push(int start, int id, int item) {
    assert(id <= size);
    dl[start + id] = item;
    return 1;
  }

  template <typename T>
  __device__ __forceinline__ int push_1item(int nitem, int item,
                                            int threads_per_block) {
    __shared__ typename T::TempStorage temp_storage;
    __shared__ int queue_index;
    int total_items = 0;
    int thread_data = nitem;

    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);

    if (threadIdx.x == 0) {
      if (debug)
        printf("t: %d\n", total_items);
      queue_index = atomicAdd((int*)dindex, total_items);
      // printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x,
      // queue_index, thread_data + n_items, total_items);
    }

    __syncthreads();

    if (nitem == 1) {
      if (queue_index + thread_data >= size) {
        printf("GPU: exceeded length: %d %d %d\n", queue_index, thread_data,
               size);
        return 0;
      }

      // dwl[queue_index + thread_data] = item;
      cub::ThreadStore<cub::STORE_CG>(dl + queue_index + thread_data, item);
    }

    return total_items;
  }

  void save(const char* f, const unsigned iteration) {
    char n[255];
    int ret;

    ret = snprintf(n, 255, "%s%s-%05d-%s.wl", instr_trace_dir(), f, iteration,
                   instr_uniqid());

    if (ret < 0 || ret >= 255) {
      fprintf(stderr, "Error creating filename for kernel '%s', iteration %d\n",
              f, iteration);
      exit(1);
    }

    int nsize = nitems();
    int* wl   = list.cpu_rd_ptr();

    TRACE of = trace_open(n, "w");
    instr_write_array(n, of, sizeof(int), nsize, wl);
    trace_close(of);
    bmk2_log_collect("ggc/wlcontents", n);
    return;
  }

  void load(const char* f, const unsigned iteration) {
    char n[255];
    int ret;

    ret = snprintf(n, 255, "%s%s-%05d-%s.wl", instr_trace_dir(), f, iteration,
                   instr_uniqid());

    if (ret < 0 || ret >= 255) {
      fprintf(stderr, "Error creating filename for kernel '%s', iteration %d\n",
              f, iteration);
      exit(1);
    }

    TRACE of = trace_open(n, "w");
    int nsize =
        instr_read_array(n, of, sizeof(int), size, list.cpu_wr_ptr(true));
    list.gpu_rd_ptr();
    check_cuda(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),
                          cudaMemcpyHostToDevice));
    trace_close(of);
    return;
  }
};


================================================
FILE: libgpu/include/atomic_helpers.h
================================================
#pragma once

// TODO: re-implement all these using atomicCAS()

__device__ static double atomicTestAdd(double* address, double val) {
#if __CUDA_ARCH__ >= 600
  return (val == 0.0) ? *address : atomicAdd(address, val);
#else
  unsigned long long int* address_ull = (unsigned long long int*)address;
  unsigned long long int old          = *address_ull;
  unsigned long long int assumed;
  do {
    assumed                          = old;
    double value                     = val + __longlong_as_double(assumed);
    unsigned long long int value_ull = __double_as_longlong(value);
    old = atomicCAS(address_ull, assumed, value_ull);
  } while (assumed != old);
  return __longlong_as_double(assumed);
#endif
}

__device__ static float atomicMax(float* address, float val) {
  int* address_as_i = (int*)address;
  int val_as_i      = __float_as_int(val);
  int old_as_i      = *address_as_i;
  float old         = __int_as_float(old_as_i);
  while (old < val) {
    old_as_i = atomicCAS(address_as_i, old_as_i, val_as_i);
    old      = __int_as_float(old_as_i);
  }
  return old;
}

__device__ static float atomicMin(float* address, float val) {
  int* address_as_i = (int*)address;
  int val_as_i      = __float_as_int(val);
  int old_as_i      = *address_as_i;
  float old         = __int_as_float(old_as_i);
  while (old > val) {
    old_as_i = atomicCAS(address_as_i, old_as_i, val_as_i);
    old      = __int_as_float(old_as_i);
  }
  return old;
}

__device__ static int atomicTestAdd(int* address, int val) {
  return (val == 0) ? *address : atomicAdd(address, val);
}

__device__ static float atomicTestAdd(float* address, float val) {
  return (val == 0.0) ? *address : atomicAdd(address, val);
}

__device__ static float atomicTestMin(float* address, float val) {
  return atomicMin(address, val);
}

__device__ static float atomicTestMax(float* address, float val) {
  return atomicMax(address, val);
}

__device__ static uint32_t atomicTestAdd(uint32_t* address, uint32_t val) {
  return (val == 0) ? *address : atomicAdd(address, val);
}

__device__ static uint32_t atomicTestMin(uint32_t* address, uint32_t val) {
  uint32_t old_val = *address;
  return (old_val <= val) ? old_val : atomicMin(address, val);
}

__device__ static uint32_t atomicTestMax(uint32_t* address, uint32_t val) {
  uint32_t old_val = *address;
  return (old_val >= val) ? old_val : atomicMax(address, val);
}

__device__ static uint64_t atomicTestAdd(uint64_t* address, uint64_t val) {
  return (val == 0) ? *address
                    : atomicAdd((unsigned long long int*)address, val);
}

__device__ static uint64_t atomicTestMin(uint64_t* address, uint64_t val) {
  uint64_t old_val            = *address;
  unsigned long long int val2 = val;
  return (old_val <= val) ? old_val
                          : atomicMin((unsigned long long int*)address, val2);
}

__device__ static uint64_t atomicTestMax(uint64_t* address, uint64_t val) {
  uint64_t old_val            = *address;
  unsigned long long int val2 = val;
  return (old_val >= val) ? old_val
                          : atomicMax((unsigned long long int*)address, val2);
}


================================================
FILE: libgpu/include/bmk2.h
================================================
#pragma once

#ifdef __cplusplus
extern "C" {
#endif
char* bmk2_get_binid();
char* bmk2_get_inputid();
char* bmk2_get_runid();
int bmk2_log_collect(const char* component, const char* file);

#ifdef __cplusplus
}
#endif


================================================
FILE: libgpu/include/component.h
================================================
/*
   component.h

   Implements ComponentSpace. Part of the GGC source code.
   Originally derived from the LonestarGPU 2.0 source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   TODO: relicense
*/

struct ComponentSpace {
  ComponentSpace(unsigned nelements);

  __device__ unsigned numberOfElements();
  __device__ unsigned numberOfComponents();
  __device__ bool isBoss(unsigned element);
  __device__ unsigned find(unsigned lelement, bool compresspath = true);
  __device__ bool unify(unsigned one, unsigned two);
  __device__ void print1x1();
  __host__ void print();
  __host__ void copy(ComponentSpace& two);
  void dump_to_file(const char* F);
  void allocate();
  void init();
  unsigned numberOfComponentsHost();

  unsigned nelements;
  unsigned *ncomponents, // number of components.
      *complen,          // lengths of components.
      *ele2comp;         // components of elements.
};
ComponentSpace::ComponentSpace(unsigned nelements) {
  this->nelements = nelements;

  allocate();
  init();
}

void ComponentSpace::dump_to_file(const char* F) {
  static FILE* f;
  static unsigned* mem;

  if (!f) {
    f   = fopen(F, "w");
    mem = (unsigned*)calloc(nelements, sizeof(unsigned));
  }

  assert(cudaMemcpy(mem, ele2comp, nelements * sizeof(unsigned),
                    cudaMemcpyDeviceToHost) == cudaSuccess);

  int i;
  for (i = 0; i < nelements; i++) {
    int boss = i;
    do {
      boss = mem[boss];
    } while (boss != mem[boss]);
    fprintf(f, "%d %d %d\n", i, mem[i], boss);
  }

  fprintf(f, "\n");
}

void ComponentSpace::copy(ComponentSpace& two) {
  assert(cudaMemcpy(two.ncomponents, ncomponents, sizeof(unsigned),
                    cudaMemcpyDeviceToDevice) == 0);
  assert(cudaMemcpy(two.ele2comp, ele2comp, sizeof(unsigned) * nelements,
                    cudaMemcpyDeviceToDevice) == 0);
  assert(cudaMemcpy(two.complen, complen, sizeof(unsigned) * nelements,
                    cudaMemcpyDeviceToDevice) == 0);
}
__device__ void ComponentSpace::print1x1() {
  printf("\t\t-----------------\n");
  for (unsigned ii = 0; ii < nelements; ++ii) {
    printf("\t\t%d -> %d\n", ii, ele2comp[ii]);
  }
  printf("\t\t-----------------\n");
}
__global__ void print1x1(ComponentSpace cs) { cs.print1x1(); }
__host__ void ComponentSpace::print() { ::print1x1<<<1, 1>>>(*this); }
__device__ unsigned ComponentSpace::numberOfElements() { return nelements; }
__device__ unsigned ComponentSpace::numberOfComponents() {
  return *ncomponents;
}
unsigned ComponentSpace::numberOfComponentsHost() {
  unsigned hncomponents = 0;
  check_cuda(cudaMemcpy(&hncomponents, ncomponents, sizeof(unsigned),
                        cudaMemcpyDeviceToHost));
  return hncomponents;
}
void ComponentSpace::allocate() {
  check_cuda(cudaMalloc((void**)&ncomponents, 1 * sizeof(unsigned)));
  check_cuda(cudaMalloc((void**)&complen, nelements * sizeof(unsigned)));
  check_cuda(cudaMalloc((void**)&ele2comp, nelements * sizeof(unsigned)));
}
__global__ void dinitcs(unsigned nelements, unsigned* complen,
                        unsigned* ele2comp) {
  unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < nelements) {
    // elements[id] 	= id;
    complen[id]  = 1;
    ele2comp[id] = id;
  }
}
void ComponentSpace::init() {
  // init the elements.
  unsigned blocksize = 256; ////
  unsigned nblocks   = (nelements + blocksize - 1) / blocksize;
  dinitcs<<<nblocks, blocksize>>>(nelements, complen, ele2comp);
  // init number of components.
  check_cuda(cudaMemcpy(ncomponents, &nelements, sizeof(unsigned),
                        cudaMemcpyHostToDevice));
}
__device__ bool ComponentSpace::isBoss(unsigned element) {
  return atomicCAS(&ele2comp[element], element, element) == element;
}
__device__ unsigned ComponentSpace::find(unsigned lelement,
                                         bool compresspath /*= true*/) {
  // do we need to worry about concurrency in this function?
  // for other finds, no synchronization necessary as the data-structure is a
  // tree. for other unifys, synchornization is not required considering that
  // unify is going to affect only bosses, while find is going to affect only
  // non-bosses.
  unsigned element = lelement;
  while (isBoss(element) == false) {
    element = ele2comp[element];
  }
  if (compresspath)
    ele2comp[lelement] = element; // path compression.
  return element;
}
__device__ bool ComponentSpace::unify(unsigned one, unsigned two) {
  // if the client makes sure that one component is going to get unified as a
  // source with another destination only once, then synchronization is
  // unnecessary. while this is true for MST, due to load-balancing in if-block
  // below, a node may be source multiple times. if a component is source in one
  // thread and destination is another, then it is okay for MST.
  do {
    if (!isBoss(one))
      return false;
    if (!isBoss(two))
      return false;

    unsigned onecomp = one;
    unsigned twocomp = two;
    // unsigned onecomp = find(one, false);
    // unsigned twocomp = find(two, false);

    if (onecomp == twocomp)
      return false; // "duplicate" edges due to symmetry

    unsigned boss        = twocomp;
    unsigned subordinate = onecomp;
    // if (complen[onecomp] > complen[twocomp]) {	// one is larger, make it
    // the representative: can create cycles.
    if (boss < subordinate) { // break cycles by id.
      boss        = onecomp;
      subordinate = twocomp;
    }
    // merge subordinate into the boss.
    // ele2comp[subordinate] = boss;

    unsigned oldboss = atomicCAS(&ele2comp[subordinate], subordinate, boss);
    if (oldboss != subordinate) { // someone else updated the boss.
      // we need not restore the ele2comp[subordinate], as union-find ensures
      // correctness and complen of subordinate doesn't matter.
      one = oldboss;
      two = boss;
      return false;
    } else {
      dprintf("\t\tunifying %d -> %d (%d)\n", subordinate, boss);
      atomicAdd(&complen[boss], complen[subordinate]);
      // complen[boss] += complen[subordinate];
      // complen[subordinate] doesn't matter now, since find() will find its
      // boss.

      // a component has reduced.
      unsigned ncomp = atomicSub(ncomponents, 1);
      // atomicDec(ncomponents, nelements);
      dprintf("\t%d: ncomponents = %d\n", threadIdx.x, ncomp);
      return true;
    }
  } while (true);
}


================================================
FILE: libgpu/include/counter.h
================================================
#pragma once
/*
   counter.h

   Implements instrumentation counters. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#include <stdio.h>
#include <cassert>
#include <cub/util_device.cuh>

// timeblocks 2.0

const int MAGIC = 0x5a5e5a61; // random

// from http://forums.nvidia.com/index.php?showtopic=186669
static __device__ uint get_smid_reg(void) {
  uint ret;
  asm("mov.u32 %0, %smid;" : "=r"(ret));
  return ret;
}

class GPUCounter {
  unsigned count;
  unsigned value;

public:
  unsigned* tvalues;
  unsigned* tcounts;
  unsigned* smids; // make this a char?
  clock_t* start;

  __device__ void init() {
    count = 0;
    value = 0;
  }

  __device__ void record(int value) { this->value += value; }

  __device__ void count_iter() { count++; }

  __device__ void begin(unsigned tid) {
    this->start[tid] = clock64();
    if (threadIdx.x == 0)
      smids[blockIdx.x] = get_smid_reg();
  }

  __device__ void end(unsigned tid) {
    clock_t e = clock64();
    value     = (e - start[tid]);
  }

  __device__ void finish(unsigned tid) {
    tvalues[tid] = value;
    tcounts[tid] = count;
  }
};

class Counter {
public:
  GPUCounter gc;
  FILE* f;
  Shared<unsigned> tvalues;
  Shared<unsigned> tcounts;
  Shared<unsigned> smids;
  Shared<clock_t> start;

  int threads;
  int blocks, tpblock, dynsmem, residency;
  const void* function;

  int get_residency(int tpb, int dynsmem) {
    int res;

#if CUDA_VERSION < 6050
    assert(dynsmem == 0);
    cub::MaxSmOccupancy(res, function, tpb);
#else
    assert(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
               &res, function, tpb, dynsmem) == cudaSuccess);
#endif

    return res;
  }

  __host__ void init(const char* fname, const void* fn, int blocks, int tpb,
                     int dynsmem) {
    this->blocks  = blocks;
    this->tpblock = tpb;

    threads = blocks * tpb;

    function  = fn;
    residency = get_residency(tpb, dynsmem);
    assert(residency > 0);

    f = fopen(fname, "w");
    if (!f) {
      fprintf(stderr, "Could not open '%s'", fname);
      assert(false);
    }

    assert(fwrite(&MAGIC, sizeof(MAGIC), 1, f) == 1);
    assert(fwrite(&blocks, sizeof(blocks), 1, f) == 1);
    assert(fwrite(&tpb, sizeof(tpb), 1, f) == 1);

    tvalues.alloc(threads);
    tcounts.alloc(threads);
    smids.alloc(blocks);
    start.alloc(threads);

    gc.tvalues = tvalues.gpu_wr_ptr();
    gc.tcounts = tcounts.gpu_wr_ptr();
    gc.smids   = smids.gpu_wr_ptr();
    gc.start   = start.gpu_wr_ptr();
  }

  __host__ GPUCounter& get_gpu() {
    tvalues.gpu_wr_ptr();
    tcounts.gpu_wr_ptr();
    smids.gpu_wr_ptr();
    start.gpu_wr_ptr();

    return gc;
  }

  __host__ void write_data(int iteration, unsigned work, int iblocks = 0,
                           int ithreads = 0, int idynsmem = -1) {
    int zero     = 0;
    unsigned* tv = tvalues.cpu_rd_ptr();
    unsigned* tc = tcounts.cpu_rd_ptr();
    int res;

    if (iblocks == 0)
      iblocks = blocks;
    if (ithreads == 0)
      ithreads = tpblock;
    if (idynsmem == -1)
      idynsmem = dynsmem;

    if (ithreads != tpblock) {
      res = get_residency(ithreads, idynsmem);
      assert(res > 0);
    } else {
      res = residency;
    }

    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);
    assert(fwrite(&iteration, sizeof(iteration), 1, f) == 1);
    assert(fwrite(&work, sizeof(work), 1, f) == 1);
    assert(fwrite(&iblocks, sizeof(iblocks), 1, f) == 1);
    assert(fwrite(&ithreads, sizeof(ithreads), 1, f) == 1);
    assert(fwrite(&idynsmem, sizeof(idynsmem), 1, f) == 1);
    assert(fwrite(&res, sizeof(res), 1, f) == 1);

    // reserved for type identifiers
    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);
    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);

    assert(fwrite(smids.cpu_rd_ptr(), sizeof(unsigned), blocks, f) == blocks);
    assert(fwrite(start.cpu_rd_ptr(), sizeof(clock_t), threads, f) == threads);

    assert(fwrite(tv, sizeof(tv[0]), threads, f) == threads);
    assert(fwrite(tc, sizeof(tc[0]), threads, f) == threads);
  }
  __host__ void zero_gpu() {
    tvalues.zero_gpu();
    tcounts.zero_gpu();
    smids.zero_gpu();
    start.zero_gpu();
  }
};


================================================
FILE: libgpu/include/csr_graph.h
================================================
/*
   csr_graph.h

   Implements a CSR Graph. Part of the GGC source code.
   Interface derived from LonestarGPU.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#ifndef LSG_CSR_GRAPH
#define LSG_CSR_GRAPH

#include <fstream>

// Adapted from LSG CSRGraph.h

// TODO: make this template data
typedef unsigned index_type; // should be size_t, but GPU chokes on size_t
typedef int edge_data_type;
typedef int node_data_type;

// very simple implementation
struct CSRGraph {
  unsigned read(char file[], bool read_edge_data = true);
  void copy_to_gpu(struct CSRGraph& copygraph);
  void copy_to_cpu(struct CSRGraph& copygraph);

  CSRGraph();

  unsigned init();
  unsigned allocOnHost(bool no_edge_data = false);
  unsigned allocOnDevice(bool no_edge_data = false);
  void progressPrint(unsigned maxii, unsigned ii);
  unsigned readFromGR(char file[], bool read_edge_data = true);

  unsigned deallocOnHost();
  unsigned deallocOnDevice();
  void dealloc();

  __device__ __host__ bool valid_node(index_type node) {
    return (node < nnodes);
  }

  __device__ __host__ bool valid_edge(index_type edge) {
    return (edge < nedges);
  }

  __device__ __host__ index_type getOutDegree(unsigned src) {
    assert(src < nnodes);
    return row_start[src + 1] - row_start[src];
  };

  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {
    assert(src < nnodes);
    assert(edge < getOutDegree(src));

    index_type abs_edge = row_start[src] + edge;
    assert(abs_edge < nedges);

    return edge_dst[abs_edge];
  };

  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {
    assert(abs_edge < nedges);

    return edge_dst[abs_edge];
  };

  __device__ __host__ index_type getFirstEdge(unsigned src) {
    assert(src <= nnodes); // <= is okay
    return row_start[src];
  };

  __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) {
    assert(src < nnodes);
    assert(edge < getOutDegree(src));

    index_type abs_edge = row_start[src] + edge;
    assert(abs_edge < nedges);

    return edge_data[abs_edge];
  };

  __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) {
    assert(abs_edge < nedges);

    return edge_data[abs_edge];
  };

  void init_from_mgraph(int m, int nnz, index_type* h_row_offsets,
                        index_type* h_column_indices,
                        node_data_type* h_labels) {
    nnodes = m;
    nedges = nnz;
    check_cuda(cudaMalloc((void**)&row_start, (m + 1) * sizeof(index_type)));
    check_cuda(cudaMalloc((void**)&edge_dst, nnz * sizeof(index_type)));
    check_cuda(cudaMemcpy(row_start, h_row_offsets,
                          (m + 1) * sizeof(index_type),
                          cudaMemcpyHostToDevice));
    check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type),
                          cudaMemcpyHostToDevice));
#ifdef ENABLE_LABEL
    check_cuda(cudaMalloc((void**)&node_data, m * sizeof(node_data_type)));
    check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type),
                          cudaMemcpyHostToDevice));
#endif
    // int *h_degrees = (int *)malloc(m * sizeof(int));
    // for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] -
    // h_row_offsets[i]; check_cuda(cudaMalloc((void **)&d_degrees, m *
    // sizeof(int))); check_cuda(cudaMemcpy(d_degrees, h_degrees, m *
    // sizeof(int), cudaMemcpyHostToDevice));
  }

  inline __device__ __host__ index_type getEdgeDst(unsigned edge) {
    assert(edge < nedges);
    return edge_dst[edge];
  };
  inline __device__ __host__ node_data_type getData(unsigned vid) {
    return node_data[vid];
  }
  inline __device__ __host__ index_type edge_begin(unsigned src) {
    assert(src <= nnodes);
    return row_start[src];
  };
  inline __device__ __host__ index_type edge_end(unsigned src) {
    assert(src <= nnodes);
    return row_start[src + 1];
  };

  index_type nnodes, nedges;
  index_type* row_start; // row_start[node] points into edge_dst, node starts at
                         // 0, row_start[nnodes] = nedges
  index_type* edge_dst;
  edge_data_type* edge_data;
  node_data_type* node_data;
  bool device_graph;
};

struct CSRGraphTex : CSRGraph {
  cudaTextureObject_t edge_dst_tx;
  cudaTextureObject_t row_start_tx;
  cudaTextureObject_t node_data_tx;

  void copy_to_gpu(struct CSRGraphTex& copygraph);
  unsigned allocOnDevice(bool no_edge_data = false);

  __device__ __host__ index_type getOutDegree(unsigned src) {
#ifdef __CUDA_ARCH__
    assert(src < nnodes);
    return tex1Dfetch<index_type>(row_start_tx, src + 1) -
           tex1Dfetch<index_type>(row_start_tx, src);
#else
    return CSRGraph::getOutDegree(src);
#endif
  };

  __device__ node_data_type node_data_ro(index_type node) {
    assert(node < nnodes);
    return tex1Dfetch<node_data_type>(node_data_tx, node);
  }

  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {
#ifdef __CUDA_ARCH__
    assert(src < nnodes);
    assert(edge < getOutDegree(src));

    index_type abs_edge = tex1Dfetch<index_type>(row_start_tx, src + edge);
    assert(abs_edge < nedges);

    return tex1Dfetch<index_type>(edge_dst_tx, abs_edge);
#else
    return CSRGraph::getDestination(src, edge);
#endif
  };

  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {
#ifdef __CUDA_ARCH__
    assert(abs_edge < nedges);

    return tex1Dfetch<index_type>(edge_dst_tx, abs_edge);
#else
    return CSRGraph::getAbsDestination(abs_edge);
#endif
  };

  __device__ __host__ index_type getFirstEdge(unsigned src) {
#ifdef __CUDA_ARCH__
    assert(src <= nnodes); // <= is okay
    return tex1Dfetch<index_type>(row_start_tx, src);
#else
    return CSRGraph::getFirstEdge(src);
#endif
  };
};

#ifdef CSRG_TEX
typedef CSRGraphTex CSRGraphTy;
#else
typedef CSRGraph CSRGraphTy;
#endif

#endif


================================================
FILE: libgpu/include/cuda_launch_config.hpp
================================================
/*
 *  Copyright 2008-2012 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

/*
   Modified by Sreepathi Pai <sreepai@ices.utexas.edu>
   to remove dependency on Thrust and add maximum_residency()
*/

#pragma once

#include <cstddef>
#include <cuda_runtime_api.h>

#define VERBOSE_ERRORS 1
#if VERBOSE_ERRORS == 1
#include <stdio.h>
#endif
/*! Computes a block size in number of threads for a CUDA kernel using a
 * occupancy-promoting heuristic. \param attributes The cudaFuncAttributes
 * corresponding to a __global__ function of interest on a GPU of interest.
 *  \param properties The cudaDeviceProp corresponding to a GPU on which to
 * launch the __global__ function of interest. \return A CUDA block size, in
 * number of threads, which the resources of the GPU's streaming multiprocessor
 * can accomodate and which is intended to promote occupancy. The result is
 * equivalent to the one performed by the "CUDA Occupancy Calculator". \note The
 * __global__ function of interest is presumed to use 0 bytes of
 * dynamically-allocated __shared__ memory.
 */
inline __host__ __device__ std::size_t
block_size_with_maximum_potential_occupancy(
    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties);

/*! Computes a block size in number of threads for a CUDA kernel using a
 * occupancy-promoting heuristic. Use this version of the function when a CUDA
 * block's dynamically-allocated __shared__ memory requirements vary with the
 * size of the block. \param attributes The cudaFuncAttributes corresponding to
 * a __global__ function of interest on a GPU of interest. \param properties The
 * cudaDeviceProp corresponding to a GPU on which to launch the __global__
 * function of interest. \param block_size_to_dynamic_smem_bytes A unary
 * function which maps an integer CUDA block size to the number of bytes of
 * dynamically-allocated __shared__ memory required by a CUDA block of that
 * size. \return A CUDA block size, in number of threads, which the resources of
 * the GPU's streaming multiprocessor can accomodate and which is intended to
 * promote occupancy. The result is equivalent to the one performed by the "CUDA
 * Occupancy Calculator".
 */
template <typename UnaryFunction>
inline __host__ __device__ std::size_t
block_size_with_maximum_potential_occupancy(
    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties,
    UnaryFunction block_size_to_dynamic_smem_size);

/*! Computes the maximum residency for a CUDA kernel function
 *  \param t The CUDA kernel function
 *  \param CTA_SIZE The size of the CTA in threads
 *  \param dynamic_smem_bytes The size of dynamic shared memory
 *  \return Returns the maximum number of thread blocks per SM or 0 on error
 */
template <typename T>
inline __host__ std::size_t maximum_residency(T t, const size_t CTA_SIZE,
                                              const size_t dynamic_smem_bytes);

/*! Computes the maximum residency for a CUDA kernel function
 *  \param attributes The cudaFuncAttributes corresponding to a __global__
 * function of interest on a GPU of interest. \param properties The
 * cudaDeviceProp corresponding to a GPU on which to launch the __global__
 * function of interest. \param CTA_SIZE The size of the CTA in threads \param
 * dynamic_smem_bytes The size of dynamic shared memory \return Returns the
 * maximum number of thread blocks per SM or 0 on error
 */
inline __host__ std::size_t
maximum_residency(const cudaFuncAttributes& attributes,
                  const cudaDeviceProp& properties, size_t CTA_SIZE,
                  size_t dynamic_smem_bytes);

namespace __cuda_launch_config_detail {

using std::size_t;

namespace util {

template <typename T>
inline __host__ __device__ T min_(const T& lhs, const T& rhs) {
  return rhs < lhs ? rhs : lhs;
}

template <typename T>
struct zero_function {
  inline __host__ __device__ T operator()(T) { return 0; }
};

// x/y rounding towards +infinity for integers, used to determine # of
// blocks/warps etc.
template <typename L, typename R>
inline __host__ __device__ L divide_ri(const L x, const R y) {
  return (x + (y - 1)) / y;
}

// x/y rounding towards zero for integers, used to determine # of blocks/warps
// etc.
template <typename L, typename R>
inline __host__ __device__ L divide_rz(const L x, const R y) {
  return x / y;
}

// round x towards infinity to the next multiple of y
template <typename L, typename R>
inline __host__ __device__ L round_i(const L x, const R y) {
  return y * divide_ri(x, y);
}

// round x towards zero to the next multiple of y
template <typename L, typename R>
inline __host__ __device__ L round_z(const L x, const R y) {
  return y * divide_rz(x, y);
}

} // end namespace util

// granularity of shared memory allocation
inline __host__ __device__ size_t
smem_allocation_unit(const cudaDeviceProp& properties) {
  switch (properties.major) {
  case 1:
    return 512;
  case 2:
    return 128;
  case 3:
    return 256;
  default:
    return 256; // unknown GPU; have to guess
  }
}

// granularity of register allocation
inline __host__ __device__ size_t reg_allocation_unit(
    const cudaDeviceProp& properties, const size_t regsPerThread) {
  switch (properties.major) {
  case 1:
    return (properties.minor <= 1) ? 256 : 512;
  case 2:
    switch (regsPerThread) {
    case 21:
    case 22:
    case 29:
    case 30:
    case 37:
    case 38:
    case 45:
    case 46:
      return 128;
    default:
      return 64;
    }
  case 3:
    return 256;
  default:
    return 256; // unknown GPU; have to guess
  }
}

// granularity of warp allocation
inline __host__ __device__ size_t
warp_allocation_multiple(const cudaDeviceProp& properties) {
  return (properties.major <= 1) ? 2 : 1;
}

// number of "sides" into which the multiprocessor is partitioned
inline __host__ __device__ size_t
num_sides_per_multiprocessor(const cudaDeviceProp& properties) {
  switch (properties.major) {
  case 1:
    return 1;
  case 2:
    return 2;
  case 3:
    return 4;
  default:
    return 4; // unknown GPU; have to guess
  }
}

inline __host__ __device__ size_t
max_blocks_per_multiprocessor(const cudaDeviceProp& properties) {
  return (properties.major <= 2) ? 8 : 16;
}

inline __host__ __device__ size_t max_active_blocks_per_multiprocessor(
    const cudaDeviceProp& properties, const cudaFuncAttributes& attributes,
    size_t CTA_SIZE, size_t dynamic_smem_bytes) {
  // Determine the maximum number of CTAs that can be run simultaneously per SM
  // This is equivalent to the calculation done in the CUDA Occupancy Calculator
  // spreadsheet

  //////////////////////////////////////////
  // Limits due to threads/SM or blocks/SM
  //////////////////////////////////////////
  const size_t maxThreadsPerSM =
      properties.maxThreadsPerMultiProcessor; // 768, 1024, 1536, etc.
  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);

  // Calc limits
  const size_t ctaLimitThreads = (CTA_SIZE <= properties.maxThreadsPerBlock)
                                     ? maxThreadsPerSM / CTA_SIZE
                                     : 0;
  const size_t ctaLimitBlocks = maxBlocksPerSM;

  //////////////////////////////////////////
  // Limits due to shared memory/SM
  //////////////////////////////////////////
  const size_t smemAllocationUnit = smem_allocation_unit(properties);
  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;
  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);

  // Calc limit
  const size_t ctaLimitSMem = smemPerCTA > 0
                                  ? properties.sharedMemPerBlock / smemPerCTA
                                  : maxBlocksPerSM;

  //////////////////////////////////////////
  // Limits due to registers/SM
  //////////////////////////////////////////
  const size_t regAllocationUnit =
      reg_allocation_unit(properties, attributes.numRegs);
  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
  const size_t numWarps               = util::round_i(
      util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);

  // Calc limit
  size_t ctaLimitRegs;
  if (properties.major <= 1) {
    // GPUs of compute capability 1.x allocate registers to CTAs
    // Number of regs per block is regs per thread times number of warps times
    // warp size, rounded up to allocation unit
    const size_t regsPerCTA = util::round_i(
        attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
    ctaLimitRegs =
        regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
  } else {
    // GPUs of compute capability 2.x and higher allocate registers to warps
    // Number of regs per warp is regs per thread times times warp size, rounded
    // up to allocation unit
    const size_t regsPerWarp = util::round_i(
        attributes.numRegs * properties.warpSize, regAllocationUnit);
    const size_t numSides       = num_sides_per_multiprocessor(properties);
    const size_t numRegsPerSide = properties.regsPerBlock / numSides;
    ctaLimitRegs                = regsPerWarp > 0
                       ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps
                       : maxBlocksPerSM;
  }

  //////////////////////////////////////////
  // Overall limit is min() of limits due to above reasons
  //////////////////////////////////////////
  return util::min_(
      ctaLimitRegs,
      util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));
}

template <typename UnaryFunction>
inline __host__ __device__ size_t default_block_size(
    const cudaDeviceProp& properties, const cudaFuncAttributes& attributes,
    UnaryFunction block_size_to_smem_size) {
  size_t max_occupancy = properties.maxThreadsPerMultiProcessor;
  size_t largest_blocksize =
      util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
  size_t granularity       = properties.warpSize;
  size_t max_blocksize     = 0;
  size_t highest_occupancy = 0;

  for (size_t blocksize = largest_blocksize; blocksize != 0;
       blocksize -= granularity) {
    size_t occupancy = blocksize * max_active_blocks_per_multiprocessor(
                                       properties, attributes, blocksize,
                                       block_size_to_smem_size(blocksize));

    if (occupancy > highest_occupancy) {
      max_blocksize     = blocksize;
      highest_occupancy = occupancy;
    }

    // early out, can't do better
    if (highest_occupancy == max_occupancy)
      break;
  }

  return max_blocksize;
}

} // end namespace __cuda_launch_config_detail

template <typename UnaryFunction>
inline __host__ __device__ std::size_t
block_size_with_maximum_potential_occupancy(
    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties,
    UnaryFunction block_size_to_dynamic_smem_size) {
  return __cuda_launch_config_detail::default_block_size(
      properties, attributes, block_size_to_dynamic_smem_size);
}

inline __host__ __device__ std::size_t
block_size_with_maximum_potential_occupancy(
    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties) {
  return block_size_with_maximum_potential_occupancy(
      attributes, properties,
      __cuda_launch_config_detail::util::zero_function<std::size_t>());
}

template <typename T>
inline __host__ std::size_t block_size_with_maximum_potential_occupancy(T t) {
  cudaError_t err;
  cudaFuncAttributes attributes;
  err = cudaFuncGetAttributes(&attributes, t);

  if (err != cudaSuccess)
    return 0;

  int device;
  err = cudaGetDevice(&device);

  if (err != cudaSuccess)
    return 0;

  cudaDeviceProp properties;
  err = cudaGetDeviceProperties(&properties, device);

  if (err != cudaSuccess)
    return 0;

  return block_size_with_maximum_potential_occupancy(attributes, properties);
}

inline __host__ std::size_t
maximum_residency(const cudaFuncAttributes& attributes,
                  const cudaDeviceProp& properties, size_t CTA_SIZE,
                  size_t dynamic_smem_bytes) {
  return __cuda_launch_config_detail::max_active_blocks_per_multiprocessor(
      properties, attributes, CTA_SIZE, dynamic_smem_bytes);
}

template <typename T>
inline __host__ std::size_t maximum_residency(T t, size_t CTA_SIZE,
                                              size_t dynamic_smem_bytes) {
  cudaError_t err;
  cudaFuncAttributes attributes;
  err = cudaFuncGetAttributes(&attributes, t);

  if (err != cudaSuccess) {
#if VERBOSE_ERRORS == 1
    fprintf(stderr, "Failed to get function attributes (%d: %s)\n", err,
            cudaGetErrorString(err));
#endif
    return 0;
  }

  if (CTA_SIZE > attributes.maxThreadsPerBlock) {
#if VERBOSE_ERRORS == 1
    fprintf(stderr,
            "WARNING: function CTA size (%d) is greater than can be "
            "accomodated: (%d)\n",
            CTA_SIZE, attributes.maxThreadsPerBlock);
#endif
    return 0;
  }

  int device;
  err = cudaGetDevice(&device);

  if (err != cudaSuccess) {
#if VERBOSE_ERRORS == 1
    fprintf(stderr, "Failed to get current CUDA device (%d: %s)\n", err,
            cudaGetErrorString(err));
#endif
    return 0;
  }

  cudaDeviceProp properties;
  err = cudaGetDeviceProperties(&properties, device);

  if (err != cudaSuccess) {
#if VERBOSE_ERRORS == 1
    fprintf(stderr, "Failed to get current CUDA device properties (%d: %s)\n",
            err, cudaGetErrorString(err));
#endif
    return 0;
  }

  size_t mr =
      maximum_residency(attributes, properties, CTA_SIZE, dynamic_smem_bytes);

#if VERBOSE_ERRORS == 1
  if (mr == 0) {
    fprintf(stderr, "WARNING: Maximum Residency is 0\n");
  }
#endif
  return mr;
}

template <typename T>
inline __host__ std::size_t all_resident(T t, const dim3& grid,
                                         const dim3& threads,
                                         size_t dynamic_smem_bytes) {
  cudaError_t err;
  cudaFuncAttributes attributes;
  err = cudaFuncGetAttributes(&attributes, t);

  if (err != cudaSuccess)
    return 0;

  int device;
  err = cudaGetDevice(&device);

  if (err != cudaSuccess)
    return 0;

  cudaDeviceProp properties;
  err = cudaGetDeviceProperties(&properties, device);

  if (err != cudaSuccess)
    return 0;

  return (maximum_residency(attributes, properties,
                            threads.x * threads.y * threads.z,
                            dynamic_smem_bytes) *
          properties.multiProcessorCount) >= grid.x * grid.y * grid.z;
}


================================================
FILE: libgpu/include/cutil_subset.h
================================================
/*
   cutil_subset.h

   Implements a subset of the CUDA utilities. Part of the GGC source code.

   TODO: actual owner copyright (NVIDIA) and license.
*/

#pragma once
#include "cub/cub.cuh"

#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
  {                                                                            \
    cudaError err = call;                                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
              __LINE__, cudaGetErrorString(err));                              \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }

#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);

#define CUDA_SAFE_THREAD_SYNC()                                                \
  {                                                                            \
    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
              __LINE__, cudaGetErrorString(err));                              \
    }                                                                          \
  }

// from http://forums.nvidia.com/index.php?showtopic=186669
static __device__ uint get_smid(void) {
  uint ret;
  asm("mov.u32 %0, %smid;" : "=r"(ret));
  return ret;
}

static __device__ uint get_warpid(void) {
  uint ret;
  asm("mov.u32 %0, %warpid;" : "=r"(ret));
  return ret;
}

// since cub::WarpScan doesn't work very well with disabled threads in the warp
__device__ __forceinline__ void warp_active_count(int& first, int& offset,
                                                  int& total) {
  unsigned int active = __ballot_sync(0xffffffff, 1);
  total               = __popc(active);
  offset              = __popc(active & cub::LaneMaskLt());
  first               = __ffs(active) - 1; // we know active != 0
}

// since cub::WarpScan doesn't work very well with disabled threads in the warp
__device__ __forceinline__ void
warp_active_count_zero_active(int& first, int& offset, int& total) {
  unsigned int active = __ballot_sync(0xffffffff, 1);
  total               = __popc(active);
  offset              = __popc(active & cub::LaneMaskLt());
  first               = 0;
}


================================================
FILE: libgpu/include/exclusive.h
================================================
/*
   exclusive.h

   Runtime implementation for Exclusive. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#include "sharedptr.h"
#include <cassert>
#include <cub/cub.cuh>

#ifndef LOADCV
#define LOADCV(x) cub::ThreadLoad<cub::LOAD_CV>((x))
#endif

#ifndef LOADCG
#define LOADCG(x) cub::ThreadLoad<cub::LOAD_CG>((x))
#endif

#ifndef STORECG
#define STORECG(x, y) cub::ThreadStore<cub::STORE_CG>((x), (y))
#endif

class ExclusiveLocks {
public:
  Shared<int> locks; // need not be shared, GPU-only is fine.
  int* lk;
  int nitems;

  ExclusiveLocks() { nitems = 0; }

  ExclusiveLocks(size_t nitems) {
    this->nitems = nitems;
    locks.alloc(nitems);
    locks.cpu_wr_ptr();
    lk = locks.gpu_wr_ptr();
  }

  void alloc(size_t nitems) {
    // to be called once if default constructor was used
    assert(this->nitems == 0);
    locks.alloc(nitems);
    locks.cpu_wr_ptr();
    lk = locks.gpu_wr_ptr();
  }

  __device__ void mark_p1(int n, int* a, int id) {
    // try to claim ownership
    for (int i = 0; i < n; i++)
      STORECG(lk + a[i], id);
  }

  __device__ void mark_p1_iterator(int start, int n, int step, int* a, int id) {
    // try to claim ownership
    for (int i = start; i < n; i += step)
      STORECG(lk + a[i], id);
  }

  __device__ void mark_p2(int n, int* a, int id) {
    for (int i = 0; i < n; i++)
      if (LOADCG(lk + a[i]) != id)
        atomicMin(lk + a[i], id);
  }

  __device__ void mark_p2_iterator(int start, int n, int step, int* a, int id) {
    for (int i = start; i < n; i += step)
      if (LOADCG(lk + a[i]) != id)
        atomicMin(lk + a[i], id);
  }

  __device__ bool owns(int n, int* a, int id) {
    for (int i = 0; i < n; i++)
      if (LOADCG(lk + a[i]) != id)
        return false;

    return true;
  }

  __device__ bool owns_iterator(int start, int n, int step, int* a, int id) {
    for (int i = start; i < n; i += step)
      if (LOADCG(lk + a[i]) != id)
        return false;

    return true;
  }
};


================================================
FILE: libgpu/include/failfast.h
================================================
/*
   failfast.h

   Implements debug routines. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#pragma once

#include <stdio.h>
#include <stdarg.h>

static void ff_fprintf(const char* file, const int line, FILE* stream,
                       const char* fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  int err = vfprintf(stream, fmt, ap);
  if (err < 0) {
    fprintf(stderr, "%s:%d:fprintf failed.\n", file, line);
    exit(1);
  }
}

#define check_fprintf(...) ff_fprintf(__FILE__, __LINE__, __VA_ARGS__)


================================================
FILE: libgpu/include/gbar.cuh
================================================
/******************************************************************************
 * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Software Global Barrier
 ******************************************************************************/

#pragma once

#include <cub/cub.cuh>
#include "cutil_subset.h"

/**
 * Manages device storage needed for implementing a global software barrier
 * between CTAs in a single grid
 */
class GlobalBarrier
{
public:

	typedef unsigned int SyncFlag;

protected :


	// Counters in global device memory
	SyncFlag* d_sync;

	/**
	 * Simple wrapper for returning a CG-loaded SyncFlag at the specified pointer
	 */
	__device__ __forceinline__ SyncFlag LoadCG(SyncFlag* d_ptr) const
	{
		SyncFlag retval;
		retval = cub::ThreadLoad<cub::LOAD_CG>(d_ptr);
		return retval;
	}

public:

	/**
	 * Constructor
	 */
	GlobalBarrier() : d_sync(NULL) {}


	/**
	 * Synchronize
	 */
	__device__ __forceinline__ void Sync() const
	{
        volatile SyncFlag *d_vol_sync = d_sync;

        // Threadfence and syncthreads to make sure global writes are visible before
		// thread-0 reports in with its sync counter
		__threadfence();
		__syncthreads();

		if (blockIdx.x == 0) {

			// Report in ourselves
			if (threadIdx.x == 0) {
			    d_vol_sync[blockIdx.x] = 1;
			}

			__syncthreads();

			// Wait for everyone else to report in
			for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {
				while (LoadCG(d_sync + peer_block) == 0) {
					__threadfence_block();
				}
			}

			__syncthreads();

			// Let everyone know it's safe to read their prefix sums
			for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {
			    d_vol_sync[peer_block] = 0;
			}

		} else {

			if (threadIdx.x == 0) {
				// Report in
			    d_vol_sync[blockIdx.x] = 1;

				// Wait for acknowledgement
				while (LoadCG(d_sync + blockIdx.x) == 1) {
					__threadfence_block();
				}
			}

			__syncthreads();
		}
	}
};


/**
 * Version of global barrier with storage lifetime management.
 *
 * We can use this in host enactors, and pass the base GlobalBarrier
 * as parameters to kernels.
 */
class GlobalBarrierLifetime : public GlobalBarrier
{
protected:

	// Number of bytes backed by d_sync
	size_t sync_bytes;

public:

	/**
	 * Constructor
	 */
	GlobalBarrierLifetime() : GlobalBarrier(), sync_bytes(0) {}


	/**
	 * Deallocates and resets the progress counters
	 */
	cudaError_t HostReset()
	{
		cudaError_t retval = cudaSuccess;
		if (d_sync) {
			CUDA_SAFE_CALL(cudaFree(d_sync));
			d_sync = NULL;
		}
		sync_bytes = 0;
		return retval;
	}


	/**
	 * Destructor
	 */
	virtual ~GlobalBarrierLifetime()
	{
		HostReset();
	}


	/**
	 * Sets up the progress counters for the next kernel launch (lazily
	 * allocating and initializing them if necessary)
	 */
	cudaError_t Setup(int sweep_grid_size)
	{
		cudaError_t retval = cudaSuccess;
		do {
			size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
			if (new_sync_bytes > sync_bytes) {

				if (d_sync) {
					CUDA_SAFE_CALL(cudaFree(d_sync));
					retval = cudaSuccess;
				}

				sync_bytes = new_sync_bytes;

				CUDA_SAFE_CALL(cudaMalloc((void**) &d_sync, sync_bytes));
				retval = cudaSuccess;

				// Initialize to zero
				CUDA_SAFE_CALL(cudaMemset(d_sync, 0, sweep_grid_size * sizeof(SyncFlag)));

			}
		} while (0);

		return retval;
	}
};


================================================
FILE: libgpu/include/gg.h
================================================
/*
   gg.h

   Implements the main GG header file. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>

   TODO: RTLICENSE
*/

#ifndef GALOIS_GPU
#define GALOIS_GPU

#include <fstream>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <cassert>

#ifndef GGDEBUG
#define GGDEBUG 0
#endif

#define dprintf                                                                \
  if (debug)                                                                   \
  printf
unsigned const debug = GGDEBUG;

#include "Timer.h"

static void check_cuda_error(const cudaError_t e, const char* file,
                             const int line) {
  if (e != cudaSuccess) {
    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
    exit(1);
  }
}

template <typename T>
static void check_retval(const T retval, const T expected, const char* file,
                         const int line) {
  if (retval != expected) {
    fprintf(stderr, "%s:%d: Got %d, expected %d\n", file, line, retval,
            expected);
    exit(1);
  }
}

inline static __device__ __host__ int roundup(int a, int r) {
  return ((a + r - 1) / r) * r;
}

inline static __device__ __host__ int GG_MIN(int x, int y) {
  if (x > y)
    return y;
  else
    return x;
}

#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
#define check_rv(r, x) check_retval(r, x, __FILE__, __LINE__)

#include "bmk2.h"
#include "csr_graph.h"
#include "sharedptr.h"
#include "worklist.h"
#include "aolist.h"
#include "lockarray.h"
#include "abitset.h"
#include "gbar.cuh"
#include "cuda_launch_config.hpp"
#include "pipe.h"
#include "exclusive.h"
#include "internal.h"
#include "rv.h"
#include "failfast.h"
#include "ggc_rt.h"
#include "instr.h"

#include <moderngpu/context.hxx>

extern mgpu::context_t* mgc;
#endif


================================================
FILE: libgpu/include/ggc_rt.h
================================================
#pragma once

struct ggc_rt_dev_info {
  int dev;
  int nSM;
};

void ggc_init_dev_info();
void ggc_set_gpu_device(int dev);
int ggc_get_nSM();


================================================
FILE: libgpu/include/ggcuda.h
================================================
/*
   ggcuda.h

   Implements GG CUDA runtime bits. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#pragma once

#define TID_1D (threadIdx.x + blockIdx.x * blockDim.x)
#define TOTAL_THREADS_1D (gridDim.x * blockDim.x)
#define BLOCK_DIM_X blockDim.x

#define CONDITION enable_lb
#define MAX_INT 2147483647
#define THRESHOLD TOTAL_THREADS_1D
#define DEGREE_LIMIT ((CONDITION) ? (THRESHOLD) : (MAX_INT))


================================================
FILE: libgpu/include/instr.h
================================================
#pragma once

typedef struct trace_file* TRACE;
struct instr_trace;

void instr_set_saved_uniqid(const char* id);
void instr_load_uniqid();
const char* instr_saved_uniqid();
const char* instr_uniqid();
const char* instr_trace_dir();

TRACE trace_open(const char* name, const char* mode);
void trace_close(TRACE t);

void instr_write_array(const char* n, TRACE f, size_t elemsz, size_t nelems,
                       void* p);

/* gp is the gpu pointer, cp can be the associated CPU pointer if available */
void instr_write_array_gpu(const char* n, TRACE f, size_t elemsz, size_t nelems,
                           void* gp, void* cp);

size_t instr_read_array(const char* n, TRACE f, size_t elemsz, size_t maxnelems,
                        void* p);

size_t instr_read_array_gpu(const char* n, TRACE f, size_t elemsz,
                            size_t maxnelems, void* gp, void* cp);

void instr_save_array_gpu(const char* kernel, const int invocation,
                          const int pos, const char* arg, void* gp, void* cp,
                          size_t sz, size_t num);

void instr_save_array(const char* kernel, const int invocation, const int pos,
                      const char* arg, void* cp, size_t sz, size_t num);

void instr_save_primitive(const char* name, const int invocation, const int pos,
                          const char* arg, void* p, size_t sp);

size_t instr_load_array_gpu(const char* kernel, const int invocation,
                            const int pos, const char* arg, void* gp, void* cp,
                            size_t sz, size_t maxnum);

size_t instr_load_array(const char* kernel, const int invocation, const int pos,
                        const char* arg, void* cp, size_t sz, size_t maxnum);

void instr_load_primitive(const char* name, const int invocation, const int pos,
                          const char* arg, void* p, size_t sp);

struct instr_trace* instr_trace_file(const char* prefix, int mode);

void instr_pipe_iterate(struct instr_trace* f, int depth, int index);

void instr_pipe_exit(struct instr_trace* f, int depth, int index);

void instr_load_trace(const char* n, struct instr_trace* it);
bool instr_match_pipe(struct instr_trace* it, int what, int depth, int index);
bool instr_match_pipe_iterate(struct instr_trace* it, int depth, int index);
bool instr_match_pipe_exit(struct instr_trace* it, int depth, int index);
void instr_pipe_iterate(struct instr_trace* it, int depth, int index);
void instr_pipe_exit(struct instr_trace* it, int depth, int index);

#ifdef USE_SNAPPY
#include "snfile.h"
SNAPPY_FILE trace_snappy_handle(TRACE f);
#endif

#define INSTR_TRACE_ITER 0
#define INSTR_TRACE_EXIT 1


================================================
FILE: libgpu/include/internal.h
================================================
/*
   internal.h

   Implements internal runtime routines. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#pragma once

typedef int cuda_size_t;

// TODO: specialize this
const int MAX_TB_SIZE     = 1024;
const int LOG_MAX_TB_SIZE = 10;

/* container to perform multiple independent sums (scans) */
template <int items, typename T>
struct multiple_sum {
  T el[items];

  // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e
  // "This operation assumes the value of obtained by the T's default
  // constructor (or by zero-initialization if no user-defined default
  // constructor exists) is suitable as the identity value zero for addition."
  __device__ __host__ multiple_sum() : multiple_sum(T()) {}

  __device__ __host__ multiple_sum(const T e) {
    for (int i = 0; i < items; i++)
      el[i] = e;
  }

  __device__ __host__ multiple_sum& operator=(const T rhs) {
    for (int i = 0; i < items; i++)
      el[i] = rhs;

    return *this;
  }

  __device__ __host__ multiple_sum& operator+=(const multiple_sum& rhs) {
    for (int i = 0; i < items; i++)
      el[i] += rhs.el[i];

    return *this;
  }

  __device__ __host__ friend multiple_sum operator+(multiple_sum lhs,
                                                    const multiple_sum& rhs) {
    return lhs += rhs;
  }
};

/* for two scans */
struct pair {
  int x, y, z;

  __device__ __host__ pair& operator+=(const pair& rhs) {
    x += rhs.x;
    y += rhs.y;
    z += rhs.z;

    return *this;
  }

  __device__ __host__ friend pair operator+(pair lhs, const pair& rhs) {
    return lhs += rhs;
  }
};

template <const int WARPS_PER_TB>
struct warp_np {
  volatile index_type owner[WARPS_PER_TB];
  volatile index_type start[WARPS_PER_TB];
  volatile index_type size[WARPS_PER_TB];
  volatile index_type offset[WARPS_PER_TB]; // task offset
  volatile index_type src[WARPS_PER_TB];
};

struct tb_np {
  index_type owner;
  index_type start;
  index_type size;
  index_type offset;
  index_type src;
};

template <const int ITSIZE>
struct fg_np {
  index_type itvalue[ITSIZE];
  index_type src[ITSIZE];
};

struct empty_np {};

template <typename ts_type, typename index_type, typename TTB, typename TWP,
          typename TFG>
union np_shared {
  // for scans
  ts_type temp_storage;

  // for tb-level np
  TTB tb;

  // for warp-level np
  TWP warp;

  TFG fg;
};

struct NPInspector1 {
  cuda_size_t total;   // total work across all threads
  cuda_size_t done;    // total work done across all threads
  cuda_size_t size;    // size of this thread's work
  cuda_size_t start;   // this thread's iteration start value
  cuda_size_t offset;  // offset within flattened iteration space
  cuda_size_t my_done; // items completed within this thread's space

  // inspect should be inspect_begin, inspect_end, inspect_update really?
  // especially for custom closures...

  template <typename T>
  __device__ __host__ cuda_size_t inspect(T* itvalue,
                                          const cuda_size_t ITSIZE) {
    cuda_size_t _np_i;
    for (_np_i = 0;
         (my_done + _np_i) < size && (offset - done + _np_i) < ITSIZE;
         _np_i++) {
      itvalue[offset - done + _np_i] = start + my_done + _np_i;
    }

    my_done += _np_i;
    offset += _np_i;

    return _np_i;
  }

  template <typename T>
  __device__ __host__ cuda_size_t inspect2(T* itvalue, T* source,
                                           const cuda_size_t ITSIZE,
                                           const cuda_size_t src) {
    cuda_size_t _np_i;
    for (_np_i = 0;
         (my_done + _np_i) < size && (offset - done + _np_i) < ITSIZE;
         _np_i++) {
      itvalue[offset - done + _np_i] = start + my_done + _np_i;
      source[offset - done + _np_i]  = src;
    }

    my_done += _np_i;
    offset += _np_i;

    return _np_i;
  }

  __device__ __host__ bool work() const { return total > 0; }

  __device__ __host__ bool valid(const cuda_size_t ltid) const {
    return ltid < total; // remember total decreases every round
  }

  __device__ __host__ void execute_round_done(const cuda_size_t ITSIZE) {
    total -= ITSIZE;
    done += ITSIZE;
  }
};


================================================
FILE: libgpu/include/lockarray.h
================================================
/*
   lockarray.h

   Implements LockArray*. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#include "sharedptr.h"
#include <cassert>

#define UNLOCKED 0
#define LOCKED 1

class LockArraySimple {
public:
  Shared<int> locks;
  int* glocks;
  int nlocks;

  LockArraySimple(size_t nlocks) {
    locks        = Shared<int>(nlocks);
    this->nlocks = nlocks;
    glocks       = locks.zero_gpu();
  }

  // do not use this
  __device__ bool acquire(int ndx) {
    assert(ndx >= 0 && ndx < nlocks);
    while (atomicCAS(glocks + ndx, UNLOCKED, LOCKED) == LOCKED) {
      __threadfence();
    }
    return glocks[ndx] == LOCKED;
  }

  __device__ bool acquire_or_fail(int ndx) {
    assert(ndx >= 0 && ndx < nlocks);
    return atomicCAS(glocks + ndx, UNLOCKED, LOCKED) == UNLOCKED;
  }

  __device__ bool is_locked(int ndx) {
    // TODO: atomic reads?
    assert(ndx >= 0 && ndx < nlocks);
    return glocks[ndx] == LOCKED;
  }

  __device__ void release(int ndx) {
    __threadfence();
    bool was_locked = atomicCAS(glocks + ndx, LOCKED, UNLOCKED) == LOCKED;
    assert(was_locked);
  }
};

class LockArrayTicket : public LockArraySimple {
public:
  Shared<int> tickets;

  int* gtickets;

  LockArrayTicket(size_t nlocks) : LockArraySimple(nlocks) {
    tickets  = Shared<int>(nlocks);
    gtickets = tickets.gpu_wr_ptr();
    assert(cudaMemset(gtickets, 0, nlocks * sizeof(int)) == cudaSuccess);
  }

  __device__ int reserve(int ndx) {
    assert(ndx >= 0 && ndx < nlocks);
    return atomicAdd(gtickets + ndx, 1);
  }

  __device__ bool acquire_or_fail(int ndx, int ticket) {
    assert(ndx >= 0 && ndx < nlocks);
    return glocks[ndx] == ticket;
  }

  __device__ bool is_locked(int ndx) {
    assert(ndx >= 0 && ndx < nlocks);
    return glocks[ndx] < gtickets[ndx];
  }

  __device__ void release(int ndx) {
    __threadfence();
    bool was_locked = glocks[ndx]++ < gtickets[ndx];
    assert(was_locked);
  }
};

typedef LockArraySimple LockArray;


================================================
FILE: libgpu/include/pipe.h
================================================
/*
   pipe.h

   Implements PipeContext*. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#pragma once
#include <cuda.h>

class Worklist2Light;

struct oi_save {
  int in;
  int out;
  int re;
  // should be if-def'ed?
  int in_currslot;
  int out_currslot;
  int re_currslot;
};

template <class WLT>
struct PipeContextT {
  WLT wl[3];
  int in, out, re;
  struct oi_save* ois;

  PipeContextT() {}

  PipeContextT(size_t size) {
    wl[0] = WLT(size);
    wl[1] = WLT(size);
    wl[2] = WLT(size);
    in    = 0;
    out   = 1;
    re    = 2;
    ois   = 0;
  }

  __device__ __host__ WLT& in_wl() { return wl[in]; }

  __device__ __host__ WLT& out_wl() { return wl[out]; }

  __device__ __host__ WLT& re_wl() { return wl[re]; }

  __device__ __host__ void swap(int& x, int& y) {
    int t;
    t = x;
    x = y;
    y = t;
  }
  __device__ __host__ inline void advance() {
    wl[in].reset();
    swap(in, out);
  }

  __device__ __host__ inline void advance2() { swap(in, out); }

  __device__ __host__ inline void retry() {
    wl[in].reset();
    swap(re, in);
  }

  __device__ __host__ inline void retry2() { swap(re, in); }

  __host__ void prep() { check_cuda(cudaMalloc(&ois, sizeof(struct oi_save))); }

  __device__ void save() {
    ois->in  = in;
    ois->out = out;
    ois->re  = re;

    ois->in_currslot  = wl[in].currslot;
    ois->out_currslot = wl[out].currslot;
    ois->re_currslot  = wl[re].currslot;
  }

  __host__ void restore() {
    struct oi_save local;
    check_cuda(cudaMemcpy(&local, ois, sizeof(struct oi_save),
                          cudaMemcpyDeviceToHost));

    in  = local.in;
    out = local.out;
    re  = local.re;

    wl[in].set_slot(local.in_currslot);
    wl[out].set_slot(local.out_currslot);
    wl[re].set_slot(local.re_currslot);

    check_cuda(cudaFree(ois));
  }

  __host__ void free() {
    for (int i = 0; i < 3; i++) {
      wl[i].free();
    }
  }
};

struct PipeContextLight {
  Worklist2Light wl[2];
  int index;
  struct oi_save* ois;

  template <typename T>
  __device__ PipeContextLight(PipeContextT<T> pipe) {
    wl[0].fromWL2(pipe.in_wl());
    wl[1].fromWL2(pipe.out_wl());
    // wl[2].fromWL2(pipe.re_wl());   // not used
    index = 0;
    ois   = 0;
  }

  /* __device__ __host__ __forceinline__ */
  /* Worklist2Light &in_wl() { */
  /*   //assert(in != re && in != out && re != out); */
  /*   return wl[index]; */
  /* } */

  /* __device__ __host__ __forceinline__ */
  /* Worklist2Light &out_wl() { */
  /*   //assert(out != re && in != out  && re != in); */
  /*   return wl[index ^ 1]; */
  /* } */

  /* __device__ __host__ __forceinline__ */
  /* Worklist2Light &re_wl() { */
  /*   //assert(in != re && re != out  && in != out); */
  /*   //return wl[2]; */
  /* } */

  /* __device__ __host__  */
  /* void swap(int &x, int &y) { */
  /*   int t; */
  /*   t = x; */
  /*   x = y; */
  /*   y = t; */
  /* } */
  /* __device__ __host__ inline */
  /* void advance() { */
  /*   //wl[in].reset(); */
  /*   //swap(in, out); */
  /* } */

  /* __device__ __host__ inline */
  /* void advance2() { */
  /*   //swap(in, out); */
  /*   index ^= 1; */
  /* } */

  /* __device__ __host__ inline */
  /* void retry() { */
  /*   //wl[in].reset(); */
  /*   //swap(re, in); */
  /* } */

  /* __device__ __host__ inline */
  /* void retry2() { */
  /*   //swap(re, in); */
  /* } */

  /* __host__ void prep() { */
  /*   check_cuda(cudaMalloc(&ois, sizeof(struct oi_save))); */
  /* } */

  template <typename T>
  __device__ void save(PipeContextT<T>& pipe, int index) {
    pipe.ois->in  = index;
    pipe.ois->out = index ^ 1;
    pipe.ois->re  = 2;

    pipe.ois->in_currslot  = wl[index].currslot;
    pipe.ois->out_currslot = wl[index ^ 1].currslot;
    // pipe.ois->re_currslot = wl[2].currslot;
  }

  /* __host__ void restore() { */
  /*   struct oi_save local; */
  /*   check_cuda(cudaMemcpy(&local, ois, sizeof(struct oi_save),
   * cudaMemcpyDeviceToHost)); */

  /*   index = local.in; */
  /*   // = local.out; */
  /*   //re = local.re; */

  /*   wl[index].set_slot(local.in_currslot); */
  /*   wl[index ^ 1].set_slot(local.out_currslot); */
  /*   //wl[2].set_slot(local.re_currslot); */

  /*   check_cuda(cudaFree(ois)); */
  /* }     */

  /* __host__ void free() { */
  /*   for(int i = 0; i < 3; i++) { */
  /*     //wl[i].free(); */
  /*   } */
  /* } */
};

typedef PipeContextT<Worklist2> PipeContext;
typedef PipeContextT<WorklistT> PipeContextWT;


================================================
FILE: libgpu/include/rv.h
================================================
/*
   rv.h

   Implements Reduce on the GPU. Adapted from the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
   Author: Roshan Dathathri <roshan@cs.utexas.edu>
*/

#pragma once
#include "cub/cub.cuh"
#include "atomic_helpers.h"

template <typename Type>
class HGReducible {
public:
  Type* rv; // allocated by the user

  __device__ void thread_entry() {}

  template <typename T>
  __device__ void thread_exit(typename T::TempStorage& temp_storage) {}

  __device__ void reduce(Type value) {}
};

template <typename Type>
class HGAccumulator : public HGReducible<Type> {

public:
  Type local;
  __device__ void thread_entry() { local = 0; }

  template <typename T>
  __device__ void thread_exit(typename T::TempStorage& temp_storage) {
    local = T(temp_storage).Sum(local);

    if (threadIdx.x == 0 && local) {
      atomicTestAdd((Type*)HGReducible<Type>::rv, local);
    }
  }

  __device__ void reduce(Type value) {
    if (value)
      local += value;
  }
};

template <typename Type>
class HGReduceMax : public HGReducible<Type> {
  Type local;

  struct MaxOp {
    __device__ Type operator()(const Type& a, const Type& b) {
      return (a > b) ? a : b;
    }
  };
  MaxOp maxOp;

public:
  __device__ void thread_entry() { local = 0; } // assumes positive numbers

  template <typename T>
  __device__ void thread_exit(typename T::TempStorage& temp_storage) {
    local = T(temp_storage).Reduce(local, maxOp);

    if (threadIdx.x == 0 && local) {
      atomicTestMax((Type*)HGReducible<Type>::rv, local);
    }
  }

  __device__ void reduce(Type value) {
    if (local < value)
      local = value;
  }
};

template <typename Type>
class HGReduceMin : public HGReducible<Type> {
  Type local;

  struct MinOp {
    __device__ Type operator()(const Type& a, const Type& b) {
      return (a < b) ? a : b;
    }
  };
  MinOp minOp;

public:
  __device__ void thread_entry() {
    local = 1073741823;
  } // assumes Type can hold this number

  template <typename T>
  __device__ void thread_exit(typename T::TempStorage& temp_storage) {
    local = T(temp_storage).Reduce(local, minOp);

    if (threadIdx.x == 0 && (local != 1073741823)) {
      atomicTestMin((Type*)HGReducible<Type>::rv, local);
    }
  }

  __device__ void reduce(Type value) {
    if (local > value)
      local = value;
  }
};


================================================
FILE: libgpu/include/sharedptr.h
================================================
/*
  sharedptr.h

  Convenience class for shared CPU/GPU allocations.
  Based on the X10 Runtime ideas described in Pai et al. in PACT 2012.
  Also see NVIDIA Hemi's array.h at <https://github.com/harrism/hemi>

  Copyright (C) 2014--2016, The University of Texas at Austin

  Author: Sreepathi Pai  <sreepai@ices.utexas.edu>
*/

#pragma once
#include <cstdlib>
#include <cstdio>
#include <cuda.h>
#include <assert.h>
#include "cutil_subset.h"

template <typename T>
class Shared {
  T** ptrs;
  bool* owner;
  bool* isCPU;
  int max_devices;
  size_t nmemb;

public:
  Shared() { nmemb = 0; }

  Shared(size_t nmemb) {
    this->nmemb = nmemb;
    max_devices = 2;
    ptrs        = (T**)calloc(max_devices, sizeof(T*));
    owner       = (bool*)calloc(max_devices, sizeof(bool));
    isCPU       = (bool*)calloc(max_devices, sizeof(bool));

    isCPU[0] = true;

    for (int i = 0; i < max_devices; i++)
      owner[i] = true;
  }

  size_t size() const { return this->nmemb; }

  void alloc(size_t nmemb) {
    assert(this->nmemb == 0);

    this->nmemb = nmemb;

    max_devices = 2;
    ptrs        = (T**)calloc(max_devices, sizeof(T*));
    owner       = (bool*)calloc(max_devices, sizeof(bool));
    isCPU       = (bool*)calloc(max_devices, sizeof(bool));

    isCPU[0] = true;

    for (int i = 0; i < max_devices; i++)
      owner[i] = true;
  }

  void free() {
    for (int i = 0; i < max_devices; i++)
      free_device(i);
  }

  bool free_device(int device = 0) {
    assert(device < max_devices);

    if (!ptrs[device])
      return true;

    if (isCPU[device])
      ::free(ptrs[device]);
    else {
      if (cudaFree(ptrs[device]) == cudaSuccess)
        ptrs[device] = NULL;
      else
        return false;
    }

    return true;
  }

  bool find_owner(int& o) {
    int i;
    for (i = 0; i < max_devices; i++)
      if (owner[i]) {
        o = i;
        break;
      }

    return i < max_devices;
  }

  T* cpu_rd_ptr() {
    if (ptrs[0] == NULL)
      ptrs[0] = (T*)calloc(nmemb, sizeof(T));

    if (!owner[0]) {
      int o;
      if (find_owner(o))
        copy(o, 0);

      owner[0] = true;
    }

    return ptrs[0];
  }

  T* cpu_wr_ptr(bool overwrite = false) {
    if (ptrs[0] == NULL)
      ptrs[0] = (T*)calloc(nmemb, sizeof(T));

    if (!owner[0]) {
      if (!overwrite) {
        int o;
        if (find_owner(o))
          copy(o, 0);
      }

      owner[0] = true;
    }

    for (int i = 1; i < max_devices; i++)
      owner[i] = false;

    return ptrs[0];
  }

  T* gpu_rd_ptr(int device = 1) /* device >= 1 */
  {
    assert(device >= 1);

    if (ptrs[device] == NULL)
      CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));

    if (!owner[device]) {
      int o;
      if (find_owner(o))
        copy(o, device);

      owner[device] = true;
    }

    return ptrs[device];
  }

  T* gpu_wr_ptr(bool overwrite = false, int device = 1) {
    assert(device >= 1);

    if (ptrs[device] == NULL) {
      CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));
    }

    if (!owner[device]) {
      if (!overwrite) {
        int o;
        if (find_owner(o))
          copy(o, device);
      }

      owner[device] = true;
    }

    for (int i = 0; i < max_devices; i++)
      if (i != device)
        owner[i] = false;

    return ptrs[device];
  }

  T* zero_gpu(int device = 1) {
    T* p = gpu_wr_ptr(true, device);
    CUDA_SAFE_CALL(cudaMemset(p, 0, sizeof(T) * nmemb));
    return p;
  }

  void copy(int src, int dst) {
    if (!ptrs[src])
      return;

    assert(ptrs[dst]);

    if (isCPU[dst] && !isCPU[src]) {
      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),
                                cudaMemcpyDeviceToHost));
    } else if (!isCPU[dst] && !isCPU[src]) {
      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),
                                cudaMemcpyDeviceToDevice));
    } else if (!isCPU[dst] && isCPU[src]) {
      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),
                                cudaMemcpyHostToDevice));
    } else
      abort(); // cpu-to-cpu not implemented
  }

  __device__ __host__ T* ptr() {
#ifdef __CUDA_ARCH__
    return ptrs[1]; // TODO: this is invalid beyond one gpu device!
#else
    return ptrs[0];
#endif
  }
};

template <typename T>
class DeviceOnly {
  T* ptr;
  size_t nmemb;

public:
  DeviceOnly() {
    ptr   = NULL;
    nmemb = 0;
  }

  DeviceOnly(size_t nmemb) {
    ptr = NULL;
    alloc(nmemb);
  }

  size_t size() const { return nmemb; }

  void alloc(size_t nmemb) {
    assert(this->nmemb == 0);
    this->nmemb = nmemb;
    CUDA_SAFE_CALL(cudaMalloc(&ptr, nmemb * sizeof(T)));
  }

  bool free() {
    if (ptr == NULL)
      return true;
    if (cudaFree(ptr) == cudaSuccess) {
      ptr = NULL;
      return true;
    }
    return false;
  }

  T* zero_gpu() {
    CUDA_SAFE_CALL(cudaMemset(ptr, 0, sizeof(T) * nmemb));
    return ptr;
  }

  void copy_to_gpu(T* cpu_ptr) { copy_to_gpu(cpu_ptr, nmemb); }

  void copy_to_gpu(T* cpu_ptr, size_t nuseb) {
    if (cpu_ptr == NULL)
      return;
    assert(ptr != NULL);
    assert(nuseb <= nmemb);
    CUDA_SAFE_CALL(
        cudaMemcpy(ptr, cpu_ptr, nuseb * sizeof(T), cudaMemcpyHostToDevice));
  }

  void copy_to_cpu(T* cpu_ptr) { copy_to_cpu(cpu_ptr, nmemb); }

  void copy_to_cpu(T* cpu_ptr, size_t nuseb) {
    if (ptr == NULL)
      return;
    assert(cpu_ptr != NULL);
    assert(nuseb <= nmemb);
    CUDA_SAFE_CALL(
        cudaMemcpy(cpu_ptr, ptr, nuseb * sizeof(T), cudaMemcpyDeviceToHost));
  }

  __device__ __host__ T* device_ptr() {
#ifdef __CUDA_ARCH__
    return ptr; // TODO: this is invalid beyond one gpu device!
#else
    return ptr;
#endif
  }
};


================================================
FILE: libgpu/include/snfile.h
================================================
#pragma once

#ifdef __cplusplus
extern "C" {
#endif

struct snappy_file;

typedef struct snappy_file* SNAPPY_FILE;

SNAPPY_FILE snopen(const char* name, const char* mode);
size_t snwrite(SNAPPY_FILE f, void* p, size_t sz);
size_t snread(SNAPPY_FILE f, void* p, size_t sz);
int sneof(SNAPPY_FILE f);
void snclose(SNAPPY_FILE f);

#ifdef __cplusplus
}
#endif


================================================
FILE: libgpu/include/thread_work.h
================================================
/*
  thread_work.h

  Copyright (C) 20XX--20XX, The University of Texas at Austin

  Author: Vishwesh Jatala  <vishwesh.jatala@austin.utexas.edu>
*/

struct ThreadWork {

  PipeContextT<Worklist2> thread_work_wl;
  PipeContextT<Worklist2> thread_src_wl;
  Shared<int> thread_prefix_work_wl;
  bool initialized = false;

  void init_thread_work(int size) {
    if (!initialized) {
      thread_work_wl = PipeContextT<Worklist2>(size);
      thread_src_wl  = PipeContextT<Worklist2>(size);

      thread_prefix_work_wl.alloc(size);
      thread_prefix_work_wl.zero_gpu();
      initialized = true;
    }
  }

  void compute_prefix_sum() {

    cub::CachingDeviceAllocator g_allocator(
        true); // Caching allocator for device memory
    // Determine temporary device storage requirements for inclusive prefix sum
    void* d_temp_storage      = NULL;
    size_t temp_storage_bytes = 0;

    cub::DeviceScan::InclusiveSum(
        d_temp_storage, temp_storage_bytes, thread_work_wl.in_wl().dwl,
        thread_prefix_work_wl.gpu_wr_ptr(), thread_work_wl.in_wl().nitems());
    // Allocate temporary storage for inclusive prefix sum
    CubDebugExit(
        g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
    // Run inclusive prefix sum
    cub::DeviceScan::InclusiveSum(
        d_temp_storage, temp_storage_bytes, thread_work_wl.in_wl().dwl,
        thread_prefix_work_wl.gpu_wr_ptr(), thread_work_wl.in_wl().nitems());
  }

  void reset_thread_work() {
    thread_prefix_work_wl.zero_gpu();
    thread_work_wl.in_wl().reset();
    thread_src_wl.in_wl().reset();
  }
};

__device__ unsigned compute_src_and_offset(unsigned first, unsigned last,
                                           unsigned index,
                                           int* thread_prefix_work_wl,
                                           unsigned num_items,
                                           unsigned int& offset) {

  unsigned middle = (first + last) / 2;

  if (index <= thread_prefix_work_wl[first]) {
    if (first == 0) {
      offset = index - 1;
      return first;
    } else {
      offset = index - thread_prefix_work_wl[first - 1] - 1;
      return first;
    }
  }
  while (first + 1 != last) {
    middle = (first + last) / 2;
    if (index > thread_prefix_work_wl[middle]) {
      first = middle;
    } else {
      last = middle;
    }
  }
  offset = index - thread_prefix_work_wl[first] - 1;
  return last;
}


================================================
FILE: libgpu/include/worklist.h
================================================
/*
   worklist.h

   Implements Worklist classes. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

#pragma once

#include "sharedptr.h"
#include "cub/cub.cuh"
#include "cutil_subset.h"
#include "bmk2.h"
#include "instr.h"
#include <moderngpu/kernel_mergesort.hxx>
#include <stdio.h>
#include <stdlib.h>

#define SLOTS 1

static int zero = 0;

extern mgpu::context_t* mgc;

static __global__ void reset_wl(volatile int* dindex) { *dindex = 0; }

static __global__ void init_wl(int size, int* dsize, volatile int* dindex) {
  *dsize        = size;
  *dindex       = 0;
  *(dindex + 1) = 0;
}

/*   int *dwl;
  int *dindex;
  int *dcounters;
  int currslot;
  int length;
*/
struct Worklist {
  int* dwl;
  int* dindex;
#ifdef SLOTS
  int* dcounters;
  int currslot;
#endif
  int length, index;

  int* wl;
  int* dnsize;

  int* dprio;

#ifdef COUNT_ATOMICS
  int* atomic_counter;
#endif

#ifdef ATOMIC_DENSITY
  unsigned int* atomic_density;
#endif

  Shared<int> prio;
  bool f_will_write;

  Worklist(size_t nsize) {
#ifdef SLOTS
    currslot = 0;
#endif
    if (nsize == 0) {
      wl  = NULL;
      dwl = NULL;
    } else {
      wl = (int*)calloc(nsize, sizeof(int));
      CUDA_SAFE_CALL(cudaMalloc(&dwl, nsize * sizeof(int)));
    }
    CUDA_SAFE_CALL(cudaMalloc(&dnsize, 1 * sizeof(int)));
#ifdef SLOTS
    CUDA_SAFE_CALL(cudaMalloc(&dcounters, 2 * sizeof(int)));
    dindex = &dcounters[currslot];
#else
    CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int)));
#endif
    // CUDA_SAFE_CALL(cudaMalloc(&dindex, 2 * sizeof(int)));

    init_wl<<<1, 1>>>(nsize, dnsize, dindex);

    // CUDA_SAFE_CALL(cudaMemcpy(dnsize, &nsize, 1 * sizeof(int),
    // cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy((void *) dindex,
    // &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice));

#ifdef COUNT_ATOMICS
    CUDA_SAFE_CALL(cudaMalloc(&atomic_counter, sizeof(int) * 1));
    CUDA_SAFE_CALL(cudaMemcpy((void*)atomic_counter, &zero, 1 * sizeof(zero),
                              cudaMemcpyHostToDevice));
#endif

#ifdef ATOMIC_DENSITY
    CUDA_SAFE_CALL(
        cudaMalloc(&atomic_density, sizeof(unsigned int) * (32 + 1)));
    CUDA_SAFE_CALL(
        cudaMemset(atomic_density, 0, sizeof(unsigned int) * (32 + 1)));
#endif

    // CUDA_SAFE_CALL(cudaMalloc(&rcounter, 1 * sizeof(int)));
    // CUDA_SAFE_CALL(cudaMemcpy((void *) rcounter, &zero, 1 * sizeof(zero),
    // cudaMemcpyHostToDevice));

    prio.alloc(nsize);
    // prio.cpu_wr_ptr();
    dprio        = prio.gpu_wr_ptr(true);
    length       = nsize;
    f_will_write = false;
    index        = 0;
  }

  void free() {
    ::free(wl);
    CUDA_SAFE_CALL(cudaFree(dwl));
    CUDA_SAFE_CALL(cudaFree(dnsize));
#ifdef SLOTS
    CUDA_SAFE_CALL(cudaFree(dcounters));
#else
    CUDA_SAFE_CALL(cudaFree(dindex));
#endif

#ifdef COUNT_ATOMICS
    CUDA_SAFE_CALL(cudaFree(atomic_counter));
#endif

    prio.free();
  }

  void will_write() { f_will_write = true; }

  void sort() { mergesort(dwl, nitems(), mgpu::less_t<int>(), *mgc); }

  void sort_prio() {
    mergesort(dprio, dwl, nitems(), mgpu::less_t<int>(), *mgc);
  }

  void update_gpu(int nsize) {
#ifdef SLOTS
    int index[2] = {nsize, 0};
    currslot     = 0;
    dindex       = &dcounters[currslot];

    CUDA_SAFE_CALL(cudaMemcpy((void*)dcounters, &index, 2 * sizeof(nsize),
                              cudaMemcpyHostToDevice));
#else
    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),
                              cudaMemcpyHostToDevice));
#endif
    CUDA_SAFE_CALL(
        cudaMemcpy(dwl, wl, nsize * sizeof(int), cudaMemcpyHostToDevice));
  }

  void update_cpu() {
    int nsize = nitems();
    CUDA_SAFE_CALL(
        cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost));
  }

  void display_items() {
    int nsize = nitems();
    CUDA_SAFE_CALL(
        cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost));

    printf("WL: ");
    for (int i = 0; i < nsize; i++)
      printf("%d %d, ", i, wl[i]);

    printf("\n");
    return;
  }

  void save(const char* f, const unsigned iteration) {
    char n[255];
    int ret;

    ret = snprintf(n, 255, "%s%s-%05d-%s.wl", instr_trace_dir(), f, iteration,
                   instr_uniqid());

    if (ret < 0 || ret >= 255) {
      fprintf(stderr, "Error creating filename for kernel '%s', iteration %d\n",
              f, iteration);
      exit(1);
    }

    int nsize = nitems();
    TRACE of  = trace_open(n, "w");
    instr_write_array_gpu(n, of, sizeof(wl[0]), nsize, dwl, wl);
    trace_close(of);
    bmk2_log_collect("ggc/wlcontents", n);
    return;
  }

  void load(const char* f, const unsigned iteration) {
    char n[255];
    int ret;

    ret = snprintf(n, 255, "%s%s-%05d-%s.wl", instr_trace_dir(), f, iteration,
                   instr_saved_uniqid());

    if (ret < 0 || ret >= 255) {
      fprintf(stderr, "Error creating filename for kernel '%s', iteration %d\n",
              f, iteration);
      exit(1);
    }

    TRACE of  = trace_open(n, "r");
    int nsize = instr_read_array_gpu(n, of, sizeof(wl[0]), length, dwl, wl);
    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),
                              cudaMemcpyHostToDevice));
    trace_close(of);
    return;
  }

#ifdef SLOTS
  __device__ __host__ inline void reset_next_slot() const {
#ifdef __CUDA_ARCH__
    dcounters[1 ^ currslot] = 0;
#else
    reset_wl<<<1, 1>>>(&dcounters[1 ^ currslot]);
#endif
  }

  __device__ __host__ inline void set_slot(int slot) {
    currslot = slot;
    dindex   = &dcounters[currslot];
  }

  __device__ __host__ inline void swap_slots() {
    currslot ^= 1;
    dindex = &dcounters[currslot];
  }
#endif /* SLOTS */

  __device__ __host__ inline void reset() {
#ifdef __CUDA_ARCH__
    *(volatile int*)dindex = 0;
    // atomicAdd(rcounter, 1);
#else
    // CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero),
    // cudaMemcpyHostToDevice));
    reset_wl<<<1, 1>>>(dindex);
#endif
  }

  __device__ __host__ inline int nitems() {
#ifdef __CUDA_ARCH__
    // return atomicAdd(dindex, 0);
    // return *dindex;
    return *((volatile int*)dindex);
#else
    // if(f_will_write)

    CUDA_SAFE_CALL(cudaMemcpy(&index, (void*)dindex, 1 * sizeof(index),
                              cudaMemcpyDeviceToHost));

    // f_will_write = 0;
    return index;
#endif
  }

  __device__ int push(int item) {
    int lindex = atomicAdd((int*)dindex, 1);
    assert(lindex <= *dnsize);

#ifdef ATOMIC_DENSITY
    int first, offset, total;
    warp_active_count(first, offset, total);

    if (offset == 0) {
      atomicAdd(&atomic_density[total], 1);
    }
#endif

#ifdef COUNT_ATOMICS
    atomicAdd(atomic_counter, 1);
#endif

    dwl[lindex] = item;
    return 1;
  }

  __device__ int push_range(int nitems) const {
    int lindex = atomicAdd((int*)dindex, nitems);
    assert(lindex <= *dnsize);

#ifdef COUNT_ATOMICS
    atomicAdd(atomic_counter, 1);
#endif

    return lindex;
  }

  __device__ int push(int item, int prio) {
    int lindex = atomicAdd((int*)dindex, 1);
    assert(lindex <= *dnsize);

#ifdef COUNT_ATOMICS
    atomicAdd(atomic_counter, 1);
#endif

#ifdef ATOMIC_DENSITY
    int first, offset, total;
    warp_active_count(first, offset, total);

    if (offset == 0) {
      atomicAdd(&atomic_density[total], 1);
    }
#endif

    dwl[lindex]   = item;
    dprio[lindex] = prio;

    return 1;
  }

  __device__ int push_id(int id, int item) {
    assert(id <= *dnsize);
    dwl[id] = item;
    return 1;
  }

  __device__ int setup_push_warp_one() {
    int first, total, offset, lindex = 0;

    warp_active_count(first, offset, total);

    if (offset == 0) {
      lindex = atomicAdd((int*)dindex, total);
      assert(lindex <= *dnsize);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif

      // counting density makes no sense -- it is always 1
    }

    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);
    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1

    return lindex + offset;
  }

  __device__ int setup_push_warp_one_za() {
    int first, total, offset, lindex = 0;

    // test function, not part of API

    warp_active_count_zero_active(first, offset, total);

    if (offset == 0) {
      lindex = atomicAdd((int*)dindex, total);
      assert(lindex <= *dnsize);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif
    }

    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);
    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1

    return lindex + offset;
  }

  // must be warp uniform ... i.e. all threads in warp must be active
  template <typename T>
  __device__ int setup_push_warp(typename T::TempStorage* ts, int nitems) {
    int total, offset, lindex;
    T(ts[threadIdx.x / 32]).ExclusiveSum(nitems, offset, total);

    if (threadIdx.x % 32 == 0) {
      lindex = atomicAdd((int*)dindex, total);
      assert(lindex <= *dnsize);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif
    }

    lindex = cub::ShuffleIndex<32>(lindex, 0, 0xffffffff);
    // lindex = cub::ShuffleIndex(lindex, 0); // CUB > 1.3.1

    return lindex + offset;
  }

  __device__ int do_push(int start, int id, int item) const {
    assert(id <= *dnsize);
    dwl[start + id] = item;
    return 1;
  }

  __device__ int pop(int& item) const {
    int lindex = atomicSub((int*)dindex, 1);
    if (lindex <= 0) {
      *dindex = 0;
      return 0;
    }

    item = dwl[lindex - 1];
    return 1;
  }
};

struct Worklist2 : public Worklist {
  Worklist2() : Worklist(0) {}
  Worklist2(int nsize) : Worklist(nsize) {}

  template <typename T>
  __device__ __forceinline__ int push_1item(int nitem, int item,
                                            int threads_per_block) {
    __shared__ typename T::TempStorage temp_storage;
    __shared__ int queue_index;
    int total_items = 0;
    int thread_data = nitem;

    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);

    if (threadIdx.x == 0) {
      if (debug)
        printf("t: %d\n", total_items);
      queue_index = atomicAdd((int*)dindex, total_items);
      // printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x,
      // queue_index, thread_data + n_items, total_items);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif
    }

    __syncthreads();

    if (nitem == 1) {
      if (queue_index + thread_data >= *dnsize) {
        printf("GPU: exceeded length: %d %d %d\n", queue_index, thread_data,
               *dnsize);
        return 0;
      }

      // dwl[queue_index + thread_data] = item;
      cub::ThreadStore<cub::STORE_CG>(dwl + queue_index + thread_data, item);
    }

    return total_items;
  }

  template <typename T>
  __device__ __forceinline__ int push_1item(int nitem, int item, int prio,
                                            int threads_per_block) {
    __shared__ typename T::TempStorage temp_storage;
    __shared__ int queue_index;
    int total_items = 0;
    int thread_data = nitem;

    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);

    if (threadIdx.x == 0) {
      if (debug)
        printf("t: %d\n", total_items);
      queue_index = atomicAdd((int*)dindex, total_items);
      // printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x,
      // queue_index, thread_data + n_items, total_items);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif
    }

    __syncthreads();

    if (nitem == 1) {
      if (queue_index + thread_data >= *dnsize) {
        printf("GPU: exceeded length: %d %d %d\n", queue_index, thread_data,
               *dnsize);
        return 0;
      }

      // dwl[queue_index + thread_data] = item;
      cub::ThreadStore<cub::STORE_CG>(dwl + queue_index + thread_data, item);
      cub::ThreadStore<cub::STORE_CG>(dprio + queue_index + thread_data, prio);
    }

    return total_items;
  }

  template <typename T>
  __device__ __forceinline__ int push_nitems(int n_items, int* items,
                                             int threads_per_block) {
    __shared__ typename T::TempStorage temp_storage;
    __shared__ int queue_index;
    int total_items;

    int thread_data = n_items;

    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);

    if (threadIdx.x == 0) {
      queue_index = atomicAdd((int*)dindex, total_items);
      // printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x,
      // queue_index, thread_data + n_items, total_items);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif
    }

    __syncthreads();

    for (int i = 0; i < n_items; i++) {
      // printf("pushing %d to %d\n", items[i], queue_index + thread_data + i);
      if (queue_index + thread_data + i >= *dnsize) {
        printf("GPU: exceeded length: %d %d %d %d\n", queue_index, thread_data,
               i, *dnsize);
        return 0;
      }

      dwl[queue_index + thread_data + i] = items[i];
    }

    return total_items;
  }

  __device__ int pop_id(int id, int& item) const {
    if (id < *dindex) {
      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
      // item = dwl[id];
      return 1;
    }

    return 0;
  }

  __device__ int pop_id_len(int id, int len, int& item) const {
    if (id < len) {
      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
      // item = dwl[id];
      return 1;
    }

    return 0;
  }
};

struct WorklistT : public Worklist2 {
  cudaTextureObject_t tx;

  WorklistT() : Worklist2() {}

  WorklistT(size_t nsize) : Worklist2(nsize) {
    // from here:
    // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/

    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType                = cudaResourceTypeLinear;
    resDesc.res.linear.devPtr      = dwl;
    resDesc.res.linear.desc.f      = cudaChannelFormatKindSigned;
    resDesc.res.linear.desc.x      = 32; // bits per channel
    resDesc.res.linear.sizeInBytes = length * sizeof(int);

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType;

    // create texture object: we only have to do this once!
    CUDA_SAFE_CALL(cudaCreateTextureObject(&tx, &resDesc, &texDesc, NULL));
  }

  void free() {
    CUDA_SAFE_CALL(cudaDestroyTextureObject(tx));
    Worklist2::free();
  }

  __device__ int pop_id(int id, int& item) {
    if (id < *dindex) {
      item = tex1Dfetch<int>(tx, id);
      // item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
      return 1;
    }

    return 0;
  }

  __device__ int pop_id_len(int id, int len, int& item) {
    if (id < len) {
      item = tex1Dfetch<int>(tx, id);
      // item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
      return 1;
    }

    return 0;
  }
};

struct Worklist2Light {
  int* dwl;
  int* dindex;
  int* dcounters;
  int currslot;
  int length;

  __device__ void fromWL2(Worklist2 wl) {
    dwl       = wl.dwl;
    dindex    = wl.dindex;
    dcounters = wl.dcounters;
    currslot  = wl.currslot;
    length    = *wl.dnsize;
  }

  __device__ __host__ inline int nitems() {
#ifdef __CUDA_ARCH__
    // return atomicAdd(dindex, 0);
    // return *dindex;
    return *((volatile int*)dindex);
#else
    assert(false);
    return 0;
    // if(f_will_write)

    // CUDA_SAFE_CALL(cudaMemcpy(&index, (void *) dindex, 1 * sizeof(index),
    // cudaMemcpyDeviceToHost));

    // f_will_write = 0;
    // return index;
#endif
  }

#ifdef SLOTS
  __device__ __host__ inline void swap_slots() {
    currslot ^= 1;
    dindex = &dcounters[currslot];
  }

  __device__ __host__ inline void set_slot(int slot) {
    currslot = slot;
    dindex   = &dcounters[currslot];
  }
#endif /* SLOTS */

#ifdef SLOTS
  __device__ __host__ inline void reset_next_slot() const {
#ifdef __CUDA_ARCH__
    dcounters[1 ^ currslot] = 0;
#else
    reset_wl<<<1, 1>>>(&dcounters[1 ^ currslot]);
#endif
  }
#endif

  __device__ int do_push(int start, int id, int item) {
    assert(id <= length);
    dwl[start + id] = item;
    return 1;
  }

  __device__ int push_range(int nitems) const {
    int lindex = atomicAdd((int*)dindex, nitems);
    assert(lindex <= length);

#ifdef COUNT_ATOMICS
    // atomicAdd(atomic_counter, 1);
#endif

    return lindex;
  }

  __device__ int setup_push_warp_one() {
    int first, total, offset, lindex = 0;

    warp_active_count(first, offset, total);

    if (offset == 0) {
      lindex = atomicAdd((int*)dindex, total);
      assert(lindex <= length);
#ifdef COUNT_ATOMICS
      atomicAdd(atomic_counter, 1);
#endif

      // counting density makes no sense -- it is always 1
    }

    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);
    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1

    return lindex + offset;
  }

  __device__ int pop_id(int id, int& item) {
    if (id < *dindex) {
      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
      // item = dwl[id];
      return 1;
    }

    return 0;
  }
};

#ifdef COUNT_ATOMICS
static __device__ __host__ int get_atomic_count(Worklist wl) {
#ifdef __CUDA_ARCH__
  return *wl.atomic_counter;
#else
  int count = 0;
  CUDA_SAFE_CALL(cudaMemcpy(&count, wl.atomic_counter, sizeof(int) * 1,
                            cudaMemcpyDeviceToHost));
  return count;
#endif
}
#endif

#ifdef ATOMIC_DENSITY
static __device__ __host__ void print_atomic_density(const char* name,
                                                     Worklist wl) {
#ifdef __CUDA_ARCH__
  assert(false);
#else
  unsigned count[32 + 1];
  CUDA_SAFE_CALL(cudaMemcpy(&count, wl.atomic_density,
                            sizeof(unsigned int) * (32 + 1),
                            cudaMemcpyDeviceToHost));

  for (int i = 0; i < 32 + 1; i++) {
    fprintf(stderr, "INSTR atomic_density_%s_%d %u\n", name, i, count[i]);
  }
#endif
}
#endif


================================================
FILE: libgpu/src/bmk2.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bmk2.h"

static int inited = 0;
static int bmk2   = 0;
static char* binid;
static char* inputid;
static char* runid;

char* bmk2_get_binid() { return binid; }

char* bmk2_get_inputid() { return inputid; }

char* bmk2_get_runid() { return runid; }

int bmk2_log_collect(const char* component, const char* file) {
  if (bmk2 && binid && inputid && runid) {
    fprintf(stderr, "COLLECT %s/%s %s %s %s\n", binid, inputid, runid,
            component, file);
    return 1;
  }

  return 0;
}

__attribute__((constructor)) void init_bmk2() {
  char* p;

  inited = 1;

  if (p = getenv("BMK2")) {
    if (atoi(p) == 1) {
      bmk2 = 1;
    }
  }

  if (bmk2) {
    if (p = getenv("BMK2_BINID")) {
      binid = strdup(p);
    }

    if (p = getenv("BMK2_INPUTID")) {
      inputid = strdup(p);
    }

    if (p = getenv("BMK2_RUNID")) {
      runid = strdup(p);
    }
  }
}


================================================
FILE: libgpu/src/csr_graph.cu
================================================
/*
   csr_graph.cu

   Implements CSR Graph. Part of the GGC source code.

   Copyright (C) 2014--2016, The University of Texas at Austin

   See LICENSE.TXT for copyright license.

   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
*/

/* -*- mode: c++ -*- */

#include "gg.h"
#include "csr_graph.h"

unsigned CSRGraph::init() {
  row_start = edge_dst = NULL;
  edge_data            = NULL;
  node_data            = NULL;
  nnodes = nedges = 0;
  device_graph    = false;

  return 0;
}

unsigned CSRGraph::allocOnHost(bool no_edge_data) {
  assert(nnodes > 0);
  assert(!device_graph);

  if (row_start != NULL) // already allocated
    return true;

  size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type) +
                     (nnodes) * sizeof(node_data_type);
  if (!no_edge_data)
    mem_usage += (nedges) * sizeof(edge_data_type);

  printf("Host memory for graph: %3u MB\n", mem_usage / 1048756);

  row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));
  edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));
  if (!no_edge_data)
    edge_data = (edge_data_type*)calloc(nedges, sizeof(edge_data_type));
  node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));

  return ((no_edge_data || edge_data) && row_start && edge_dst && node_data);
}

unsigned CSRGraph::allocOnDevice(bool no_edge_data) {
  if (edge_dst != NULL) // already allocated
    return true;

  assert(edge_dst == NULL); // make sure not already allocated

  if (nedges > 0)
    check_cuda(cudaMalloc((void**)&edge_dst, nedges * sizeof(index_type)));
  check_cuda(cudaMalloc((void**)&row_start, (nnodes + 1) * sizeof(index_type)));

  if (!no_edge_data && (nedges > 0))
    check_cuda(cudaMalloc((void**)&edge_data, nedges * sizeof(edge_data_type)));
  if (nnodes > 0)
    check_cuda(cudaMalloc((void**)&node_data, nnodes * sizeof(node_data_type)));

  device_graph = true;

  assert(((nedges == 0) || edge_dst) &&
         (no_edge_data || (nedges == 0) || edge_data) && row_start &&
         ((nnodes == 0) || node_data));
  return true;
}

void CSRGraphTex::copy_to_gpu(struct CSRGraphTex& copygraph) {
  copygraph.nnodes = nnodes;
  copygraph.nedges = nedges;

  copygraph.allocOnDevice(edge_data == NULL);

  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,
                        nedges * sizeof(index_type), cudaMemcpyHostToDevice));
  if (edge_data != NULL)
    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,
                          nedges * sizeof(edge_data_type),
                          cudaMemcpyHostToDevice));
  check_cuda(cudaMemcpy(copygraph.node_data, node_data,
                        nnodes * sizeof(node_data_type),
                        cudaMemcpyHostToDevice));

  check_cuda(cudaMemcpy(copygraph.row_start, row_start,
                        (nnodes + 1) * sizeof(index_type),
                        cudaMemcpyHostToDevice));
}

unsigned CSRGraphTex::allocOnDevice(bool no_edge_data) {
  if (CSRGraph::allocOnDevice(no_edge_data)) {
    assert(sizeof(index_type) <= 4);     // 32-bit only!
    assert(sizeof(node_data_type) <= 4); // 32-bit only!

    cudaResourceDesc resDesc;

    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType           = cudaResourceTypeLinear;
    resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
    resDesc.res.linear.desc.x = 32; // bits per channel

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType;

    resDesc.res.linear.devPtr      = edge_dst;
    resDesc.res.linear.sizeInBytes = nedges * sizeof(index_type);
    check_cuda(cudaCreateTextureObject(&edge_dst_tx, &resDesc, &texDesc, NULL));

    resDesc.res.linear.devPtr      = row_start;
    resDesc.res.linear.sizeInBytes = (nnodes + 1) * sizeof(index_type);
    check_cuda(
        cudaCreateTextureObject(&row_start_tx, &resDesc, &texDesc, NULL));

    resDesc.res.linear.devPtr      = node_data;
    resDesc.res.linear.sizeInBytes = (nnodes) * sizeof(node_data_type);
    check_cuda(
        cudaCreateTextureObject(&node_data_tx, &resDesc, &texDesc, NULL));

    return 1;
  }

  return 0;
}

unsigned CSRGraph::deallocOnHost() {
  if (!device_graph) {
    free(row_start);
    free(edge_dst);
    if (edge_data != NULL)
      free(edge_data);
    free(node_data);
  }

  return 0;
}
unsigned CSRGraph::deallocOnDevice() {
  if (device_graph) {
    cudaFree(edge_dst);
    if (edge_data != NULL)
      cudaFree(edge_data);
    cudaFree(row_start);
    cudaFree(node_data);
  }

  return 0;
}

CSRGraph::CSRGraph() { init(); }

void CSRGraph::progressPrint(unsigned maxii, unsigned ii) {
  const unsigned nsteps = 10;
  unsigned ineachstep   = (maxii / nsteps);
  if (ineachstep == 0)
    ineachstep = 1;
  /*if (ii == maxii) {
    printf("\t100%%\n");
    } else*/
  if (ii % ineachstep == 0) {
    int progress = ((size_t)ii * 100) / maxii + 1;

    printf("\t%3d%%\r", progress);
    fflush(stdout);
  }
}

unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
  std::ifstream cfile;
  cfile.open(file);

  // copied from GaloisCpp/trunk/src/FileGraph.h
  int masterFD = open(file, O_RDONLY);
  if (masterFD == -1) {
    printf("FileGraph::structureFromFile: unable to open %s.\n", file);
    return 1;
  }

  struct stat buf;
  int f = fstat(masterFD, &buf);
  if (f == -1) {
    printf("FileGraph::structureFromFile: unable to stat %s.\n", file);
    abort();
  }
  size_t masterLength = buf.st_size;

  int _MAP_BASE = MAP_PRIVATE;
  //#ifdef MAP_POPULATE
  //  _MAP_BASE  |= MAP_POPULATE;
  //#endif

  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
  if (m == MAP_FAILED) {
    m = 0;
    printf("FileGraph::structureFromFile: mmap failed.\n");
    abort();
  }

  ggc::Timer t("graphreader");
  t.start();

  // parse file
  uint64_t* fptr                           = (uint64_t*)m;
  __attribute__((unused)) uint64_t version = le64toh(*fptr++);
  assert(version == 1);
  uint64_t sizeEdgeTy = le64toh(*fptr++);
  uint64_t numNodes   = le64toh(*fptr++);
  uint64_t numEdges   = le64toh(*fptr++);
  uint64_t* outIdx    = fptr;
  fptr += numNodes;
  uint32_t* fptr32 = (uint32_t*)fptr;
  uint32_t* outs   = fptr32;
  fptr32 += numEdges;
  if (numEdges % 2)
    fptr32 += 1;
  edge_data_type* edgeData = (edge_data_type*)fptr32;

  // cuda.
  nnodes = numNodes;
  nedges = numEdges;

  printf("nnodes=%d, nedges=%d, sizeEdge=%d.\n", nnodes, nedges, sizeEdgeTy);
  allocOnHost(!read_edge_data);

  row_start[0] = 0;

  for (unsigned ii = 0; ii < nnodes; ++ii) {
    row_start[ii + 1] = le64toh(outIdx[ii]);
    //   //noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]);
    index_type degree = row_start[ii + 1] - row_start[ii];

    for (unsigned jj = 0; jj < degree; ++jj) {
      unsigned edgeindex = row_start[ii] + jj;

      unsigned dst = le32toh(outs[edgeindex]);
      if (dst >= nnodes)
        printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj,
               edgeindex);

      edge_dst[edgeindex] = dst;

      if (sizeEdgeTy && read_edge_data)
        edge_data[edgeindex] = edgeData[edgeindex];
    }

    progressPrint(nnodes, ii);
  }

  cfile.close(); // probably galois doesn't close its file due to mmap.
  t.stop();

  // TODO: fix MB/s
  printf("read %lld bytes in %d ms (%0.2f MB/s)\n\r\n", masterLength,
         t.duration_ms(), (masterLength / 1000.0) / (t.duration_ms()));

  return 0;
}

unsigned CSRGraph::read(char file[], bool read_edge_data) {
  return readFromGR(file, read_edge_data);
}

void CSRGraph::dealloc() {
  if (device_graph)
    deallocOnDevice();
  else
    deallocOnHost();
}

void CSRGraph::copy_to_gpu(struct CSRGraph& copygraph) {
  copygraph.nnodes = nnodes;
  copygraph.nedges = nedges;

  copygraph.allocOnDevice(edge_data == NULL);

  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,
                        nedges * sizeof(index_type), cudaMemcpyHostToDevice));
  if (edge_data != NULL)
    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,
                          nedges * sizeof(edge_data_type),
                          cudaMemcpyHostToDevice));
  check_cuda(cudaMemcpy(copygraph.node_data, node_data,
                        nnodes * sizeof(node_data_type),
                        cudaMemcpyHostToDevice));

  check_cuda(cudaMemcpy(copygraph.row_start, row_start,
                        (nnodes + 1) * sizeof(index_type),
                        cudaMemcpyHostToDevice));
}

void CSRGraph::copy_to_cpu(struct CSRGraph& copygraph) {
  assert(device_graph);

  // cpu graph is not allocated
  assert(copygraph.nnodes = nnodes);
  assert(copygraph.nedges = nedges);

  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,
                        nedges * sizeof(index_type), cudaMemcpyDeviceToHost));
  if (edge_data != NULL)
    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,
                          nedges * sizeof(edge_data_type),
                          cudaMemcpyDeviceToHost));
  check_cuda(cudaMemcpy(copygraph.node_data, node_data,
                        nnodes * sizeof(node_data_type),
                        cudaMemcpyDeviceToHost));

  check_cuda(cudaMemcpy(copygraph.row_start, row_start,
                        (nnodes + 1) * sizeof(index_type),
                        cudaMemcpyDeviceToHost));
}

struct EdgeIterator {
  CSRGraph* g;
  index_type node;
  index_type s;

  __device__ EdgeIterator(CSRGraph& g, index_type node) {
    this->g    = &g;
    this->node = node;
  }

  __device__ index_type size() const {
    return g->row_start[node + 1] - g->row_start[node];
  }

  __device__ index_type start() {
    s = g->row_start[node];
    return s;
  }

  __device__ index_type end() const { return g->row_start[node + 1]; }

  __device__ void next() { s++; }

  __device__ index_type dst() const { return g->edge_dst[s]; }

  __device__ edge_data_type data() const { return g->edge_data[s]; }
};


================================================
FILE: libgpu/src/ggc_rt.cu
================================================
/* -*- mode: c++ -*- */
#include <cuda.h>
#include "gg.h"

static struct ggc_rt_dev_info dinfo = {-1, -1};

void ggc_init_dev_info() {
  int dev;
  struct cudaDeviceProp p;

  check_cuda(cudaGetDevice(&dev));
  dinfo.dev = dev;
  
  check_cuda(cudaGetDeviceProperties(&p, dev));
  dinfo.nSM = p.multiProcessorCount;
}

void ggc_set_gpu_device(int dev) {
  check_cuda(cudaSetDevice(dev));
  ggc_init_dev_info();
}

int ggc_get_nSM() {
  if(dinfo.dev == -1)
    ggc_init_dev_info();

  return dinfo.nSM;
}


================================================
FILE: libgpu/src/instr.cu
================================================
/* -*- mode: c++ -*- */
#include <cuda.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "gg.h"
#include "bmk2.h"
#include <zlib.h>

#ifdef USE_SNAPPY
#include "snfile.h"
#endif

enum compformat {
  UNCOMPRESSED = 0,
  GZIP = 1,
  SNAPPY = 2
};

struct trace_file {
  int format;
  union {
    FILE *f;
    gzFile z;
#ifdef USE_SNAPPY
    SNAPPY_FILE s;
#endif    
  };
};

static const char *saved_uniqid;

TRACE trace_open(const char *name, const char *mode) {
  trace_file *t;
  int use_compress = 1;
  const char *c;

  t = (trace_file *) malloc(sizeof(trace_file) * 1);

  if(!t) {
    fprintf(stderr, "(Internal) Unable to allocate memory for TRACE '%s' (mode: %s)\n", name, mode);
    exit(1);
  }

  if(c = getenv("INSTR_COMPRESS")) {
    use_compress = atoi(c);
    fprintf(stderr, "Instr Compression enabled: %d\n", use_compress);
  }

  if(!use_compress) {
    t->format = UNCOMPRESSED;
    t->f = fopen(name, mode);
  
    if(!t->f) {
      fprintf(stderr, "Unable to open trace data file '%s' (mode: %s)\n", name, mode);
      exit(1);
    }
  } else {
#ifdef USE_SNAPPY
    t->format = SNAPPY;
    t->s = snopen(name, mode);
    if(!t->s) {
      fprintf(stderr, "Unable to open compressed trace data file '%s' (mode: %s)\n", name, mode);
      exit(1);      
    }
#else
    t->format = GZIP;
    t->z = gzopen(name, mode);
    
    gzbuffer(t->z, 1048576);

    int gzip_level = 3;

    if(c = getenv("INSTR_GZIP_LEVEL")) {
      gzip_level = atoi(c);
      fprintf(stderr, "Using GZIP level: %d\n", gzip_level);
    }

    gzsetparams(t->z, gzip_level, Z_DEFAULT_STRATEGY);

    if(!t->z) {
      fprintf(stderr, "Unable to open compressed trace data file '%s' (mode: %s)\n", name, mode);
      exit(1);
    }
#endif
  }

  return t;
}

void trace_close(TRACE t) {

  if(t->format == UNCOMPRESSED) {
    fclose(t->f);
  } else if (t->format == GZIP) {
    gzclose(t->z);
  } else if (t->format == SNAPPY) {
#ifdef USE_SNAPPY
    snclose(t->s);
#endif 
  }

  free(t);
}


void instr_set_saved_uniqid(const char *id) {
  saved_uniqid = id;
}

void instr_load_uniqid() {
  const int SZ=255;
  static char id[SZ];
  const char *r = NULL;

  r = getenv("INSTR_UNIQID");

  if(r) {
    strncpy(id, r, SZ);
    assert(id[SZ - 1] == '\0');
    instr_set_saved_uniqid(id);
  } else {
    fprintf(stderr, "Unable to read environment variable INSTR_UNIQID\n");
    exit(1);
  }
}

const char *instr_trace_dir() {
  const int SZ=255;
  static char dir[SZ];
  static bool checked;
  const char *r = NULL;
  
  if(!checked) {
    r = getenv("INSTR_TRACE_DIR");
    
    if(r) {
      strncpy(dir, r, SZ);
      assert(dir[SZ - 1] == '\0');
      //TODO: append a "/"?
    } else {    
      dir[0] = '\0';
    }
    checked = true;
  }

  return dir;
}

const char *instr_saved_uniqid() {
  return saved_uniqid;
}

const char *instr_uniqid() {
  const char *runid;
  static char spid[32];
  int ret;

  runid = bmk2_get_runid();
  if(!runid) {
    ret = snprintf(spid, 32, "%d", getpid());
    assert(ret > 0 && ret < 32);
    runid = spid;
  }

  return runid;
}

void instr_write_array(const char *n, 
		      TRACE f, size_t elemsz, size_t nelems, void *p) 
{
  assert(f != NULL);

  if(f->format == UNCOMPRESSED) {
    if(fwrite(&nelems, sizeof(nelems), 1, f->f) != 1) {
      fprintf(stderr, "Error writing size to '%s'\n", n);
      exit(1);
    }

    if(fwrite(p, elemsz, nelems, f->f) != nelems) {
      fprintf(stderr, "Error writing items to '%s'\n", n);
      exit(1);
    }
  } else if(f->format == GZIP) {
    if(gzwrite(f->z, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {
      fprintf(stderr, "Error writing size to compressed '%s'\n", n);
      exit(1);
    }
    
    if(gzwrite(f->z, p, elemsz * nelems) < elemsz * nelems) {
      fprintf(stderr, "Error writing items to compressed '%s'\n", n);
      exit(1);
    }
  } else if(f->format == SNAPPY) {
#ifdef USE_SNAPPY
    if(snwrite(f->s, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {
      fprintf(stderr, "Error writing size to compressed '%s'\n", n);
      exit(1);
    }
    
    if(snwrite(f->s, p, elemsz * nelems) < elemsz * nelems) {
      fprintf(stderr, "Error writing items to compressed '%s'\n", n);
      exit(1);
    }
#endif 
  }
}

#ifdef USE_SNAPPY
SNAPPY_FILE trace_snappy_handle(TRACE f) {
  return f->s;
}
#endif

size_t instr_read_array(const char *n, 
			TRACE f, 
			size_t elemsz, 
			size_t maxnelems, 
			void *p) 
{
  size_t nelems;

  assert(f != NULL);
  if(f->format == UNCOMPRESSED) {
    if(fread(&nelems, sizeof(nelems), 1, f->f) != 1) {
      fprintf(stderr, "Error reading size from '%s'\n", n);
      exit(1);
    }

    if(nelems > maxnelems) {
      fprintf(stderr, "Too many items to read from '%s'\n", n);
      exit(1);
    }

    if(fread(p, elemsz, nelems, f->f) != nelems) {
      fprintf(stderr, "Error reading items from '%s'\n", n);
      exit(1);
    }
  } else if(f->format == GZIP) {
    if(gzread(f->z, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {
      fprintf(stderr, "Error reading size from compressed '%s'\n", n);
      exit(1);
    }

    if(nelems > maxnelems) {
      fprintf(stderr, "Too many items to read from '%s'\n", n);
      exit(1);
    }

    if(gzread(f->z, p, elemsz * nelems) != elemsz * nelems) {
      fprintf(stderr, "Error reading items from compressed '%s'\n", n);
      exit(1);
    }
  } else if(f->format == SNAPPY) {
#ifdef USE_SNAPPY
    if(snread(f->s, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {
      fprintf(stderr, "Error reading size from compressed '%s'\n", n);
      exit(1);
    }

    if(nelems > maxnelems) {
      fprintf(stderr, "Too many items to read from '%s'\n", n);
      exit(1);
    }

    if(snread(f->s, p, elemsz * nelems) != elemsz * nelems) {
      fprintf(stderr, "Error reading items from compressed '%s'\n", n);
      exit(1);
    }
#endif 
  }
  return nelems;
}

size_t instr_read_array_gpu(const char *n, 
			    TRACE f, size_t elemsz, size_t maxnelems, 
			    void *gp, void *cp) 
{
  bool allocated = false;

  if(!cp) {
    cp = malloc(elemsz * maxnelems);
    allocated = true;
    assert(cp != NULL);
  }

  size_t nelems = instr_read_array(n, f, elemsz, maxnelems, cp);

  check_cuda(cudaMemcpy(gp, cp, nelems * elemsz, cudaMemcpyHostToDevice));
  
  if(allocated) 
    free(cp);

  return nelems;
}


void instr_write_array_gpu(const char *n, 
			   TRACE f, size_t elemsz, size_t nelems, 
			   void *gp, void *cp) 
{
  bool allocated = false;

  if(!cp) {
    cp = malloc(elemsz * nelems);
    allocated = true;
    assert(cp != NULL);
  }

  check_cuda(cudaMemcpy(cp, gp, nelems * elemsz, cudaMemcpyDeviceToHost));
  
  instr_write_array(n, f, elemsz, nelems, cp);

  if(allocated) 
    free(cp);
}


void instr_save_primitive(const char *name, 
			  const int invocation,
			  const int pos,
			  const char *arg,
			  void *p, size_t sp)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%s.arg", instr_trace_dir(), name, arg, instr_uniqid());
  
  assert(written > 0 && written < SZ);
  
  FILE *o;
  o = fopen(fname, "w+");

  if(o == NULL) {
    fprintf(stderr, "Failed to open '%s'\n", fname);
    exit(1);
  }

  if(fseek(o, invocation * sp, SEEK_SET) == 0) {
    if(fwrite(p, sp, 1, o) != 1) {
      fprintf(stderr, "instr_save_primitive: Write failed!\n");
      exit(1);
    }
  }
  else {
      fprintf(stderr, "instr_save_primitive: fseek failed!\n");
      exit(1);    
  }

  if(invocation == 0) {
    bmk2_log_collect("ggc/kstate", fname);
  }

  fclose(o);
}

void instr_load_primitive(const char *name, 
			  const int invocation,
			  const int pos,
			  const char *arg,
			  void *p, size_t sp)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%s.arg", instr_trace_dir(), name, arg, instr_saved_uniqid());
  
  assert(written > 0 && written < SZ);
  
  FILE *o;
  o = fopen(fname, "r");

  if(o == NULL) {
    fprintf(stderr, "Failed to open '%s'\n", fname);
    exit(1);
  }

  if(fseek(o, invocation * sp, SEEK_SET) == 0) {
    if(fread(p, sp, 1, o) != 1) {
      fprintf(stderr, "instr_load_primitive: Read failed!\n");
      exit(1);
    }
  }
  else {
      fprintf(stderr, "instr_load_primitive: fseek failed!\n");
      exit(1);    
  }

  fclose(o);
}


void instr_save_array_gpu(const char *kernel,
			  const int invocation,
			  const int pos,
			  const char *arg,
			  void *gp,
			  void *cp,
			  size_t sz,
			  size_t num)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%d.%s.arg", instr_trace_dir(), kernel, arg, invocation, instr_uniqid());
  
  assert(written >0 && written < SZ);
  
  TRACE o;
  o = trace_open(fname, "w");

  instr_write_array_gpu(fname, o, sz, num, gp, cp);

  bmk2_log_collect("ggc/kstate", fname);

  trace_close(o);
}

void instr_save_array(const char *kernel,
		      const int invocation,
		      const int pos,
		      const char *arg,
		      void *cp,
		      size_t sz,
		      size_t num)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%d.%s.arg", instr_trace_dir(), kernel, arg, invocation, instr_uniqid());
  
  assert(written >0 && written < SZ);
  
  TRACE o;
  o = trace_open(fname, "w");

  instr_write_array(fname, o, sz, num, cp);

  bmk2_log_collect("ggc/kstate", fname);

  trace_close(o);
}


size_t instr_load_array_gpu(const char *kernel,
			    const int invocation,
			    const int pos,
			    const char *arg,
			    void *gp,
			    void *cp,
			    size_t sz,
			    size_t maxnum)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%d.%s.arg", instr_trace_dir(), kernel, arg, invocation, 
		     instr_saved_uniqid());
  
  assert(written >0 && written < SZ);
  
  TRACE o;
  o = trace_open(fname, "r");

  if(o == NULL) {
    fprintf(stderr, "Failed to open '%s'\n", fname);
    exit(1);
  }

  assert(o != NULL);

  size_t nelems = instr_read_array_gpu(fname, o, sz, maxnum, gp, cp);
  
  trace_close(o);

  return nelems;
}

size_t instr_load_array(const char *kernel,
			const int invocation,
			const int pos,
			const char *arg,
			void *cp,
			size_t sz,
			size_t maxnum)
{
  const int SZ=255;
  char fname[SZ];
  int written;

  written = snprintf(fname, SZ, "%s%s.%s.%d.%s.arg", instr_trace_dir(), kernel, arg, invocation, 
		     instr_saved_uniqid());
  
  assert(written >0 && written < SZ);
  
  TRACE o;
  o = trace_open(fname, "r");

  size_t nelems = instr_read_array(fname, o, sz, maxnum, cp);
  
  trace_close(o);

  return nelems;
}


struct instr_trace_record {
  int ty;
  int depth;
  int index;
};

struct instr_trace {
  FILE *f;  
  struct instr_trace_record *r;
  int records;
  int index;
};


struct instr_trace * instr_trace_file(const char *prefix, int mode) {
  struct instr_trace *it;
  const int SZ=255;
  char fname[SZ];
  int written;
  const char *id;

  it = (struct instr_trace *) malloc(sizeof(instr_trace));
  it->r = NULL;

  if(mode == 0) {
    id = instr_saved_uniqid(); // read
  } else {
    id = instr_uniqid(); //write
  }

  written = snprintf(fname, SZ, "%s%s.%s.trace", instr_trace_dir(), prefix, id);
  assert(written >0 && written < SZ);

  if(mode == 0) 
    it->f = fopen(fname, "r");
  else 
    it->f = fopen(fname, "w");

  if(!it->f) {
    fprintf(stderr, "Failed to open '%s' for %s\n", fname, mode ? "writing" : "reading");
    exit(1);  
  }

  if(mode == 0) {
    instr_load_trace(fname, it);
  } else {
    bmk2_log_collect("ggc/trace", fname);
  }

  return it;
}

void instr_load_trace(const char *n, struct instr_trace *it) {
  assert(it != NULL);
  assert(it->f != NULL);
  assert(it->r == NULL);

  int wr = 0;
  int N = 1024;
  int ty, depth, index;

  it->r = (struct instr_trace_record *) malloc(N*sizeof(instr_trace_record));

  assert(it->r);

  int fd = fileno(it->f);
  assert(fd != -1);

  struct stat s;

  if(fstat(fd, &s) != 0) {
    fprintf(stderr, "Error when stat'ing trace file '%s'\n", n);
    exit(1);
  }

  if(s.st_size > 0) {
    while(!feof(it->f)) {
      if(fscanf(it->f, "%d %d %d\n", &ty, &depth, &index) == 3) {
	it->r[wr].ty = ty;
	it->r[wr].depth = depth;
	it->r[wr].index = index;
	wr++;
	if(wr >= N) {
	  N *= 2;
	  it->r = (struct instr_trace_record *) realloc(it->r, N*sizeof(instr_trace_record));
	  assert(it->r != NULL);
	}
      }
      else {
	fprintf(stderr, "Error when reading trace file '%s'\n", n);
	exit(1);
      }
    }
  }

  it->records = wr;
  it->index = 0;
}

bool instr_match_pipe(struct instr_trace *it, int what, int depth, int index) {
  assert(it != NULL);
  assert(it->f != NULL);
  assert(it->r != NULL);
  
  if(it->index < it->records) {
    struct instr_trace_record *r = &it->r[it->index];
    if (r->ty == what && r->depth == depth && r->index == index) {
      it->index++;
      return true;
    }
  }

  return false;
}

bool debug_match(struct instr_trace *it, bool succeeded, int depth, int index, int what, const char *swhat) {
  if(!succeeded) {
    assert(it != NULL);
    assert(it->f != NULL);
    assert(it->r != NULL);

    if(it->index < it->records) {
      struct instr_trace_record *r = &it->r[it->index];
      
      fprintf(stderr, "Match attempt failed: (ty: %d/%s, depth: %d, index: %d) does not equal stored (ty: %d, depth: %d, index: %d)\n", 
	      what, swhat, depth, index, 
	      r->ty, r->depth, r->index);      
    } else {
      fprintf(stderr, "Match attempt failed: (ty: %d/%s, depth: %d, index: %d) beyond end of stored trace\n", 
	      what, swhat, depth, index);
    }
  }

  return succeeded;
}

bool instr_match_pipe_iterate(struct instr_trace *it, int depth, int index) {
  bool x = instr_match_pipe(it, INSTR_TRACE_ITER, depth, index);

  return debug_match(it, x, depth, index, INSTR_TRACE_ITER, "ITER");
}

bool instr_match_pipe_exit(struct instr_trace *it, int depth, int index) {
  bool x = instr_match_pipe(it, INSTR_TRACE_EXIT, depth, index);

  return debug_match(it, x, depth, index, INSTR_TRACE_EXIT, "EXIT");
}

void instr_pipe_iterate(struct instr_trace *it, int depth, int index) {
  assert(it != NULL);
  assert(it->f != NULL);
  fprintf(it->f, "%d %d %d\n", INSTR_TRACE_ITER, depth, index);
}

void instr_pipe_exit(struct instr_trace *it, int depth, int index) {
  assert(it != NULL);
  assert(it->f != NULL);
  fprintf(it->f, "%d %d %d\n", INSTR_TRACE_EXIT, depth, index);
}


================================================
FILE: libgpu/src/skelapp/skel.cu
================================================
/* -*- mode: c++ -*- */

#include <cuda.h>
#include <cstdio>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>

#include "gg.h"
#include "Timer.h"

extern void gg_main(CSRGraphTy &, CSRGraphTy &);
extern void output(CSRGraphTy &, const char *output_file);
extern const char *GGC_OPTIONS;

int QUIET = 0;
char *INPUT, *OUTPUT;
extern int SKELAPP_RETVAL;
extern unsigned long DISCOUNT_TIME_NS;

unsigned long DISCOUNT_TIME_NS = 0;
int SKELAPP_RETVAL = 0;

int CUDA_DEVICE = 0;

//mgpu::ContextPtr mgc;

extern const char *prog_opts;
extern const char *prog_usage;
extern const char *prog_args_usage;
extern void process_prog_opt(char optchar, char *optarg);
extern int process_prog_arg(int argc, char *argv[], int arg_start);

__global__ void initialize_skel_kernel() {
}

void kernel_sizing(CSRGraphTy & g, dim3 &blocks, dim3 &threads) {
  threads.x = 256;
  threads.y = threads.z = 1;

  blocks.x = ggc_get_nSM() * 8;
  blocks.y = blocks.z = 1;
}

int load_graph_and_run_kernel(char *graph_file) {
  CSRGraphTy g, gg;
  
  ggc::Timer k("gg_main");
  fprintf(stderr, "OPTIONS: %s\n", GGC_OPTIONS);
  g.read(graph_file);

  g.copy_to_gpu(gg);

  int *d;
  check_cuda(cudaMalloc(&d, sizeof(int) * 1));
  //check_cuda(cudaFree(d));

  //initialize_skel_kernel<<<1,1>>>();

  k.start();
  gg_main(g, gg);
  check_cuda(cudaDeviceSynchronize());
  k.stop();
  k.print();
  fprintf(stderr, "Total time: %llu ms\n", k.duration_ms());
  fprintf(stderr, "Total time: %llu ns\n", k.duration());

  if(DISCOUNT_TIME_NS > 0) {
    fprintf(stderr, "Total time (discounted): %llu ns\n", k.duration() - DISCOUNT_TIME_NS);
  }

  gg.copy_to_cpu(g);

  if(!QUIET)
    output(g, OUTPUT);

  return SKELAPP_RETVAL;
}

void usage(int argc, char *argv[]) 
{
  if(strlen(prog_usage)) 
    fprintf(stderr, "usage: %s [-q] [-g gpunum] [-o output-file] %s graph-file \n %s\n", argv[0], prog_usage, prog_args_usage);
  else
    fprintf(stderr, "usage: %s [-q] [-g gpunum] [-o output-file] graph-file %s\n", argv[0], prog_args_usage);
}

void parse_args(int argc, char *argv[]) 
{
  int c;
  const char *skel_opts = "g:qo:";
  char *opts;
  int len = 0;
  
  len = strlen(skel_opts) + strlen(prog_opts) + 1;
  opts = (char *) calloc(1, len);
  strcat(strcat(opts, skel_opts), prog_opts);

  while((c = getopt(argc, argv, opts)) != -1) {
    switch(c) 
      {
      case 'q':
	QUIET = 1;
	break;
      case 'o':
	OUTPUT = optarg; //TODO: copy?
	break;
      case 'g':
	char *end;
	errno = 0;
	CUDA_DEVICE = strtol(optarg, &end, 10);
	if(errno != 0 || *end != '\0') {
	  fprintf(stderr, "Invalid GPU device '%s'. An integer must be specified.\n", optarg);
	  exit(EXIT_FAILURE);
	}
	break;
      case '?':
	usage(argc, argv);
	exit(EXIT_FAILURE);
      default:
	process_prog_opt(c, optarg);
	break;
    }
  }

  if(optind < argc) {
    INPUT = argv[optind];
    if(!process_prog_arg(argc, argv, optind + 1)) {
      usage(argc, argv);
      exit(EXIT_FAILURE);
    }
  }
  else {
    usage(argc, argv);
    exit(EXIT_FAILURE);      
  }
}

int main(int argc, char *argv[]) {
  if(argc == 1) {
    usage(argc, argv);
    exit(1);
  }

  parse_args(argc, argv);
  ggc_set_gpu_device(CUDA_DEVICE);
  //mgc = mgpu::CreateCudaDevice(CUDA_DEVICE);
  //printf("Using GPU: %s\n", mgc->DeviceString().c_str());
  int r = load_graph_and_run_kernel(INPUT);
  return r;
}


================================================
FILE: libgpu/src/snappy.c
================================================
#include <snappy-c.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include "snfile.h"
#include <errno.h>
#include <string.h>

const int BUFSIZE = 10485760;

struct snappy_file {
  FILE* f;
  char* buf;
  size_t bufsize;
  size_t bufhead;
  size_t buflen;
};

SNAPPY_FILE snopen(const char* name, const char* mode) {
  SNAPPY_FILE x;

  x = (SNAPPY_FILE)malloc(sizeof(struct snappy_file) * 1);

  if (!x)
    return NULL;

  x->f = fopen(name, mode);
  if (!x->f) {
    free(x);
    return NULL;
  }

  x->buf = (char*)malloc(BUFSIZE);
  if (!x->buf) {
    fclose(x->f);
    free(x);
    return NULL;
  }

  x->bufsize = BUFSIZE;
  x->bufhead = 0;
  x->buflen  = 0;

  return x;
}

size_t snwrite(SNAPPY_FILE f, void* p, size_t sz) {
  size_t clen;

  clen = snappy_max_compressed_length(sz);

  if (clen > f->bufsize) {
    f->buf = (char*)realloc(f->buf, clen);
    if (!f->buf) {
      fprintf(stderr, "snwrite: Out of memory!\n");
      return 0;
    }

    f->bufsize = clen;
  }

  if (snappy_compress(p, sz, f->buf, &clen) == SNAPPY_OK) {
    if (fwrite(&clen, sizeof(clen), 1, f->f) != 1)
      return 0;

    if (fwrite(f->buf, 1, clen, f->f) < clen)
      return 0;

    return sz;
  } else {
    return 0;
  }
}

static size_t snmin(size_t a, size_t b) { return a > b ? b : a; }

size_t snread(SNAPPY_FILE f, void* p, size_t sz) {
  size_t handled = 0;
  size_t read;
  size_t clen, unclen;

  // is there uncompressed data in the buffer?
  assert(f->buflen >= f->bufhead);

  if (f->buflen - f->bufhead) {
    handled = snmin(sz, (f->buflen - f->bufhead));
    memcpy(p, f->buf + f->bufhead, handled);
    sz -= handled;
    p += handled;
    f->bufhead += handled;
  }

  while (sz > 0) {
    assert(f->bufhead == f->buflen);

    f->bufhead = 0;
    f->buflen  = 0;

    if (fread(&clen, sizeof(clen), 1, f->f) != 1) {
      fprintf(stderr, "Failed to read clen (errno: %d, %d, %d)\n", errno,
              ferror(f->f), feof(f->f));
      return handled;
    }

    char* cbuf = (char*)malloc(clen * 1);
    if (!cbuf) {
      fprintf(stderr, "Failed to allocate buffer clen\n");
      return handled;
    }

    if (fread(cbuf, 1, clen, f->f) < clen) {
      fprintf(stderr, "Failed to read complete cbuf of length %u\n", clen);
      // almost certainly an error!
      free(cbuf);
      return handled;
    }

    if (snappy_uncompressed_length(cbuf, clen, &unclen) != SNAPPY_OK) {
      fprintf(stderr, "Failed to decompress uncompressed length %u\n", clen);
      return handled;
    }

    if (unclen > f->bufsize) {
      f->buf = realloc(f->buf, unclen);
      if (!f->buf) {
        fprintf(stderr, "snread: Out of memory!\n");
        return handled;
      }
      f->bufsize = unclen;
    }

    if (snappy_uncompress(cbuf, clen, f->buf, &unclen) != SNAPPY_OK) {
      fprintf(stderr, "Failed to decompress cbuf of length %u\n", clen);
      return handled;
    }
    free(cbuf);

    f->buflen = unclen;

    size_t tocopy;
    tocopy = snmin(sz, f->buflen);

    memcpy(p, f->buf, tocopy);
    sz -= tocopy;
    p += tocopy;
    f->bufhead += tocopy;
    handled += tocopy;
  }

  return handled;
}

int sneof(SNAPPY_FILE f) {
  assert(f);
  return feof(f->f);
}

void snclose(SNAPPY_FILE f) {
  fclose(f->f);
  free(f->buf);
  free(f);
}


================================================
FILE: libgpu/src/snappy_test.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "snfile.h"

void compress(char* in, char* out) {
  FILE* i;
  SNAPPY_FILE s;
  char* buf;
  size_t N = 1024 * 1024 * 100;
  size_t R = 0;

  buf = (char*)malloc(N);
  assert(buf);

  i = fopen(in, "r");
  s = snopen(out, "w");

  assert(i);

  while (!feof(i)) {
    R = fread(buf, 1, N, i);

    if (R > 0) {
      snwrite(s, buf, R);
    } else {
      fprintf(stderr, "Error?\n");
    }
  }

  fclose(i);
  snclose(s);
}

void decompress(char* in, char* out) {
  FILE* o;
  SNAPPY_FILE s;
  char* buf;
  size_t N = 1024;
  size_t R = 0;

  buf = (char*)malloc(N);
  assert(buf);

  s = snopen(in, "r");
  o = fopen(out, "w");

  assert(s);
  assert(o);

  while (!sneof(s)) {
    R = snread(s, buf, N);
    if (R > 0) {
      if (fwrite(buf, 1, R, o) < R) {
        fprintf(stderr, "Error writing\n");
        exit(1);
      }
    } else {
      if (!sneof(s))
        fprintf(stderr, "Error?\n");
      break;
    }
  }

  snclose(s);
  fclose(o);
}

int main(int argc, char* argv[]) {
  if (argc != 4) {
    fprintf(stderr, "Usage: %s cmd input output\n", argv[0]);
    exit(1);
  }

  char* cmd = argv[1];
  char* inp = argv[2];
  char* out = argv[3];

  if (strcmp(cmd, "compress") == 0) {
    compress(inp, out);
  } else {
    decompress(inp, out);
  }
}


================================================
FILE: libpangolin/CMakeLists.txt
================================================
add_library(pangolin STATIC)
add_library(Galois::pangolin ALIAS pangolin)
add_dependencies(lib pangolin)

target_sources(pangolin PRIVATE
  src/equivalence.cpp
  src/quick_pattern.cpp
  src/base_embedding.cpp
  src/vertex_embedding.cpp
  src/BfsMining/embedding_list.cpp
)

target_include_directories(pangolin PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

target_include_directories(pangolin PUBLIC
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/bliss>
  $<INSTALL_INTERFACE:include>
)

target_link_libraries(pangolin PUBLIC galois_shmem)

if (GALOIS_ENABLE_GPU)
  add_library(pangolin_gpu INTERFACE)
  add_library(Galois::pangolin_gpu ALIAS pangolin_gpu)
  add_dependencies(lib pangolin_gpu)

  target_include_directories(pangolin_gpu INTERFACE
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/gpu>
    $<INSTALL_INTERFACE:include>
  )

  target_link_libraries(pangolin_gpu INTERFACE galois_shmem galois_gpu)
endif()


================================================
FILE: libpangolin/README.md
================================================
Overview of Graph Pattern Mining (GPM) in Galois
================================================================================

This is the Pangolin framework [1] for efficient and flexible graph mining. 
It uses the bliss library [2][3] for graph isomorphism check. 
The license for this library is in the bliss directory: 
note that **it does not use the same license as the rest of Galois**.
To run Pangolin applications, please go to ../lonestarmine/README.md
for more details.

[1] Xuhao Chen, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, 
Pangolin: An Efficient and Flexible Graph Pattern Mining System on CPU and GPU, VLDB 2020

[2] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.

[3] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

INPUT
===========

We support four input graph format: **gr**, **txt**, **adj**, **mtx**.
For unlabeled graphs, we use the gr graph format, same as other Galois benchmarks.
**Make sure that the graph is symmetric and contains no self-loop or redundant edges**.
If not, use the convert tool in tools/graph-convert/ to convert the graph.
We use **adj** format for labeled graphs as also used by Arabesque and RStream.
The **adj** format takes as input graphs with the following formats (vertex labeled):

```
# <num vertices> <num edges>
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
...
```

We currently do not support graphs label on edges.
Vertex ids are expected to be sequential integers between 0 and (total number of vertices - 1).
For testing, we have prepared a test graph **citeseer** in $GALOIS_HOME/lonestarmine/test_data.

BUILD
===========

1. Run cmake at BUILD directory: 

`cd build; cmake -DUSE_PANGOLIN=1 ../`

To enable GPU mining, use: 

`cmake -DUSE_PANGOLIN=1 -DUSE_GPU=1 ../`

2. Run make:

`cd <BUILD>/lonestar/experimental/fsm; make -j`

RUN
===========

The following are a few example command lines.

- `$ ./tc_mine gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -t 28`
- `$ ./kcl gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -k=3 -t 28`
- `$ ./motif gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -k=3 -t 56`
- `$ ./fsm adj $GALOIS_HOME/lonestarmine/test_data/citeseer.sadj -k=2 -ms=300 -t 28`

PERFORMANCE
===========

Please see details in the paper.

CITATION
==========

Please cite the following paper if you use Pangolin:

```
@article{Pangolin,
	title={Pangolin: An Efficient and Flexible Graph Mining System on CPU and GPU},
	author={Xuhao Chen and Roshan Dathathri and Gurbinder Gill and Keshav Pingali},
	year={2020},
	journal = {Proc. VLDB Endow.},
	issue_date = {August 2020},
	volume = {13},
	number = {8},
	month = aug,
	year = {2020},
	numpages = {12},
	publisher = {VLDB Endowment},
}
```


================================================
FILE: libpangolin/gpu/pangolin/bitsets.h
================================================

#pragma once
#include <cuda.h>
#include <assert.h>
#include "pangolin/types.cuh"
#include "pangolin/cutils.h"

class Bitsets {
public:
  int num_sets;
  int num_bits_capacity;
  int num_bits;
  uint64_t** h_bit_vectors;
  uint64_t** d_bit_vectors;
  Bitsets() {}
  Bitsets(int n, int nbits) { alloc(n, nbits); }
  ~Bitsets() {}
  void set_size(int n, int nbits) {
    num_sets          = n;
    num_bits_capacity = nbits;
    num_bits          = nbits;
  }
  void alloc(int n, int nbits) {
    assert(sizeof(unsigned long long int) * 8 == 64);
    assert(sizeof(uint64_t) * 8 == 64);
    num_sets          = n;
    num_bits_capacity = nbits;
    num_bits          = nbits;
    h_bit_vectors     = (uint64_t**)malloc(n * sizeof(uint64_t*));
    for (int i = 0; i < n; i++) {
      CUDA_SAFE_CALL(
          cudaMalloc(&h_bit_vectors[i], vec_size() * sizeof(uint64_t)));
      reset(i);
    }
    CUDA_SAFE_CALL(cudaMalloc(&d_bit_vectors, n * sizeof(uint64_t*)));
    CUDA_SAFE_CALL(cudaMemcpy(d_bit_vectors, h_bit_vectors,
                              n * sizeof(uint64_t*), cudaMemcpyHostToDevice));
  }
  void clear() {
    for (int i = 0; i < num_sets; i++)
      reset(i);
    CUDA_SAFE_CALL(cudaMemcpy(d_bit_vectors, h_bit_vectors,
                              num_sets * sizeof(uint64_t*),
                              cudaMemcpyHostToDevice));
  }
  void clean() {
    for (int i = 0; i < num_sets; i++)
      if (h_bit_vectors[i] != NULL)
        cudaFree(h_bit_vectors[i]);
    if (d_bit_vectors != NULL)
      cudaFree(d_bit_vectors);
    if (h_bit_vectors != NULL)
      free(h_bit_vectors);
  }
  void reset(int i) {
    CUDA_SAFE_CALL(
        cudaMemset(h_bit_vectors[i], 0, vec_size() * sizeof(uint64_t)));
  }
  __device__ void set(int sid, int bid) {
    if (sid >= num_sets)
      printf("sid=%d, num_sets=%d\n", sid, num_sets);
    assert(sid < num_sets);
    assert(bid < num_bits);
    int bit_index                     = bid / 64;
    unsigned long long int bit_offset = 1;
    bit_offset <<= (bid % 64);
    if ((d_bit_vectors[sid][bit_index] & bit_offset) == 0) { // test and set
      atomicOr((unsigned long long int*)&d_bit_vectors[sid][bit_index],
               bit_offset);
    }
  }
  __device__ int count_num_ones(int sid, size_t bid) {
    return __popcll(d_bit_vectors[sid][bid]);
  }
  __device__ __host__ size_t vec_size() const {
    size_t bit_vector_size = (num_bits + 63) / 64;
    return bit_vector_size;
  }
};


================================================
FILE: libpangolin/gpu/pangolin/checker.h
================================================
#ifndef CHECKER_H
#define CHECKER_H
#include <cuda.h>
#include <cuda_runtime.h>

static void check_cuda_error(const cudaError_t e, const char* file,
                             const int line) {
  if (e != cudaSuccess) {
    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
    exit(1);
  }
}
#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)

#endif


================================================
FILE: libpangolin/gpu/pangolin/cutils.h
================================================
#ifndef CUTIL_SUBSET_H
#define CUTIL_SUBSET_H

#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
  {                                                                            \
    cudaError err = call;                                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n",  \
              err, __FILE__, __LINE__, cudaGetErrorString(err));               \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }

#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);

#define CUDA_SAFE_THREAD_SYNC()                                                \
  {                                                                            \
    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
              __LINE__, cudaGetErrorString(err));                              \
    }                                                                          \
  }

// from http://forums.nvidia.com/index.php?showtopic=186669
static __device__ unsigned get_smid(void) {
  unsigned ret;
  asm("mov.u32 %0, %smid;" : "=r"(ret));
  return ret;
}

inline unsigned CudaTest(const char* msg) {
  cudaError_t e;
  // cudaThreadSynchronize();
  if (cudaSuccess != (e = cudaGetLastError())) {
    fprintf(stderr, "%s: %d\n", msg, e);
    fprintf(stderr, "%s\n", cudaGetErrorString(e));
    exit(-1);
    // return 1;
  }
  return 0;
}
#endif


================================================
FILE: libpangolin/gpu/pangolin/element.cuh
================================================
#ifndef ELEMENT_CUH_
#define ELEMENT_CUH_
#include "types.cuh"

struct SimpleElement {
protected:
	IndexT vertex_id;
public:
	SimpleElement() : vertex_id(0) { }
	SimpleElement(IndexT _vertex_id) : vertex_id(_vertex_id) { }
	SimpleElement(IndexT _vertex_id, edge_data_type _edge_label, node_data_type _vertex_label, history_type _history) : vertex_id(_vertex_id) { }
	SimpleElement(IndexT _vertex_id, key_type _key_index, edge_data_type _edge_label, node_data_type _vertex_label, history_type _history) : vertex_id(_vertex_id) { }
	~SimpleElement() { }
	inline __device__ void set_vertex_id(IndexT new_id) { vertex_id = new_id; }
	inline __device__ void set_history_info(history_type his) { }
	inline __device__ void set_vertex_label(node_data_type lab) { }
	inline __device__ IndexT get_vid() const { return vertex_id; }
	inline __device__ history_type get_his() const { return 0; }
	inline __device__ key_type get_key() const { return 0; }
	inline __device__ int cmp(const SimpleElement& other) const {
		if(vertex_id < other.get_vid()) return -1;
		if(vertex_id > other.get_vid()) return 1;
		return 0;
	}
	//friend bool operator==(const SimpleElement &e1, const SimpleElement &e2) {
	//	return e1.get_vid() == e2.get_vid();
	//}
};

#ifdef USE_SIMPLE
typedef SimpleElement ElementType;
#endif
#endif


================================================
FILE: libpangolin/gpu/pangolin/embedding.cuh
================================================
#ifndef EMBEDDING_CUH_
#define EMBEDDING_CUH_

#include "element.cuh"

template <typename ElementTy>
class Embedding {
public:
	Embedding() { size_ = 0; }
	Embedding(size_t n) { size_ = n; elements = new ElementTy[size_]; } // TODO
	//Embedding(const Embedding &emb) { size_ = emb.size(); elements = emb.elements; }
	~Embedding() { }
	__device__ IndexT get_vertex(unsigned i) const { return elements[i].get_vid(); }
	__device__ history_type get_history(unsigned i) const { return elements[i].get_his(); }
	__device__ node_data_type get_label(unsigned i) const { return elements[i].get_vlabel(); }
	__device__ key_type get_key(unsigned i) const { return elements[i].get_key(); }
	__device__ bool empty() const { return size_ == 0; }
	__device__ size_t size() const { return size_; }
	__device__ ElementTy& back() { return elements[size_-1]; }
	__device__ const ElementTy& back() const { return elements[size_-1]; }
	__device__ ElementTy get_element(unsigned i) const { return elements[i]; }
	__device__ void set_element(unsigned i, ElementTy &ele) { elements[i] = ele; }
	__device__ void set_vertex(unsigned i, IndexT vid) { elements[i].set_vertex_id(vid); }
	//__device__ unsigned insert(unsigned pos, const ElementTy& value ) { return elements[pos] = value; }
	//__device__ ElementTy* data() { return elements; }
	//__device__ const ElementTy* data() const { return elements; }
	//__device__ ElementTy* get_elements() const { return elements; }
protected:
	ElementTy *elements;
	size_t size_;
};


class BaseEmbedding : public Embedding<SimpleElement> {
public:
	BaseEmbedding() {}
	BaseEmbedding(size_t n) : Embedding(n) {}
	~BaseEmbedding() {}
};

#ifdef USE_BASE_TYPES
typedef BaseEmbedding EmbeddingType;
#endif

template <typename EmbeddingTy>
class EmbeddingQueue{
public:
	EmbeddingQueue() {}
	~EmbeddingQueue() {}
	void init(int nedges, unsigned max_size = 2, bool use_dag = true) {
		int nnz = nedges;
		if (!use_dag) nnz = nnz / 2;
		size = nedges;
	}
	EmbeddingTy *queue;
	int size;
};

class EmbeddingList {
public:
	EmbeddingList() {}
	~EmbeddingList() {}
	void init(int nedges, unsigned max_size = 2, bool use_dag = true) {
		last_level = 1;
		assert(max_size > 1);
		max_level = max_size;
		h_vid_lists = (IndexT **)malloc(max_level * sizeof(IndexT*));
		h_idx_lists = (IndexT **)malloc(max_level * sizeof(IndexT*));
		check_cuda(cudaMalloc(&d_vid_lists, max_level * sizeof(IndexT*)));
		check_cuda(cudaMalloc(&d_idx_lists, max_level * sizeof(IndexT*)));
		#ifdef ENABLE_LABEL
		h_his_lists = (history_type **)malloc(max_level * sizeof(history_type*));
		check_cuda(cudaMalloc(&d_his_lists, max_level * sizeof(history_type*)));
		#endif
		sizes = new size_t[max_level];
		sizes[0] = 0;
		int nnz = nedges;
		if (!use_dag) nnz = nnz / 2;
		sizes[1] = nnz;
		check_cuda(cudaMalloc((void **)&h_vid_lists[1], nnz * sizeof(IndexT)));
		check_cuda(cudaMalloc((void **)&h_idx_lists[1], nnz * sizeof(IndexT)));
		check_cuda(cudaMemcpy(d_vid_lists, h_vid_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));
		check_cuda(cudaMemcpy(d_idx_lists, h_idx_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));
		#ifdef ENABLE_LABEL
		check_cuda(cudaMalloc((void **)&h_his_lists[1], nnz * sizeof(history_type)));
		check_cuda(cudaMemcpy(d_his_lists, h_his_lists, max_level * sizeof(history_type*), cudaMemcpyHostToDevice));
		#endif
	}
	void init_cpu(CSRGraph *graph, bool is_dag = false) {
		int nnz = graph->get_nedges();
		if (!is_dag) nnz = nnz / 2;
		IndexT *vid_list = (IndexT *)malloc(nnz*sizeof(IndexT));
		IndexT *idx_list = (IndexT *)malloc(nnz*sizeof(IndexT));
		int eid = 0;
		for (int src = 0; src < graph->get_nnodes(); src ++) {
			IndexT row_begin = graph->edge_begin(src);
			IndexT row_end = graph->edge_end(src);
			for (IndexT e = row_begin; e < row_end; e++) {
				IndexT dst = graph->getEdgeDst(e);
				if (is_dag || src < dst) {
					vid_list[eid] = dst;
					idx_list[eid] = src;
					eid ++;
				}
			}
		}
		check_cuda(cudaMemcpy(h_vid_lists[1], vid_list, nnz * sizeof(IndexT), cudaMemcpyHostToDevice));
		check_cuda(cudaMemcpy(h_idx_lists[1], idx_list, nnz * sizeof(IndexT), cudaMemcpyHostToDevice));
		#ifdef ENABLE_LABEL
		check_cuda(cudaMemset(h_his_lists[1], 0, nnz * sizeof(history_type)));
		#endif
	}
	__device__ IndexT get_vid(unsigned level, IndexT id) const { return d_vid_lists[level][id]; }
	__device__ IndexT get_idx(unsigned level, IndexT id) const { return d_idx_lists[level][id]; }
	__device__ history_type get_his(unsigned level, IndexT id) const { return d_his_lists[level][id]; }
	__device__ unsigned get_pid(IndexT id) const { return pid_list[id]; }
	__device__ void set_vid(unsigned level, IndexT id, IndexT vid) { d_vid_lists[level][id] = vid; }
	__device__ void set_idx(unsigned level, IndexT id, IndexT idx) { d_idx_lists[level][id] = idx; }
	__device__ void set_his(unsigned level, IndexT id, history_type lab) { d_his_lists[level][id] = lab; }
	__device__ void set_pid(IndexT id, unsigned pid) { pid_list[id] = pid; }
	size_t size() const { return sizes[last_level]; }
	size_t size(unsigned level) const { return sizes[level]; }
	//__device__ VertexList get_vid_list(unsigned level) { return vid_lists[level]; }
	//__device__ UintList get_idx_list(unsigned level) { return idx_lists[level]; }
	//__device__ ByteList get_his_list(unsigned level) { return his_lists[level]; }
	void add_level(unsigned size) { // TODO: this size could be larger than 2^32, when running LiveJournal and even larger graphs
		last_level ++;
		assert(last_level < max_level);
		check_cuda(cudaMalloc((void **)&h_vid_lists[last_level], size * sizeof(IndexT)));
		check_cuda(cudaMalloc((void **)&h_idx_lists[last_level], size * sizeof(IndexT)));
		#ifdef ENABLE_LABEL
		check_cuda(cudaMalloc((void **)&h_his_lists[last_level], size * sizeof(history_type)));
		#endif
		#ifdef USE_PID
		check_cuda(cudaMalloc((void **)&pid_list, size * sizeof(unsigned)));
		#endif
		check_cuda(cudaMemcpy(d_vid_lists, h_vid_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));
		check_cuda(cudaMemcpy(d_idx_lists, h_idx_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));
		#ifdef ENABLE_LABEL
		check_cuda(cudaMemcpy(d_his_lists, h_his_lists, max_level * sizeof(history_type*), cudaMemcpyHostToDevice));
		#endif
		sizes[last_level] = size;
	}
	void remove_tail(unsigned idx) { sizes[last_level] = idx; }
	void reset_level() {
		for (size_t i = 2; i <= last_level; i ++) {
			check_cuda(cudaFree(h_vid_lists[i]));
			check_cuda(cudaFree(h_idx_lists[i]));
		}
		last_level = 1;
	}

	/*
	void printout_embeddings(int level, bool verbose = false) {
		std::cout << "number of embeddings in level " << level << ": " << size() << std::endl;
		if(verbose) {
			for (size_t pos = 0; pos < size(); pos ++) {
				embeddingtype emb(last_level+1);
				get_embedding(last_level, pos, emb);
				std::cout << emb << "\n";
			}
		}
	}
	*/
	__device__ void get_embedding(unsigned level, unsigned pos, IndexT *emb) {
		IndexT vid = get_vid(level, pos);
		IndexT idx = get_idx(level, pos);
		emb[level] = vid;
		for (unsigned l = 1; l < level; l ++) {
			vid = get_vid(level-l, idx);
			emb[level-l] = vid;
			idx = get_idx(level-l, idx);
		}
		emb[0] = idx;
	}
	__device__ void get_edge_embedding(unsigned level, unsigned pos, IndexT *vids, history_type *hiss) {
		IndexT vid = get_vid(level, pos);
		IndexT idx = get_idx(level, pos);
		history_type his = get_his(level, pos);
		vids[level] = vid;
		hiss[level] = his;
		for (unsigned l = 1; l < level; l ++) {
			vid = get_vid(level-l, idx);
			his = get_his(level-l, idx);
			vids[level-l] = vid;
			hiss[level-l] = his;
			idx = get_idx(level-l, idx);
		}
		vids[0] = idx;
		hiss[0] = 0;
	}

private:
	unsigned max_level;
	unsigned last_level;
	size_t *sizes;
	unsigned *pid_list;
	IndexT** h_idx_lists;
	IndexT** h_vid_lists;
	history_type** h_his_lists;
	IndexT** d_idx_lists;
	IndexT** d_vid_lists;
	history_type** d_his_lists;
};

__global__ void init_gpu_dag(int m, CSRGraph graph, EmbeddingList emb_list) {
	unsigned src = blockIdx.x * blockDim.x + threadIdx.x;
	if (src < m) {
		IndexT row_begin = graph.edge_begin(src);
		IndexT row_end = graph.edge_end(src);
		for (IndexT e = row_begin; e < row_end; e++) {
			IndexT dst = graph.getEdgeDst(e);
			emb_list.set_vid(1, e, dst);
			emb_list.set_idx(1, e, src);
		}
	}
}

__global__ void init_alloc(int m, CSRGraph graph, EmbeddingList emb_list, IndexT *num_emb) {
	unsigned src = blockIdx.x * blockDim.x + threadIdx.x;
	if (src < m) {
		num_emb[src] = 0;
		#ifdef ENABLE_LABEL
		node_data_type src_label = graph.getData(src);
		#endif
		IndexT row_begin = graph.edge_begin(src);
		IndexT row_end = graph.edge_end(src);
		for (IndexT e = row_begin; e < row_end; e++) {
			IndexT dst = graph.getEdgeDst(e);
			#ifdef ENABLE_LABEL
			node_data_type dst_label = graph.getData(dst);
			#endif
			#ifdef ENABLE_LABEL
			if (src_label <= dst_label) num_emb[src] ++;
			#else
			if (src < dst) num_emb[src] ++;
			#endif
		}
	}
}

__global__ void init_insert(int m, CSRGraph graph, EmbeddingList emb_list, IndexT *indices) {
	unsigned src = blockIdx.x * blockDim.x + threadIdx.x;
	if (src < m) {
		#ifdef ENABLE_LABEL
		node_data_type src_label = graph.getData(src);
		#endif
		IndexT start = indices[src];
		IndexT row_begin = graph.edge_begin(src);
		IndexT row_end = graph.edge_end(src);
		for (IndexT e = row_begin; e < row_end; e++) {
			IndexT dst = graph.getEdgeDst(e);
			#ifdef ENABLE_LABEL
			node_data_type dst_label = graph.getData(dst);
			#endif
			#ifdef ENABLE_LABEL
			if (src_label <= dst_label) {
			#else
			if (src < dst) {
			#endif
				emb_list.set_vid(1, start, dst);
				emb_list.set_idx(1, start, src);
				#ifdef ENABLE_LABEL
				emb_list.set_his(1, start, 0);
				#endif
				start ++;
			}
		}
	}
}

#endif // EMBEDDING_CUH_


================================================
FILE: libpangolin/gpu/pangolin/graph_gpu.h
================================================
#pragma once
#include <set>
#include <vector>
#include <string>
#include <cassert>
#include <fstream>
#include <fcntl.h>
#include <cassert>
#include <unistd.h>
#include <stdint.h>
#include <algorithm>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/types.h>
#include "pangolin/types.cuh"
#include "pangolin/checker.h"
#include "pangolin/timer.h"

struct Edge {
  Edge(IndexT from, IndexT to) : src(from), dst(to) {}
  IndexT src;
  IndexT dst;
};

std::vector<std::vector<Edge>> vertices;

class CSRGraph {
protected:
  IndexT* row_start;
  IndexT* edge_dst;
  node_data_type* node_data;
  int nnodes;
  int nedges;
  bool need_dag;
  bool device_graph;
  bool use_node_data;

public:
  CSRGraph() { init(); }
  //~CSRGraph() {}
  void init() {
    row_start = edge_dst = NULL;
    node_data            = NULL;
    nnodes = nedges = 0;
    need_dag        = false;
    device_graph    = false;
    use_node_data   = false;
  }
  void enable_dag() { need_dag = true; }
  int get_nnodes() { return nnodes; }
  int get_nedges() { return nedges; }
  void clean() {
    check_cuda(cudaFree(row_start));
    check_cuda(cudaFree(edge_dst));
  }
  __device__ __host__ bool valid_node(IndexT node) { return (node < nnodes); }
  __device__ __host__ bool valid_edge(IndexT edge) { return (edge < nedges); }
  __device__ __host__ IndexT getOutDegree(unsigned src) {
    assert(src < nnodes);
    return row_start[src + 1] - row_start[src];
  };
  __device__ __host__ IndexT getDestination(unsigned src, unsigned edge) {
    assert(src < nnodes);
    assert(edge < getOutDegree(src));
    IndexT abs_edge = row_start[src] + edge;
    assert(abs_edge < nedges);
    return edge_dst[abs_edge];
  };
  __device__ __host__ IndexT getAbsDestination(unsigned abs_edge) {
    assert(abs_edge < nedges);
    return edge_dst[abs_edge];
  };
  inline __device__ __host__ IndexT getEdgeDst(unsigned edge) {
    assert(edge < nedges);
    return edge_dst[edge];
  };
  inline __device__ __host__ node_data_type getData(unsigned vid) {
    return node_data[vid];
  }
  inline __device__ __host__ IndexT edge_begin(unsigned src) {
    assert(src <= nnodes);
    return row_start[src];
  };
  inline __device__ __host__ IndexT edge_end(unsigned src) {
    assert(src <= nnodes);
    return row_start[src + 1];
  };
  int read(std::string file, bool read_node_data = true, bool dag = false) {
    std::cout << "Reading graph fomr file: " << file << "\n";
    need_dag = dag;
    if (read_node_data) {
      use_node_data = true;
      return read_adj(file.c_str());
    } else {
      use_node_data = false;
      readFromGR(file.c_str());
    }
    return 0;
  }
  void readFromGR(const char file[]) {
    std::ifstream cfile;
    cfile.open(file);
    int masterFD = open(file, O_RDONLY);
    if (masterFD == -1) {
      printf("FileGraph::structureFromFile: unable to open %s.\n", file);
      return;
    }
    struct stat buf;
    int f = fstat(masterFD, &buf);
    if (f == -1) {
      printf("FileGraph::structureFromFile: unable to stat %s.\n", file);
      abort();
    }
    size_t masterLength = buf.st_size;
    int _MAP_BASE       = MAP_PRIVATE;
    void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
    if (m == MAP_FAILED) {
      m = 0;
      printf("FileGraph::structureFromFile: mmap failed.\n");
      abort();
    }
    Timer t;
    t.Start();
    uint64_t* fptr                           = (uint64_t*)m;
    __attribute__((unused)) uint64_t version = le64toh(*fptr++);
    assert(version == 1);
    uint64_t sizeEdgeTy = le64toh(*fptr++);
    uint64_t numNodes   = le64toh(*fptr++);
    uint64_t numEdges   = le64toh(*fptr++);
    uint64_t* outIdx    = fptr;
    fptr += numNodes;
    uint32_t* fptr32 = (uint32_t*)fptr;
    uint32_t* outs   = fptr32;
    fptr32 += numEdges;
    if (numEdges % 2)
      fptr32 += 1;
    nnodes = numNodes;
    nedges = numEdges;
    printf("nnodes=%d, nedges=%d, sizeEdge=%d.\n", nnodes, nedges, sizeEdgeTy);
    row_start    = (index_type*)calloc(nnodes + 1, sizeof(index_type));
    edge_dst     = (index_type*)calloc(nedges, sizeof(index_type));
    row_start[0] = 0;
    for (unsigned ii = 0; ii < nnodes; ++ii) {
      row_start[ii + 1] = le64toh(outIdx[ii]);
      index_type degree = row_start[ii + 1] - row_start[ii];
      for (unsigned jj = 0; jj < degree; ++jj) {
        unsigned edgeindex = row_start[ii] + jj;
        unsigned dst       = le32toh(outs[edgeindex]);
        if (dst >= nnodes)
          printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj,
                 edgeindex);
        edge_dst[edgeindex] = dst;
      }
    }
    cfile.close(); // probably galois doesn't close its file due to mmap.
    t.Stop();
    double runtime = t.Millisecs();
    printf("read %lld bytes in %.1f ms (%0.2f MB/s)\n\r\n", masterLength,
           runtime, (masterLength / 1000.0) / runtime);
    if (need_dag) {
      reconstruct_from_csr();
      SquishGraph();
      MakeCSR(vertices);
      vertices.clear();
    }
    return;
  }
  void reconstruct_from_csr() {
    vertices.resize(nnodes);
    std::cout << "Reconstructing from CSR graph ... ";
    for (int i = 0; i < nnodes; i++) {
      std::vector<Edge> neighbors;
      for (IndexT j = row_start[i]; j < row_start[i + 1]; j++)
        neighbors.push_back(Edge(i, edge_dst[j]));
      vertices[i] = neighbors;
    }
    std::cout << "Done\n";
  }
  int read_adj(const char* filename) {
    FILE* fd = fopen(filename, "r");
    assert(fd != NULL);
    char buf[2048];
    unsigned size = 0, maxsize = 0;
    int numNodes = 0;
    while (fgets(buf, 2048, fd) != NULL) {
      auto len = strlen(buf);
      size += len;
      if (buf[len - 1] == '\n') {
        maxsize = std::max(size, maxsize);
        size    = 0;
        numNodes++;
      }
    }
    fclose(fd);
    nnodes = numNodes;
    printf("nnodes=%d.\n", nnodes);
    std::ifstream is;
    is.open(filename, std::ios::in);
    char* line = new char[maxsize + 1];
    std::vector<std::string> result;
    nedges    = 0;
    node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));
    vertices.resize(nnodes);
    std::vector<Edge> neighbors;
    for (size_t i = 0; i < nnodes; i++)
      vertices.push_back(neighbors);
    int line_count = 0;
    while (is.getline(line, maxsize + 1)) {
      result.clear();
      split(line, result);
      IndexT src = atoi(result[0].c_str());
      assert(src == line_count);
      assert(src < nnodes);
      node_data[src] = atoi(result[1].c_str());
      for (size_t i = 2; i < result.size(); i++) {
        IndexT dst = atoi(result[i].c_str());
        if (src == dst)
          continue; // remove self-loop
        vertices[src].push_back(Edge(src, dst));
        nedges++;
      }
      line_count++;
    }
    is.close();
    printf("nedges=%d\n", nedges);
    int num_labels = count_unique_labels();
    std::cout << "Number of unique vertex label values: " << num_labels
              << std::endl;
    SquishGraph();
    printf("nedges after clean: %d\n", nedges);
    row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));
    edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));
    MakeCSR(vertices);
    vertices.clear();
    return num_labels;
  }

  void MakeCSR(const std::vector<std::vector<Edge>> vert) {
    printf("Constructing CSR graph ... ");
    std::vector<IndexT> offsets(nnodes + 1);
    IndexT total = 0;
    for (int i = 0; i < nnodes; i++) {
      offsets[i] = total;
      total += vert[i].size();
    }
    offsets[nnodes] = total;
    assert(nedges == offsets[nnodes]);
    assert(row_start != NULL);
    for (size_t i = 0; i < nnodes + 1; i++)
      row_start[i] = offsets[i];
    for (size_t i = 0; i < nnodes; i++) {
      for (auto e : vert[i]) {
        if (i != e.src)
          std::cout << "[debug] i = " << i << ", src = " << e.src
                    << ", dst = " << e.dst << "\n";
        assert(i == e.src);
        edge_dst[offsets[e.src]++] = e.dst;
      }
    }
    printf("Done\n");
  }

  static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
  void SquishGraph(bool remove_selfloops  = true,
                   bool remove_redundents = true) {
    printf("Sorting the neighbor lists...");
    for (size_t i = 0; i < nnodes; i++)
      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
    printf(" Done\n");
    // remove self loops
    int num_selfloops = 0;
    if (remove_selfloops) {
      printf("Removing self loops...");
      for (size_t i = 0; i < nnodes; i++) {
        for (unsigned j = 0; j < vertices[i].size(); j++) {
          if (i == vertices[i][j].dst) {
            vertices[i].erase(vertices[i].begin() + j);
            num_selfloops++;
            j--;
          }
        }
      }
      printf(" %d selfloops are removed\n", num_selfloops);
      nedges -= num_selfloops;
    }
    // remove redundent
    int num_redundents = 0;
    if (remove_redundents) {
      printf("Removing redundent edges...");
      for (size_t i = 0; i < nnodes; i++) {
        for (unsigned j = 1; j < vertices[i].size(); j++) {
          if (vertices[i][j].dst == vertices[i][j - 1].dst) {
            vertices[i].erase(vertices[i].begin() + j);
            num_redundents++;
            j--;
          }
        }
      }
      printf(" %d redundent edges are removed\n", num_redundents);
      nedges -= num_redundents;
    }
    if (need_dag) {
      int num_dag = 0;
      std::cout << "Constructing DAG...";
      IndexT* degrees = new IndexT[nnodes];
      for (size_t i = 0; i < nnodes; i++)
        degrees[i] = vertices[i].size();
      for (size_t i = 0; i < nnodes; i++) {
        for (unsigned j = 0; j < vertices[i].size(); j++) {
          IndexT to = vertices[i][j].dst;
          auto di   = degrees[i];
          if (degrees[to] < di || (degrees[to] == di && to < i)) {
            vertices[i].erase(vertices[i].begin() + j);
            num_dag++;
            j--;
          }
        }
      }
      delete degrees;
      printf(" %d dag edges are removed\n", num_dag);
      nedges -= num_dag;
    }
  }

  int count_unique_labels() {
    std::set<node_data_type> s;
    int res = 0;
    for (int i = 0; i < nnodes; i++) {
      if (s.find(node_data[i]) == s.end()) {
        s.insert(node_data[i]);
        res++;
      }
    }
    return res;
  }

  inline void split(const std::string& str, std::vector<std::string>& tokens,
                    const std::string& delimiters = " ") {
    std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
    std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
    while (std::string::npos != pos || std::string::npos != lastPos) {
      tokens.push_back(str.substr(lastPos, pos - lastPos));
      lastPos = str.find_first_not_of(delimiters, pos);
      pos     = str.find_first_of(delimiters, lastPos);
    }
  }

  void copy_to_gpu(struct CSRGraph& copygraph) {
    copygraph.nnodes = nnodes;
    copygraph.nedges = nedges;
    auto error       = copygraph.allocOnDevice(use_node_data);
    if (error == 0) {
      std::cout << "GPU memory allocation failed\n";
      exit(0);
    }
    printf("edge_dst: host_ptr %x device_ptr %x \n", edge_dst,
           copygraph.edge_dst);
    check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,
                          nedges * sizeof(index_type), cudaMemcpyHostToDevice));
    check_cuda(cudaMemcpy(copygraph.row_start, row_start,
                          (nnodes + 1) * sizeof(index_type),
                          cudaMemcpyHostToDevice));
    if (use_node_data)
      check_cuda(cudaMemcpy(copygraph.node_data, node_data,
                            nnodes * sizeof(node_data_type),
                            cudaMemcpyHostToDevice));
  }

  unsigned allocOnHost() {
    assert(nnodes > 0);
    assert(!device_graph);
    if (row_start != NULL)
      return true;
    std::cout << "Allocating memory on CPU\n";
    if (use_node_data)
      std::cout << "Need node data\n";
    size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type);
    if (use_node_data)
      mem_usage += (nnodes) * sizeof(node_data_type);
    printf("Host memory for graph: %3u MB\n", mem_usage / 1048756);
    row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));
    edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));
    if (use_node_data)
      node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));
    std::cout << "Memory allocation done\n";
    return ((!use_node_data || node_data) && row_start && edge_dst);
  }

  unsigned allocOnDevice(bool use_label) {
    if (edge_dst != NULL) {
      std::cout << "already allocated\n";
      exit(0);
    }
    assert(edge_dst == NULL); // make sure not already allocated
    device_graph = true;
    std::cout << "Allocating memory on GPU\n";
    check_cuda(cudaMalloc((void**)&edge_dst, nedges * sizeof(index_type)));
    check_cuda(
        cudaMalloc((void**)&row_start, (nnodes + 1) * sizeof(index_type)));
    if (use_label)
      check_cuda(
          cudaMalloc((void**)&node_data, nnodes * sizeof(node_data_type)));
    return (edge_dst && (!use_node_data || node_data) && row_start);
  }
};


================================================
FILE: libpangolin/gpu/pangolin/miner.cuh
================================================
#ifndef GMINER_HPP_
#define GMINER_HPP_

#include "graph_gpu.h"
#include "embedding.cuh"

__device__ void printout_embedding(unsigned level, IndexT *emb) {
	printf("embedding[");
	for (unsigned i = 0; i < level; i ++) {
		printf("%d, ", emb[i]);
	}
	printf("%d]\n", emb[level]);
}

inline __device__ bool binary_search(CSRGraph graph, IndexT key, IndexT begin, IndexT end) {
	assert(begin < end);
	int l = begin;
	int r = end-1;
	while (r >= l) { 
		int mid = l + (r - l) / 2; 
		IndexT value = graph.getEdgeDst(mid);
		if (value == key) return true;
		if (value < key) l = mid + 1;
		else r = mid - 1;
	}
	return false;
}

inline __device__ bool binary_search(IndexT *column_indices, IndexT key, IndexT begin, IndexT end) {
	assert(begin < end);
	int l = begin;
	int r = end-1;
	while (r >= l) { 
		int mid = l + (r - l) / 2; 
		IndexT value = column_indices[mid];
		if (value == key) return true;
		if (value < key) l = mid + 1;
		else r = mid - 1;
	}
	return false;
}

inline __device__ unsigned intersect_dag_merge(IndexT p, IndexT q, CSRGraph graph) {
	unsigned count = 0;
	IndexT p_start, p_end, q_start, q_end, p_it, q_it;
	p_start = graph.edge_begin(p);
	p_end = graph.edge_end(p);
	q_start = graph.edge_begin(q);
	q_end = graph.edge_end(q);
	p_it = p_start;
	q_it = q_start;
	IndexT a, b;
	while (p_it < p_end && q_it < q_end) {
		a = graph.getEdgeDst(p_it);
		b = graph.getEdgeDst(q_it);
		IndexT d = a - b;
		if (d <= 0) p_it ++;
		if (d >= 0) q_it ++;
		if (d == 0) count ++;
	}
	return count;
}

inline __device__ bool is_connected(IndexT a, IndexT b, CSRGraph graph) {
	if (graph.getOutDegree(a) == 0 || graph.getOutDegree(b) == 0) return false;
	IndexT key = a;
	IndexT search = b;
	if (graph.getOutDegree(a) < graph.getOutDegree(b)) {
		key = b;
		search = a;
	} 
	IndexT begin = graph.edge_begin(search);
	IndexT end = graph.edge_end(search);
	IndexT l = begin;
	IndexT r = end-1;
	while (r >= l) { 
		IndexT mid = l + (r - l) / 2; 
		IndexT value = graph.getEdgeDst(mid);
		if (value == key) return true;
		if (value < key) l = mid + 1; 
		else r = mid - 1; 
	} 
	return false;
}

inline __device__ bool is_connected_dag(IndexT key, IndexT search, CSRGraph graph) {
	if (graph.getOutDegree(search) == 0) return false;
	IndexT begin = graph.edge_begin(search);
	IndexT end = graph.edge_end(search);
	IndexT l = begin;
	IndexT r = end-1;
	while (r >= l) { 
		IndexT mid = l + (r - l) / 2; 
		IndexT value = graph.getEdgeDst(mid);
		if (value == key) return true;
		if (value < key) l = mid + 1; 
		else r = mid - 1; 
	} 
	return false;
}

inline __device__ bool is_vertexInduced_automorphism(unsigned n, IndexT *emb, unsigned idx, IndexT src, IndexT dst, CSRGraph g) {
	// the new vertex id should be larger than the first vertex id
	if (dst <= emb[0]) return true;
	// the new vertex should not already exist in the embedding
	for (unsigned i = 1; i < n; ++i)
		if (dst == emb[i]) return true;
	// the new vertex should not already be extended by any previous vertex in the embedding
	for (unsigned i = 0; i < idx; ++i)
		if (is_connected(emb[i], dst, g)) return true;
	// the new vertex id should be larger than any vertex id after its source vertex in the embedding
	for (unsigned i = idx+1; i < n; ++i)
		if (dst < emb[i]) return true;
	return false;
}

// count 3-motifs
inline __device__ unsigned find_3motif_pattern_id(unsigned idx, IndexT dst, IndexT* emb, CSRGraph g, unsigned pos = 0) {
	unsigned pid = 1; // 3-chain
	if (idx == 0) {
		if (is_connected(emb[1], dst, g)) pid = 0; // triangle
		#ifdef USE_WEDGE
		//else if (max_size == 4) is_wedge[pos] = 1; // wedge; used for 4-motif
		#endif
	}
	return pid;
}
// count 4-motifs
inline __device__ unsigned find_4motif_pattern_id(unsigned n, unsigned idx, IndexT dst, IndexT* emb, unsigned pattern, CSRGraph g, unsigned pos = 0) {
	unsigned pid = pattern;
	unsigned num_edges = 1;
	if (pid == 0) { // extending a triangle
		for (unsigned j = idx+1; j < n; j ++)
			if (is_connected(emb[j], dst, g)) num_edges ++;
		pid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique
	} else { // extending a 3-chain
		assert(pid == 1);
		bool connected[3];
		for (int i = 0; i < 3; i ++) connected[i] = false;
		connected[idx] = true;
		for (unsigned j = idx+1; j < n; j ++) {
			if (is_connected(emb[j], dst, g)) {
				num_edges ++;
				connected[j] = true;
			}
		}
		if (num_edges == 1) {
			pid = 0; // p0: 3-path
			unsigned center = 1;
			#ifdef USE_WEDGE
			//if (is_wedge[pos]) center = 0;
			#else
			center = is_connected(emb[1], emb[2], g) ? 1 : 0;
			#endif
			if (idx == center) pid = 1; // p1: 3-star
		} else if (num_edges == 2) {
			pid = 2; // p2: 4-cycle
			unsigned center = 1;
			#ifdef USE_WEDGE
			//if (is_wedge[pos]) center = 0;
			#else
			center = is_connected(emb[1], emb[2], g) ? 1 : 0;
			#endif
			if (connected[center]) pid = 3; // p3: tailed-triangle
		} else {
			pid = 4; // p4: diamond
		}
	}
	return pid;
}

inline __device__ unsigned intersect_dag(IndexT a, IndexT b, CSRGraph g) {
	return intersect_dag_merge(a, b, g);
}

inline __device__ bool is_all_connected_dag(IndexT dst, IndexT *emb, IndexT end, CSRGraph g) {
	bool all_connected = true;
	for(IndexT i = 0; i < end; ++i) {
		IndexT from = emb[i];
		if (!is_connected_dag(dst, from, g)) {
			all_connected = false;
			break;
		}
	}
	return all_connected;
}

#endif


================================================
FILE: libpangolin/gpu/pangolin/timer.h
================================================
// Copyright (c) 2015, The Regents of the University of California (Regents)
// See LICENSE.txt for license details

#ifndef TIMER_H_
#define TIMER_H_

#include <sys/time.h>

/*
GAP Benchmark Suite
Class:  Timer
Author: Scott Beamer

Simple timer that wraps gettimeofday
*/

class Timer {
public:
  Timer() {}

  void Start() { gettimeofday(&start_time_, NULL); }

  void Stop() {
    gettimeofday(&elapsed_time_, NULL);
    elapsed_time_.tv_sec -= start_time_.tv_sec;
    elapsed_time_.tv_usec -= start_time_.tv_usec;
  }

  double Seconds() const {
    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;
  }

  double Millisecs() const {
    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;
  }

  double Microsecs() const {
    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;
  }

private:
  struct timeval start_time_;
  struct timeval elapsed_time_;
};

// Times op's execution using the timer t
#define TIME_OP(t, op)                                                         \
  {                                                                            \
    t.Start();                                                                 \
    (op);                                                                      \
    t.Stop();                                                                  \
  }

#endif // TIMER_H_


================================================
FILE: libpangolin/gpu/pangolin/types.cuh
================================================
#ifndef TYPES_H_
#define TYPES_H_

typedef int IndexT;
typedef int index_type;
typedef uint8_t edge_data_type;
typedef uint8_t node_data_type;
typedef uint8_t key_type;
typedef uint8_t history_type;
typedef unsigned char SetType;
typedef unsigned long long AccType;

#define PANGOLIN_MAX_SIZE     5
#define WARP_SIZE   32
#define BLOCK_SIZE 256
#define DIVIDE_INTO(x,y) ((x + y - 1)/y)
#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)

#endif


================================================
FILE: libpangolin/include/pangolin/BfsMining/edge_miner.h
================================================
#ifndef EDGE_MINER_H
#define EDGE_MINER_H
#include "pangolin/miner.h"
#include "pangolin/quick_pattern.h"
#include "pangolin/canonical_graph.h"
#include "pangolin/domain_support.h"
#include "pangolin/BfsMining/embedding_list.h"

template <typename ElementTy, typename EmbeddingTy, typename API,
          bool report_num_pattern>
class EdgeMiner : public Miner<ElementTy, EmbeddingTy, false> {
  typedef EmbeddingList<ElementTy, EmbeddingTy> EmbeddingListTy;
  typedef QuickPattern<EmbeddingTy, ElementTy> QPattern;
  typedef CanonicalGraph<EmbeddingTy, ElementTy> CPattern;
  // quick pattern map (mapping quick pattern to its frequency)
  typedef std::unordered_map<QPattern, Frequency> QpMapFreq;
  // canonical pattern map (mapping canonical pattern to its frequency)
  typedef std::unordered_map<CPattern, Frequency> CgMapFreq;
  // quick pattern map (mapping quick pattern to its domain support)
  typedef std::unordered_map<QPattern, DomainSupport*> QpMapDomain;
  // canonical pattern map (mapping canonical pattern to its domain support)
  typedef std::unordered_map<CPattern, DomainSupport*> CgMapDomain;
  // PerThreadStorage: thread-local quick pattern map
  typedef galois::substrate::PerThreadStorage<QpMapFreq> LocalQpMapFreq;
  // PerThreadStorage: thread-local canonical pattern map
  typedef galois::substrate::PerThreadStorage<CgMapFreq> LocalCgMapFreq;
  typedef galois::substrate::PerThreadStorage<QpMapDomain> LocalQpMapDomain;
  typedef galois::substrate::PerThreadStorage<CgMapDomain> LocalCgMapDomain;

public:
  EdgeMiner(unsigned max_sz, int nt)
      : Miner<ElementTy, EmbeddingTy, false>(max_sz, nt) {}
  virtual ~EdgeMiner() {}
  void clean() {
    edge_map.clear();
    freq_edge_set.clear();
    is_frequent_edge.clear();
    clean_maps();
  }
  void clean_maps() {
    id_map.clear();
    domain_support_map.clear();
    for (auto ele : qp_map)
      ele.second->clean();
    for (auto ele : cg_map)
      ele.second->clean();
    for (auto ele : init_map)
      ele.second->clean();
    qp_map.clear();
    cg_map.clear();
    init_map.clear();
    for (auto i = 0; i < this->num_threads; i++) {
      auto qp_map_ptr = qp_localmaps.getLocal(i);
      for (auto ele : *qp_map_ptr)
        ele.second->clean();
      qp_map_ptr->clear();
      auto cg_map_ptr = cg_localmaps.getLocal(i);
      for (auto ele : *cg_map_ptr)
        ele.second->clean();
      cg_map_ptr->clear();
      auto init_map_ptr = init_pattern_maps.getLocal(i);
      for (auto ele : *init_map_ptr)
        ele.second->clean();
      init_map_ptr->clear();
    }
  }
  void initialize(std::string) { init_emb_list(); }
  void init_emb_list() {
    this->emb_list.init(this->graph, this->max_size + 1);
    construct_edgemap();
  }
  void inc_total_num(int value) { total_num += value; }
  void solver() {
    std::cout << "Mininum support: " << threshold << "\n";
    unsigned level = 1;
    // this->emb_list.printout_embeddings(1);
    int num_freq_patterns = init_aggregator();
    if (num_freq_patterns == 0) {
      std::cout << "No frequent pattern found\n\n";
      return;
    }
    inc_total_num(num_freq_patterns);
    std::cout << "Number of frequent single-edge patterns: "
              << num_freq_patterns << "\n";
    init_filter();
    // this->emb_list.printout_embeddings(level);

    while (1) {
      extend_edge(level);
      level++;
      // this->emb_list.printout_embeddings(level, debug);
      quick_aggregate(level);
      merge_qp_map(level + 1);
      canonical_aggregate();
      merge_cg_map(level + 1);
      num_freq_patterns = support_count();
      // std::cout << "num_frequent_patterns: " << num_freq_patterns << "\n";
      // printout_agg();
      inc_total_num(num_freq_patterns);
      if (num_freq_patterns == 0)
        break;
      if (level == this->max_size)
        break;
      filter(level);
      // this->emb_list.printout_embeddings(level, debug);
      clean_maps();
    }
  }

  void extend_edge(unsigned level) {
    UintList num_new_emb(this->emb_list.size());
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          num_new_emb[pos] = 0;
          unsigned n       = emb.size();
          VertexSet vert_set;
          if (n > 3)
            for (unsigned i = 0; i < n; i++)
              vert_set.insert(emb.get_vertex(i));
          for (unsigned i = 0; i < n; ++i) {
            auto src = emb.get_vertex(i);
            if (emb.get_key(i) == 0) { // TODO: need to fix this
              for (auto e : this->graph.edges(src)) {
                GNode dst    = this->graph.getEdgeDst(e);
                BYTE existed = 0;
                if (is_frequent_edge[*e])
                  if (API::toAdd(n, emb, i, src, dst, existed, vert_set))
                    num_new_emb[pos]++;
              }
            }
          }
          emb.clean();
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-alloc"));
    Ulong new_size =
        std::accumulate(num_new_emb.begin(), num_new_emb.end(), (Ulong)0);
    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);
    new_size          = indices[indices.size() - 1];
    this->emb_list.add_level(new_size);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size(level)),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          unsigned start = indices[pos];
          unsigned n     = emb.size();
          VertexSet vert_set;
          if (n > 3)
            for (unsigned i = 0; i < n; i++)
              vert_set.insert(emb.get_vertex(i));
          for (unsigned i = 0; i < n; ++i) {
            auto src = emb.get_vertex(i);
            if (emb.get_key(i) == 0) {
              for (auto e : this->graph.edges(src)) {
                GNode dst    = this->graph.getEdgeDst(e);
                BYTE existed = 0;
                if (is_frequent_edge[*e]) {
                  if (API::toAdd(n, emb, i, src, dst, existed, vert_set)) {
                    this->emb_list.set_idx(level + 1, start, pos);
                    this->emb_list.set_his(level + 1, start, i);
                    this->emb_list.set_vid(level + 1, start++, dst);
                  }
                }
              }
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-write"));
  }
  inline unsigned init_aggregator() {
    init_map.clear();
    galois::do_all(
        galois::iterate(this->graph.begin(), this->graph.end()),
        [&](const GNode& src) {
          InitMap* lmap   = init_pattern_maps.getLocal();
          auto& src_label = this->graph.getData(src);
          for (auto e : this->graph.edges(src)) {
            GNode dst       = this->graph.getEdgeDst(e);
            auto& dst_label = this->graph.getData(dst);
            if (src_label <= dst_label) {
              InitPattern key = get_init_pattern(src_label, dst_label);
              if (lmap->find(key) == lmap->end()) {
                (*lmap)[key] = new DomainSupport(2);
                (*lmap)[key]->set_threshold(threshold);
              }
              (*lmap)[key]->add_vertex(0, src);
              (*lmap)[key]->add_vertex(1, dst);
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("InitAggregation"));
    merge_init_map();
    std::cout << "Number of single-edge patterns: " << init_map.size() << "\n";
    unsigned count = 0;
    for (auto it = init_map.begin(); it != init_map.end(); ++it)
      if (it->second->get_support())
        count++;
    return count; // return number of frequent single-edge patterns
  }
  inline void quick_aggregate(unsigned level) {
    for (auto i = 0; i < this->num_threads; i++)
      qp_localmaps.getLocal(i)->clear();
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          QpMapDomain* lmap = qp_localmaps.getLocal();
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          unsigned n = emb.size();
          QPattern qp(emb, true);
          bool qp_existed = false;
          auto it         = lmap->find(qp);
          if (it == lmap->end()) {
            (*lmap)[qp] = new DomainSupport(n);
            (*lmap)[qp]->set_threshold(threshold);
            this->emb_list.set_pid(pos, qp.get_id());
          } else {
            qp_existed = true;
            this->emb_list.set_pid(pos, (it->first).get_id());
          }
          for (unsigned i = 0; i < n; i++) {
            if ((*lmap)[qp]->has_domain_reached_support(i) == false)
              (*lmap)[qp]->add_vertex(i, emb.get_vertex(i));
          }
          if (qp_existed)
            qp.clean();
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("QuickAggregation"));
  }
  // aggregate quick patterns into canonical patterns.
  // construct id_map from quick pattern ID (qp_id) to canonical pattern ID
  // (cg_id)
  void canonical_aggregate() {
    id_map.clear();
    for (auto i = 0; i < this->num_threads; i++)
      cg_localmaps.getLocal(i)->clear();
    galois::do_all(
        galois::iterate(qp_map),
        [&](std::pair<QPattern, DomainSupport*> element) {
          CgMapDomain* lmap    = cg_localmaps.getLocal();
          unsigned num_domains = element.first.get_size();
          CPattern cg(element.first);
          int qp_id = element.first.get_id();
          int cg_id = cg.get_id();
          slock.lock();
          id_map.insert(std::make_pair(qp_id, cg_id));
          slock.unlock();
          auto it = lmap->find(cg);
          if (it == lmap->end()) {
            (*lmap)[cg] = new DomainSupport(num_domains);
            (*lmap)[cg]->set_threshold(threshold);
            element.first.set_cgid(cg.get_id());
          } else {
            element.first.set_cgid((it->first).get_id());
          }
          VertexPositionEquivalences equivalences;
          element.first.get_equivalences(equivalences);
          for (unsigned i = 0; i < num_domains; i++) {
            if ((*lmap)[cg]->has_domain_reached_support(i) == false) {
              unsigned qp_idx = cg.get_quick_pattern_index(i);
              assert(qp_idx < num_domains);
              UintSet equ_set = equivalences.get_equivalent_set(qp_idx);
              for (unsigned idx : equ_set) {
                DomainSupport* support = element.second;
                if (support->has_domain_reached_support(idx) == false) {
                  bool reached_threshold =
                      (*lmap)[cg]->add_vertices(i, support->domain_sets[idx]);
                  if (reached_threshold)
                    break;
                } else {
                  (*lmap)[cg]->set_domain_frequent(i);
                  break;
                }
              }
            }
          }
          cg.clean();
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("CanonicalAggregation"));
  }
  inline void merge_qp_map(LocalQpMapFreq& qp_localmap, QpMapFreq& qp_map) {
    for (auto i = 0; i < this->num_threads; i++) {
      for (auto element : *qp_localmap.getLocal(i)) {
        if (qp_map.find(element.first) != qp_map.end())
          qp_map[element.first] += element.second;
        else
          qp_map[element.first] = element.second;
      }
    }
  }
  inline void merge_cg_map(LocalCgMapFreq& localmaps, CgMapFreq& cg_map) {
    for (auto i = 0; i < this->num_threads; i++) {
      for (auto element : *localmaps.getLocal(i)) {
        if (cg_map.find(element.first) != cg_map.end())
          cg_map[element.first] += element.second;
        else
          cg_map[element.first] = element.second;
      }
    }
  }
  inline void merge_init_map() {
    init_map = *(init_pattern_maps.getLocal(0));
    for (auto i = 1; i < this->num_threads; i++) {
      for (auto element : *init_pattern_maps.getLocal(i)) {
        DomainSupport* support = element.second;
        if (init_map.find(element.first) == init_map.end()) {
          init_map[element.first] = support;
        } else {
          for (unsigned i = 0; i < 2; i++) {
            if (!init_map[element.first]->has_domain_reached_support(i)) {
              if (support->has_domain_reached_support(i))
                init_map[element.first]->set_domain_frequent(i);
              else
                init_map[element.first]->add_vertices(i,
                                                      support->domain_sets[i]);
            }
          }
        }
      }
    }
  }
  inline void merge_qp_map(unsigned num_domains) {
    qp_map.clear();
    qp_map = *(qp_localmaps.getLocal(0));
    for (auto i = 1; i < this->num_threads; i++) {
      const QpMapDomain* lmap = qp_localmaps.getLocal(i);
      for (auto element : *lmap) {
        if (qp_map.find(element.first) == qp_map.end())
          qp_map[element.first] = element.second;
      }
      galois::do_all(
          galois::iterate(*lmap),
          [&](std::pair<QPattern, DomainSupport*> element) {
            DomainSupport* support = element.second;
            for (unsigned i = 0; i < num_domains; i++) {
              if (!qp_map[element.first]->has_domain_reached_support(i) &&
                  qp_map[element.first] != support) {
                if (support->has_domain_reached_support(i))
                  qp_map[element.first]->set_domain_frequent(i);
                else
                  qp_map[element.first]->add_vertices(i,
                                                      support->domain_sets[i]);
              }
            }
          },
          galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
          galois::loopname("MergeQuickPatterns"));
    }
  }
  inline void merge_cg_map(unsigned num_domains) {
    cg_map.clear();
    cg_map = *(cg_localmaps.getLocal(0));
    for (auto i = 1; i < this->num_threads; i++) {
      const CgMapDomain* lmap = cg_localmaps.getLocal(i);
      for (auto element : *lmap) {
        if (cg_map.find(element.first) == cg_map.end())
          cg_map[element.first] = element.second;
      }
      galois::do_all(
          galois::iterate(*lmap),
          [&](std::pair<CPattern, DomainSupport*> element) {
            DomainSupport* support = element.second;
            for (unsigned i = 0; i < num_domains; i++) {
              if (!cg_map[element.first]->has_domain_reached_support(i) &&
                  cg_map[element.first] != support) {
                if (support->has_domain_reached_support(i))
                  cg_map[element.first]->set_domain_frequent(i);
                else
                  cg_map[element.first]->add_vertices(i,
                                                      support->domain_sets[i]);
              }
            }
          },
          galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
          galois::loopname("MergeCanonicalPatterns"));
    }
  }

  // Filtering for FSM
  inline void init_filter() {
    UintList is_frequent_emb(this->emb_list.size(), 0);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          auto src        = this->emb_list.get_idx(1, pos);
          auto dst        = this->emb_list.get_vid(1, pos);
          auto& src_label = this->graph.getData(src);
          auto& dst_label = this->graph.getData(dst);
          InitPattern key = get_init_pattern(src_label, dst_label);
          if (init_map[key]->get_support())
            is_frequent_emb[pos] = 1;
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("InitFilter"));

    assert(this->emb_list.size() * 2 ==
           this->graph.sizeEdges()); // symmetric graph
    is_frequent_edge.resize(this->graph.sizeEdges());
    std::fill(is_frequent_edge.begin(), is_frequent_edge.end(), 0);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          if (is_frequent_emb[pos]) {
            auto src  = this->emb_list.get_idx(1, pos);
            auto dst  = this->emb_list.get_vid(1, pos);
            auto eid0 = edge_map[OrderedEdge(src, dst)];
            auto eid1 = edge_map[OrderedEdge(dst, src)];
            // std::cout << "src=" << src << ", dst=" << dst
            //	<< ", eid_sd=" << eid0 << ", eid_ds=" << eid1 << "\n";
            __sync_bool_compare_and_swap(&is_frequent_edge[eid0], 0, 1);
            __sync_bool_compare_and_swap(&is_frequent_edge[eid1], 0, 1);
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("InitFrquentEdges"));
    std::cout << "Number of frequent edges: "
              << count(is_frequent_edge.begin(), is_frequent_edge.end(), 1)
              << "\n";

    UintList indices     = parallel_prefix_sum(is_frequent_emb);
    VertexList vid_list0 = this->emb_list.get_idx_list(1);
    VertexList vid_list1 = this->emb_list.get_vid_list(1);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          if (is_frequent_emb[pos]) {
            auto src   = vid_list0[pos];
            auto dst   = vid_list1[pos];
            auto start = indices[pos];
            this->emb_list.set_vid(1, start, dst);
            this->emb_list.set_idx(1, start, src);
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("InitEmbeddingList"));
    this->emb_list.remove_tail(indices.back());
  }

  // Check if the pattern of a given embedding is frequent, if yes, insert it to
  // the queue
  inline void filter(unsigned level) {
    UintList is_frequent_emb(this->emb_list.size(), 0);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          unsigned qp_id = this->emb_list.get_pid(pos);
          unsigned cg_id = id_map.at(qp_id);
          if (domain_support_map.at(cg_id))
            is_frequent_emb[pos] = 1;
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Filter-alloc"));
    UlongList indices   = parallel_prefix_sum<unsigned, Ulong>(is_frequent_emb);
    VertexList vid_list = this->emb_list.get_vid_list(level);
    UintList idx_list   = this->emb_list.get_idx_list(level);
    ByteList his_list   = this->emb_list.get_his_list(level);
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& pos) {
          if (is_frequent_emb[pos]) {
            auto start = indices[pos];
            auto vid   = vid_list[pos];
            auto idx   = idx_list[pos];
            auto his   = his_list[pos];
            this->emb_list.set_idx(level, start, idx);
            this->emb_list.set_vid(level, start, vid);
            this->emb_list.set_his(level, start, his);
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Filter-write"));
    this->emb_list.remove_tail(indices.back());
  }
  void set_threshold(const unsigned minsup) { threshold = minsup; }
  inline void printout_agg(const CgMapFreq& cg_map) {
    for (auto it = cg_map.begin(); it != cg_map.end(); ++it)
      std::cout << "{" << it->first << " --> " << it->second << std::endl;
  }
  inline void printout_agg() {
    BoolVec support(cg_map.size());
    int i = 0;
    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {
      support[i] = it->second->get_support();
      i++;
    }
    i = 0;
    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {
      std::cout << "{" << it->first << " --> " << support[i] << std::endl;
      i++;
    }
  }
  inline unsigned support_count() {
    domain_support_map.clear();
    unsigned count = 0;
    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {
      bool support = it->second->get_support();
      domain_support_map.insert(std::make_pair(it->first.get_id(), support));
      if (support)
        count++;
    }
    return count;
  }
  // construct edge-map for later use. May not be necessary if Galois has this
  // support
  void construct_edgemap() {
    // std::cout << "Constructing edge map ...\n";
    for (auto src : this->graph) {
      for (auto e : this->graph.edges(src)) {
        GNode dst = this->graph.getEdgeDst(e);
        OrderedEdge edge(src, dst);
        // std::cout << "src=" << src << ", dst=" << dst
        //	<< ", eid=" << *e << "\n";
        edge_map.insert(std::pair<OrderedEdge, unsigned>(edge, *e));
      }
    }
  }

protected:
  int total_num; // total number of frequent patterns
  unsigned threshold;
  EmbeddingListTy emb_list;

private:
  InitMap init_map;
  UintMap id_map;
  DomainMap domain_support_map;
  galois::gstl::Map<OrderedEdge, unsigned> edge_map;
  std::set<std::pair<VertexId, VertexId>> freq_edge_set;
  std::vector<unsigned> is_frequent_edge;
  InitMaps init_pattern_maps; // initialization map, only used for once, no need
                              // to clear
  LocalQpMapDomain qp_localmaps; // quick pattern local map for each thread
  LocalCgMapDomain cg_localmaps; // canonical pattern local map for each thread
  QpMapDomain qp_map;            // quick pattern map
  CgMapDomain cg_map;            // canonical graph map
  galois::substrate::SimpleLock slock;

  inline InitPattern get_init_pattern(BYTE src_label, BYTE dst_label) {
    if (src_label <= dst_label)
      return std::make_pair(src_label, dst_label);
    else
      return std::make_pair(dst_label, src_label);
  }
  inline void get_embedding(unsigned level, unsigned pos, EmbeddingTy& emb) {
    auto vid = this->emb_list.get_vid(level, pos);
    auto idx = this->emb_list.get_idx(level, pos);
    auto his = this->emb_list.get_his(level, pos);
    auto lab = this->graph.getData(vid);
    ElementTy ele(vid, 0, lab, his);
    emb.set_element(level, ele);
    for (unsigned l = 1; l < level; l++) {
      vid = this->emb_list.get_vid(level - l, idx);
      his = this->emb_list.get_his(level - l, idx);
      lab = this->graph.getData(vid);
      ElementTy ele(vid, 0, lab, his);
      emb.set_element(level - l, ele);
      idx = this->emb_list.get_idx(level - l, idx);
    }
    lab = this->graph.getData(idx);
    ElementTy ele0(idx, 0, lab, 0);
    emb.set_element(0, ele0);
  }
};

#endif // EDGE_MINER_HPP_


================================================
FILE: libpangolin/include/pangolin/BfsMining/edge_miner_api.h
================================================
#pragma once
#include "pangolin/gtypes.h"

template <typename EmbeddingTy>
class EdgeMinerAPI {
public:
  EdgeMinerAPI() {}
  ~EdgeMinerAPI() {}

  // toExtend
  static inline bool toExtend(unsigned, const EmbeddingTy&, unsigned) {
    return true;
  }
  // toAdd (only add non-automorphisms)
  static inline bool toAdd(unsigned n, const EmbeddingTy& emb, unsigned pos,
                           VertexId src, VertexId dst, BYTE& existed,
                           const VertexSet& vertex_set) {
    return !is_edge_automorphism(n, emb, pos, src, dst, existed, vertex_set);
  }
  // customized pattern classification method
  static inline unsigned getPattern(unsigned GALOIS_USED_ONLY_IN_DEBUG(n),
                                    const EmbeddingTy&, unsigned, VertexId,
                                    VertexId) {
    assert(n < 4);
    return 0;
  }

protected:
  static inline bool is_quick_automorphism(unsigned size,
                                           const EmbeddingTy& emb,
                                           unsigned history, VertexId dst,
                                           BYTE& existed) {
    if (dst <= emb.get_vertex(0))
      return true;
    if (dst == emb.get_vertex(1))
      return true;
    if (history == 0 && dst < emb.get_vertex(1))
      return true;
    if (size == 2) {
    } else if (size == 3) {
      if (history == 0 && emb.get_history(2) == 0 && dst <= emb.get_vertex(2))
        return true;
      if (history == 0 && emb.get_history(2) == 1 && dst == emb.get_vertex(2))
        return true;
      if (history == 1 && emb.get_history(2) == 1 && dst <= emb.get_vertex(2))
        return true;
      if (dst == emb.get_vertex(2))
        existed = 1;
    } else {
      std::cout << "Error: should go to detailed check\n";
    }
    return false;
  }

  static inline bool is_edge_automorphism(unsigned size, const EmbeddingTy& emb,
                                          unsigned history, VertexId src,
                                          VertexId dst, BYTE& existed,
                                          const VertexSet& vertex_set) {
    if (size < 3)
      return is_quick_automorphism(size, emb, history, dst, existed);
    // check with the first element
    if (dst <= emb.get_vertex(0))
      return true;
    if (history == 0 && dst <= emb.get_vertex(1))
      return true;
    // check loop edge
    if (dst == emb.get_vertex(emb.get_history(history)))
      return true;
    if (vertex_set.find(dst) != vertex_set.end())
      existed = 1;
    // check to see if there already exists the vertex added;
    // if so, just allow to add edge which is (smaller id -> bigger id)
    if (existed && src > dst)
      return true;
    std::pair<VertexId, VertexId> added_edge(src, dst);
    for (unsigned index = history + 1; index < emb.size(); ++index) {
      std::pair<VertexId, VertexId> edge;
      edge.first  = emb.get_vertex(emb.get_history(index));
      edge.second = emb.get_vertex(index);
      // assert(edge.first != edge.second);
      int cmp = compare(added_edge, edge);
      if (cmp <= 0)
        return true;
    }
    return false;
  }
  static inline void swap(std::pair<VertexId, VertexId>& pair) {
    if (pair.first > pair.second) {
      auto tmp    = pair.first;
      pair.first  = pair.second;
      pair.second = tmp;
    }
  }
  static inline int compare(std::pair<VertexId, VertexId>& oneEdge,
                            std::pair<VertexId, VertexId>& otherEdge) {
    swap(oneEdge);
    swap(otherEdge);
    if (oneEdge.first == otherEdge.first)
      return oneEdge.second - otherEdge.second;
    else
      return oneEdge.first - otherEdge.first;
  }
};


================================================
FILE: libpangolin/include/pangolin/BfsMining/embedding_list.h
================================================
#ifndef EMBEDDING_LIST_H_
#define EMBEDDING_LIST_H_
#include "pangolin/gtypes.h"
#include "pangolin/base_embedding.h"
#include "pangolin/vertex_embedding.h"
#include "pangolin/edge_embedding.h"

// Embedding list: SoA structure
template <typename ElementType, typename EmbeddingType>
class EmbeddingList {
public:
  EmbeddingList() {}
  ~EmbeddingList() {}
  void init(PangolinGraph& graph, unsigned max_size = 2, bool is_dag = false);
  VertexId get_vid(unsigned level, size_t id) const {
    return vid_lists[level][id];
  }
  IndexTy get_idx(unsigned level, size_t id) const {
    return idx_lists[level][id];
  }
  BYTE get_his(unsigned level, size_t id) const { return his_lists[level][id]; }
  unsigned get_pid(size_t id) const { return pid_list[id]; }
  void set_vid(unsigned level, size_t id, VertexId vid) {
    vid_lists[level][id] = vid;
  }
  void set_idx(unsigned level, size_t id, IndexTy idx) {
    idx_lists[level][id] = idx;
  }
  void set_his(unsigned level, size_t id, BYTE lab) {
    his_lists[level][id] = lab;
  }
  void set_pid(size_t id, unsigned pid) { pid_list[id] = pid; }
  size_t size() const { return vid_lists[last_level].size(); }
  size_t size(unsigned level) const { return vid_lists[level].size(); }
  VertexList get_vid_list(unsigned level) { return vid_lists[level]; }
  UintList get_idx_list(unsigned level) { return idx_lists[level]; }
  ByteList get_his_list(unsigned level) { return his_lists[level]; }
  void remove_tail(size_t idx) {
    vid_lists[last_level].erase(vid_lists[last_level].begin() + idx,
                                vid_lists[last_level].end());
    if (std::is_same<ElementType, LabeledElement>::value)
      his_lists[last_level].erase(his_lists[last_level].begin() + idx,
                                  his_lists[last_level].end());
  }
  void add_level(Ulong size) {
    last_level++;
    assert(last_level < max_level);
    vid_lists[last_level].resize(size);
    idx_lists[last_level].resize(size);
    if (std::is_same<ElementType, LabeledElement>::value)
      his_lists[last_level].resize(size);
    if (std::is_same<EmbeddingType, VertexEmbedding>::value ||
        std::is_same<EmbeddingType, EdgeEmbedding>::value) // multi-pattern
      pid_list.resize(size);
  }
  void reset_level() {
    for (size_t i = 2; i <= last_level; i++) {
      vid_lists[i].clear();
      idx_lists[i].clear();
    }
    last_level = 1;
  }
  void printout_embeddings(int level, bool verbose = false) {
    std::cout << "Number of embeddings in level " << level << ": " << size()
              << std::endl;
    if (verbose) {
      for (size_t pos = 0; pos < size(); pos++) {
        EmbeddingType emb(last_level + 1);
        get_embedding(last_level, pos, emb);
        std::cout << emb << "\n";
      }
    }
  }
  void clean() {
    pid_list.clear();
    for (size_t i = 0; i < vid_lists.size(); i++) {
      if (std::is_same<ElementType, LabeledElement>::value)
        his_lists[i].clear();
      idx_lists[i].clear();
      vid_lists[i].clear();
    }
    his_lists.clear();
    idx_lists.clear();
    vid_lists.clear();
  }

private:
  UintList pid_list;
  ByteLists his_lists;
  IndexLists idx_lists;
  VertexLists vid_lists;
  unsigned last_level;
  unsigned max_level;
  void get_embedding(unsigned level, size_t pos, EmbeddingType& emb) {
    auto vid    = get_vid(level, pos);
    IndexTy idx = get_idx(level, pos);
    BYTE his    = 0;
    if (std::is_same<ElementType, LabeledElement>::value)
      his = get_his(level, pos);
    ElementType ele(vid, 0, 0, his);
    emb.set_element(level, ele);
    for (unsigned l = 1; l < level; l++) {
      vid = get_vid(level - l, idx);
      if (std::is_same<ElementType, LabeledElement>::value)
        his = get_his(level - l, idx);
      ElementType ele(vid, 0, 0, his);
      emb.set_element(level - l, ele);
      idx = get_idx(level - l, idx);
    }
    ElementType ele0(idx, 0, 0, 0);
    emb.set_element(0, ele0);
  }
};

#endif // EMBEDDING_LIST_HPP_


================================================
FILE: libpangolin/include/pangolin/BfsMining/engine.h
================================================
#include "galois/Galois.h"
#include "pangolin/res_man.h"
#include "pangolin/BfsMining/embedding_list.h"

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarMineStart(argc, argv, name, desc, url);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  AppMiner miner(k, numThreads);
  galois::StatTimer Tinitial("GraphReadingTime");
  Tinitial.start();
  miner.read_graph(filetype, inputFile);
  Tinitial.stop();
  ResourceManager rm;
  for (unsigned nt = 0; nt < num_trials; nt++) {
    // std::cout << "\nStart running trial " << nt + 1 << ": ";
    galois::StatTimer Tinitemb("EmbInitTime");
    Tinitemb.start();
    miner.initialize(pattern_filename);
    Tinitemb.stop();

    galois::StatTimer execTime("Timer_0");
    execTime.start();
#ifdef TRIANGLE
    miner.tc_solver();
#else
    miner.solver();
#endif // TRIANGLE
    execTime.stop();
    miner.print_output();
    miner.clean();
  }
  std::cout << "\n\t" << rm.get_peak_memory() << "\n\n";

  totalTime.stop();

  return 0;
}


================================================
FILE: libpangolin/include/pangolin/BfsMining/vertex_miner.h
================================================
#ifndef VERTEX_MINER_H
#define VERTEX_MINER_H
#include "pangolin/miner.h"
#include "pangolin/ptypes.h"
#include "pangolin/quick_pattern.h"
#include "pangolin/canonical_graph.h"
#include "pangolin/BfsMining/embedding_list.h"

template <typename ElementTy, typename EmbeddingTy, typename API,
          bool enable_dag = false, bool is_single = true,
          bool use_wedge = false, bool use_match_order = false>
class VertexMiner : public Miner<ElementTy, EmbeddingTy, enable_dag> {
  typedef EmbeddingList<ElementTy, EmbeddingTy> EmbeddingListTy;

public:
  VertexMiner(unsigned max_sz, int nt, unsigned nb)
      : Miner<ElementTy, EmbeddingTy, enable_dag>(max_sz, nt), num_blocks(nb) {}
  virtual ~VertexMiner() {}
  void init_emb_list() {
    this->emb_list.init(this->graph, this->max_size, enable_dag);
  }
  bool is_single_pattern() { return npatterns == 1; }
  int get_num_patterns() { return npatterns; }
  void set_num_patterns(int np = 1) {
    npatterns = np;
    accumulators.resize(npatterns);
    for (int i = 0; i < npatterns; i++)
      accumulators[i].reset();
    if (!is_single)
      for (auto i = 0; i < this->num_threads; i++)
        qp_localmaps.getLocal(i)->clear();
  }
  void clean() {
    is_wedge.clear();
    accumulators.clear();
    qp_map.clear();
    cg_map.clear();
    for (auto i = 0; i < this->num_threads; i++)
      qp_localmaps.getLocal(i)->clear();
    for (auto i = 0; i < this->num_threads; i++)
      cg_localmaps.getLocal(i)->clear();
    this->emb_list.clean();
  }
  void initialize(std::string pattern_filename) {
    galois::on_each([&](unsigned tid, unsigned) {
      auto& local_counters = *(counters.getLocal(tid));
      local_counters.resize(npatterns);
      std::fill(local_counters.begin(), local_counters.end(), 0);
    });
    init_emb_list();
    if (use_match_order) {
      if (pattern_filename == "") {
        std::cout << "need specify pattern file name using -p\n";
        exit(1);
      }
      // unsigned pid = this->read_pattern(pattern_filename);
      // unsigned pid = this->read_pattern(pattern_filename, "gr", true);
      // std::cout << "pattern id = " << pid << "\n";
      // set_input_pattern(pid);
    }
  }
  void set_input_pattern(unsigned GALOIS_UNUSED(pid)) {
    // input_pid = pid;
  }
  virtual void print_output() {}

  // extension for vertex-induced motif
  inline void extend_vertex_multi(unsigned level, size_t chunk_begin,
                                  size_t chunk_end) {
    auto cur_size = this->emb_list.size();
    size_t begin = 0, end = cur_size;
    if (level == 1) {
      begin    = chunk_begin;
      end      = chunk_end;
      cur_size = end - begin;
      // std::cout << "\t chunk_begin = " << chunk_begin << ", chunk_end "
      //          << chunk_end << "\n";
    }
    // std::cout << "\t number of current embeddings in level " << level << ": "
    // << cur_size << "\n";
    UintList num_new_emb(cur_size); // TODO: for large graph, wo need UlongList
    // UlongList num_new_emb(cur_size);
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          auto& local_counters  = *(counters.getLocal());
          unsigned n            = level + 1;
          StrQpMapFreq* qp_lmap = nullptr;
          if (n >= 4)
            qp_lmap = qp_localmaps.getLocal();
          EmbeddingTy emb(n);
          get_embedding(level, pos, emb);
          if (n < this->max_size - 1)
            num_new_emb[pos - begin] = 0;
          if (n == 3 && this->max_size == 4)
            emb.set_pid(this->emb_list.get_pid(pos));
          for (unsigned i = 0; i < n; ++i) {
            if (!API::toExtend(n, emb, i))
              continue;
            auto src = emb.get_vertex(i);
            for (auto e : this->graph.edges(src)) {
              auto dst = this->graph.getEdgeDst(e);
              if (API::toAdd(n, this->graph, emb, i, dst)) {
                if (n < this->max_size - 1) {
                  num_new_emb[pos - begin]++;
                } else { // do reduction
                  if (n < 4) {
                    unsigned pid =
                        this->find_motif_pattern_id(n, i, dst, emb, pos);
                    local_counters[pid] += 1;
                  } else
                    quick_reduce(n, i, dst, emb, qp_lmap);
                }
              }
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-alloc"));
    if (level == this->max_size - 2) {
      galois::on_each([&](unsigned tid, unsigned) {
        auto& local_counters = *(counters.getLocal(tid));
        for (int i = 0; i < this->npatterns; i++)
          this->accumulators[i] += local_counters[i];
      });
      return;
    }

    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);
    num_new_emb.clear();
    Ulong new_size = indices.back();
    this->emb_list.add_level(new_size);
    if (use_wedge && level == 1 && this->max_size == 4) {
      is_wedge.resize(this->emb_list.size());
      std::fill(is_wedge.begin(), is_wedge.end(), 0);
    }
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          auto start = indices[pos - begin];
          auto n     = emb.size();
          for (unsigned i = 0; i < n; ++i) {
            if (!API::toExtend(n, emb, i))
              continue;
            auto src = emb.get_vertex(i);
            for (auto e : this->graph.edges(src)) {
              GNode dst = this->graph.getEdgeDst(e);
              if (API::toAdd(n, this->graph, emb, i, dst)) {
                if (!is_single && n == 2 && this->max_size == 4)
                  this->emb_list.set_pid(start, this->find_motif_pattern_id(
                                                    n, i, dst, emb, start));
                this->emb_list.set_idx(level + 1, start, pos);
                this->emb_list.set_vid(level + 1, start++, dst);
              }
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-insert"));
    indices.clear();
  }

  // extension for vertex-induced clique
  inline void extend_vertex_single(unsigned level, size_t chunk_begin,
                                   size_t chunk_end) {
    auto cur_size = this->emb_list.size();
    size_t begin = 0, end = cur_size;
    if (level == 1) {
      begin    = chunk_begin;
      end      = chunk_end;
      cur_size = end - begin;
      // std::cout << "\t chunk_begin = " << chunk_begin << ", chunk_end "
      //          << chunk_end << "\n";
    }
    // std::cout << "\t number of current embeddings in level " << level << ": "
    //          << cur_size << "\n";
    UintList num_new_emb(cur_size);
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          auto& local_counters = *(counters.getLocal());
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          auto vid                 = this->emb_list.get_vid(level, pos);
          num_new_emb[pos - begin] = 0;
          for (auto e : this->graph.edges(vid)) {
            GNode dst = this->graph.getEdgeDst(e);
            if (API::toAdd(level + 1, this->graph, emb, level, dst)) {
              if (level < this->max_size - 2) {
                num_new_emb[pos - begin]++;
              } else {
                local_counters[0] += 1;
              }
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-alloc"));

    if (level == this->max_size - 2) {
      galois::on_each([&](unsigned tid, unsigned) {
        auto& local_counters = *(counters.getLocal(tid));
        for (int i = 0; i < this->npatterns; i++)
          this->accumulators[0] += local_counters[0];
      });
      return;
    }

    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);
    num_new_emb.clear();
    Ulong new_size = indices.back();
    std::cout << "\t number of new embeddings: " << new_size << "\n";
    this->emb_list.add_level(new_size);
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          auto vid   = this->emb_list.get_vid(level, pos);
          auto start = indices[pos - begin];
          for (auto e : this->graph.edges(vid)) {
            GNode dst = this->graph.getEdgeDst(e);
            if (API::toAdd(level + 1, this->graph, emb, level, dst)) {
              this->emb_list.set_idx(level + 1, start, pos);
              this->emb_list.set_vid(level + 1, start++, dst);
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-insert"));
    indices.clear();
  }

  inline void extend_single_ordered(unsigned level, size_t chunk_begin,
                                    size_t chunk_end) {
    auto cur_size = this->emb_list.size();
    size_t begin = 0, end = cur_size;
    if (level == 1) {
      begin    = chunk_begin;
      end      = chunk_end;
      cur_size = end - begin;
      // std::cout << "\t chunk_begin = " << chunk_begin << ", chunk_end " <<
      // chunk_end << "\n";
    }
    // std::cout << "\t number of embeddings in level " << level << ": " <<
    // cur_size << "\n";
    UintList num_new_emb(cur_size);

    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          auto& local_counters = *(counters.getLocal());
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          num_new_emb[pos - begin] = 0;
          auto id                  = API::getExtendableVertex(level + 1);
          auto src                 = emb.get_vertex(id);
          for (auto e : this->graph.edges(src)) {
            auto dst = this->graph.getEdgeDst(e);
            if (API::toAdd(level + 1, this->graph, emb, src, dst)) {
              if (level < this->max_size - 2) {
                num_new_emb[pos - begin]++;
              } else {
                local_counters[0] += 1;
              }
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-alloc"));

    if (level == this->max_size - 2) {
      galois::on_each([&](unsigned tid, unsigned) {
        auto& local_counters = *(counters.getLocal(tid));
        for (int i = 0; i < this->npatterns; i++)
          this->accumulators[0] += local_counters[0];
      });
      return;
    }

    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);
    num_new_emb.clear();
    Ulong new_size = indices.back();
    // std::cout << "number of new embeddings: " << new_size << "\n";
    this->emb_list.add_level(new_size);

    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          auto start = indices[pos - begin];
          auto id    = API::getExtendableVertex(level + 1);
          auto src   = emb.get_vertex(id);
          // std::cout << "current embedding: " << emb << "\n";
          // std::cout << "extending vertex " << src << "\n";
          for (auto e : this->graph.edges(src)) {
            auto dst = this->graph.getEdgeDst(e);
            if (API::toAdd(level + 1, this->graph, emb, src, dst)) {
              this->emb_list.set_idx(level + 1, start, pos);
              this->emb_list.set_vid(level + 1, start++, dst);
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-insert"));
    indices.clear();
  }

  inline void extend_ordered(unsigned level, size_t chunk_begin,
                             size_t chunk_end) {
    auto cur_size = this->emb_list.size();
    size_t begin = 0, end = cur_size;
    if (level == 1) {
      begin    = chunk_begin;
      end      = chunk_end;
      cur_size = end - begin;
      // std::cout << "\t chunk_begin = " << chunk_begin << ", chunk_end "
      //          << chunk_end << "\n";
    }
    // std::cout << "\t number of current embeddings in level " << level << ": "
    //          << cur_size << "\n";
    UintList num_new_emb(cur_size);
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          num_new_emb[pos - begin] = 0;
          // std::cout << "current embedding: " << emb << "\n";
          for (auto q_edge : this->pattern.edges(level + 1)) {
            VertexId q_dst   = this->pattern.getEdgeDst(q_edge);
            VertexId q_order = q_dst;
            if (q_order < level + 1) {
              VertexId d_vertex = emb.get_vertex(q_order);
              for (auto d_edge : this->graph.edges(d_vertex)) {
                GNode d_dst = this->graph.getEdgeDst(d_edge);
                if (API::toAddOrdered(level + 1, this->graph, emb, q_order,
                                      d_dst, this->pattern)) {
                  if (level < this->max_size - 2)
                    num_new_emb[pos - begin]++;
                  else
                    accumulators[0] += 1;
                }
              }
              break;
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-alloc"));

    if (level == this->max_size - 2)
      return;
    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);
    num_new_emb.clear();
    Ulong new_size = indices.back();
    std::cout << "\t number of new embeddings: " << new_size << "\n";
    this->emb_list.add_level(new_size);
    galois::do_all(
        galois::iterate(begin, end),
        [&](const size_t& pos) {
          EmbeddingTy emb(level + 1);
          get_embedding(level, pos, emb);
          auto start = indices[pos - begin];
          for (auto q_edge : this->pattern.edges(level + 1)) {
            VertexId q_dst   = this->pattern.getEdgeDst(q_edge);
            VertexId q_order = q_dst;
            if (q_order < level + 1) {
              VertexId d_vertex = emb.get_vertex(q_order);
              for (auto d_edge : this->graph.edges(d_vertex)) {
                GNode d_dst = this->graph.getEdgeDst(d_edge);
                if (API::toAddOrdered(level + 1, this->graph, emb, q_order,
                                      d_dst, this->pattern)) {
                  this->emb_list.set_idx(level + 1, start, pos);
                  this->emb_list.set_vid(level + 1, start++, d_dst);
                }
              }
              break;
            }
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("Extending-insert"));
    indices.clear();
  }

  // quick pattern reduction
  inline void quick_reduce(unsigned n, unsigned i, VertexId dst,
                           const EmbeddingTy& emb, StrQpMapFreq* qp_lmap) {
    std::vector<bool> connected;
    this->get_connectivity(n, i, dst, emb, connected);
    StrQPattern qp(n + 1, connected);
    if (qp_lmap->find(qp) != qp_lmap->end()) {
      (*qp_lmap)[qp] += 1;
      qp.clean();
    } else
      (*qp_lmap)[qp] = 1;
  }
  // canonical pattern reduction
  inline void canonical_reduce() {
    for (auto i = 0; i < this->num_threads; i++)
      cg_localmaps.getLocal(i)->clear();
    galois::do_all(
        galois::iterate(qp_map),
        [&](auto& element) {
          StrCgMapFreq* cg_map = cg_localmaps.getLocal();
          StrCPattern cg(element.first);
          if (cg_map->find(cg) != cg_map->end())
            (*cg_map)[cg] += element.second;
          else
            (*cg_map)[cg] = element.second;
          cg.clean();
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::loopname("CanonicalReduce"));
    qp_map.clear();
  }
  inline void merge_qp_map() {
    qp_map.clear();
    for (unsigned i = 0; i < qp_localmaps.size(); i++) {
      StrQpMapFreq qp_lmap = *qp_localmaps.getLocal(i);
      for (auto element : qp_lmap) {
        if (qp_map.find(element.first) != qp_map.end())
          qp_map[element.first] += element.second;
        else
          qp_map[element.first] = element.second;
      }
    }
  }
  inline void merge_cg_map() {
    cg_map.clear();
    for (unsigned i = 0; i < cg_localmaps.size(); i++) {
      StrCgMapFreq cg_lmap = *cg_localmaps.getLocal(i);
      for (auto element : cg_lmap) {
        if (cg_map.find(element.first) != cg_map.end())
          cg_map[element.first] += element.second;
        else
          cg_map[element.first] = element.second;
      }
    }
  }

  // Utilities
  Ulong get_total_count() { return accumulators[0].reduce(); }
  void printout_motifs() {
    std::cout << std::endl;
    if (accumulators.size() == 2) {
      std::cout << "\ttriangles " << accumulators[0].reduce() << std::endl;
      std::cout << "\twedges    " << accumulators[1].reduce() << std::endl;
    } else if (accumulators.size() == 6) {
      std::cout << "\t4-paths --> " << accumulators[0].reduce() << std::endl;
      std::cout << "\t3-stars --> " << accumulators[1].reduce() << std::endl;
      std::cout << "\t4-cycles --> " << accumulators[2].reduce() << std::endl;
      std::cout << "\ttailed-triangles --> " << accumulators[3].reduce()
                << std::endl;
      std::cout << "\tdiamonds --> " << accumulators[4].reduce() << std::endl;
      std::cout << "\t4-cliques --> " << accumulators[5].reduce() << std::endl;
    } else {
      if (this->max_size < 9) {
        std::cout << std::endl;
        for (auto it = cg_map.begin(); it != cg_map.end(); ++it)
          std::cout << "{" << it->first << "} --> " << it->second << std::endl;
      } else {
        std::cout << std::endl;
        for (auto it = cg_map.begin(); it != cg_map.end(); ++it)
          std::cout << it->first << " --> " << it->second << std::endl;
      }
    }
    // std::cout << std::endl;
  }
  void tc_vertex_solver() { // vertex parallel
    galois::do_all(
        galois::iterate(this->graph.begin(), this->graph.end()),
        [&](const GNode& src) {
          for (auto e : this->graph.edges(src)) {
            auto dst = this->graph.getEdgeDst(e);
            accumulators[0] += this->intersect(src, dst);
          }
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("TC"));
  }

  void tc_solver() { // edge parallel
    galois::do_all(
        galois::iterate((size_t)0, this->emb_list.size()),
        [&](const size_t& id) {
          auto src = this->emb_list.get_idx(1, id);
          auto dst = this->emb_list.get_vid(1, id);
          auto num = this->intersect_dag(src, dst);
          accumulators[0] += num;
        },
        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
        galois::loopname("TC"));
  }

  void solver() {
    size_t num          = this->emb_list.size();
    size_t chunk_length = (num - 1) / num_blocks + 1;
    // std::cout << "number of single-edge embeddings: " << num << "\n";
    for (size_t cid = 0; cid < num_blocks; cid++) {
      size_t chunk_begin = cid * chunk_length;
      size_t chunk_end   = std::min((cid + 1) * chunk_length, num);
      // size_t cur_size    = chunk_end - chunk_begin;
      // std::cout << "Processing the " << cid << " chunk (" << cur_size
      //          << " edges) of " << num_blocks << " blocks\n";
      unsigned level = 1;
      while (1) {
        // this->emb_list.printout_embeddings(level);
        if (use_match_order) {
          extend_single_ordered(level, chunk_begin, chunk_end);
        } else {
          if (is_single_pattern())
            extend_vertex_single(level, chunk_begin, chunk_end);
          else
            extend_vertex_multi(level, chunk_begin, chunk_end);
        }
        if (level == this->max_size - 2)
          break;
        level++;
      }
      this->emb_list.reset_level();
    }
    if (this->max_size >= 5 && !is_single_pattern()) {
      merge_qp_map();
      canonical_reduce();
      merge_cg_map();
    }
  }

private:
  unsigned num_blocks;
  StrQpMapFreq qp_map; // quick patterns map for counting the frequency
  StrCgMapFreq cg_map; // canonical graph map for couting the frequency
  LocalStrQpMapFreq qp_localmaps; // quick patterns local map for each thread
  LocalStrCgMapFreq cg_localmaps; // canonical graph local map for each thread
  std::vector<BYTE> is_wedge;     // indicate a 3-vertex embedding is a wedge or
                                  // chain (v0-cntered or v1-centered)

  inline void get_embedding(unsigned level, size_t pos, EmbeddingTy& emb) {
    auto vid = this->emb_list.get_vid(level, pos);
    auto idx = this->emb_list.get_idx(level, pos);
    ElementTy ele(vid);
    emb.set_element(level, ele);
    // backward constructing the embedding
    for (unsigned l = 1; l < level; l++) {
      auto u = this->emb_list.get_vid(level - l, idx);
      ElementTy ele(u);
      emb.set_element(level - l, ele);
      idx = this->emb_list.get_idx(level - l, idx);
    }
    ElementTy ele0(idx);
    emb.set_element(0, ele0);
  }

protected:
  int npatterns;
  galois::substrate::PerThreadStorage<std::vector<Ulong>> counters;
  std::vector<UlongAccu> accumulators;
  EmbeddingListTy emb_list;

  inline unsigned find_motif_pattern_id(unsigned n, unsigned idx, VertexId dst,
                                        const EmbeddingTy& emb,
                                        unsigned pos = 0) {
    unsigned pid = 0;
    if (n == 2) { // count 3-motifs
      pid = 1;    // 3-chain
      if (idx == 0) {
        if (this->is_connected(emb.get_vertex(1), dst))
          pid = 0; // triangle
        else if (use_wedge && this->max_size == 4)
          is_wedge[pos] = 1; // wedge; used for 4-motif
      }
    } else if (n == 3) { // count 4-motifs
      unsigned num_edges = 1;
      pid                = emb.get_pid();
      if (pid == 0) { // extending a triangle
        for (unsigned j = idx + 1; j < n; j++)
          if (this->is_connected(emb.get_vertex(j), dst))
            num_edges++;
        pid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique
      } else {               // extending a 3-chain
        std::vector<bool> connected(3, false);
        connected[idx] = true;
        for (unsigned j = idx + 1; j < n; j++) {
          if (this->is_connected(emb.get_vertex(j), dst)) {
            num_edges++;
            connected[j] = true;
          }
        }
        if (num_edges == 1) {
          pid             = 0; // p0: 3-path
          unsigned center = 1;
          if (use_wedge) {
            if (is_wedge[pos])
              center = 0;
          } else
            center = this->is_connected(emb.get_vertex(1), emb.get_vertex(2))
                         ? 1
                         : 0;
          if (idx == center)
            pid = 1; // p1: 3-star
        } else if (num_edges == 2) {
          pid             = 2; // p2: 4-cycle
          unsigned center = 1;
          if (use_wedge) {
            if (is_wedge[pos])
              center = 0;
          } else
            center = this->is_connected(emb.get_vertex(1), emb.get_vertex(2))
                         ? 1
                         : 0;
          if (connected[center])
            pid = 3; // p3: tailed-triangle
        } else {
          pid = 4; // p4: diamond
        }
      }
    } else { // count 5-motif and beyond
      pid = this->find_motif_pattern_id_eigen(n, idx, dst, emb);
    }
    return pid;
  }
};

#endif // VERTEX_MINER_HPP_


================================================
FILE: libpangolin/include/pangolin/BfsMining/vertex_miner_api.h
================================================
#pragma once
#include "pangolin/gtypes.h"

template <typename EmbeddingTy, bool use_wedge = true>
class VertexMinerAPI {
public:
  VertexMinerAPI() {}
  ~VertexMinerAPI() {}
  // toExtend
  static inline bool toExtend(unsigned, const EmbeddingTy&, unsigned) {
    return true;
  }

  // toAdd (only add non-automorphisms)
  static inline bool toAdd(unsigned n, PangolinGraph& g, const EmbeddingTy& emb,
                           unsigned pos, VertexId dst) {
    return !is_vertex_automorphism(n, g, emb, pos, dst);
  }

  static inline bool toAddOrdered(unsigned, PangolinGraph&, const EmbeddingTy&,
                                  unsigned, VertexId, PangolinGraph&) {
    return true;
  }

  // specify which vertex to extend when using matching order
  static inline unsigned getExtendableVertex(unsigned n) { return n - 1; }

  // given an embedding, return its pattern id (hash value)
  static inline unsigned getPattern(unsigned, PangolinGraph&, unsigned,
                                    VertexId, const EmbeddingTy&, unsigned) {
    return 0;
  }

protected:
  static inline bool is_vertex_automorphism(unsigned n, PangolinGraph& g,
                                            const EmbeddingTy& emb,
                                            unsigned idx, VertexId dst) {
    // unsigned n = emb.size();
    // the new vertex id should be larger than the first vertex id
    if (dst <= emb.get_vertex(0))
      return true;
    // the new vertex should not already exist in the embedding
    for (unsigned i = 1; i < n; ++i)
      if (dst == emb.get_vertex(i))
        return true;
    // the new vertex should not already be extended by any previous vertex in
    // the embedding
    for (unsigned i = 0; i < idx; ++i)
      if (is_connected(g, emb.get_vertex(i), dst))
        return true;
    // the new vertex id must be larger than any vertex id after its source
    // vertex in the embedding
    for (unsigned i = idx + 1; i < n; ++i)
      if (dst < emb.get_vertex(i))
        return true;
    return false;
  }
  static inline bool is_all_connected_dag(PangolinGraph& g, unsigned dst,
                                          const EmbeddingTy& emb, unsigned end,
                                          unsigned start = 0) {
    assert(end > 0);
    bool all_connected = true;
    for (unsigned i = start; i < end; ++i) {
      unsigned from = emb.get_vertex(i);
      if (!is_connected_dag(g, dst, from)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  static inline bool is_connected(PangolinGraph& g, unsigned a, unsigned b) {
    if (g.get_degree(a) == 0 || g.get_degree(b) == 0)
      return false;
    unsigned key    = a;
    unsigned search = b;
    if (g.get_degree(a) < g.get_degree(b)) {
      key    = b;
      search = a;
    }
    auto begin = g.edge_begin(search);
    auto end   = g.edge_end(search);
    return binary_search(g, key, begin, end);
  }
  static inline int is_connected_dag(PangolinGraph& g, unsigned key,
                                     unsigned search) {
    if (g.get_degree(search) == 0)
      return false;
    auto begin = g.edge_begin(search);
    auto end   = g.edge_end(search);
    return binary_search(g, key, begin, end);
  }
  static inline bool binary_search(PangolinGraph& g, unsigned key,
                                   PangolinGraph::edge_iterator begin,
                                   PangolinGraph::edge_iterator end) {
    auto l = begin;
    auto r = end - 1;
    while (r >= l) {
      auto mid       = l + (r - l) / 2;
      unsigned value = g.getEdgeDst(mid);
      if (value == key)
        return true;
      if (value < key)
        l = mid + 1;
      else
        r = mid - 1;
    }
    return false;
  }
  static inline unsigned find_motif_pattern_id(unsigned n, PangolinGraph& g,
                                               unsigned idx, VertexId dst,
                                               const EmbeddingTy& emb,
                                               BYTE* pre_pid,
                                               unsigned pos = 0) {
    unsigned pid = 0;
    if (n == 2) { // count 3-motifs
      pid = 1;    // 3-chain
      if (idx == 0) {
        if (is_connected(g, emb.get_vertex(1), dst))
          pid = 0; // triangle
        else if (use_wedge)
          pre_pid[pos] = 1; // wedge; used for 4-motif
      }
    } else if (n == 3) { // count 4-motifs
      unsigned num_edges = 1;
      pid                = emb.get_pid();
      if (pid == 0) { // extending a triangle
        for (unsigned j = idx + 1; j < n; j++)
          if (is_connected(g, emb.get_vertex(j), dst))
            num_edges++;
        pid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique
      } else {               // extending a 3-chain
        assert(pid == 1);
        std::vector<bool> connected(3, false);
        connected[idx] = true;
        for (unsigned j = idx + 1; j < n; j++) {
          if (is_connected(g, emb.get_vertex(j), dst)) {
            num_edges++;
            connected[j] = true;
          }
        }
        if (num_edges == 1) {
          pid             = 0; // p0: 3-path
          unsigned center = 1;
          if (use_wedge) {
            if (pre_pid[pos])
              center = 0;
          } else
            center =
                is_connected(g, emb.get_vertex(1), emb.get_vertex(2)) ? 1 : 0;
          if (idx == center)
            pid = 1; // p1: 3-star
        } else if (num_edges == 2) {
          pid             = 2; // p2: 4-cycle
          unsigned center = 1;
          if (use_wedge) {
            if (pre_pid[pos])
              center = 0;
          } else
            center =
                is_connected(g, emb.get_vertex(1), emb.get_vertex(2)) ? 1 : 0;
          if (connected[center])
            pid = 3; // p3: tailed-triangle
        } else {
          pid = 4; // p4: diamond
        }
      }
    } else { // count 5-motif and beyond
             // pid = find_motif_pattern_id_eigen(n, idx, dst, emb);
    }
    return pid;
  }
};


================================================
FILE: libpangolin/include/pangolin/base_embedding.h
================================================
#pragma once
#include "pangolin/embedding.h"
#include "bliss/uintseqhash.hh"

// Basic Vertex-induced embedding
class BaseEmbedding : public Embedding<SimpleElement> {
  friend std::ostream& operator<<(std::ostream& strm, const BaseEmbedding& emb);

public:
  BaseEmbedding() {}
  BaseEmbedding(size_t n) : Embedding(n) {}
  ~BaseEmbedding() {}
  inline unsigned get_hash() const {
    bliss::UintSeqHash h;
    for (unsigned i = 0; i < size(); ++i)
      h.update(elements[i].get_vid());
    return h.get_value();
  }
  BaseEmbedding& operator=(const BaseEmbedding& other) {
    if (this == &other)
      return *this;
    elements = other.get_elements();
    return *this;
  }
  inline unsigned get_pid() const { return 0; } // not used
  inline void set_pid(unsigned) {}              // not used
  friend bool operator==(const BaseEmbedding& e1, const BaseEmbedding& e2) {
    return e1.elements == e2.elements;
  }
};

namespace std {
template <>
struct hash<BaseEmbedding> {
  std::size_t operator()(const BaseEmbedding& emb) const {
    return std::hash<int>()(emb.get_hash());
  }
};
} // namespace std


================================================
FILE: libpangolin/include/pangolin/canonical_graph.h
================================================
#ifndef CANONICAL_GRAPH_HPP_
#define CANONICAL_GRAPH_HPP_
/**
 * Code from on below link. Modified under Galois.
 *
 * https://github.com/rstream-system/RStream/
 *
 * Copyright (c) 2018, Kai Wang and the respective contributors
 * All rights reserved.
 * Reused/revised under 3-BSD
 */

#define USE_DOMAIN // use domain support

GALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS
#include "bliss/graph.hh"
GALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS

#include "pangolin/embedding.h"
#include "pangolin/edge_type.h"

typedef std::unordered_map<VertexId, BYTE> VertexMap;
typedef std::vector<bliss::Graph::Vertex> BlissVertexList;

template <typename EmbeddingTy, typename ElementTy>
class CanonicalGraph;
template <typename EmbeddingTy, typename ElementTy>
std::ostream& operator<<(std::ostream& strm,
                         const CanonicalGraph<EmbeddingTy, ElementTy>& cg);

template <typename EmbeddingTy, typename ElementTy>
class CanonicalGraph {
  friend std::ostream&
  operator<<<>(std::ostream& strm,
               const CanonicalGraph<EmbeddingTy, ElementTy>& cg);

public:
  CanonicalGraph() : number_of_vertices(0), hash_value(0) {}
  CanonicalGraph(bliss::AbstractGraph* ag,
                 bool GALOIS_USED_ONLY_IN_DEBUG(is_directed) = false) {
    assert(!is_directed);
    construct_cg(ag);
  }
  CanonicalGraph(const QuickPattern<EmbeddingTy, ElementTy>& qp,
                 bool GALOIS_USED_ONLY_IN_DEBUG(is_directed) = false) {
    assert(!is_directed);
    bliss::AbstractGraph* ag = turn_abstract(qp);
    construct_cg(ag);
  }
  ~CanonicalGraph() {}
  int cmp(const CanonicalGraph& other_cg) const {
    // compare the numbers of vertices
    if (get_num_vertices() < other_cg.get_num_vertices())
      return -1;
    if (get_num_vertices() > other_cg.get_num_vertices())
      return 1;
    // compare hash value
    if (get_hash() < other_cg.get_hash())
      return -1;
    if (get_hash() > other_cg.get_hash())
      return 1;
    // compare edges
    assert(embedding.size() == other_cg.embedding.size());
    for (unsigned i = 0; i < embedding.size(); ++i) {
      const auto& t1  = embedding.get_element(i);
      const auto& t2  = other_cg.embedding.get_element(i);
      int cmp_element = t1.cmp(t2);
      if (cmp_element != 0)
        return cmp_element;
    }
    return 0;
  }
  inline unsigned get_hash() const { return hash_value; }
  inline int get_num_vertices() const { return number_of_vertices; }
  // operator for map
  inline bool operator==(const CanonicalGraph& other) const {
    return cmp(other) == 0;
  }
  // inline EmbeddingTy& get_embedding() { return embedding; }
  inline EmbeddingTy get_embedding() const { return embedding; }
  inline void set_number_vertices(int num_vertices) {
    number_of_vertices = num_vertices;
  }
  inline void set_hash_value(unsigned int hash) { hash_value = hash; }
  inline unsigned get_quick_pattern_index(unsigned i) { return qp_idx[i]; }
  inline unsigned get_id() const { return hash_value; }
  inline void clean() { embedding.clean(); }

private:
  EmbeddingTy embedding;
  std::vector<int> qp_idx; // TODO: try gstl::Vector
  int number_of_vertices;
  unsigned hash_value;
  unsigned support;
  void construct_cg(bliss::AbstractGraph* ag) {
    number_of_vertices = ag->get_nof_vertices();
    hash_value         = ag->get_hash();
    transform_to_embedding(ag);
  }
  void transform_to_embedding(bliss::AbstractGraph* ag) {
    bliss::Graph* graph = (bliss::Graph*)ag;
    VertexSet set;
    VertexMap map;
    EdgeHeap min_heap;
    BlissVertexList vertices = graph->get_vertices_rstream();
    VertexId first_src       = init_heapAndset(vertices, min_heap, set);
    assert(first_src != (VertexId)-1);
    push_first_element(first_src, map, vertices);
#ifdef USE_DOMAIN
    bool is_first_edge = true;
#endif
    while (!min_heap.empty()) {
      Edge edge = min_heap.top();
#ifdef USE_DOMAIN
      if (is_first_edge) {
        qp_idx.push_back(edge.src_domain);
        is_first_edge = false;
      }
#endif
      push_element(edge, map, vertices);
      min_heap.pop();
      add_neighbours(edge, min_heap, vertices, set);
    }
  }
  VertexId init_heapAndset(BlissVertexList& vertices, EdgeHeap& min_heap,
                           VertexSet& set) {
    for (unsigned i = 0; i < vertices.size(); ++i) {
      if (!vertices[i].edges.empty()) {
        for (auto v : vertices[i].edges) {
#ifdef USE_DOMAIN
          min_heap.push(Edge(i, v.first, v.second.first, v.second.second));
#else
          min_heap.push(Edge(i, v));
#endif
        }
        set.insert(i);
        return i;
      }
    }
    return -1;
  }
  void push_first_element(VertexId first, VertexMap& map,
                          BlissVertexList& vertices) {
    map[first] = 0;
    embedding.push_back(
        ElementTy(first + 1, (BYTE)0, (BYTE)vertices[first].color, (BYTE)0));
  }
  void push_element(Edge& edge, VertexMap& map, BlissVertexList& vertices) {
    assert(edge.src < edge.dst);
    if (map.find(edge.src) != map.end()) {
      embedding.push_back(ElementTy(edge.dst + 1, (BYTE)0,
                                    (BYTE)vertices[edge.dst].color,
                                    (BYTE)map[edge.src]));
#ifdef USE_DOMAIN
      qp_idx.push_back(edge.dst_domain);
#endif
      if (map.find(edge.dst) == map.end()) {
        int s         = embedding.size() - 1;
        map[edge.dst] = s;
      }
    } else if (map.find(edge.dst) != map.end()) {
      embedding.push_back(ElementTy(edge.src + 1, (BYTE)0,
                                    (BYTE)vertices[edge.src].color,
                                    (BYTE)map[edge.dst]));
#ifdef USE_DOMAIN
      qp_idx.push_back(edge.src_domain);
#endif
      if (map.find(edge.src) == map.end()) {
        int s         = embedding.size() - 1;
        map[edge.src] = s;
      }
    } else {
      // wrong case
      std::cout << "wrong case!!!" << std::endl;
      throw std::exception();
    }
  }
  void add_neighbours(Edge& edge, EdgeHeap& min_heap, BlissVertexList& vertices,
                      VertexSet& set) {
    add_neighbours(edge.src, min_heap, vertices, set);
    add_neighbours(edge.dst, min_heap, vertices, set);
  }
  void add_neighbours(VertexId srcId, EdgeHeap& min_heap,
                      BlissVertexList& vertices, VertexSet& set) {
    if (set.find(srcId) == set.end()) {
      for (auto v : vertices[srcId].edges) {
#ifdef USE_DOMAIN
        VertexId dst = v.first;
#else
        VertexId dst = v;
#endif
        if (set.find(dst) == set.end()) {
#ifdef USE_DOMAIN
          Edge edge(srcId, dst, v.second.first, v.second.second);
#else
          Edge edge(srcId, dst);
#endif
          edge.swap();
          min_heap.push(edge);
        }
      }
      set.insert(srcId);
    }
  }
  static void report_aut(void* GALOIS_USED_ONLY_IN_DEBUG(param),
                         const unsigned GALOIS_UNUSED(n),
                         const unsigned* GALOIS_UNUSED(aut)) {
    assert(param);
    // fprintf((FILE*) param, "Generator: ");
    // bliss::print_permutation((FILE*) param, n, aut, 1);
    // fprintf((FILE*) param, "\n");
  }
  bliss::AbstractGraph*
  turn_abstract(const QuickPattern<EmbeddingTy, ElementTy>& qp) {
    bliss::AbstractGraph* ag = 0;
    // get the number of vertices
    std::unordered_map<VertexId, BYTE> vertices;
    for (unsigned index = 0; index < qp.get_size(); ++index) {
      auto element = qp.at(index);
      if (std::is_same<ElementTy, LabeledElement>::value)
        vertices[element.get_vid()] = element.get_vlabel();
      else
        vertices[element.get_vid()] = 0;
    }
    // construct bliss graph
    const unsigned number_vertices = vertices.size();
    ag                             = new bliss::Graph(vertices.size());
    // set vertices
    for (unsigned i = 0; i < number_vertices; ++i)
      ag->change_color(i, (unsigned)vertices[i + 1]);
    // read edges
    assert(qp.get_size() > 1);
    for (unsigned index = 1; index < qp.get_size(); ++index) {
      auto element = qp.at(index);
      VertexId from, to;
      from = qp.at(element.get_his()).get_vid();
      to   = element.get_vid();
      // std::cout << "Adding edge: " << from << " --> " << to << "\n";
      ag->add_edge(from - 1, to - 1,
                   std::make_pair((unsigned)element.get_his(), index));
    }
    bliss::Stats stats;
    const unsigned* cl = ag->canonical_form(
        stats, &report_aut, stdout); // canonical labeling. This is expensive.
    bliss::AbstractGraph* cf = ag->permute(cl); // permute to canonical form
    delete ag;
    return cf;
  }
};

template <typename EmbeddingTy, typename ElementTy>
std::ostream& operator<<(std::ostream& strm,
                         const CanonicalGraph<EmbeddingTy, ElementTy>& cg) {
  strm << "{" << cg.embedding << "; " << cg.get_num_vertices() << "}";
  return strm;
}

namespace std {
// template<>
template <typename EmbeddingTy, typename ElementTy>
struct hash<CanonicalGraph<EmbeddingTy, ElementTy>> {
  std::size_t
  operator()(const CanonicalGraph<EmbeddingTy, ElementTy>& cg) const {
    return std::hash<int>()(cg.get_hash());
  }
};
} // namespace std
#endif // CANONICAL_GRAPH_HPP_


================================================
FILE: libpangolin/include/pangolin/core.h
================================================
#pragma once

typedef struct {
  unsigned key;
  unsigned value;
} keyvalue;

class bheap {
public:
  unsigned n_max; // max number of nodes.
  unsigned n;     // number of nodes.
  unsigned* pt;   // pointers to nodes.
  keyvalue* kv;   // nodes.
  bheap() {
    pt = NULL;
    kv = NULL;
  }
  ~bheap() {
    if (pt)
      free(pt);
    if (kv)
      free(kv);
  }
  void construct(size_t n_max) {
    n_max = n_max;
    n     = 0;
    pt    = (unsigned*)malloc(n_max * sizeof(unsigned));
    for (unsigned i = 0; i < n_max; i++)
      pt[i] = (unsigned)-1;
    kv = (keyvalue*)malloc(n_max * sizeof(keyvalue));
  }
  void swap(unsigned i, unsigned j) {
    keyvalue kv_tmp = kv[i];
    unsigned pt_tmp = pt[kv_tmp.key];
    pt[kv[i].key]   = pt[kv[j].key];
    kv[i]           = kv[j];
    pt[kv[j].key]   = pt_tmp;
    kv[j]           = kv_tmp;
  }
  void bubble_up(unsigned i) {
    unsigned j = (i - 1) / 2;
    while (i > 0) {
      if (kv[j].value > kv[i].value) {
        swap(i, j);
        i = j;
        j = (i - 1) / 2;
      } else
        break;
    }
  }
  void bubble_down() {
    unsigned i = 0, j1 = 1, j2 = 2, j;
    while (j1 < n) {
      j = ((j2 < n) && (kv[j2].value < kv[j1].value)) ? j2 : j1;
      if (kv[j].value < kv[i].value) {
        swap(i, j);
        i  = j;
        j1 = 2 * i + 1;
        j2 = j1 + 1;
        continue;
      }
      break;
    }
  }
  void insert(keyvalue item) {
    pt[item.key] = (n)++;
    kv[n - 1]    = item;
    bubble_up(n - 1);
  }
  void update(unsigned key) {
    unsigned i = pt[key];
    if (i != (unsigned)-1) {
      ((kv[i]).value)--;
      bubble_up(i);
    }
  }
  keyvalue popmin() {
    keyvalue min  = kv[0];
    pt[min.key]   = (unsigned)-1;
    kv[0]         = kv[--(n)];
    pt[kv[0].key] = 0;
    bubble_down();
    return min;
  }
  // Building the heap structure with (key,value)=(node,degree) for each node
  void mkheap(size_t n, std::vector<VertexId> v) {
    construct(n);
    for (size_t i = 0; i < n; i++) {
      keyvalue item;
      item.key   = i;
      item.value = v[i];
      insert(item);
    }
  }
};


================================================
FILE: libpangolin/include/pangolin/domain_support.h
================================================
#ifndef DOMAIN_SUPPORT_H
#define DOMAIN_SUPPORT_H
/**
 * Code ported from on below link. Modified under Galois.
 *
 * https://github.com/qcri/Arabesque
 *
 * Copyright (c) 2015 Qatar Computing Research Institute
 * All rights reserved.
 * Reused/revised under 3-BSD
 */

#include "pangolin/gtypes.h"

class DomainSupport {
public:
  DomainSupport() {
    num_domains    = 0;
    enough_support = false;
  }
  DomainSupport(unsigned n) {
    num_domains    = n;
    enough_support = false;
    domains_reached_support.resize(n);
    std::fill(domains_reached_support.begin(), domains_reached_support.end(),
              0);
    domain_sets.resize(n);
  }
  ~DomainSupport() {}
  void set_threshold(unsigned minsup) {
    minimum_support = minsup;
    // for (auto i = 0; i < num_domains; i++)
    //	domain_sets[i].get_allocator().allocate(minsup+1);
  }
  void clean() {
    domains_reached_support.clear();
    domain_sets.clear();
  }
  void resize(unsigned n) {
    num_domains    = n;
    enough_support = false;
    domains_reached_support.resize(n);
    std::fill(domains_reached_support.begin(), domains_reached_support.end(),
              0);
    domain_sets.resize(n);
  }
  bool is_frequent() { return enough_support; }
  void set_frequent() {
    if (get_support())
      enough_support = true;
  }
  bool has_domain_reached_support(int i) {
    assert(i < num_domains);
    return domains_reached_support[i];
    // return enough_support || domains_reached_support[i];
  }
  void set_domain_frequent(int i) {
    domains_reached_support[i] = 1;
    domain_sets[i].clear();
  }
  void add_vertex(int i, VertexId vid) {
    domain_sets[i].insert(vid);
    if (domain_sets[i].size() >= minimum_support)
      set_domain_frequent(i);
  }
  bool add_vertices(int i, IntSet& vertices) {
    domains_reached_support[i] = 0;
    domain_sets[i].insert(vertices.begin(), vertices.end());
    if (domain_sets[i].size() >= minimum_support) {
      set_domain_frequent(i);
      return true;
    }
    return false;
  }
  // counting the minimal image based support
  inline bool get_support() {
    return std::all_of(domains_reached_support.begin(),
                       domains_reached_support.end(), [](bool v) { return v; });
  }

  // private:
  unsigned minimum_support;
  int num_domains;
  bool enough_support;
  BoolVec domains_reached_support;
  IntSets domain_sets;
};

// typedef galois::gstl::Map<InitPattern, DomainSupport> InitMap;
typedef std::map<InitPattern, DomainSupport*> InitMap;
typedef galois::substrate::PerThreadStorage<InitMap> InitMaps;
#endif


================================================
FILE: libpangolin/include/pangolin/edge_embedding.h
================================================
#pragma once
#include "pangolin/embedding.h"

// Edge induced embedding
template <typename ElementTy>
class EdgeInducedEmbedding;
template <typename ElementTy>
std::ostream& operator<<(std::ostream& strm,
                         const EdgeInducedEmbedding<ElementTy>& emb);

template <typename ElementTy>
class EdgeInducedEmbedding : public Embedding<ElementTy> {
  friend std::ostream& operator<<<>(std::ostream& strm,
                                    const EdgeInducedEmbedding<ElementTy>& emb);

public:
  EdgeInducedEmbedding() { qp_id = 0xFFFFFFFF; }
  EdgeInducedEmbedding(size_t n) : Embedding<ElementTy>(n) {}
  ~EdgeInducedEmbedding() {}
  void set_qpid(unsigned i) { qp_id = i; }    // set the quick pattern id
  unsigned get_qpid() const { return qp_id; } // get the quick pattern id
private:
  unsigned qp_id; // quick pattern id
};

template <typename ElementTy>
std::ostream& operator<<(std::ostream& strm,
                         const EdgeInducedEmbedding<ElementTy>& emb) {
  if (emb.empty()) {
    strm << "(empty)";
    return strm;
  }
  strm << "(";
  for (unsigned index = 0; index < emb.size() - 1; ++index)
    std::cout << emb.get_element(index) << ", ";
  std::cout << emb.get_element(emb.size() - 1);
  strm << ")";
  return strm;
}

typedef EdgeInducedEmbedding<LabeledElement> EdgeEmbedding;


================================================
FILE: libpangolin/include/pangolin/edge_type.h
================================================
#pragma once
#include "pangolin/types.h"

struct Edge {
  VertexId src;
  VertexId dst;
#ifdef USE_DOMAIN
  unsigned src_domain;
  unsigned dst_domain;
  Edge(VertexId _src, VertexId _dst, unsigned _src_domain, unsigned _dst_domain)
      : src(_src), dst(_dst), src_domain(_src_domain), dst_domain(_dst_domain) {
  }
#endif
  Edge(VertexId _src, VertexId _dst) : src(_src), dst(_dst) {}
  Edge() : src(0), dst(0) {}
  ~Edge() {}
  std::string toString() {
    return "(" + std::to_string(src) + ", " + std::to_string(dst) + ")";
  }
  std::string to_string() const {
    std::stringstream ss;
    ss << "e(" << src << "," << dst << ")";
    return ss.str();
  }
  void swap() {
    if (src > dst) {
      VertexId tmp = src;
      src          = dst;
      dst          = tmp;
#ifdef USE_DOMAIN
      unsigned domain = src_domain;
      src_domain      = dst_domain;
      dst_domain      = domain;
#endif
    }
  }
};

class EdgeComparator {
public:
  int operator()(const Edge& oneEdge, const Edge& otherEdge) {
    if (oneEdge.src == otherEdge.src) {
      return oneEdge.dst > otherEdge.dst;
    } else {
      return oneEdge.src > otherEdge.src;
    }
  }
};

typedef std::pair<VertexId, VertexId> OrderedEdge;
typedef std::priority_queue<Edge, std::vector<Edge>, EdgeComparator> EdgeHeap;


================================================
FILE: libpangolin/include/pangolin/element.h
================================================
// defines the embedding element classes
// LabeledElement: vertex_id, history_info, vertex_label, edge_label. Used for
// FSM. StructuralElement: vertex_id, history_info. Used for Motifs.
// SimpleElement: vertex_id. Used for KCL and TC.
#ifndef ELEMENT_HPP_
#define ELEMENT_HPP_
/**
 * Code from on below link. Modified under Galois.
 *
 * https://github.com/rstream-system/RStream/
 *
 * Copyright (c) 2018, Kai Wang and the respective contributors
 * All rights reserved.
 * Reused/revised under 3-BSD
 */

#include "pangolin/gtypes.h"

// This is the data structure used in RStream.
// Each element contains 8 bytes, first 4 bytes is vertex id,
// second 4 bytes contains edge label(1byte) + vertex label(1byte) + history
// info(1byte). History info is used to record subgraph structure.
struct LabeledElement {
protected:
  VertexId vertex_id;
  BYTE key_index;
  BYTE edge_label;
  BYTE vertex_label;
  BYTE history_info;

public:
  LabeledElement() {}
  LabeledElement(VertexId _vertex_id)
      : vertex_id(_vertex_id), key_index(0), edge_label(0), vertex_label(0),
        history_info(0) {}
  LabeledElement(VertexId _vertex_id, BYTE _history)
      : vertex_id(_vertex_id), key_index(0), edge_label(0), vertex_label(0),
        history_info(_history) {}
  LabeledElement(VertexId _vertex_id, BYTE _edge_label, BYTE _vertex_label)
      : vertex_id(_vertex_id), key_index(0), edge_label(_edge_label),
        vertex_label(_vertex_label), history_info(0) {}
  LabeledElement(VertexId _vertex_id, BYTE _edge_label, BYTE _vertex_label,
                 BYTE _history)
      : vertex_id(_vertex_id), key_index(0), edge_label(_edge_label),
        vertex_label(_vertex_label), history_info(_history) {}
  LabeledElement(VertexId _vertex_id, BYTE _key_index, BYTE _edge_label,
                 BYTE _vertex_label, BYTE _history)
      : vertex_id(_vertex_id), key_index(_key_index), edge_label(_edge_label),
        vertex_label(_vertex_label), history_info(_history) {}
  ~LabeledElement() = default;
  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }
  inline void set_history_info(BYTE his) { history_info = his; }
  inline void set_vertex_label(BYTE lab) { vertex_label = lab; }
  inline int cmp(const LabeledElement& other) const {
    // compare vertex id
    if (vertex_id < other.vertex_id)
      return -1;
    if (vertex_id > other.vertex_id)
      return 1;
    // compare history info
    if (history_info < other.history_info)
      return -1;
    if (history_info > other.history_info)
      return 1;
    // compare vertex label
    if (vertex_label < other.vertex_label)
      return -1;
    if (vertex_label > other.vertex_label)
      return 1;
    // compare edge label
    if (edge_label < other.edge_label)
      return -1;
    if (edge_label > other.edge_label)
      return 1;
    // compare index
    if (key_index < other.key_index)
      return -1;
    if (key_index > other.key_index)
      return 1;
    return 0;
  }
  VertexId get_vid() const { return vertex_id; }
  BYTE get_key() const { return key_index; }
  BYTE get_elabel() const { return edge_label; }
  BYTE get_vlabel() const { return vertex_label; }
  BYTE get_his() const { return history_info; }
  bool has_history() { return true; }
  friend std::ostream& operator<<(std::ostream& strm,
                                  const LabeledElement& element) {
    strm << "[" << element.get_vid()
         << ", " //<< (int)element.get_key() << ", " <<
                 //(int)element.get_elabel() << ", "
         << (int)element.get_vlabel() << ", " << (int)element.get_his() << "]";
    return strm;
  }
};

struct StructuralElement {
protected:
  VertexId vertex_id;
  BYTE history_info;

public:
  StructuralElement() {}
  StructuralElement(VertexId _vertex_id)
      : vertex_id(_vertex_id), history_info(0) {}
  StructuralElement(VertexId _vertex_id, BYTE _history)
      : vertex_id(_vertex_id), history_info(_history) {}
  StructuralElement(VertexId _vertex_id, BYTE, BYTE, BYTE _history)
      : vertex_id(_vertex_id), history_info(_history) {}
  StructuralElement(VertexId _vertex_id, BYTE, BYTE, BYTE, BYTE _history)
      : vertex_id(_vertex_id), history_info(_history) {}
  ~StructuralElement() = default;
  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }
  inline void set_history_info(BYTE his) { history_info = his; }
  inline void set_vertex_label(BYTE) {}
  inline int cmp(const StructuralElement& other) const {
    // compare vertex id
    if (vertex_id < other.vertex_id)
      return -1;
    if (vertex_id > other.vertex_id)
      return 1;
    // compare history info
    if (history_info < other.history_info)
      return -1;
    if (history_info > other.history_info)
      return 1;
    return 0;
  }
  VertexId get_vid() const { return vertex_id; }
  BYTE get_his() const { return history_info; }
  BYTE get_vlabel() const { return 0; }
  BYTE get_key() const { return 0; }
  bool has_history() { return true; }
  friend std::ostream& operator<<(std::ostream& strm,
                                  const StructuralElement& element) {
    strm << "[" << element.get_vid() << ", " << (int)element.get_his() << "]";
    return strm;
  }
};

// typedef unsigned SimpleElement;
struct SimpleElement {
protected:
  VertexId vertex_id;

public:
  SimpleElement() : vertex_id(0) {}
  SimpleElement(VertexId _vertex_id) : vertex_id(_vertex_id) {}
  SimpleElement(VertexId _vertex_id, BYTE, BYTE, BYTE)
      : vertex_id(_vertex_id) {}
  SimpleElement(VertexId _vertex_id, BYTE, BYTE, BYTE, BYTE)
      : vertex_id(_vertex_id) {}
  ~SimpleElement() = default;
  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }
  inline void set_history_info(BYTE) {}
  inline void set_vertex_label(BYTE) {}
  VertexId get_vid() const { return vertex_id; }
  BYTE get_his() const { return 0; }
  BYTE get_key() const { return 0; }
  BYTE get_vlabel() const { return 0; }
  bool has_history() { return false; }
  inline int cmp(const SimpleElement& other) const {
    if (vertex_id < other.get_vid())
      return -1;
    if (vertex_id > other.get_vid())
      return 1;
    return 0;
  }
  friend bool operator==(const SimpleElement& e1, const SimpleElement& e2) {
    return e1.get_vid() == e2.get_vid();
  }
  friend std::ostream& operator<<(std::ostream& strm,
                                  const SimpleElement& element) {
    strm << "[" << element.get_vid() << "]";
    return strm;
  }
};

#endif


================================================
FILE: libpangolin/include/pangolin/embedding.h
================================================
#ifndef EMBEDDING_HPP_
#define EMBEDDING_HPP_

// bliss headers
//#include "bliss/defs.hh"
//#include "bliss/utils.hh"
//#include "bliss/bignum.hh"

#include "pangolin/element.h"

template <typename ElementTy>
class Embedding {
  // using iterator = typename std::vector<ElementTy>::iterator;
  using iterator = typename galois::gstl::Vector<ElementTy>::iterator;

public:
  Embedding() {}
  Embedding(size_t n) { elements.resize(n); }
  Embedding(const Embedding& emb) { elements = emb.elements; }
  ~Embedding() { elements.clear(); }
  VertexId get_vertex(unsigned i) const { return elements[i].get_vid(); }
  BYTE get_history(unsigned i) const { return elements[i].get_his(); }
  BYTE get_label(unsigned i) const { return elements[i].get_vlabel(); }
  BYTE get_key(unsigned i) const { return elements[i].get_key(); }
  bool empty() const { return elements.empty(); }
  iterator begin() { return elements.begin(); }
  iterator end() { return elements.end(); }
  iterator insert(iterator pos, const ElementTy& value) {
    return elements.insert(pos, value);
  }
  void push_back(ElementTy ele) { elements.push_back(ele); }
  void pop_back() { elements.pop_back(); }
  ElementTy& back() { return elements.back(); }
  const ElementTy& back() const { return elements.back(); }
  size_t size() const { return elements.size(); }
  void resize(size_t n) { elements.resize(n); }
  ElementTy* data() { return elements.data(); }
  const ElementTy* data() const { return elements.data(); }
  ElementTy get_element(unsigned i) const { return elements[i]; }
  void set_element(unsigned i, ElementTy& ele) { elements[i] = ele; }
  void set_vertex(unsigned i, VertexId vid) { elements[i].set_vertex_id(vid); }
  // std::vector<ElementTy> get_elements() const { return elements; }
  galois::gstl::Vector<ElementTy> get_elements() const { return elements; }
  void clean() { elements.clear(); }

protected:
  // std::vector<ElementTy> elements;
  galois::gstl::Vector<ElementTy> elements;
};

#endif // EMBEDDING_HPP_


================================================
FILE: libpangolin/include/pangolin/embedding_queue.h
================================================
#include <iostream>
#include "galois/Bag.h"

// Embedding queue: AoS structure
// print out the embeddings in the task queue
template <typename EmbeddingTy>
class EmbeddingQueue : public galois::InsertBag<EmbeddingTy> {
public:
  void printout_embeddings(int level, bool verbose = false) {
    int num_embeddings = std::distance(this->begin(), this->end());
    std::cout << "Number of embeddings in level " << level << ": "
              << num_embeddings << std::endl;
    if (verbose)
      for (auto emb : *this)
        std::cout << emb << "\n";
  }
  void clean() {
    for (auto emb : *this)
      emb.clean();
    this->clear();
  }
};


================================================
FILE: libpangolin/include/pangolin/equivalence.h
================================================
#ifndef EQUIVALENCE_HPP_
#define EQUIVALENCE_HPP_
/**
 * Code from on below link. Modified under Galois.
 *
 * https://github.com/qcri/Arabesque
 *
 * Copyright (c) 2015 Qatar Computing Research Institute
 * All rights reserved.
 * Reused/revised under 3-BSD
 */
#include "pangolin/types.h"

class VertexPositionEquivalences {
  friend std::ostream& operator<<(std::ostream& strm,
                                  const VertexPositionEquivalences& equ);

public:
  VertexPositionEquivalences() { numVertices = 0; }
  ~VertexPositionEquivalences() {}
  void clear() {
    for (unsigned i = 0; i < equivalences.size(); ++i)
      equivalences[i].clear();
  }
  void set_size(unsigned n) {
    if (numVertices != n) {
      equivalences.resize(n);
      numVertices = n;
    }
  }
  void add_equivalence(unsigned pos1, unsigned pos2) {
    equivalences[pos1].insert(pos2);
  }
  UintSet get_equivalent_set(unsigned pos) const { return equivalences[pos]; }
  void propagate_equivalences() {
    for (unsigned i = 0; i < numVertices; ++i) {
      UintSet currentEquivalences = equivalences[i];
      for (auto equivalentPosition : currentEquivalences) {
        if (equivalentPosition == i)
          continue;
        // equivalences[equivalentPosition];
      }
    }
  }
  unsigned get_size() const { return numVertices; }
  bool empty() const { return numVertices == 0; }

private:
  UintSets equivalences;
  unsigned numVertices;
};
#endif // EQUIVALENCE_HPP_


================================================
FILE: libpangolin/include/pangolin/gtypes.h
================================================
#pragma once
// Galois supported types
#include "pangolin/types.h"
#include "galois/Bag.h"
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/substrate/SimpleLock.h"

#ifndef LARGE_SIZE
typedef galois::gstl::Vector<BYTE> ByteList;
typedef galois::gstl::Vector<unsigned> UintList;
typedef galois::gstl::Vector<Ulong> UlongList;
typedef galois::gstl::Vector<VertexId> VertexList;
#endif

typedef galois::gstl::Set<VertexId> VertexSet;
typedef galois::substrate::PerThreadStorage<UintList> Lists;
typedef galois::substrate::PerThreadStorage<unsigned> Counts;

typedef galois::GAccumulator<unsigned> UintAccu;
typedef galois::GAccumulator<uint64_t> UlongAccu;
typedef galois::substrate::PerThreadStorage<UintMap> LocalUintMap;

// typedef galois::gstl::Map<unsigned, unsigned> FreqMap;
// typedef galois::gstl::UnorderedMap<unsigned, bool> DomainMap;

// use Galois memory allocator for domain support
typedef galois::gstl::Set<int> IntSet;
typedef galois::gstl::Vector<IntSet> IntSets;
// typedef std::set<int> IntSet;
// typedef std::vector<IntSet> IntSets;

class PangolinGraph
    : public galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<
          true>::type ::with_no_lockable<true>::type {
public:
  galois::gstl::Vector<uint32_t> degrees;
  void degree_counting() {
    degrees.resize(numNodes);
    galois::do_all(
        galois::iterate(begin(), end()),
        [&](auto v) {
          degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v));
        },
        galois::loopname("DegreeCounting"));
  }
  uint32_t get_degree(uint32_t n) { return degrees[n]; }
};

typedef PangolinGraph::GraphNode GNode;


================================================
FILE: libpangolin/include/pangolin/mgraph.h
================================================
#pragma once
#include <set>
#include <string>
#include <vector>
#include <sstream>
#include <fstream>
#include <iostream>
#include <algorithm>
#include "pangolin/types.h"

struct MEdge {
  IndexT src;
  IndexT dst;
  ValueT elabel;
  MEdge() : src(0), dst(0), elabel(0) {}
  MEdge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {}
  std::string to_string() const {
    std::stringstream ss;
    ss << "e(" << src << "," << dst << "," << elabel << ")";
    return ss.str();
  }
};
typedef std::vector<MEdge> MEdgeList;

class MGraph {
public:
  // MEdgeList el;
  MGraph() : need_dag(false), symmetrize_(false), directed_(false) {}
  MGraph(bool dag) : need_dag(dag), symmetrize_(false), directed_(false) {}
  void clean() {
    el.clear();
    delete[] rowptr_;
    delete[] colidx_;
    delete[] weight_;
    degrees.clear();
    labels_.clear();
    vertices.clear();
  }
  IndexT* out_rowptr() const { return rowptr_; }
  IndexT* out_colidx() const { return colidx_; }
  ValueT* labels() { return labels_.data(); }
  ValueT get_label(IndexT n) { return labels_[n]; }
  IndexT get_offset(IndexT n) { return rowptr_[n]; }
  IndexT get_dest(IndexT n) { return colidx_[n]; }
  ValueT get_weight(IndexT n) { return weight_[n]; }
  unsigned get_max_degree() { return max_degree; }
  unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; }
  bool directed() const { return directed_; }
  size_t num_vertices() const { return num_vertices_; }
  size_t num_edges() const { return num_edges_; }

  void read_txt(const char* filename, bool symmetrize = true) {
    std::ifstream is;
    is.open(filename, std::ios::in);
    char line[1024];
    std::vector<std::string> result;
    std::set<std::pair<IndexT, IndexT>> edge_set;
    // clear();
    while (true) {
      unsigned pos = is.tellg();
      if (!is.getline(line, 1024))
        break;
      result.clear();
      split(line, result);
      if (result.empty()) {
      } else if (result[0] == "t") {
        if (!labels_.empty()) { // use as delimiter
          is.seekg(pos, std::ios_base::beg);
          break;
        } else {
        }
      } else if (result[0] == "v" && result.size() >= 3) {
        unsigned id = atoi(result[1].c_str());
        labels_.resize(id + 1);
        labels_[id] = atoi(result[2].c_str());
      } else if (result[0] == "e" && result.size() >= 4) {
        IndexT src    = atoi(result[1].c_str());
        IndexT dst    = atoi(result[2].c_str());
        ValueT elabel = atoi(result[3].c_str());
        assert(labels_.size() > src && labels_.size() > dst);
        if (src == dst)
          continue; // remove self-loop
        if (edge_set.find(std::pair<IndexT, IndexT>(src, dst)) ==
            edge_set.end()) {
          edge_set.insert(std::pair<IndexT, IndexT>(src, dst));
          el.push_back(MEdge(src, dst, elabel));
          if (symmetrize) {
            edge_set.insert(std::pair<IndexT, IndexT>(dst, src));
            el.push_back(MEdge(dst, src, elabel));
          }
        }
      }
    }
    is.close();
    num_vertices_   = labels_.size();
    auto num_labels = count_unique_labels();
    std::cout << "Number of unique vertex label values: " << num_labels
              << std::endl;
    num_edges_ = el.size();
    if (!directed_)
      symmetrize_ = false; // no need to symmetrize undirected graph
    MakeGraphFromEL();
  }
  void read_adj(const char* filename) {
    FILE* fd = fopen(filename, "r");
    assert(fd != NULL);
    char buf[2048];
    unsigned size = 0, maxsize = 0;
    while (fgets(buf, 2048, fd) != NULL) {
      auto len = strlen(buf);
      size += len;
      if (buf[len - 1] == '\n') {
        maxsize = std::max(size, maxsize);
        size    = 0;
      }
    }
    fclose(fd);

    std::ifstream is;
    is.open(filename, std::ios::in);
    // char line[1024];
    char* line = new char[maxsize + 1];
    std::vector<std::string> result;
    while (is.getline(line, maxsize + 1)) {
      result.clear();
      split(line, result);
      IndexT src = atoi(result[0].c_str());
      labels_.resize(src + 1);
      labels_[src]  = atoi(result[1].c_str());
      ValueT elabel = 0;
      std::set<std::pair<IndexT, ValueT>> neighbors;
      for (size_t i = 2; i < result.size(); i++) {
        IndexT dst = atoi(result[i].c_str());
        if (src == dst)
          continue; // remove self-loop
        // elabel = atoi(result[i].c_str());
        neighbors.insert(
            std::pair<IndexT, ValueT>(dst, elabel)); // remove redundant edge
      }
      for (auto it = neighbors.begin(); it != neighbors.end(); ++it)
        el.push_back(MEdge(src, it->first, it->second));
    }
    is.close();
    num_vertices_   = labels_.size();
    auto num_labels = count_unique_labels();
    std::cout << "Number of unique vertex label values: " << num_labels
              << std::endl;
    num_edges_ = el.size();
    if (!directed_)
      symmetrize_ = false; // no need to symmetrize undirected graph
    MakeGraphFromEL();
  }
  void read_mtx(const char* filename, bool symmetrize = false) {
    std::ifstream in;
    in.open(filename, std::ios::in);
    std::string start, object, format, field, symmetry, line;
    in >> start >> object >> format >> field >> symmetry >> std::ws;
    if (start != "%%MatrixMarket") {
      std::cout << ".mtx file did not start with %%MatrixMarket" << std::endl;
      std::exit(-21);
    }
    if ((object != "matrix") || (format != "coordinate")) {
      std::cout << "only allow matrix coordinate format for .mtx" << std::endl;
      std::exit(-22);
    }
    if (field == "complex") {
      std::cout << "do not support complex weights for .mtx" << std::endl;
      std::exit(-23);
    }
    bool read_weights;
    if (field == "pattern") {
      read_weights = false;
    } else if ((field == "real") || (field == "double") ||
               (field == "integer")) {
      read_weights = true;
    } else {
      std::cout << "unrecognized field type for .mtx" << std::endl;
      std::exit(-24);
    }
    bool undirected;
    if (symmetry == "symmetric") {
      undirected = true;
    } else if ((symmetry == "general") || (symmetry == "skew-symmetric")) {
      undirected = false;
    } else {
      std::cout << "unsupported symmetry type for .mtx" << std::endl;
      std::exit(-25);
    }
    while (true) {
      char c = in.peek();
      if (c == '%') {
        in.ignore(200, '\n');
      } else {
        break;
      }
    }
    size_t m, n, nonzeros;
    in >> m >> n >> nonzeros >> std::ws;
    if (m != n) {
      std::cout << m << " " << n << " " << nonzeros << std::endl;
      std::cout << "matrix must be square for .mtx" << std::endl;
      std::exit(-26);
    }
    while (std::getline(in, line)) {
      std::istringstream edge_stream(line);
      IndexT u;
      edge_stream >> u;
      if (read_weights) {
        IndexT v;
        edge_stream >> v;
        el.push_back(MEdge(u - 1, v - 1, 1));
        if (symmetrize)
          el.push_back(MEdge(v - 1, u - 1, 1));
      } else {
        IndexT v;
        edge_stream >> v;
        el.push_back(MEdge(u - 1, v - 1, 1));
        if (symmetrize)
          el.push_back(MEdge(v - 1, u - 1, 1));
      }
    }
    in.close();
    labels_.resize(m);
    directed_ = !undirected;
    if (undirected)
      symmetrize_ = false; // no need to symmetrize undirected graph
    for (size_t i = 0; i < m; i++) {
      labels_[i] = rand() % 10 + 1;
    }
    num_vertices_ = m;
    num_edges_    = el.size();
    MakeGraphFromEL();
  }
  void read_gr(PangolinGraph& g) {
    num_vertices_ = g.size();
    for (auto it = g.begin(); it != g.end(); it++) {
      GNode src = *it;
      for (auto e : g.edges(src)) {
        GNode dst = g.getEdgeDst(e);
        el.push_back(MEdge(src, dst, 1));
      }
    }
    assert(el.size() == g.sizeEdges());
    num_edges_ = el.size();
    labels_.resize(num_vertices_);
    for (size_t i = 0; i < num_vertices_; i++) {
      labels_[i] = g.getData(i);
    }
    MakeGraphFromEL();
  }
  void print_graph() {
    if (directed_)
      std::cout << "directed graph\n";
    else
      std::cout << "undirected graph\n";
    for (size_t n = 0; n < num_vertices_; n++) {
      IndexT row_begin = rowptr_[n];
      IndexT row_end   = rowptr_[n + 1];
      std::cout << "vertex " << n << ": label = " << labels_[n]
                << " edgelist = [ ";
      for (IndexT offset = row_begin; offset < row_end; offset++) {
        IndexT dst = colidx_[offset];
        std::cout << dst << " ";
      }
      std::cout << "]" << std::endl;
    }
  }

private:
  MEdgeList el;
  bool need_dag;
  bool symmetrize_; // whether to symmetrize a directed graph
  bool directed_;
  size_t num_vertices_;
  size_t num_edges_;
  IndexT* rowptr_;
  IndexT* colidx_;
  ValueT* weight_;
  unsigned max_degree;
  std::vector<IndexT> degrees;
  std::vector<ValueT> labels_;
  std::vector<std::vector<MEdge>> vertices;

  unsigned count_unique_labels() {
    std::set<ValueT> s;
    unsigned res = 0;
    for (size_t i = 0; i < labels_.size(); i++) {
      if (s.find(labels_[i]) == s.end()) {
        s.insert(labels_[i]);
        res++;
      }
    }
    return res;
  }
  void CountDegrees(const MEdgeList& el) {
    degrees.resize(num_vertices_);
    std::fill(degrees.begin(), degrees.end(), 0);
    for (auto it = el.begin(); it < el.end(); it++) {
      MEdge e = *it;
      degrees[e.src]++;
      if (symmetrize_)
        degrees[e.dst]++;
    }
  }
  void MakeCSR(bool transpose) {
    degrees.resize(num_vertices_);
    std::fill(degrees.begin(), degrees.end(), 0);
    for (size_t i = 0; i < num_vertices_; i++)
      degrees[i] = vertices[i].size();
    max_degree = *(std::max_element(degrees.begin(), degrees.end()));

    std::vector<IndexT> offsets(degrees.size() + 1);
    IndexT total = 0;
    for (size_t n = 0; n < degrees.size(); n++) {
      offsets[n] = total;
      total += degrees[n];
    }
    offsets[degrees.size()] = total;

    assert(num_edges_ == offsets[num_vertices_]);
    weight_ = new ValueT[num_edges_];
    colidx_ = new IndexT[num_edges_];
    rowptr_ = new IndexT[num_vertices_ + 1];
    for (size_t i = 0; i < num_vertices_ + 1; i++)
      rowptr_[i] = offsets[i];
    for (size_t i = 0; i < num_vertices_; i++) {
      for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) {
        MEdge e = *it;
        assert(i == e.src);
        if (symmetrize_ || (!symmetrize_ && !transpose)) {
          weight_[offsets[e.src]]   = e.elabel;
          colidx_[offsets[e.src]++] = e.dst;
        }
        if (symmetrize_ || (!symmetrize_ && transpose)) {
          weight_[offsets[e.dst]]   = e.elabel;
          colidx_[offsets[e.dst]++] = e.src;
        }
      }
    }
  }
  static bool compare_id(MEdge a, MEdge b) { return (a.dst < b.dst); }
  void SquishGraph(bool remove_selfloops  = true,
                   bool remove_redundents = true) {
    std::vector<MEdge> neighbors;
    for (size_t i = 0; i < num_vertices_; i++)
      vertices.push_back(neighbors);
    // assert(num_edges_ == el.size());
    for (size_t i = 0; i < num_edges_; i++)
      vertices[el[i].src].push_back(el[i]);
    el.clear();
    printf("Sorting the neighbor lists...");
    for (size_t i = 0; i < num_vertices_; i++)
      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
    printf(" Done\n");
    // remove self loops
    int num_selfloops = 0;
    if (remove_selfloops) {
      printf("Removing self loops...");
      for (size_t i = 0; i < num_vertices_; i++) {
        for (unsigned j = 0; j < vertices[i].size(); j++) {
          if (i == vertices[i][j].dst) {
            vertices[i].erase(vertices[i].begin() + j);
            num_selfloops++;
            j--;
          }
        }
      }
      printf(" %d selfloops are removed\n", num_selfloops);
      num_edges_ -= num_selfloops;
    }
    // remove redundent
    int num_redundents = 0;
    if (remove_redundents) {
      printf("Removing redundent edges...");
      for (size_t i = 0; i < num_vertices_; i++) {
        for (unsigned j = 1; j < vertices[i].size(); j++) {
          if (vertices[i][j].dst == vertices[i][j - 1].dst) {
            vertices[i].erase(vertices[i].begin() + j);
            num_redundents++;
            j--;
          }
        }
      }
      printf(" %d redundent edges are removed\n", num_redundents);
      num_edges_ -= num_redundents;
    }
    if (need_dag) {
      int num_dag = 0;
      std::cout << "Constructing DAG...";
      degrees.resize(num_vertices_);
      for (size_t i = 0; i < num_vertices_; i++)
        degrees[i] = vertices[i].size();
      for (size_t i = 0; i < num_vertices_; i++) {
        for (unsigned j = 0; j < vertices[i].size(); j++) {
          IndexT to = vertices[i][j].dst;
          if (degrees[to] < degrees[i] ||
              (degrees[to] == degrees[i] && to < i)) {
            vertices[i].erase(vertices[i].begin() + j);
            num_dag++;
            j--;
          }
        }
      }
      printf(" %d dag edges are removed\n", num_dag);
      num_edges_ -= num_dag;
    }
  }
  void MakeGraphFromEL() {
    SquishGraph();
    MakeCSR(false);
  }
  inline void split(const std::string& str, std::vector<std::string>& tokens,
                    const std::string& delimiters = " ") {
    std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
    std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
    while (std::string::npos != pos || std::string::npos != lastPos) {
      tokens.push_back(str.substr(lastPos, pos - lastPos));
      lastPos = str.find_first_not_of(delimiters, pos);
      pos     = str.find_first_of(delimiters, lastPos);
    }
  }
};


================================================
FILE: libpangolin/include/pangolin/miner.h
================================================
#ifndef MINER_HPP_
#define MINER_HPP_
#include "pangolin/scan.h"
#include "pangolin/util.h"
#include "pangolin/embedding_queue.h"
#include "bliss/uintseqhash.hh"
#define CHUNK_SIZE 1

template <typename ElementTy, typename EmbeddingTy, bool enable_dag>
class Miner {
  typedef EmbeddingQueue<EmbeddingTy> EmbeddingQueueTy;

public:
  Miner(unsigned max_sz, int nt) : max_size(max_sz), num_threads(nt) {
    // std::cout << "max_size = " << max_sz << std::endl;
    // std::cout << "num_threads = " << nt << std::endl;
  }
  virtual ~Miner() {}
  inline void insert(EmbeddingQueueTy& queue, bool debug = false);
  inline unsigned intersect(unsigned a, unsigned b) {
    return intersect_merge(a, b);
  }
  inline unsigned intersect_dag(unsigned a, unsigned b) {
    return intersect_dag_merge(a, b);
  }
  // unsigned read_graph(std::string filename);
  unsigned read_graph(std::string filetype, std::string filename) {
    max_degree = util::read_graph(graph, filetype, filename, enable_dag);
    graph.degree_counting();
    degrees = graph.degrees.data();
    // std::cout << "Input graph: num_vertices " << graph.size() << " num_edges
    // "
    //          << graph.sizeEdges() << "\n";
    // util::print_graph(graph);
    // convert_to_gbbs(filename);
    return max_degree;
  }
  void convert_to_gbbs(std::string filename) {
    printf("writing gbbs file\n");
    std::ofstream outfile;
    outfile.open(filename + ".gbbs");
    outfile << "AdjacencyGraph"
            << "\n";
    auto m   = graph.size();
    auto nnz = graph.sizeEdges();
    outfile << m << "\n";
    outfile << nnz << "\n";
    size_t offset = 0;
    for (size_t i = 0; i < m; i++) {
      outfile << offset << "\n";
      offset += graph.get_degree(i);
    }
    for (size_t i = 0; i < m; i++) {
      for (auto e : graph.edges(i)) {
        auto v = graph.getEdgeDst(e);
        outfile << v << "\n";
      }
    }
    outfile.close();
    exit(0);
  }
  unsigned read_pattern(std::string filename, std::string filetype = "gr",
                        bool symmetric = false) {
    unsigned max_deg = util::read_graph(pattern, filetype, filename, false);
    pattern.degree_counting();
    auto nv = pattern.size();
    auto ne = pattern.sizeEdges();
    std::cout << "Pattern graph: num_vertices " << nv << " num_edges " << ne
              << "\n";
    if (symmetric) {
      if (nv == 4) {
        if (ne == 12) {
          std::cout << "Input pattern: 4-clique, please use kcl\n";
          exit(1);
        } else if (ne == 10) {
          std::cout << "Input pattern: diamond\n";
          return 4;
        } else if (ne == 8) {
          if (max_deg == 3) {
            std::cout << "Input pattern: tailed-triangle\n";
            return 3;
          } else {
            std::cout << "Input pattern: 4-cycle\n";
            return 2;
          }
        } else if (ne == 6) {
          if (max_deg == 3) {
            std::cout << "Input pattern: 3-star\n";
            return 1;
          } else {
            std::cout << "Input pattern: 4-path\n";
            return 0;
          }
        } else {
          std::cout << "Error: the number of edges is invalid\n";
          exit(1);
        }
      } else if (nv == 5) {
        std::cout << "5-motif currently not supported\n";
        exit(1);
      } else {
        std::cout << "pattern size currently not supported\n";
        exit(1);
      }
    } else {
      if (nv == 4) {
        if (ne == 6) {
          std::cout << "Input pattern: 4-clique, please use kcl\n";
          exit(1);
        } else if (ne == 5) {
          std::cout << "Input pattern: diamond\n";
          return 4;
        } else if (ne == 4) {
          if (max_deg == 2) {
            std::cout << "Input pattern: tailed-triangle\n";
            return 3;
          } else {
            // assert(max_deg == 1);
            std::cout << "Input pattern: 4-cycle\n";
            return 2;
          }
        } else if (ne == 3) {
          if (max_deg == 3) {
            std::cout << "Input pattern: 3-star\n";
            return 1;
          } else {
            // assert(max_deg == 2);
            std::cout << "Input pattern: 4-path\n";
            return 0;
          }
        } else {
          std::cout << "Error: the unmber of edges is invalid\n";
          exit(1);
        }
      } else if (nv == 5) {
      } else {
        std::cout << "pattern size currently not supported\n";
        exit(1);
      }
    }
    return 0;
  }

protected:
  PangolinGraph graph;
  PangolinGraph pattern;
  unsigned max_size;
  int num_threads;
  unsigned max_degree;
  uint32_t* degrees;

  inline bool is_automorphism_dag(unsigned n, const EmbeddingTy& emb,
                                  unsigned idx, VertexId dst) {
    // if (dst <= emb.get_vertex(0)) return true;
    for (unsigned i = 0; i < n; ++i)
      if (dst == emb.get_vertex(i))
        return true;
    for (unsigned i = 0; i < idx; ++i)
      if (is_connected_dag(dst, emb.get_vertex(i)))
        return true;
    // for (unsigned i = idx+1; i < n; ++i) if (dst < emb.get_vertex(i)) return
    // true;
    return false;
  }
  inline bool is_vertexInduced_automorphism(unsigned n, const EmbeddingTy& emb,
                                            unsigned idx, VertexId dst) {
    // unsigned n = emb.size();
    // the new vertex id should be larger than the first vertex id
    if (dst <= emb.get_vertex(0))
      return true;
    // the new vertex should not already exist in the embedding
    for (unsigned i = 1; i < n; ++i)
      if (dst == emb.get_vertex(i))
        return true;
    // the new vertex should not already be extended by any previous vertex in
    // the embedding
    for (unsigned i = 0; i < idx; ++i)
      if (is_connected(emb.get_vertex(i), dst))
        return true;
    // the new vertex id should be larger than any vertex id after its source
    // vertex in the embedding
    for (unsigned i = idx + 1; i < n; ++i)
      if (dst < emb.get_vertex(i))
        return true;
    return false;
  }
  unsigned get_degree(PangolinGraph* g, VertexId vid) {
    return std::distance(g->edge_begin(vid), g->edge_end(vid));
  }
  inline unsigned intersect_merge(unsigned src, unsigned dst) {
    unsigned count = 0;
    for (auto e : graph.edges(dst)) {
      GNode dst_dst = graph.getEdgeDst(e);
      for (auto e1 : graph.edges(src)) {
        GNode to = graph.getEdgeDst(e1);
        if (dst_dst == to) {
          count += 1;
          break;
        }
        if (to > dst_dst)
          break;
      }
    }
    return count;
  }
  inline unsigned intersect_dag_merge(unsigned p, unsigned q) {
    unsigned count = 0;
    auto p_start   = graph.edge_begin(p);
    auto p_end     = graph.edge_end(p);
    auto q_start   = graph.edge_begin(q);
    auto q_end     = graph.edge_end(q);
    auto p_it      = p_start;
    auto q_it      = q_start;
    int a;
    int b;
    while (p_it < p_end && q_it < q_end) {
      a     = graph.getEdgeDst(p_it);
      b     = graph.getEdgeDst(q_it);
      int d = a - b;
      if (d <= 0)
        p_it++;
      if (d >= 0)
        q_it++;
      if (d == 0)
        count++;
    }
    return count;
  }
  inline unsigned intersect_search(unsigned a, unsigned b) {
    if (degrees[a] == 0 || degrees[b] == 0)
      return 0;
    unsigned count  = 0;
    unsigned lookup = a;
    unsigned search = b;
    if (degrees[a] > degrees[b]) {
      lookup = b;
      search = a;
    }
    auto begin = graph.edge_begin(search);
    auto end   = graph.edge_end(search);
    for (auto e : graph.edges(lookup)) {
      GNode key = graph.getEdgeDst(e);
      if (binary_search(key, begin, end))
        count++;
    }
    return count;
  }
  inline bool is_all_connected_except(unsigned dst, unsigned pos,
                                      const EmbeddingTy& emb) {
    unsigned n         = emb.size();
    bool all_connected = true;
    for (unsigned i = 0; i < n; ++i) {
      if (i == pos)
        continue;
      unsigned from = emb.get_vertex(i);
      if (!is_connected(from, dst)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  inline bool is_all_connected_except_dag(unsigned dst, unsigned pos,
                                          const EmbeddingTy& emb) {
    unsigned n         = emb.size();
    bool all_connected = true;
    for (unsigned i = 0; i < n; ++i) {
      if (i == pos)
        continue;
      unsigned from = emb.get_vertex(i);
      if (!is_connected_dag(dst, from)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  inline bool is_all_connected(unsigned dst, const EmbeddingTy& emb,
                               unsigned end, unsigned start = 0) {
    assert(start >= 0 && end > 0);
    bool all_connected = true;
    for (unsigned i = start; i < end; ++i) {
      unsigned from = emb.get_vertex(i);
      if (!is_connected(from, dst)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  inline bool is_all_connected_dag(unsigned dst, const EmbeddingTy& emb,
                                   unsigned end, unsigned start = 0) {
    assert(start >= 0 && end > 0);
    bool all_connected = true;
    for (unsigned i = start; i < end; ++i) {
      unsigned from = emb.get_vertex(i);
      if (!is_connected_dag(dst, from)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  inline bool is_all_connected_dag(unsigned dst,
                                   const std::vector<VertexId>& emb,
                                   unsigned end, unsigned start = 0) {
    assert(start >= 0 && end > 0);
    bool all_connected = true;
    for (unsigned i = start; i < end; ++i) {
      unsigned from = emb[i];
      if (!is_connected_dag(dst, from)) {
        all_connected = false;
        break;
      }
    }
    return all_connected;
  }
  // check if vertex a is connected to vertex b in a undirected graph
  inline bool is_connected(unsigned a, unsigned b) {
    if (degrees[a] == 0 || degrees[b] == 0)
      return false;
    unsigned key    = a;
    unsigned search = b;
    if (degrees[a] < degrees[b]) {
      key    = b;
      search = a;
    }
    auto begin = graph.edge_begin(search);
    auto end   = graph.edge_end(search);
    // return serial_search(key, begin, end);
    return binary_search(key, begin, end);
  }
  inline int is_connected_dag(unsigned key, unsigned search) {
    if (degrees[search] == 0)
      return false;
    auto begin = graph.edge_begin(search);
    auto end   = graph.edge_end(search);
    // return serial_search(key, begin, end);
    return binary_search(key, begin, end);
  }
  inline bool serial_search(unsigned key, PangolinGraph::edge_iterator begin,
                            PangolinGraph::edge_iterator end) {
    for (auto offset = begin; offset != end; ++offset) {
      unsigned d = graph.getEdgeDst(offset);
      if (d == key)
        return true;
      if (d > key)
        return false;
    }
    return false;
  }
  inline bool binary_search(unsigned key, PangolinGraph::edge_iterator begin,
                            PangolinGraph::edge_iterator end) {
    auto l = begin;
    auto r = end - 1;
    while (r >= l) {
      auto mid       = l + (r - l) / 2;
      unsigned value = graph.getEdgeDst(mid);
      if (value == key)
        return true;
      if (value < key)
        l = mid + 1;
      else
        r = mid - 1;
    }
    return false;
  }
  inline int binary_search(unsigned key, PangolinGraph::edge_iterator begin,
                           int length) {
    if (length < 1)
      return -1;
    int l = 0;
    int r = length - 1;
    while (r >= l) {
      int mid        = l + (r - l) / 2;
      unsigned value = graph.getEdgeDst(begin + mid);
      if (value == key)
        return mid;
      if (value < key)
        l = mid + 1;
      else
        r = mid - 1;
    }
    return -1;
  }
  inline void gen_adj_matrix(unsigned n, const std::vector<bool>& connected,
                             Matrix& a) {
    unsigned l = 0;
    for (unsigned i = 1; i < n; i++)
      for (unsigned j = 0; j < i; j++)
        if (connected[l++])
          a[i][j] = a[j][i] = 1;
  }
  // calculate the trace of a given n*n matrix
  inline MatType trace(unsigned n, Matrix matrix) {
    MatType tr = 0;
    for (unsigned i = 0; i < n; i++) {
      tr += matrix[i][i];
    }
    return tr;
  }
  // matrix mutiplication, both a and b are n*n matrices
  inline Matrix product(unsigned n, const Matrix& a, const Matrix& b) {
    Matrix c(n, std::vector<MatType>(n));
    for (unsigned i = 0; i < n; ++i) {
      for (unsigned j = 0; j < n; ++j) {
        c[i][j] = 0;
        for (unsigned k = 0; k < n; ++k) {
          c[i][j] += a[i][k] * b[k][j];
        }
      }
    }
    return c;
  }
  // calculate the characteristic polynomial of a n*n matrix A
  inline void char_polynomial(unsigned n, Matrix& A, std::vector<MatType>& c) {
    // n is the size (num_vertices) of a graph
    // A is the adjacency matrix (n*n) of the graph
    Matrix C;
    C = A;
    for (unsigned i = 1; i <= n; i++) {
      if (i > 1) {
        for (unsigned j = 0; j < n; j++)
          C[j][j] += c[n - i + 1];
        C = product(n, A, C);
      }
      c[n - i] -= trace(n, C) / i;
    }
  }
  inline void get_connectivity(unsigned n, unsigned idx, VertexId dst,
                               const EmbeddingTy& emb,
                               std::vector<bool>& connected) {
    connected.push_back(true); // 0 and 1 are connected
    for (unsigned i = 2; i < n; i++)
      for (unsigned j = 0; j < i; j++)
        if (is_connected(emb.get_vertex(i), emb.get_vertex(j)))
          connected.push_back(true);
        else
          connected.push_back(false);
    for (unsigned j = 0; j < n; j++) {
      if (j == idx)
        connected.push_back(true);
      else if (is_connected(emb.get_vertex(j), dst))
        connected.push_back(true);
      else
        connected.push_back(false);
    }
  }
  // eigenvalue based approach to find the pattern id for a given embedding
  inline unsigned find_motif_pattern_id_eigen(unsigned n, unsigned idx,
                                              VertexId dst,
                                              const EmbeddingTy& emb) {
    std::vector<bool> connected;
    get_connectivity(n, idx, dst, emb, connected);
    Matrix A(n + 1, std::vector<MatType>(n + 1, 0));
    gen_adj_matrix(n + 1, connected, A);
    std::vector<MatType> c(n + 1, 0);
    char_polynomial(n + 1, A, c);
    bliss::UintSeqHash h;
    for (unsigned i = 0; i < n + 1; ++i)
      h.update((unsigned)c[i]);
    return h.get_value();
  }

  // unsigned orientation(PangolinGraph &og, PangolinGraph &g);
};

#endif // MINER_HPP_


================================================
FILE: libpangolin/include/pangolin/ptypes.h
================================================
#pragma once
#include "pangolin/types.h"
#include "pangolin/edge_embedding.h"
#include "pangolin/quick_pattern.h"
#include "pangolin/canonical_graph.h"

typedef QuickPattern<EdgeInducedEmbedding<StructuralElement>, StructuralElement>
    StrQPattern; // structural quick pattern
typedef CanonicalGraph<EdgeInducedEmbedding<StructuralElement>,
                       StructuralElement>
    StrCPattern; // structural canonical pattern
typedef std::unordered_map<StrQPattern, Frequency>
    StrQpMapFreq; // mapping structural quick pattern to its frequency
typedef std::unordered_map<StrCPattern, Frequency>
    StrCgMapFreq; // mapping structural canonical pattern to its frequency
typedef galois::substrate::PerThreadStorage<StrQpMapFreq> LocalStrQpMapFreq;
typedef galois::substrate::PerThreadStorage<StrCgMapFreq> LocalStrCgMapFreq;
/*
class Status {
protected:
    std::vector<uint8_t> visited;
public:
    Status() {}
    ~Status() {}
    void init(unsigned size) {
        visited.resize(size);
        reset();
    }
    void reset() {
        std::fill(visited.begin(), visited.end(), 0);
    }
    void set(VertexId pos, uint8_t value) { visited[pos] = value; }
    uint8_t get(VertexId pos) { return visited[pos]; }
};
typedef galois::substrate::PerThreadStorage<Status> StatusMT; // multi-threaded
*/


================================================
FILE: libpangolin/include/pangolin/quick_pattern.h
================================================
#ifndef QUICK_PATTERN_HPP_
#define QUICK_PATTERN_HPP_
/**
 * Code from on below link. Modified under Galois.
 *
 * https://github.com/rstream-system/RStream/
 *
 * Copyright (c) 2018, Kai Wang and the respective contributors
 * All rights reserved.
 * Reused/revised under 3-BSD
 */

#include "pangolin/embedding.h"
#include "pangolin/equivalence.h"
#include "bliss/uintseqhash.hh"

template <typename EmbTy, typename EleTy>
class QuickPattern;
template <typename EmbTy, typename EleTy>
std::ostream& operator<<(std::ostream& strm,
                         const QuickPattern<EmbTy, EleTy>& qp);

template <typename EmbTy, typename EleTy>
class QuickPattern {
  friend std::ostream& operator<<<>(std::ostream& strm,
                                    const QuickPattern<EmbTy, EleTy>& qp);

public:
  QuickPattern() {}
  QuickPattern(unsigned subgraph_size);
  QuickPattern(const EmbTy& emb);
  QuickPattern(EmbTy& emb, bool need_permute);
  QuickPattern(unsigned n, std::vector<bool> connected);
  ~QuickPattern() {}
  void get_equivalences(VertexPositionEquivalences& equ) {
    equ.set_size(size);
    for (unsigned i = 0; i < size; ++i)
      equ.add_equivalence(i, i);
    findAutomorphisms(equ);
  }
  // operator for map
  bool operator==(const QuickPattern& other) const {
    // compare edges
    assert(size == other.size);
    for (unsigned i = 0; i < size; ++i) {
      const EleTy& t1 = elements[i];
      const EleTy& t2 = other.elements[i];
      int cmp_element = t1.cmp(t2);
      if (cmp_element != 0) {
        return false;
      }
    }
    return true;
  }
  operator size_t() const {
    size_t a = 0;
    for (unsigned i = 0; i < size; ++i) {
      auto element = elements[i];
      a += element.get_vid();
    }
    return a;
  }
  inline unsigned get_hash() const { return hash_value; }
  inline void set_hash() {
    bliss::UintSeqHash h;
    h.update(size);
    // hash vertex labels and edges
    for (unsigned i = 0; i < size; ++i) {
      auto element = elements[i];
      h.update(element.get_vid());
      if (std::is_same<EleTy, LabeledElement>::value)
        h.update(element.get_vlabel());
      if (element.has_history())
        h.update(element.get_his());
    }
    hash_value = h.get_value();
    // return h.get_value();
  }
  EleTy& at(unsigned index) const { return elements[index]; }
  inline unsigned get_size() const { return size; }
  inline void clean() { delete[] elements; }
  inline unsigned get_id() const { return hash_value; }
  inline unsigned get_cgid() const { return cg_id; }
  void set_cgid(unsigned i) { cg_id = i; }

private:
  unsigned size;
  EleTy* elements;
  unsigned hash_value; // quick pattern ID
  unsigned
      cg_id; // ID of the canonical pattern that this quick pattern belongs to
  void findAutomorphisms(VertexPositionEquivalences& eq_sets);
};

namespace std {
template <typename EmbTy, typename EleTy>
struct hash<QuickPattern<EmbTy, EleTy>> {
  std::size_t operator()(const QuickPattern<EmbTy, EleTy>& qp) const {
    return std::hash<int>()(qp.get_hash());
  }
};
} // namespace std
#endif // QUICK_PATTERN_HPP_


================================================
FILE: libpangolin/include/pangolin/res_man.h
================================================
#ifndef RESOURCE_MANAGER_HPP_
#define RESOURCE_MANAGER_HPP_
/**
 * Code from on below link. Modified under Galois.
 *
 * https://github.com/rstream-system/RStream/
 *
 * Copyright (c) 2018, Kai Wang and the respective contributors
 * All rights reserved.
 * Reused/revised under 3-BSD
 */

#include <sys/time.h>
#include <sys/resource.h>
#include <iomanip>

class ResourceManager {
public:
  ResourceManager() {}
  ~ResourceManager() {}
  // peak memory usage
  std::string get_peak_memory() {
    double kbm;
    struct rusage CurUsage;
    getrusage(RUSAGE_SELF, &CurUsage);
    kbm        = (double)CurUsage.ru_maxrss;
    double mbm = kbm / 1024.0;
    double gbm = mbm / 1024.0;
    return "Peak memory: " + to_string_with_precision(mbm, 3) + " MB; " +
           to_string_with_precision(gbm, 3) + " GB";
  }

private:
  template <typename T = double>
  std::string to_string_with_precision(const T a_value, const int& n) {
    std::ostringstream out;
    out << std::fixed;
    out << std::setprecision(n) << a_value;
    return out.str();
  }
};
#endif


================================================
FILE: libpangolin/include/pangolin/scan.h
================================================
#pragma once
#include <vector>
#include "pangolin/gtypes.h"

inline std::vector<IndexT> PrefixSum(const std::vector<IndexT>& vec) {
  std::vector<IndexT> sums(vec.size() + 1);
  IndexT total = 0;
  for (size_t n = 0; n < vec.size(); n++) {
    sums[n] = total;
    total += vec[n];
  }
  sums[vec.size()] = total;
  return sums;
}

#ifdef LARGE_SIZE
template <typename InTy = unsigned, typename OutTy = unsigned>
inline std::vector<OutTy> prefix_sum(const std::vector<InTy>& in) {
  std::vector<OutTy> sums(in.size() + 1);
  OutTy total = 0;
  for (size_t n = 0; n < in.size(); n++) {
    sums[n] = total;
    total += (OutTy)in[n];
  }
  sums[in.size()] = total;
  return sums;
}

template <typename InTy = unsigned, typename OutTy = unsigned>
inline std::vector<OutTy> parallel_prefix_sum(const std::vector<InTy>& in) {
  const size_t block_size = 1 << 20;
  const size_t num_blocks = (in.size() + block_size - 1) / block_size;
  std::vector<OutTy> local_sums(num_blocks);
  // count how many bits are set on each thread
  galois::do_all(
      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
        OutTy lsum       = 0;
        size_t block_end = std::min((block + 1) * block_size, in.size());
        for (size_t i = block * block_size; i < block_end; i++)
          lsum += in[i];
        local_sums[block] = lsum;
      });
  std::vector<OutTy> bulk_prefix(num_blocks + 1);
  OutTy total = 0;
  for (size_t block = 0; block < num_blocks; block++) {
    bulk_prefix[block] = total;
    total += local_sums[block];
  }
  bulk_prefix[num_blocks] = total;
  std::vector<OutTy> prefix(in.size() + 1);
  galois::do_all(
      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
        OutTy local_total = bulk_prefix[block];
        size_t block_end  = std::min((block + 1) * block_size, in.size());
        for (size_t i = block * block_size; i < block_end; i++) {
          prefix[i] = local_total;
          local_total += in[i];
        }
      });
  prefix[in.size()] = bulk_prefix[num_blocks];
  return prefix;
}

#else
template <typename InTy = unsigned, typename OutTy = unsigned>
inline galois::gstl::Vector<OutTy>
parallel_prefix_sum(const galois::gstl::Vector<InTy>& in) {
  galois::gstl::Vector<OutTy> sums(in.size() + 1);
  OutTy total = 0;
  for (size_t n = 0; n < in.size(); n++) {
    sums[n] = total;
    total += (OutTy)in[n];
  }
  sums[in.size()] = total;
  return sums;
}
#endif


================================================
FILE: libpangolin/include/pangolin/types.h
================================================
#ifndef TYPES_H
#define TYPES_H
// common types
#include <map>
#include <set>
#include <queue>
#include <vector>
#include <cstring>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <functional>
#include <unordered_map>
#include <unordered_set>
#define LARGE_SIZE // for large graphs such as soc-Livejournal1 and com-Orkut

typedef float Weight;
typedef uint64_t Ulong;
typedef uint32_t ValueT;
typedef uint32_t VertexId;
typedef uint64_t EdgeId;
typedef uint8_t BYTE;
#ifdef LARGE_SIZE
typedef uint64_t IndexT;
typedef uint64_t IndexTy;
#else
typedef uint32_t IndexT;
typedef uint32_t IndexTy;
#endif

typedef std::set<uint32_t> UintSet;
typedef std::vector<UintSet> UintSets;

typedef std::vector<BYTE> ByteList;
typedef std::vector<uint32_t> UintList;
typedef std::vector<Ulong> UlongList;
typedef std::vector<VertexId> VertexList;
typedef std::vector<UintList> IndexLists;
typedef std::vector<ByteList> ByteLists;
typedef std::vector<VertexList> VertexLists;
typedef std::vector<bool> BoolVec;

// We provide two types of 'support': frequency and domain support.
// Frequency is used for counting, e.g. motif counting.
// Domain support, a.k.a, the minimum image-based support, is used for FSM. It
// has the anti-monotonic property.
typedef float MatType;
typedef unsigned Frequency;
typedef std::vector<std::vector<MatType>> Matrix;
typedef std::unordered_map<unsigned, unsigned> UintMap;
typedef std::pair<unsigned, unsigned> InitPattern;
typedef std::unordered_map<unsigned, unsigned> FreqMap;
typedef std::unordered_map<unsigned, bool> DomainMap;
typedef std::map<unsigned, std::map<unsigned, unsigned>> Map2D;

#endif


================================================
FILE: libpangolin/include/pangolin/util.h
================================================
#ifndef UTIL_H
#define UTIL_H

#include "pangolin/scan.h"
#include "pangolin/mgraph.h"
#include "pangolin/res_man.h"

namespace util {

void print_graph(PangolinGraph& graph) {
  for (GNode n : graph) {
    std::cout << "vertex " << n << ": label = " << graph.getData(n)
              << ": degree = " << graph.get_degree(n) << " edgelist = [ ";
    for (auto e : graph.edges(n))
      std::cout << graph.getEdgeDst(e) << " ";
    std::cout << "]" << std::endl;
  }
}

void genGraph(MGraph& mg, PangolinGraph& g) {
  g.allocateFrom(mg.num_vertices(), mg.num_edges());
  g.constructNodes();
  for (size_t i = 0; i < mg.num_vertices(); i++) {
    g.getData(i)   = mg.get_label(i);
    auto row_begin = mg.get_offset(i);
    auto row_end   = mg.get_offset(i + 1);
    g.fixEndEdge(i, row_end);
    for (auto offset = row_begin; offset < row_end; offset++) {
      g.constructEdge(offset, mg.get_dest(offset), 0);
    }
  }
}
// relabel vertices by descending degree order (do not apply to weighted graphs)
void DegreeRanking(PangolinGraph& og, PangolinGraph& g) {
  std::cout << " Relabeling vertices by descending degree order\n";
  std::vector<IndexT> old_degrees(og.size(), 0);
  galois::do_all(
      galois::iterate(og.begin(), og.end()),
      [&](const auto& src) {
        old_degrees[src] = std::distance(og.edge_begin(src), og.edge_end(src));
      },
      galois::loopname("getOldDegrees"));

  size_t num_vertices = og.size();
  typedef std::pair<unsigned, IndexT> degree_node_p;
  std::vector<degree_node_p> degree_id_pairs(num_vertices);
  for (IndexT n = 0; n < num_vertices; n++)
    degree_id_pairs[n] = std::make_pair(old_degrees[n], n);
  std::sort(degree_id_pairs.begin(), degree_id_pairs.end(),
            std::greater<degree_node_p>());

  std::vector<IndexT> degrees(num_vertices, 0);
  std::vector<IndexT> new_ids(num_vertices);
  for (IndexT n = 0; n < num_vertices; n++) {
    degrees[n]                         = degree_id_pairs[n].first;
    new_ids[degree_id_pairs[n].second] = n;
  }
  std::vector<IndexT> offsets = PrefixSum(degrees);

  g.allocateFrom(og.size(), og.sizeEdges());
  g.constructNodes();
  galois::do_all(
      galois::iterate(og.begin(), og.end()),
      [&](const auto& src) {
        auto row_begin = offsets[src];
        g.fixEndEdge(src, row_begin + degrees[src]);
        IndexT offset = 0;
        for (auto e : og.edges(src)) {
          auto dst = og.getEdgeDst(e);
          g.constructEdge(row_begin + offset, new_ids[dst], 0);
          offset++;
        }
        assert(offset == degrees[src]);
      },
      galois::loopname("ConstructNewGraph"));
  g.sortAllEdgesByDst();
}

unsigned orientation(PangolinGraph& og, PangolinGraph& g) {
  galois::StatTimer Tdag("DAG");
  Tdag.start();
  // std::cout << "Orientation enabled, using DAG\n";
  // std::cout << "Assume the input graph is clean and symmetric (.csgr)\n";
  // std::cout << "Before: num_vertices " << og.size() << " num_edges "
  //          << og.sizeEdges() << "\n";
  std::vector<IndexT> degrees(og.size(), 0);

  galois::do_all(
      galois::iterate(og.begin(), og.end()),
      [&](const auto& src) {
        degrees[src] = std::distance(og.edge_begin(src), og.edge_end(src));
      },
      galois::loopname("getOldDegrees"));

  unsigned max_degree = *(std::max_element(degrees.begin(), degrees.end()));
  std::vector<IndexT> new_degrees(og.size(), 0);

  galois::do_all(
      galois::iterate(og.begin(), og.end()),
      [&](const auto& src) {
        for (auto e : og.edges(src)) {
          auto dst = og.getEdgeDst(e);
          if (degrees[dst] > degrees[src] ||
              (degrees[dst] == degrees[src] && dst > src)) {
            new_degrees[src]++;
          }
        }
      },
      galois::loopname("getNewDegrees"));

  std::vector<IndexT> offsets = PrefixSum(new_degrees);
  assert(offsets[og.size()] == og.sizeEdges() / 2);

  g.allocateFrom(og.size(), og.sizeEdges() / 2);
  g.constructNodes();

  galois::do_all(
      galois::iterate(og.begin(), og.end()),
      [&](const auto& src) {
        g.getData(src) = 0;
        auto row_begin = offsets[src];
        g.fixEndEdge(src, row_begin + new_degrees[src]);
        IndexT offset = 0;
        for (auto e : og.edges(src)) {
          auto dst = og.getEdgeDst(e);
          if (degrees[dst] > degrees[src] ||
              (degrees[dst] == degrees[src] && dst > src)) {
            g.constructEdge(row_begin + offset, dst, 0);
            offset++;
          }
        }
        assert(offset == new_degrees[src]);
      },
      galois::loopname("ConstructNewGraph"));

  g.sortAllEdgesByDst();
  Tdag.stop();
  return max_degree;
}

// relabel is needed when we use DAG as input graph, and it is disabled when we
// use symmetrized graph
unsigned read_graph(PangolinGraph& graph, std::string filetype,
                    std::string filename, bool need_dag = false) {
  MGraph mgraph(need_dag);
  unsigned max_degree = 0;
  if (filetype == "txt") {
    // printf("Reading .lg file: %s\n", filename.c_str());
    mgraph.read_txt(filename.c_str());
    genGraph(mgraph, graph);
  } else if (filetype == "adj") {
    // printf("Reading .adj file: %s\n", filename.c_str());
    mgraph.read_adj(filename.c_str());
    genGraph(mgraph, graph);
  } else if (filetype == "mtx") {
    // printf("Reading .mtx file: %s\n", filename.c_str());
    mgraph.read_mtx(filename.c_str(), true); // symmetrize
    genGraph(mgraph, graph);
  } else if (filetype == "gr") {
    // printf("Reading .gr file: %s\n", filename.c_str());
    if (need_dag) {
      PangolinGraph g_temp;
      galois::graphs::readGraph(g_temp, filename);
      max_degree = orientation(g_temp, graph);
    } else {
      galois::graphs::readGraph(graph, filename);
      galois::do_all(
          galois::iterate(graph.begin(), graph.end()),
          [&](const auto& vid) {
            graph.getData(vid) = 1;
            // for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1;
          },
          galois::loopname("assignVertexLabels"));
      std::vector<unsigned> degrees(graph.size());
      galois::do_all(
          galois::iterate(graph.begin(), graph.end()),
          [&](const auto& vid) {
            degrees[vid] =
                std::distance(graph.edge_begin(vid), graph.edge_end(vid));
          },
          galois::loopname("computeMaxDegree"));
      max_degree = *(std::max_element(degrees.begin(), degrees.end()));
    }
  } else {
    printf("Unkown file format\n");
    exit(1);
  }
  // print_graph(graph);
  galois::gPrint("Input graph: num_vertices ", graph.size(), " num_edges ",
                 graph.sizeEdges(), "\n");
  if (filetype != "gr") {
    max_degree = mgraph.get_max_degree();
    mgraph.clean();
  }
  // printf("max degree = %u\n", max_degree);
  return max_degree;
}

} // namespace util
#endif


================================================
FILE: libpangolin/include/pangolin/vertex_embedding.h
================================================
#pragma once
#include "pangolin/embedding.h"
#include "bliss/uintseqhash.hh"

// Vertex-induced embedding with hash value
class VertexInducedEmbedding : public Embedding<SimpleElement> {
  friend std::ostream& operator<<(std::ostream& strm,
                                  const VertexInducedEmbedding& emb);

public:
  VertexInducedEmbedding() : Embedding() { hash_value = 0; }
  VertexInducedEmbedding(size_t n) : Embedding(n) { hash_value = 0; }
  VertexInducedEmbedding(const VertexInducedEmbedding& emb) : Embedding() {
    elements   = emb.get_elements();
    hash_value = emb.get_pid();
  }
  ~VertexInducedEmbedding() {}
  SimpleElement operator[](size_t i) const { return elements[i]; }
  VertexInducedEmbedding& operator=(const VertexInducedEmbedding& other) {
    if (this == &other)
      return *this;
    elements   = other.get_elements();
    hash_value = other.get_pid();
    return *this;
  }
  inline unsigned get_pid() const { return hash_value; } // get the pattern id
  inline void set_pid(unsigned i) { hash_value = i; }    // set the pattern id
  inline unsigned get_hash() const {
    bliss::UintSeqHash h;
    for (unsigned i = 0; i < size(); ++i)
      h.update(elements[i].get_vid());
    return h.get_value();
  }
  friend bool operator==(const VertexInducedEmbedding& e1,
                         const VertexInducedEmbedding& e2) {
    return e1.elements == e2.elements;
  }

protected:
  unsigned hash_value;
};

namespace std {
template <>
struct hash<VertexInducedEmbedding> {
  std::size_t operator()(const VertexInducedEmbedding& emb) const {
    return std::hash<int>()(emb.get_hash());
  }
};
} // namespace std

typedef VertexInducedEmbedding VertexEmbedding;


================================================
FILE: libpangolin/src/BfsMining/embedding_list.cpp
================================================
#include "pangolin/BfsMining/embedding_list.h"

template <typename ElementType, typename EmbeddingType>
void EmbeddingList<ElementType, EmbeddingType>::init(PangolinGraph& graph,
                                                     unsigned max_size,
                                                     bool is_dag) {
  last_level = 1;
  max_level  = max_size;
  vid_lists.resize(max_level);
  idx_lists.resize(max_level);
  size_t num_emb = graph.sizeEdges();
  if (!is_dag)
    num_emb = num_emb / 2;
  vid_lists[1].resize(num_emb);
  idx_lists[1].resize(num_emb);
  if (std::is_same<ElementType, LabeledElement>::value) {
    his_lists.resize(max_level);
    his_lists[1].resize(num_emb);
    galois::do_all(
        galois::iterate((size_t)0, num_emb),
        [&](const size_t& pos) { his_lists[1][pos] = 0; },
        galois::chunk_size<64>(), galois::steal(),
        galois::loopname("Init-his"));
  }
  if (is_dag) {
    galois::do_all(
        galois::iterate(graph.begin(), graph.end()),
        [&](const GNode& src) {
          for (auto e : graph.edges(src)) {
            auto dst         = graph.getEdgeDst(e);
            vid_lists[1][*e] = dst;
            idx_lists[1][*e] = src;
          }
        },
        galois::chunk_size<64>(), galois::steal(),
        galois::loopname("Init-vid"));
  } else {
    size_t num_vertices = graph.size();
    UintList num_init_emb(num_vertices);
    galois::do_all(
        galois::iterate(graph.begin(), graph.end()),
        [&](const GNode& src) {
          num_init_emb[src] = 0;
          for (auto e : graph.edges(src)) {
            auto dst = graph.getEdgeDst(e);
            if (src < dst)
              num_init_emb[src]++;
          }
        },
        galois::chunk_size<64>(), galois::steal(),
        galois::loopname("Init-vid-alloc"));
    UintList indices(num_vertices + 1);
    unsigned total = 0;
    for (size_t n = 0; n < num_vertices; n++) {
      indices[n] = total;
      total += num_init_emb[n];
    }
    indices[num_vertices] = total;
    galois::do_all(
        galois::iterate(graph.begin(), graph.end()),
        [&](const GNode& src) {
          auto start = indices[src];
          for (auto e : graph.edges(src)) {
            GNode dst = graph.getEdgeDst(e);
            if (src < dst) { // TODO: this may be incorrect for FSM: may cause
                             // the 4-FSM bug
              vid_lists[1][start] = dst;
              idx_lists[1][start] = src;
              start++;
            }
          }
        },
        galois::chunk_size<64>(), galois::steal(),
        galois::loopname("Init-vid-insert"));
  }
}

template class EmbeddingList<SimpleElement, BaseEmbedding>;   // TC and KCL
template class EmbeddingList<SimpleElement, VertexEmbedding>; // Motif
template class EmbeddingList<LabeledElement,
                             EdgeInducedEmbedding<LabeledElement>>; // FSM


================================================
FILE: libpangolin/src/base_embedding.cpp
================================================
#include "pangolin/base_embedding.h"

std::ostream& operator<<(std::ostream& strm, const BaseEmbedding& emb) {
  if (emb.empty()) {
    strm << "(empty)";
    return strm;
  }
  strm << "(";
  for (unsigned index = 0; index < emb.size() - 1; ++index)
    std::cout << emb.get_vertex(index) << ", ";
  std::cout << emb.get_vertex(emb.size() - 1);
  strm << ")";
  return strm;
}


================================================
FILE: libpangolin/src/equivalence.cpp
================================================
#include "pangolin/equivalence.h"

std::ostream& operator<<(std::ostream& strm,
                         const VertexPositionEquivalences& equ) {
  if (equ.get_size() == 0) {
    strm << "(empty)";
    return strm;
  }
  strm << "VertexPositionEquivalences{equivalences=[";
  for (unsigned i = 0; i < equ.get_size(); ++i) {
    strm << "[";
    for (auto ele : equ.get_equivalent_set(i)) {
      strm << ele << ", ";
    }
    strm << "], ";
  }
  strm << "]; size=" << equ.get_size() << "}\n";
  return strm;
}


================================================
FILE: libpangolin/src/quick_pattern.cpp
================================================
#include "pangolin/quick_pattern.h"
#include "pangolin/vertex_embedding.h"
#include "pangolin/edge_embedding.h"

template <typename EmbTy, typename EleTy>
QuickPattern<EmbTy, EleTy>::QuickPattern(unsigned subgraph_size) {
  hash_value = 0;
  cg_id      = 0;
  size       = subgraph_size / sizeof(EleTy);
  elements   = new EleTy[size];
}

template <typename EmbTy, typename EleTy>
QuickPattern<EmbTy, EleTy>::QuickPattern(const EmbTy& emb) {
  cg_id          = 0;
  size           = emb.size();
  unsigned bytes = size * sizeof(EleTy);
  elements       = new EleTy[size];
  std::memcpy(elements, emb.data(), bytes);
  VertexId new_id = 1;
  std::unordered_map<VertexId, VertexId> map;
  for (unsigned i = 0; i < size; i++) {
    auto& element   = elements[i];
    VertexId old_id = element.get_vid();
    auto iterator   = map.find(old_id);
    if (iterator == map.end()) {
      element.set_vertex_id(new_id);
      map[old_id] = new_id++;
    } else
      element.set_vertex_id(iterator->second);
  }
  set_hash();
}

template <typename EmbTy, typename EleTy>
QuickPattern<EmbTy, EleTy>::QuickPattern(EmbTy& emb, bool) {
  cg_id          = 0;
  size           = emb.size();
  unsigned bytes = size * sizeof(EleTy);
  elements       = new EleTy[size];
  std::memcpy(elements, emb.data(), bytes);
  VertexId new_id = 1;
  if (std::is_same<EleTy, LabeledElement>::value) {
    if (size == 3) {
      BYTE l1 = emb.get_label(1);
      BYTE l2 = emb.get_label(2);
      BYTE h2 = emb.get_history(2);
      elements[0].set_vertex_id(1);
      elements[1].set_vertex_id(2);
      elements[2].set_vertex_id(3);
      if (h2 == 0) {
        if (l1 < l2) {
          elements[1].set_vertex_label(l2);
          elements[2].set_vertex_label(l1);
          VertexId v1 = emb.get_vertex(1);
          VertexId v2 = emb.get_vertex(2);
          emb.set_vertex(1, v2);
          emb.set_vertex(2, v1);
        }
      } else {
        assert(h2 == 1);
        elements[0].set_vertex_label(l1);
        elements[2].set_history_info(0);
        BYTE l0     = emb.get_label(0);
        VertexId v0 = emb.get_vertex(0);
        VertexId v1 = emb.get_vertex(1);
        VertexId v2 = emb.get_vertex(2);
        if (l0 < l2) {
          elements[1].set_vertex_label(l2);
          elements[2].set_vertex_label(l0);
          emb.set_vertex(1, v2);
          emb.set_vertex(2, v0);
        } else {
          elements[1].set_vertex_label(l0);
          emb.set_vertex(1, v0);
        }
        emb.set_vertex(0, v1);
      }
    } else { // size > 3
      std::unordered_map<VertexId, VertexId> map;
      for (unsigned i = 0; i < size; i++) {
        auto& element   = elements[i];
        VertexId old_id = element.get_vid();
        auto iterator   = map.find(old_id);
        if (iterator == map.end()) {
          element.set_vertex_id(new_id);
          map[old_id] = new_id++;
        } else
          element.set_vertex_id(iterator->second);
      }
    }
  } else { // non-label
    std::unordered_map<VertexId, VertexId> map;
    for (unsigned i = 0; i < size; i++) {
      auto& element   = elements[i];
      VertexId old_id = element.get_vid();
      auto iterator   = map.find(old_id);
      if (iterator == map.end()) {
        element.set_vertex_id(new_id);
        map[old_id] = new_id++;
      } else
        element.set_vertex_id(iterator->second);
    }
  }
  set_hash();
}

template <typename EmbTy, typename EleTy>
QuickPattern<EmbTy, EleTy>::QuickPattern(unsigned n,
                                         std::vector<bool> connected) {
  cg_id = 0;
  size  = std::count(connected.begin(), connected.end(), true) +
         1; // number of edges + 1
  elements = new EleTy[size];
  std::vector<unsigned> pos(n, 0);
  pos[1] = 1;
  pos[2] = 2;
  elements[0].set_vertex_id(1);
  elements[0].set_history_info(0);
  elements[1].set_vertex_id(2);
  elements[1].set_history_info(0);
  int count = 2;
  int l     = 1;
  for (unsigned i = 2; i < n; i++) {
    if (i < n - 2)
      pos[i + 1] = pos[i];
    for (unsigned j = 0; j < i; j++) {
      if (connected[l++]) {
        if (i < n - 2)
          pos[i + 1]++;
        elements[count].set_vertex_id(i + 1);
        elements[count++].set_history_info(pos[j]);
      }
    }
  }
  set_hash();
}

template <typename EmbTy, typename EleTy>
void QuickPattern<EmbTy, EleTy>::findAutomorphisms(
    VertexPositionEquivalences& eq_sets) {
  if (size == 2) { // single-edge
    if (at(0).get_vlabel() == at(1).get_vlabel()) {
      eq_sets.add_equivalence(0, 1);
      eq_sets.add_equivalence(1, 0);
    }
  } else if (size == 3) { // two-edge chain
    if (at(2).get_his() == 0) {
      if (at(1).get_vlabel() == at(2).get_vlabel()) {
        eq_sets.add_equivalence(1, 2);
        eq_sets.add_equivalence(2, 1);
      }
    } else if (at(2).get_his() == 1) {
      if (at(0).get_vlabel() == at(2).get_vlabel()) {
        eq_sets.add_equivalence(0, 2);
        eq_sets.add_equivalence(2, 0);
      }
    } else
      std::cout << "Error\n";
  } else if (size == 4) { // three-edge chain or star
    if (at(2).get_his() == 0) {
      if (at(3).get_his() == 0) {
        if (at(1).get_vlabel() == at(2).get_vlabel()) {
          eq_sets.add_equivalence(1, 2);
          eq_sets.add_equivalence(2, 1);
        }
        if (at(1).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(1, 3);
          eq_sets.add_equivalence(3, 1);
        }
        if (at(2).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(2, 3);
          eq_sets.add_equivalence(3, 2);
        }
      } else if (at(3).get_his() == 1) {
        if (at(2).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(2, 3);
          eq_sets.add_equivalence(3, 2);
        }
        if (at(0).get_vlabel() == at(1).get_vlabel()) {
          eq_sets.add_equivalence(0, 1);
          eq_sets.add_equivalence(1, 0);
        }
      } else if (at(3).get_his() == 2) {
        if (at(1).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(1, 3);
          eq_sets.add_equivalence(3, 1);
        }
        if (at(0).get_vlabel() == at(2).get_vlabel()) {
          eq_sets.add_equivalence(0, 2);
          eq_sets.add_equivalence(2, 0);
        }
      } else
        std::cout << "Error\n";
    } else if (at(2).get_his() == 1) {
      if (at(3).get_his() == 0) {
        if (at(2).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(2, 3);
          eq_sets.add_equivalence(3, 2);
        }
        if (at(0).get_vlabel() == at(1).get_vlabel()) {
          eq_sets.add_equivalence(0, 1);
          eq_sets.add_equivalence(1, 0);
        }
      } else if (at(3).get_his() == 1) {
        if (at(0).get_vlabel() == at(2).get_vlabel()) {
          eq_sets.add_equivalence(0, 2);
          eq_sets.add_equivalence(2, 0);
        }
        if (at(0).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(0, 3);
          eq_sets.add_equivalence(3, 0);
        }
        if (at(2).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(2, 3);
          eq_sets.add_equivalence(3, 2);
        }
      } else if (at(3).get_his() == 2) {
        if (at(0).get_vlabel() == at(3).get_vlabel()) {
          eq_sets.add_equivalence(0, 3);
          eq_sets.add_equivalence(3, 0);
        }
        if (at(1).get_vlabel() == at(2).get_vlabel()) {
          eq_sets.add_equivalence(1, 2);
          eq_sets.add_equivalence(2, 1);
        }
      } else
        std::cout << "Error\n";
    } else
      std::cout << "Error\n";
  } else { // four-edge and beyond
    std::cout << "Currently not supported\n";
  }
}

template <typename EmbTy, typename EleTy>
std::ostream& operator<<(std::ostream& strm,
                         const QuickPattern<EmbTy, EleTy>& qp) {
  if (qp.get_size() == 0) {
    strm << "(empty)";
    return strm;
  }
  strm << "(";
  for (unsigned index = 0; index < qp.get_size() - 1; ++index)
    strm << qp.elements[index] << ", ";
  strm << qp.elements[qp.get_size() - 1];
  strm << ")";
  return strm;
}

template class QuickPattern<VertexEmbedding, SimpleElement>; // Motif
template class QuickPattern<EdgeInducedEmbedding<StructuralElement>,
                            StructuralElement>; // Motif
template class QuickPattern<EdgeInducedEmbedding<LabeledElement>,
                            LabeledElement>; // FSM


================================================
FILE: libpangolin/src/vertex_embedding.cpp
================================================
#include "pangolin/vertex_embedding.h"

std::ostream& operator<<(std::ostream& strm,
                         const VertexInducedEmbedding& emb) {
  if (emb.empty()) {
    strm << "(empty)";
    return strm;
  }
  std::cout << "(";
  for (unsigned index = 0; index < emb.size() - 1; ++index)
    std::cout << emb.get_vertex(index) << ", ";
  std::cout << emb.get_vertex(emb.size() - 1);
  std::cout << ") --> " << emb.get_pid();
  return strm;
}


================================================
FILE: libpygalois/CMakeLists.txt
================================================
add_library(pygalois INTERFACE)

target_include_directories(pygalois INTERFACE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

install(TARGETS pygalois
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libpygalois/include/galois/Constants.h
================================================
#include <iostream>
namespace galois {
constexpr uint32_t CHUNK_SIZE_64 = 64;

class UpdateRequestIndexer {
public:
  uint32_t shift;

  UpdateRequestIndexer(uint32_t _shift) : shift(_shift) {}
  template <typename R>
  unsigned int operator()(const R& req) const {
    unsigned int t = req.dist >> shift;
    return t;
  }
};

template <typename GNode, typename Dist>
struct UpdateRequest {
  GNode src;
  Dist dist;
  UpdateRequest(const GNode& N, Dist W) : src(N), dist(W) {}
  UpdateRequest() : src(), dist(0) {}

  friend bool operator<(const UpdateRequest& left, const UpdateRequest& right) {
    return left.dist == right.dist ? left.src < right.src
                                   : left.dist < right.dist;
  }
};

struct ReqPushWrap {
  template <typename C, typename GNode, typename Dist>
  void operator()(C& cont, const GNode& n, const Dist& dist) const {
    cont.push(UpdateRequest<GNode, Dist>(n, dist));
  }
};

} // namespace galois


================================================
FILE: libsupport/CMakeLists.txt
================================================
add_library(galois_support STATIC)
add_library(Galois::support ALIAS galois_support)
set_target_properties(galois_support PROPERTIES EXPORT_NAME support)
add_dependencies(lib galois_support)

set(sources
        src/GetEnv.cpp
        src/Logging.cpp
)

target_sources(galois_support PRIVATE ${sources})

target_include_directories(galois_support PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

find_package(fmt REQUIRED)
if (fmt_VERSION VERSION_LESS 4)
  message(FATAL_ERROR "fmt must be version 4 or higher. Found ${fmt_VERSION}.")
endif()
target_link_libraries(galois_support fmt::fmt)

add_subdirectory(test)

install(
  DIRECTORY include/
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
  COMPONENT dev
  FILES_MATCHING PATTERN "*.h"
)

install(
  TARGETS galois_support
  EXPORT GaloisTargets
  LIBRARY
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT shlib
  ARCHIVE
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT lib
  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)


================================================
FILE: libsupport/include/galois/GetEnv.h
================================================
#ifndef GALOIS_LIBSUPPORT_GALOIS_GET_ENV_H_
#define GALOIS_LIBSUPPORT_GALOIS_GET_ENV_H_

#include <string>

namespace galois {

/// Return true if the environment variable is set.
///
/// This function simply tests for the presence of an environment variable; in
/// contrast, bool GetEnv(std::string, bool&) checks if the value of the
/// environment variable matches common truthy and falsey values.
bool GetEnv(const std::string& var_name);

/// Return true if environment variable is set, and extract its value into
/// ret_val parameter.
///
/// \param var_name name of the variable
/// \param[out] ret_val where to store the value of environment variable
/// \return true if environment variable set and value was successfully parsed;
///   false otherwise
bool GetEnv(const std::string& var_name, bool* ret);
bool GetEnv(const std::string& var_name, int* ret);
bool GetEnv(const std::string& var_name, double* ret);
bool GetEnv(const std::string& var_name, std::string* ret);

} // end namespace galois

#endif


================================================
FILE: libsupport/include/galois/Logging.h
================================================
#ifndef GALOIS_LIBSUPPORT_GALOIS_LOGGING_H_
#define GALOIS_LIBSUPPORT_GALOIS_LOGGING_H_

#include <fmt/format.h>
#include <fmt/ostream.h>

#include <sstream>
#include <string>
#include <system_error>

// Small patch to work with libfmt 4.0, which is the version in Ubuntu 18.04.
#ifndef FMT_STRING
#define FMT_STRING(...) __VA_ARGS__
#endif

#if FMT_VERSION >= 60000
/// Introduce std::error_code to the fmt library. Otherwise, they will be
/// printed using ostream<< formatting (i.e., as an int).
template <>
struct fmt::formatter<std::error_code> : formatter<string_view> {

  template <typename FormatterContext>
  auto format(std::error_code c, FormatterContext& ctx) {
    return formatter<string_view>::format(c.message(), ctx);
  }
};
#endif

namespace galois {

enum class LogLevel {
  Debug   = 0,
  Verbose = 1,
  // Info = 2,  currently unused
  Warning = 3,
  Error   = 4,
};

namespace internal {

void LogString(LogLevel level, const std::string& s);

}

/// Log at a specific LogLevel.
///
/// \tparam F         string-like type
/// \param fmt_string a C++20-style fmt string (e.g., "hello {}")
/// \param args       arguments to fmt interpolation
template <typename F, typename... Args>
void Log(LogLevel level, F fmt_string, Args&&... args) {
  std::string s = fmt::format(fmt_string, std::forward<Args>(args)...);
  internal::LogString(level, s);
}

/// Log at a specific LogLevel with source code information.
///
/// \tparam F         string-like type
/// \param file_name  file name
/// \param line_no    line number
/// \param fmt_string a C++20-style fmt string (e.g., "hello {}")
/// \param args       arguments to fmt interpolation
template <typename F, typename... Args>
void LogLine(LogLevel level, const char* file_name, int line_no, F fmt_string,
             Args&&... args) {
  std::string s         = fmt::format(fmt_string, std::forward<Args>(args)...);
  std::string with_line = fmt::format("{}:{}: {}", file_name, line_no, s);
  internal::LogString(level, with_line);
}

} // end namespace galois

#define GALOIS_LOG_FATAL(fmt_string, ...)                                      \
  do {                                                                         \
    ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,           \
                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \
    ::std::abort();                                                            \
  } while (0)
#define GALOIS_LOG_ERROR(fmt_string, ...)                                      \
  do {                                                                         \
    ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,           \
                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \
  } while (0)
#define GALOIS_LOG_WARN(fmt_string, ...)                                       \
  do {                                                                         \
    ::galois::LogLine(::galois::LogLevel::Warning, __FILE__, __LINE__,         \
                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \
  } while (0)
#define GALOIS_LOG_VERBOSE(fmt_string, ...)                                    \
  do {                                                                         \
    ::galois::LogLine(::galois::LogLevel::Verbose, __FILE__, __LINE__,         \
                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \
  } while (0)

#ifndef NDEBUG
#define GALOIS_LOG_DEBUG(fmt_string, ...)                                      \
  do {                                                                         \
    ::galois::LogLine(::galois::LogLevel::Debug, __FILE__, __LINE__,           \
                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \
  } while (0)
#else
#define GALOIS_LOG_DEBUG(...)
#endif

#define GALOIS_LOG_ASSERT(cond)                                                \
  do {                                                                         \
    if (!(cond)) {                                                             \
      ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,         \
                        "assertion not true: {}", #cond);                      \
      ::std::abort();                                                          \
    }                                                                          \
  } while (0)

#endif


================================================
FILE: libsupport/src/GetEnv.cpp
================================================
#include "galois/GetEnv.h"

#include <cstdlib>
#include <stdexcept>

namespace {

bool Convert(const std::string& var_val, bool* ret) {
  // TODO(ddn): strip whitespace, case-insensitive?
  if (var_val == "True" || var_val == "1" || var_val == "true") {
    *ret = true;
    return true;
  }

  if (var_val == "False" || var_val == "0" || var_val == "false") {
    *ret = false;
    return true;
  }

  return false;
}

bool Convert(const std::string& var_val, int* ret) {
  try {
    *ret = std::stoi(var_val);
  } catch (std::invalid_argument&) {
    return false;
  } catch (std::out_of_range&) {
    return false;
  }
  return true;
}

bool Convert(const std::string& var_val, double* ret) {
  try {
    *ret = std::stod(var_val);
  } catch (std::invalid_argument&) {
    return false;
  } catch (std::out_of_range&) {
    return false;
  }
  return true;
}

bool Convert(const std::string& var_val, std::string* ret) {
  *ret = var_val;
  return true;
}

template <typename T>
bool GenericGetEnv(const std::string& var_name, T* ret) {
  char* var_val = std::getenv(var_name.c_str());
  if (!var_val) {
    return false;
  }
  return Convert(var_val, ret);
}

} // namespace

bool galois::GetEnv(const std::string& var_name, bool* ret) {
  return GenericGetEnv(var_name, ret);
}

bool galois::GetEnv(const std::string& var_name, int* ret) {
  return GenericGetEnv(var_name, ret);
}

bool galois::GetEnv(const std::string& var_name, std::string* ret) {
  return GenericGetEnv(var_name, ret);
}

bool galois::GetEnv(const std::string& var_name, double* ret) {
  return GenericGetEnv(var_name, ret);
}

bool galois::GetEnv(const std::string& var_name) {
  return std::getenv(var_name.c_str()) != nullptr;
}


================================================
FILE: libsupport/src/Logging.cpp
================================================
#include "galois/Logging.h"

#include <iostream>
#include <mutex>

#include "galois/GetEnv.h"

namespace {

void PrintString(bool error, bool flush, const std::string& prefix,
                 const std::string& s) {
  static std::mutex lock;
  std::lock_guard<std::mutex> lg(lock);

  std::ostream& o = error ? std::cerr : std::cout;
  if (!prefix.empty()) {
    o << prefix << ": ";
  }
  o << s << "\n";
  if (flush) {
    o.flush();
  }
}

} // end unnamed namespace

void galois::internal::LogString(galois::LogLevel level, const std::string& s) {
  switch (level) {
  case LogLevel::Debug:
    return PrintString(true, false, "DEBUG", s);
  case LogLevel::Verbose:
    if (galois::GetEnv("GALOIS_LOG_VERBOSE")) {
      return PrintString(true, false, "VERBOSE", s);
    }
    return;
  case LogLevel::Warning:
    return PrintString(true, false, "WARNING", s);
  case LogLevel::Error:
    return PrintString(true, false, "ERROR", s);
  default:
    std::abort();
  }
}


================================================
FILE: libsupport/test/CMakeLists.txt
================================================
function(add_test_unit name)
  set(test_name unit-${name})

  add_executable(${test_name} ${name}.cpp)
  target_link_libraries(${test_name} galois_support)

  set(command_line "$<TARGET_FILE:${test_name}>")

  add_test(NAME ${test_name} COMMAND ${command_line})

  # Allow parallel tests
  set_tests_properties(${test_name}
    PROPERTIES
      ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1
      LABELS quick
    )
endfunction()

add_test_unit(getenv)
add_test_unit(logging)


================================================
FILE: libsupport/test/getenv.cpp
================================================
#include "galois/GetEnv.h"
#include "galois/Logging.h"

int main() {
  GALOIS_LOG_ASSERT(galois::GetEnv("PATH"));

  std::string s;
  GALOIS_LOG_ASSERT(galois::GetEnv("PATH", &s));

  int i{};
  GALOIS_LOG_ASSERT(!galois::GetEnv("PATH", &i));

  double d{};
  GALOIS_LOG_ASSERT(!galois::GetEnv("PATH", &d));

  bool b{};
  GALOIS_LOG_ASSERT(!galois::GetEnv("PATH", &b));

  return 0;
}


================================================
FILE: libsupport/test/logging.cpp
================================================
#include "galois/Logging.h"

#include <system_error>

int main() {
  GALOIS_LOG_ERROR("string");
  GALOIS_LOG_ERROR("format string: {}", 42);
  GALOIS_LOG_ERROR("format string: {:d}", 42);
  // The following correctly fails with a compile time error
  // GALOIS_LOG_ERROR("basic format string {:s}", 42);
  GALOIS_LOG_WARN("format number: {:.2f}", 2.0 / 3.0);
  GALOIS_LOG_WARN("format error code: {}",
                  std::make_error_code(std::errc::invalid_argument));
  GALOIS_LOG_VERBOSE(
      "will be printed when environment variable GALOIS_LOG_VERBOSE=1");
  GALOIS_LOG_DEBUG("this will only be printed in debug builds");
  GALOIS_LOG_ASSERT(1 == 1);

  return 0;
}


================================================
FILE: lonestar/CMakeLists.txt
================================================
function(add_test_scale type app)
  set(options NOT_QUICK)
  set(one_value_args)
  set(multi_value_args REQUIRES COMMAND_PREFIX)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

  set(threads)
  set(thr "${GALOIS_NUM_TEST_THREADS}")
  while (${thr} GREATER 1)
    list(APPEND threads ${thr})
    math(EXPR thr "${thr} / 2")
  endwhile()
  list(APPEND threads "1")

  foreach (thr ${threads})
    set(name run-${type}-${app}-${thr})
    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})
    if (NOT ${X_NOT_QUICK})
      # Allow parallel tests
      set_tests_properties(${name}
        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)
    endif()
  endforeach()
endfunction(add_test_scale)

function(app_analy_gpu name target_name)
  set(options NO_GPU)
  set(one_value_args)
  set(multi_value_args)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
  string(CONCAT target_name ${target_name} "-gpu")
  add_executable(${target_name} ${name}.cu support.cu)
  install(TARGETS ${target_name} DESTINATION "${CMAKE_INSTALL_BINDIR}" EXCLUDE_FROM_ALL)
  if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})
    target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)
    target_link_libraries(${target_name} Galois::gpu)
    set_property(TARGET ${target_name} PROPERTY CUDA_STANDARD 14)
  endif()
endfunction()

function(add_test_gpu app input output baseoutputext)
  set(options NOT_QUICK)
  set(one_value_args)
  set(multi_value_args REQUIRES COMMAND_PREFIX)
  set(RESULT_CHECKER ${PROJECT_SOURCE_DIR}/scripts/result_checker.py)
  set(suffix "-${app}-${input}")
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

  string(REPLACE "_" ";" app_list ${app})
  list(GET app_list 0 app_id)
  set(baseoutput ${BASEOUTPUT}/${input}.${baseoutputext})

  set(name run-${app}-${input})
  if (EXISTS ${baseoutput})
     add_test(NAME ${name} COMMAND ${app}-gpu ${X_UNPARSED_ARGUMENTS})
     add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${baseoutput} ${output})
  else()
     add_test(NAME ${name} COMMAND ${app}-gpu ${X_UNPARSED_ARGUMENTS})
  endif()
endfunction(add_test_gpu)


if(GALOIS_ENABLE_DIST)
  add_subdirectory(libdistbench)

  if(GALOIS_ENABLE_GPU)
    # turn on cuda for distbench as well
    target_compile_definitions(distbench PRIVATE GALOIS_ENABLE_GPU=1)

    # for debugging
    add_definitions(-DGALOIS_CUDA_CHECK_ERROR)
    if(CMAKE_BUILD_TYPE MATCHES "Debug")
      add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>")
    endif()
  endif()

  function(app_dist name target_name)
    set(options NO_GPU)
    set(one_value_args)
    set(multi_value_args)
    cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
    string(CONCAT target_name ${target_name} "-dist")

    FILE(GLOB CPPSOURCES ${name}*.cpp)
    add_executable(${target_name} ${CPPSOURCES})
    add_dependencies(apps ${target_name})
    target_link_libraries(${target_name} Galois::shmem LLVMSupport)
    install(TARGETS ${target_name} DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

    target_link_libraries(${target_name} distbench)
    if(GALOIS_PER_ROUND_STATS)
      target_compile_definitions(${target_name} PRIVATE GALOIS_PER_ROUND_STATS=1)
    endif()
    if(GALOIS_COMM_STATS)
      target_compile_definitions(${target_name} PRIVATE GALOIS_COMM_STATS=1)
    endif()
    if(GALOIS_USE_BARE_MPI)
      target_compile_definitions(${target_name} PRIVATE GALOIS_USE_BARE_MPI=1)
    endif()

    if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})
      target_compile_definitions(${target_name} PRIVATE GALOIS_ENABLE_GPU=1)
      target_link_libraries(${target_name} ${target_name}_cuda)

      FILE(GLOB CUSOURCES ${name}*.cu)
      add_library(${target_name}_cuda ${CUSOURCES})
      target_compile_options(${target_name}_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)
      target_link_libraries(${target_name}_cuda Galois::gpu)
      set_property(TARGET ${target_name}_cuda PROPERTY CUDA_STANDARD 14)
    endif()
  endfunction()

  set(RESULT_CHECKER ${PROJECT_SOURCE_DIR}/scripts/result_checker.py)
  cmake_host_system_information(RESULT HOSTNAME QUERY HOSTNAME)
  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/output")

  function(add_test_dist_and_verify app input type part N np)
    set(options GPU NOT_QUICK)
    set(one_value_args)
    set(multi_value_args)
    cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

    math(EXPR t "(${N} / ${np})")
    string(REPLACE "_" ";" app_list ${app})
    list(GET app_list 0 app_id)
    set(output ${BASEOUTPUT}/${input}.${app_id})

    set(suffix "-${app}-${type}-${input}-${part}-${np}")
    if (EXISTS ${output})
      add_test(run${suffix} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part} -output -outputLocation=${CMAKE_BINARY_DIR}/output)
      add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${output} ${CMAKE_BINARY_DIR}/output/*)
    else()
      add_test(run-${app}-${type}-${input}-${part}-${np} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part})
    endif()

    if (${X_GPU})
      set_tests_properties(run${suffix} PROPERTIES RUN_SERIAL true)
    endif()

    if (NOT ${X_NOT_QUICK})
      set_tests_properties(run${suffix}
        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)
    endif()
  endfunction()

  function(add_test_dist_for_partitions app input type num_threads num_gpus part)
    # cut threads in system in half first
    if (${num_threads} GREATER 1)
      math(EXPR num_threads "${num_threads} / 2")
    endif()

    # spawn at most 8 processes/use at most 8 threads during testing
    if (${num_threads} GREATER 8)
      set(num_threads 8)
    endif()

    set(partitions ${num_threads})
    set(thr ${num_threads})
    while (${thr} GREATER 1)
      math(EXPR thr "${thr} / 2")
      list(APPEND partitions ${thr})
    endwhile()
    list(REVERSE partitions)

    foreach(np ${partitions})
      if (np GREATER 1)
        add_test_dist_and_verify(${app} ${input} ${type}-cpu ${part} ${num_threads} 1 ${ARGN})
      endif()
      add_test_dist_and_verify(${app} ${input} ${type}-cpu ${part} ${num_threads} ${np} ${ARGN})
    endforeach()

    if (NOT GALOIS_ENABLE_GPU)
      return()
    endif()

    if (num_gpus LESS_EQUAL 0)
      return()
    endif()

    if(num_gpus GREATER_EQUAL num_threads)
      message(FATAL_ERROR "number of test gpus (${num_gpus}) should be less than number of test threads (${num_threads})")
    endif()

    set(PSET "-pset=")
    foreach(np RANGE 1 ${num_gpus})
      set(PSET "${PSET}g")
      add_test_dist_and_verify(${app} ${input} ${type}-gpu ${part} ${num_threads} ${np} GPU ${ARGN} -num_nodes=1 ${PSET})
    endforeach(np)
    set(PSET "${PSET}c")
    math(EXPR np "(${G} + 1)")
    add_test_dist_and_verify(${app} ${input} ${type}-cpugpu ${part} ${num_threads} ${np} GPU ${ARGN} -num_nodes=1 ${PSET} -scalegpu=3)
  endfunction()

  function(add_test_dist app input)
    set(options NO_GPU NO_ASYNC)
    set(one_value_args)
    set(multi_value_args)
    cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

    set(num_gpus ${GALOIS_NUM_TEST_GPUS})
    if (${X_NO_GPU})
      set(num_gpus 0)
    endif()

    set(num_threads ${GALOIS_NUM_TEST_THREADS})

    foreach (part oec iec cvc cvc-iec hovc hivc)
      if (NOT ${X_NO_ASYNC})
        add_test_dist_for_partitions(${app} ${input} sync ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS} -exec=Sync)
        add_test_dist_for_partitions(${app} ${input} async ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS} -exec=Async)
      else()
        add_test_dist_for_partitions(${app} ${input} sync ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS})
      endif()
    endforeach()
  endfunction()
endif()

add_subdirectory(liblonestar)

add_subdirectory(tutorial_examples)

add_subdirectory(analytics)
add_subdirectory(eda)
add_subdirectory(mining)
add_subdirectory(scientific)


================================================
FILE: lonestar/analytics/CMakeLists.txt
================================================
add_subdirectory(cpu)

if(GALOIS_ENABLE_DIST)
  add_subdirectory(distributed)
endif()

if(GALOIS_ENABLE_GPU)
  add_subdirectory(gpu)
endif()


================================================
FILE: lonestar/analytics/cpu/CMakeLists.txt
================================================
add_subdirectory(betweennesscentrality)
add_subdirectory(bfs)
add_subdirectory(bipart)
add_subdirectory(spanningtree)
add_subdirectory(clustering)
add_subdirectory(connected-components)
add_subdirectory(gmetis)
add_subdirectory(independentset)
add_subdirectory(k-core)
add_subdirectory(k-truss)
add_subdirectory(matching)
add_subdirectory(matrixcompletion)
add_subdirectory(pagerank)
add_subdirectory(pointstoanalysis)
add_subdirectory(preflowpush)
add_subdirectory(sssp)
add_subdirectory(triangle-counting)


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/AsyncStructs.h
================================================
#ifndef GALOIS_BC_ASYNC
#define GALOIS_BC_ASYNC

#include "BCNode.h"
#include "BCEdge.h"
#include "galois/Bag.h"
#include "galois/graphs/BufferedGraph.h"
#include "galois/graphs/LC_CSR_CSC_Graph.h"
#include <iomanip>

// WARNING: optimal chunk size may differ depending on input graph
constexpr static const unsigned ASYNC_CHUNK_SIZE = 64U;
using NodeType = BCNode<BC_USE_MARKING, BC_CONCURRENT>;
using AsyncGraph =
    galois::graphs::LC_CSR_CSC_Graph<NodeType, BCEdge, false, true>;

// Work items for the forward phase
struct ForwardPhaseWorkItem {
  uint32_t nodeID;
  uint32_t distance;
  ForwardPhaseWorkItem() : nodeID(infinity), distance(infinity){};
  ForwardPhaseWorkItem(uint32_t _n, uint32_t _d) : nodeID(_n), distance(_d){};
};

// grabs distance from a forward phase work item
struct FPWorkItemIndexer {
  uint32_t operator()(const ForwardPhaseWorkItem& it) const {
    return it.distance;
  }
};

// obim worklist type declaration
namespace gwl = galois::worklists;
using PSchunk = gwl::PerSocketChunkFIFO<ASYNC_CHUNK_SIZE>;
using OBIM    = gwl::OrderedByIntegerMetric<FPWorkItemIndexer, PSchunk>;

template <typename T, bool enable>
struct Counter : public T {
  std::string name;

  Counter(std::string s) : name(std::move(s)) {}

  ~Counter() {
    galois::runtime::reportStat_Single("(NULL)", name, this->reduce());
  }
};

template <typename T>
struct Counter<T, false> {
  Counter(std::string) {}

  template <typename... Args>
  void update(Args...) {}
};

struct BetweenessCentralityAsync {
  AsyncGraph& graph;

  BetweenessCentralityAsync(AsyncGraph& _graph) : graph(_graph) {}

  using SumCounter =
      Counter<galois::GAccumulator<unsigned long>, BC_COUNT_ACTIONS>;
  SumCounter spfuCount{"SP&FU"};
  SumCounter updateSigmaP1Count{"UpdateSigmaBefore"};
  SumCounter updateSigmaP2Count{"RealUS"};
  SumCounter firstUpdateCount{"First Update"};
  SumCounter correctNodeP1Count{"CorrectNodeBefore"};
  SumCounter correctNodeP2Count{"Real CN"};
  SumCounter noActionCount{"NoAction"};

  using MaxCounter =
      Counter<galois::GReduceMax<unsigned long>, BC_COUNT_ACTIONS>;
  MaxCounter largestNodeDist{"Largest node distance"};

  using LeafCounter =
      Counter<galois::GAccumulator<unsigned long>, BC_COUNT_LEAVES>;

  void correctNode(uint32_t dstID, BCEdge&) {
    NodeType& dstData = graph.getData(dstID);

    // loop through in edges
    for (auto e : graph.in_edges(dstID)) {
      BCEdge& inEdgeData = graph.getInEdgeData(e);

      uint32_t srcID = graph.getInEdgeDst(e);
      if (srcID == dstID)
        continue;

      NodeType& srcData = graph.getData(srcID);

      // lock in right order
      if (srcID < dstID) {
        srcData.lock();
        dstData.lock();
      } else {
        dstData.lock();
        srcData.lock();
      }

      const unsigned edgeLevel = inEdgeData.level;

      // Correct Node
      if (srcData.distance >= dstData.distance) {
        correctNodeP1Count.update(1);
        dstData.unlock();

        if (edgeLevel != infinity) {
          inEdgeData.level = infinity;
          if (edgeLevel == srcData.distance) {
            correctNodeP2Count.update(1);
            srcData.nsuccs--;
          }
        }
        srcData.unlock();
      } else {
        srcData.unlock();
        dstData.unlock();
      }
    }
  }

  template <typename CTXType>
  void spAndFU(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {
    spfuCount.update(1);

    NodeType& srcData = graph.getData(srcID);
    NodeType& dstData = graph.getData(dstID);

    // make dst a successor of src, src predecessor of dst
    srcData.nsuccs++;
    const ShortPathType srcSigma = srcData.sigma;
    assert(srcSigma > 0);
    NodeType::predTY& dstPreds = dstData.preds;
    bool dstPredsNotEmpty      = !dstPreds.empty();
    dstPreds.clear();
    dstPreds.push_back(srcID);
    dstData.distance = srcData.distance + 1;

    largestNodeDist.update(dstData.distance);

    dstData.nsuccs = 0;        // SP
    dstData.sigma  = srcSigma; // FU
    ed.val         = srcSigma;
    ed.level       = srcData.distance;
    srcData.unlock();
    if (!dstData.isAlreadyIn())
      ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));
    dstData.unlock();
    if (dstPredsNotEmpty) {
      correctNode(dstID, ed);
    }
  }

  template <typename CTXType>
  void updateSigma(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {
    updateSigmaP1Count.update(1);

    NodeType& srcData = graph.getData(srcID);
    NodeType& dstData = graph.getData(dstID);

    const ShortPathType srcSigma = srcData.sigma;
    const ShortPathType eval     = ed.val;
    const ShortPathType diff     = srcSigma - eval;

    srcData.unlock();
    // greater than 0.0001 instead of 0 due to floating point imprecision
    if (diff > 0.0001) {
      updateSigmaP2Count.update(1);
      ed.val = srcSigma;

      // ShortPathType old = dstData.sigma;
      dstData.sigma += diff;

      // if (old >= dstData.sigma) {
      //  galois::gDebug("Overflow detected; capping at max uint64_t");
      //  dstData.sigma = std::numeric_limits<uint64_t>::max();
      //}

      int nbsuccs = dstData.nsuccs;

      if (nbsuccs > 0) {
        if (!dstData.isAlreadyIn())
          ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));
      }
      dstData.unlock();
    } else {
      dstData.unlock();
    }
  }

  template <typename CTXType>
  void firstUpdate(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {
    firstUpdateCount.update(1);

    NodeType& srcData = graph.getData(srcID);
    srcData.nsuccs++;
    const ShortPathType srcSigma = srcData.sigma;

    NodeType& dstData = graph.getData(dstID);
    dstData.preds.push_back(srcID);

    const ShortPathType dstSigma = dstData.sigma;

    // ShortPathType old = dstData.sigma;
    dstData.sigma = dstSigma + srcSigma;
    // if (old >= dstData.sigma) {
    //  galois::gDebug("Overflow detected; capping at max uint64_t");
    //  dstData.sigma = std::numeric_limits<uint64_t>::max();
    //}

    ed.val   = srcSigma;
    ed.level = srcData.distance;
    srcData.unlock();
    int nbsuccs = dstData.nsuccs;
    if (nbsuccs > 0) {
      if (!dstData.isAlreadyIn())
        ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));
    }
    dstData.unlock();
  }

  void dagConstruction(galois::InsertBag<ForwardPhaseWorkItem>& wl) {
    galois::for_each(
        galois::iterate(wl),
        [&](ForwardPhaseWorkItem& wi, auto& ctx) {
          uint32_t srcID    = wi.nodeID;
          NodeType& srcData = graph.getData(srcID);
          srcData.markOut();

          // loop through all edges
          for (auto e : graph.edges(srcID)) {
            BCEdge& edgeData  = graph.getEdgeData(e);
            uint32_t dstID    = graph.getEdgeDst(e);
            NodeType& dstData = graph.getData(dstID);

            if (srcID == dstID)
              continue; // ignore self loops

            // lock in set order to prevent deadlock (lower id
            // first)
            // TODO run even in serial version; find way to not
            // need to run
            if (srcID < dstID) {
              srcData.lock();
              dstData.lock();
            } else {
              dstData.lock();
              srcData.lock();
            }

            const int elevel = edgeData.level;
            const int ADist  = srcData.distance;
            const int BDist  = dstData.distance;

            if (BDist - ADist > 1) {
              // Shortest Path + First Update (and Correct Node)
              this->spAndFU(srcID, dstID, edgeData, ctx);
            } else if (elevel == ADist && BDist == ADist + 1) {
              // Update Sigma
              this->updateSigma(srcID, dstID, edgeData, ctx);
            } else if (BDist == ADist + 1 && elevel != ADist) {
              // First Update not combined with Shortest Path
              this->firstUpdate(srcID, dstID, edgeData, ctx);
            } else { // No Action
              noActionCount.update(1);
              srcData.unlock();
              dstData.unlock();
            }
          }
        },
        galois::wl<OBIM>(FPWorkItemIndexer()),
        galois::disable_conflict_detection(), galois::loopname("ForwardPhase"));
  }

  void dependencyBackProp(galois::InsertBag<uint32_t>& wl) {
    galois::for_each(
        galois::iterate(wl),
        [&](uint32_t srcID, auto& ctx) {
          NodeType& srcData = graph.getData(srcID);
          srcData.lock();

          if (srcData.nsuccs == 0) {
            const double srcDelta = srcData.delta;
            srcData.bc += srcDelta;

            srcData.unlock();

            NodeType::predTY& srcPreds = srcData.preds;

            // loop through src's predecessors
            for (unsigned i = 0; i < srcPreds.size(); i++) {
              uint32_t predID    = srcPreds[i];
              NodeType& predData = graph.getData(predID);

              assert(srcData.sigma >= 1);
              const double term =
                  (double)predData.sigma * (1.0 + srcDelta) / srcData.sigma;
              // if (std::isnan(term)) {
              //  galois::gPrint(predData.sigma, " ", srcDelta, "
              //  ", srcData.sigma, "\n");
              //}
              predData.lock();
              predData.delta += term;
              const unsigned prevPdNsuccs = predData.nsuccs;
              predData.nsuccs--;

              if (prevPdNsuccs == 1) {
                predData.unlock();
                ctx.push(predID);
              } else {
                predData.unlock();
              }
            }

            // reset data in preparation for next source
            srcData.reset();
            for (auto e : graph.edges(srcID)) {
              graph.getEdgeData(e).reset();
            }
          } else {
            srcData.unlock();
          }
        },
        galois::disable_conflict_detection(),
        galois::loopname("BackwardPhase"));
  }

  void findLeaves(galois::InsertBag<uint32_t>& fringeWL, unsigned nnodes) {
    LeafCounter leafCount{"leaf nodes in DAG"};
    galois::do_all(
        galois::iterate(0u, nnodes),
        [&](auto i) {
          NodeType& n = graph.getData(i);

          if (n.nsuccs == 0 && n.distance < infinity) {
            leafCount.update(1);
            fringeWL.push(i);
          }
        },
        galois::loopname("LeafFind"));
  }
};

void AsyncSanity(AsyncGraph& graph) {
  galois::GReduceMax<float> accumMax;
  galois::GReduceMin<float> accumMin;
  galois::GAccumulator<float> accumSum;
  accumMax.reset();
  accumMin.reset();
  accumSum.reset();

  // get max, min, sum of BC values using accumulators and reducers
  galois::do_all(
      galois::iterate(graph),
      [&](unsigned n) {
        auto& nodeData = graph.getData(n);
        accumMax.update(nodeData.bc);
        accumMin.update(nodeData.bc);
        accumSum += nodeData.bc;
      },
      galois::no_stats(), galois::loopname("AsyncSanity"));

  galois::gPrint("Max BC is ", accumMax.reduce(), "\n");
  galois::gPrint("Min BC is ", accumMin.reduce(), "\n");
  galois::gPrint("BC sum is ", accumSum.reduce(), "\n");
}
////////////////////////////////////////////////////////////////////////////////

//! runs asynchronous BC
void doAsyncBC() {
  if (BC_CONCURRENT) {
    galois::gInfo("Running in concurrent mode with ", numThreads, " threads");
  } else {
    galois::gInfo("Running in serial mode");
  }

  galois::gInfo("Constructing async BC graph");
  // create bidirectional graph
  AsyncGraph bcGraph;

  galois::StatTimer graphConstructTimer("GRAPH_CONSTRUCT");
  graphConstructTimer.start();

  galois::graphs::FileGraph fileReader;
  fileReader.fromFile(inputFile);
  bcGraph.allocateFrom(fileReader.size(), fileReader.sizeEdges());
  bcGraph.constructNodes();

  galois::do_all(galois::iterate(fileReader), [&](uint32_t i) {
    auto b = fileReader.edge_begin(i);
    auto e = fileReader.edge_end(i);

    bcGraph.fixEndEdge(i, *e);

    while (b < e) {
      bcGraph.constructEdge(*b, fileReader.getEdgeDst(*b));
      b++;
    }
  });
  bcGraph.constructIncomingEdges();

  graphConstructTimer.stop();

  BetweenessCentralityAsync bcExecutor(bcGraph);

  unsigned nnodes = bcGraph.size();
  uint64_t nedges = bcGraph.sizeEdges();
  galois::gInfo("Num nodes is ", nnodes, ", num edges is ", nedges);
  galois::gInfo("Using OBIM chunk size: ", ASYNC_CHUNK_SIZE);
  galois::gInfo("Note that optimal chunk size may differ depending on input "
                "graph");
  galois::runtime::reportStat_Single("BCAsync", "ChunkSize", ASYNC_CHUNK_SIZE);

  galois::reportPageAlloc("MemAllocPre");
  galois::gInfo("Going to pre-allocate pages");
  galois::preAlloc(
      std::min(static_cast<uint64_t>(
                   std::min(galois::getActiveThreads(), 100U) *
                   std::max((nnodes / 4500000), unsigned{5}) *
                   std::max((nedges / 30000000), uint64_t{5}) * 2.5),
               uint64_t{1500}) +
      5);
  galois::gInfo("Pre-allocation complete");
  galois::reportPageAlloc("MemAllocMid");

  // reset everything in preparation for run
  galois::do_all(galois::iterate(0u, nnodes),
                 [&](auto i) { bcGraph.getData(i).reset(); });
  galois::do_all(galois::iterate(UINT64_C(0), nedges),
                 [&](auto i) { bcGraph.getEdgeData(i).reset(); });

  // reading in list of sources to operate on if provided
  std::ifstream sourceFile;
  std::vector<uint64_t> sourceVector;
  if (sourcesToUse != "") {
    sourceFile.open(sourcesToUse);
    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},
                            std::istream_iterator<uint64_t>{});
    sourceVector = t;
    sourceFile.close();
  }

  if (numOfSources == 0) {
    numOfSources = nnodes;
  }

  // if user does specifes a certain number of out sources (i.e. only sources
  // with outgoing edges), we need to loop over the entire node set to look for
  // good sources to use
  uint32_t goodSource = 0;
  if (iterLimit != 0) {
    numOfSources = nnodes;
  }

  // only use at most the number of sources in the passed in source file (if
  // such a file was actually pass in)
  if (sourceVector.size() != 0) {
    if (numOfSources > sourceVector.size()) {
      numOfSources = sourceVector.size();
    }
  }

  galois::InsertBag<ForwardPhaseWorkItem> forwardPhaseWL;
  galois::InsertBag<uint32_t> backwardPhaseWL;

  galois::gInfo("Beginning execution");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  for (uint32_t i = 0; i < numOfSources; ++i) {
    uint32_t sourceToUse = i;
    if (sourceVector.size() != 0) {
      sourceToUse = sourceVector[i];
    }

    // ignore nodes with no neighbors
    if (!std::distance(bcGraph.edge_begin(sourceToUse),
                       bcGraph.edge_end(sourceToUse))) {
      galois::gDebug(sourceToUse, " has no outgoing edges");
      continue;
    }

    forwardPhaseWL.push_back(ForwardPhaseWorkItem(sourceToUse, 0));
    NodeType& active = bcGraph.getData(sourceToUse);
    active.initAsSource();
    galois::gDebug("Source is ", sourceToUse);

    bcExecutor.dagConstruction(forwardPhaseWL);
    forwardPhaseWL.clear();

    bcExecutor.findLeaves(backwardPhaseWL, nnodes);

    double backupSrcBC = active.bc;
    bcExecutor.dependencyBackProp(backwardPhaseWL);

    active.bc = backupSrcBC; // current source BC should not get updated

    backwardPhaseWL.clear();

    // break out once number of sources user specified to do (if any) has been
    // reached
    goodSource++;
    if (iterLimit != 0 && goodSource >= iterLimit)
      break;
  }
  execTime.stop();

  galois::gInfo("Number of sources with outgoing edges was ", goodSource);

  galois::reportPageAlloc("MemAllocPost");

  // sanity
  AsyncSanity(bcGraph);

  // prints out first 10 node BC values
  if (!skipVerify) {
    int count = 0;
    for (unsigned i = 0; i < nnodes && count < 10; ++i, ++count) {
      galois::gPrint(count, ": ", std::setiosflags(std::ios::fixed),
                     std::setprecision(6), bcGraph.getData(i).bc, "\n");
    }
  }

  if (output) {
    std::cerr << "Writting out bc values...\n";
    std::stringstream outfname;
    outfname << "certificate"
             << "_" << numThreads << ".txt";
    std::string fname = outfname.str();
    std::ofstream outfile(fname.c_str());
    for (unsigned i = 0; i < nnodes; ++i) {
      outfile << i << " " << std::setiosflags(std::ios::fixed)
              << std::setprecision(9) << bcGraph.getData(i).bc << "\n";
    }
    outfile.close();
  }
}
#endif


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/BCEdge.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _ED_H_
#define _ED_H_

#include "BCNode.h"
#include "control.h"

struct BCEdge {
  using NodeType = BCNode<BC_USE_MARKING, BC_CONCURRENT>;
  NodeType* src;
  NodeType* dst;
  ShortPathType val;
  unsigned level;

  BCEdge(NodeType* _src, NodeType* _dst)
      : src(_src), dst(_dst), val(0), level(infinity) {}
  BCEdge() : src(0), dst(0), val(0), level(infinity) {}

  BCEdge& operator=(BCEdge const& from) {
    if (this != &from) {
      src   = from.src;
      dst   = from.dst;
      val   = from.val;
      level = from.level;
    }
    return *this;
  }

  inline void reset() {
    if (level != infinity) {
      level = infinity;
    }
  }

  void checkClear(int j) {
    if (level != infinity) {
      galois::gError(j, " PROBLEM WITH LEVEL OF ", toString());
    }
    if (val != 0) {
      galois::gError(j, " PROBLEM WITH VAL OF ", toString());
    }
  }

  /**
   * TODO actually implement this if needed
   */
  // char isAlreadyIn() {
  //  return 0;
  //}

  std::string toString() const {
    std::ostringstream s;
    s << val << " " << level;
    return s.str();
  }
};
#endif


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/BCNode.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _BCNODE_H_
#define _BCNODE_H_

#include "control.h"

#include "galois/substrate/SimpleLock.h"
#include "galois/gstl.h"

#include <vector>
#include <string>
#include <sstream>
#include <algorithm>
#include <limits>

template <bool UseMarking = false, bool Concurrent = true>
struct BCNode {
  using LockType =
      typename std::conditional<Concurrent, galois::substrate::SimpleLock,
                                char>::type;
  LockType spinLock;

  using predTY = galois::gstl::Vector<uint32_t>;
  predTY preds;

  unsigned distance;
  unsigned nsuccs;

  ShortPathType sigma;
  double delta;
  double bc;
  int mark;

  BCNode()
      : spinLock(), preds(), distance(infinity), nsuccs(0), sigma(0), delta(0),
        bc(0), mark(0) {}

  /**
   * @param a Node to check if predecessor of this node
   * @returns true if node a is in predecessors of this node
   */
  bool predsContain(const BCNode* a) const {
    typename predTY::const_iterator it = preds.end();
    return (std::find(preds.begin(), preds.end(), a) != it);
  }

  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>
  void lock() {
    spinLock.lock();
  }

  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>
  bool try_lock() {
    return spinLock.try_lock();
  }

  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>
  void unlock() {
    spinLock.unlock();
  }

  // below are no-ops for when concurrent is false
  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>
  void lock() {
    // no-op
  }

  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>
  bool try_lock() {
    return true;
  }

  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>
  void unlock() {
    // no-op
  }

  /**
   * Return node as string.
   *
   * WARNING NON SCALABLE FUNCTION
   */
  std::string toString() const {
    std::ostringstream s;

    s << " distance: " << distance << " sigma: " << sigma << " bc: " << bc
      << " nsuccs: " << nsuccs << " npreds: " << preds.size();

    return s.str();
  }

  /**
   * Reset everything but the BC value
   */
  void reset() {
    preds.clear();
    distance = infinity;
    nsuccs   = 0;
    sigma    = 0;
    delta    = 0;
    mark     = 0;
  }

  /**
   * Sanity check to make sure node is reset
   */
  void checkClear() const {
    if (!preds.empty() || nsuccs != 0 || sigma != 0 || delta != 0)
      galois::gWarn("Problem, node not clear");

    assert(preds.empty());
    assert(distance == infinity);
    assert(nsuccs == 0 && sigma == 0 && delta == 0);
  }

  /**
   * Initialize this node as the source
   */
  void initAsSource() {
    distance = 0;
    sigma    = 1;
  }

  /**
   * Mark this as 0.
   */
  template <bool M = UseMarking, typename std::enable_if<M>::type* = nullptr>
  void markOut() {
    if (Concurrent) {
      __sync_fetch_and_and(&mark, 0);
    } else {
      mark = 0;
    }
  }

  template <bool M = UseMarking, typename std::enable_if<!M>::type* = nullptr>
  void markOut() {
    // no-op
  }

  /**
   * @returns true if mark is set to 1
   */
  template <bool M = UseMarking, typename std::enable_if<M>::type* = nullptr>
  int isAlreadyIn() {
    if (Concurrent) {
      return __sync_fetch_and_or(&mark, 1);
    } else {
      int retval = mark;
      mark       = 1;
      return retval;
    }
  }

  /**
   * @returns 0
   */
  template <bool M = UseMarking, typename std::enable_if<!M>::type* = nullptr>
  int isAlreadyIn() {
    return 0;
  }
};
#endif // _BCNODE_H_


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/BetweennessCentrality.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2020, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Lonestar/BoilerPlate.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/Utils.h"

////////////////////////////////////////////////////////////////////////////////

constexpr static const char* const REGION_NAME = "BC";

enum Algo { Level = 0, Async, Outer, AutoAlgo };

const char* const ALGO_NAMES[] = {"Level", "Async", "Outer", "Auto"};

const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;

////////////////////////////////////////////////////////////////////////////////

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

static cll::opt<std::string> sourcesToUse("sourcesToUse",
                                          cll::desc("Whitespace separated list "
                                                    "of sources in a file to "
                                                    "use in BC"),
                                          cll::init(""));

static cll::opt<unsigned int>
    numOfSources("numOfSources",
                 cll::desc("Number of sources to compute BC on (default all)"),
                 cll::init(0));

static llvm::cl::opt<unsigned int>
    iterLimit("numOfOutSources",
              llvm::cl::desc("Number of sources WITH EDGES "
                             " to compute BC on (default is all); does "
                             "not work with Level BC"),
              llvm::cl::init(0));

static cll::opt<bool>
    singleSourceBC("singleSource",
                   cll::desc("Level: Use for single source BC (default off)"),
                   cll::init(false));

static cll::opt<uint64_t>
    startSource("startNode",
                cll::desc("Level/Outer: Starting source node used for "
                          "betweeness-centrality (default 0); works with "
                          "singleSource flag only"),
                cll::init(0));

static cll::opt<bool>
    output("output", cll::desc("Output BC (Level/Async) (default: false)"),
           cll::init(false));

static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm (default value AutoAlgo):"),
    cll::values(clEnumVal(Level, "Level"), clEnumVal(Async, "Async"),
                clEnumVal(Outer, "Outer"),
                clEnumVal(AutoAlgo,
                          "Auto: choose among the algorithms automatically")),
    cll::init(AutoAlgo));

////////////////////////////////////////////////////////////////////////////////

static const char* name = "Betweenness Centrality";
static const char* desc = "Computes betwenness centrality in an unweighted "
                          "graph";

////////////////////////////////////////////////////////////////////////////////

// include implementations for other BCs; here so that it has access to command
// line arguments above at global scope
// @todo not the best coding practice; passing cl in via argument might be
// better

#include "LevelStructs.h"
#include "AsyncStructs.h"
#include "OuterStructs.h"

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer autoAlgoTimer("AutoAlgo_0");
  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (algo == AutoAlgo) {
    galois::graphs::FileGraph degreeGraph;
    degreeGraph.fromFile(inputFile);
    degreeGraph.initNodeDegrees();
    autoAlgoTimer.start();
    if (isApproximateDegreeDistributionPowerLaw(degreeGraph)) {
      algo = Async;
    } else {
      algo = Level;
    }
    autoAlgoTimer.stop();
    galois::gInfo("Choosing ", ALGO_NAMES[algo], " algorithm");
  }

  switch (algo) {
  case Level:
    // see LevelStructs.h
    galois::gInfo("Running level BC");
    doLevelBC();
    break;
  case Async:
    // see AsyncStructs.h
    galois::gInfo("Running async BC");
    doAsyncBC();
    break;
  case Outer:
    // see OuterStructs.h
    galois::gInfo("Running outer BC");
    doOuterBC();
    break;
  default:
    GALOIS_DIE("Unknown BC algorithm type");
  }

  totalTime.stop();
  return 0;
}


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/CMakeLists.txt
================================================
add_executable(betweennesscentrality-cpu BetweennessCentrality.cpp)
add_dependencies(apps betweennesscentrality-cpu)
target_link_libraries(betweennesscentrality-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS betweennesscentrality-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small-level betweennesscentrality-cpu -algo=Level -numOfSources=4 "${BASEINPUT}/scalefree/rmat15.gr")
add_test_scale(small-async betweennesscentrality-cpu -algo=Async -numOfSources=4 "${BASEINPUT}/scalefree/rmat15.gr")
add_test_scale(small-outer betweennesscentrality-cpu -algo=Outer -numOfSources=4 "${BASEINPUT}/scalefree/rmat15.gr")


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/LevelStructs.h
================================================
#ifndef GALOIS_BC_LEVEL
#define GALOIS_BC_LEVEL

#include "galois/AtomicHelpers.h"
#include "galois/gstl.h"
#include "galois/Reduction.h"
#include "galois/graphs/LCGraph.h"

#include <limits>
#include <fstream>

////////////////////////////////////////////////////////////////////////////////

static uint64_t levelCurrentSrcNode = 0;
// type of the num shortest paths variable
using LevelShortPathType = double;

// NOTE: types assume that these values will not reach uint64_t: it may
// need to be changed for very large graphs
struct LevelNodeData {
  uint32_t currentDistance;
  std::atomic<LevelShortPathType> numShortestPaths;
  float dependency;
  float bc;
};

using LevelGraph =
    galois::graphs::LC_CSR_Graph<LevelNodeData, void>::with_no_lockable<
        true>::type::with_numa_alloc<true>::type;
using LevelGNode        = LevelGraph::GraphNode;
using LevelWorklistType = galois::InsertBag<LevelGNode, 4096>;

constexpr static const unsigned LEVEL_CHUNK_SIZE = 256u;

/******************************************************************************/
/* Functions for running the algorithm */
/******************************************************************************/
/**
 * Initialize node fields all to 0
 * @param graph LevelGraph to initialize
 */
void LevelInitializeGraph(LevelGraph& graph) {
  galois::do_all(
      galois::iterate(graph),
      [&](LevelGNode n) {
        LevelNodeData& nodeData   = graph.getData(n);
        nodeData.currentDistance  = 0;
        nodeData.numShortestPaths = 0;
        nodeData.dependency       = 0;
        nodeData.bc               = 0;
      },
      galois::no_stats(), galois::loopname("InitializeGraph"));
}

/**
 * Resets data associated to start a new SSSP with a new source.
 *
 * @param graph LevelGraph to reset iteration data
 */
void LevelInitializeIteration(LevelGraph& graph) {
  galois::do_all(
      galois::iterate(graph),
      [&](LevelGNode n) {
        LevelNodeData& nodeData = graph.getData(n);
        bool isSource           = (n == levelCurrentSrcNode);
        // source nodes have distance 0 and initialize short paths to 1, else
        // distance is infinity with 0 short paths
        if (!isSource) {
          nodeData.currentDistance  = infinity;
          nodeData.numShortestPaths = 0;
        } else {
          nodeData.currentDistance  = 0;
          nodeData.numShortestPaths = 1;
        }
        // dependency reset for new source
        nodeData.dependency = 0;
      },
      galois::no_stats(), galois::loopname("InitializeIteration"));
};

/**
 * Forward phase: SSSP to determine DAG and get shortest path counts.
 *
 * Worklist-based push. Save worklists on a stack for reuse in backward
 * Brandes dependency propagation.
 */
galois::gstl::Vector<LevelWorklistType> LevelSSSP(LevelGraph& graph) {
  galois::gstl::Vector<LevelWorklistType> stackOfWorklists;
  uint32_t currentLevel = 0;

  // construct first level worklist which consists only of source
  stackOfWorklists.emplace_back();
  stackOfWorklists[0].emplace(levelCurrentSrcNode);

  // loop as long as current level's worklist is non-empty
  while (!stackOfWorklists[currentLevel].empty()) {
    // create worklist for next level
    stackOfWorklists.emplace_back();
    uint32_t nextLevel = currentLevel + 1;

    galois::do_all(
        galois::iterate(stackOfWorklists[currentLevel]),
        [&](LevelGNode n) {
          LevelNodeData& curData = graph.getData(n);
          GALOIS_ASSERT(curData.currentDistance == currentLevel);

          for (auto e : graph.edges(n)) {
            LevelGNode dest         = graph.getEdgeDst(e);
            LevelNodeData& destData = graph.getData(dest);

            if (destData.currentDistance == infinity) {
              uint32_t oldVal = __sync_val_compare_and_swap(
                  &(destData.currentDistance), infinity, nextLevel);
              // only 1 thread should add to worklist
              if (oldVal == infinity) {
                stackOfWorklists[nextLevel].emplace(dest);
              }

              galois::atomicAdd(destData.numShortestPaths,
                                curData.numShortestPaths.load());
            } else if (destData.currentDistance == nextLevel) {
              galois::atomicAdd(destData.numShortestPaths,
                                curData.numShortestPaths.load());
            }
          }
        },
        galois::steal(), galois::chunk_size<LEVEL_CHUNK_SIZE>(),
        galois::no_stats(), galois::loopname("SSSP"));

    // move on to next level
    currentLevel++;
  }
  return stackOfWorklists;
}

/**
 * Backward phase: use worklist of nodes at each level to back-propagate
 * dependency values.
 *
 * @param graph LevelGraph to do backward Brandes dependency prop on
 */
void LevelBackwardBrandes(
    LevelGraph& graph,
    galois::gstl::Vector<LevelWorklistType>& stackOfWorklists) {
  // minus 3 because last one is empty, one after is leaf nodes, and one
  // to correct indexing to 0 index
  if (stackOfWorklists.size() >= 3) {
    uint32_t currentLevel = stackOfWorklists.size() - 3;

    // last level is ignored since it's just the source
    while (currentLevel > 0) {
      LevelWorklistType& currentWorklist = stackOfWorklists[currentLevel];
      uint32_t succLevel                 = currentLevel + 1;

      galois::do_all(
          galois::iterate(currentWorklist),
          [&](LevelGNode n) {
            LevelNodeData& curData = graph.getData(n);
            GALOIS_ASSERT(curData.currentDistance == currentLevel);

            for (auto e : graph.edges(n)) {
              LevelGNode dest         = graph.getEdgeDst(e);
              LevelNodeData& destData = graph.getData(dest);

              if (destData.currentDistance == succLevel) {
                // grab dependency, add to self
                float contrib = ((float)1 + destData.dependency) /
                                destData.numShortestPaths;
                curData.dependency = curData.dependency + contrib;
              }
            }

            // multiply at end to get final dependency value
            curData.dependency *= curData.numShortestPaths;
            // accumulate dependency into bc
            curData.bc += curData.dependency;
          },
          galois::steal(), galois::chunk_size<LEVEL_CHUNK_SIZE>(),
          galois::no_stats(), galois::loopname("Brandes"));

      // move on to next level lower
      currentLevel--;
    }
  }
}

/******************************************************************************/
/* Sanity check */
/******************************************************************************/

/**
 * Get some sanity numbers (max, min, sum of BC)
 *
 * @param graph LevelGraph to sanity check
 */
void LevelSanity(LevelGraph& graph) {
  galois::GReduceMax<float> accumMax;
  galois::GReduceMin<float> accumMin;
  galois::GAccumulator<float> accumSum;
  accumMax.reset();
  accumMin.reset();
  accumSum.reset();

  // get max, min, sum of BC values using accumulators and reducers
  galois::do_all(
      galois::iterate(graph),
      [&](LevelGNode n) {
        LevelNodeData& nodeData = graph.getData(n);
        accumMax.update(nodeData.bc);
        accumMin.update(nodeData.bc);
        accumSum += nodeData.bc;
      },
      galois::no_stats(), galois::loopname("LevelSanity"));

  galois::gPrint("Max BC is ", accumMax.reduce(), "\n");
  galois::gPrint("Min BC is ", accumMin.reduce(), "\n");
  galois::gPrint("BC sum is ", accumSum.reduce(), "\n");
}

/******************************************************************************/
/* Running */
/******************************************************************************/

void doLevelBC() {
  // reading in list of sources to operate on if provided
  std::ifstream sourceFile;
  std::vector<uint64_t> sourceVector;

  // some initial stat reporting
  galois::gInfo("Worklist chunk size of ", LEVEL_CHUNK_SIZE,
                ": best size may depend on input.");
  galois::runtime::reportStat_Single(REGION_NAME, "ChunkSize",
                                     LEVEL_CHUNK_SIZE);
  galois::reportPageAlloc("MemAllocPre");

  // LevelGraph construction
  galois::StatTimer graphConstructTimer("TimerConstructGraph", "BFS");
  graphConstructTimer.start();
  LevelGraph graph;
  galois::graphs::readGraph(graph, inputFile);
  graphConstructTimer.stop();
  galois::gInfo("Graph construction complete");

  // preallocate pages in memory so allocation doesn't occur during compute
  galois::StatTimer preallocTime("PreAllocTime", REGION_NAME);
  preallocTime.start();
  galois::preAlloc(
      std::max(size_t{galois::getActiveThreads()} * (graph.size() / 2000000),
               std::max(10U, galois::getActiveThreads()) * size_t{10}));
  preallocTime.stop();
  galois::reportPageAlloc("MemAllocMid");

  // If particular set of sources was specified, use them
  if (sourcesToUse != "") {
    sourceFile.open(sourcesToUse);
    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},
                            std::istream_iterator<uint64_t>{});
    sourceVector = t;
    sourceFile.close();
  }

  // determine how many sources to loop over based on command line args
  uint64_t loop_end = 1;
  bool sSources     = false;
  if (!singleSourceBC) {
    if (!numOfSources) {
      loop_end = graph.size();
    } else {
      loop_end = numOfSources;
    }

    // if provided a file of sources to work with, use that
    if (sourceVector.size() != 0) {
      if (loop_end > sourceVector.size()) {
        loop_end = sourceVector.size();
      }
      sSources = true;
    }
  }

  // graph initialization, then main loop
  LevelInitializeGraph(graph);

  galois::gInfo("Beginning main computation");
  galois::StatTimer execTime("Timer_0");

  // loop over all specified sources for SSSP/Brandes calculation
  for (uint64_t i = 0; i < loop_end; i++) {
    if (singleSourceBC) {
      // only 1 source; specified start source in command line
      assert(loop_end == 1);
      galois::gDebug("This is single source node BC");
      levelCurrentSrcNode = startSource;
    } else if (sSources) {
      levelCurrentSrcNode = sourceVector[i];
    } else {
      // all sources
      levelCurrentSrcNode = i;
    }

    // here begins main computation
    execTime.start();
    LevelInitializeIteration(graph);
    // worklist; last one will be empty
    galois::gstl::Vector<LevelWorklistType> worklists = LevelSSSP(graph);
    LevelBackwardBrandes(graph, worklists);
    execTime.stop();
  }

  galois::reportPageAlloc("MemAllocPost");

  // sanity checking numbers
  LevelSanity(graph);

  // Verify, i.e. print out graph data for examination
  // @todo print to file instead of stdout
  if (output) {
    char* v_out = (char*)malloc(40);
    for (auto ii = graph.begin(); ii != graph.end(); ++ii) {
      // outputs betweenness centrality
      sprintf(v_out, "%u %.9f\n", (*ii), graph.getData(*ii).bc);
      galois::gPrint(v_out);
    }
    free(v_out);
  }
}
#endif


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
================================================
#ifndef GALOIS_BC_OUTER
#define GALOIS_BC_OUTER

#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "Lonestar/BoilerPlate.h"
#include <boost/iterator/filter_iterator.hpp>

#include <iomanip>
#include <fstream>

using OuterGraph = galois::graphs::LC_CSR_Graph<void, void>::with_no_lockable<
    true>::type ::with_numa_alloc<true>::type;
using OuterGNode = OuterGraph::GraphNode;

////////////////////////////////////////////////////////////////////////////////

class BCOuter {
  OuterGraph* G;
  int NumNodes;

  galois::substrate::PerThreadStorage<double*> CB; // betweeness measure
  galois::substrate::PerThreadStorage<double*> perThreadSigma;
  galois::substrate::PerThreadStorage<int*> perThreadD;
  galois::substrate::PerThreadStorage<double*> perThreadDelta;
  galois::substrate::PerThreadStorage<galois::gdeque<OuterGNode>*>
      perThreadSucc;

public:
  /**
   * Constructor initializes thread local storage.
   */
  BCOuter(OuterGraph& g) : G(&g), NumNodes(g.size()) { InitializeLocal(); }

  /**
   * Constructor destroys thread local storage.
   */
  ~BCOuter(void) { DeleteLocal(); }

  //! Function that does BC for a single souce; called by a thread
  void doBC(const OuterGNode curSource) {
    galois::gdeque<OuterGNode> SQ;

    double* sigma                    = *perThreadSigma.getLocal();
    int* d                           = *perThreadD.getLocal();
    double* delta                    = *perThreadDelta.getLocal();
    galois::gdeque<OuterGNode>* succ = *perThreadSucc.getLocal();

    sigma[curSource] = 1;
    d[curSource]     = 1;

    SQ.push_back(curSource);

    // Do bfs while computing number of shortest paths (saved into sigma)
    // and successors of nodes;
    // Note this bfs makes it so source has distance of 1 instead of 0
    for (auto qq = SQ.begin(), eq = SQ.end(); qq != eq; ++qq) {
      int src = *qq;

      for (auto edge : G->edges(src, galois::MethodFlag::UNPROTECTED)) {
        int dest = G->getEdgeDst(edge);

        if (!d[dest]) {
          SQ.push_back(dest);
          d[dest] = d[src] + 1;
        }

        if (d[dest] == d[src] + 1) {
          sigma[dest] = sigma[dest] + sigma[src];
          succ[src].push_back(dest);
        }
      }
    }

    // Back-propogate the dependency values (delta) along the BFS DAG
    // ignore the source (hence SQ.size > 1 and not SQ.empty)
    while (SQ.size() > 1) {
      int leaf = SQ.back();
      SQ.pop_back();

      double sigma_leaf = sigma[leaf]; // has finalized short path value
      double delta_leaf = delta[leaf];
      auto& succ_list   = succ[leaf];

      for (auto succ = succ_list.begin(), succ_end = succ_list.end();
           succ != succ_end; ++succ) {
        delta_leaf += (sigma_leaf / sigma[*succ]) * (1.0 + delta[*succ]);
      }
      delta[leaf] = delta_leaf;
    }

    // save result of this source's BC, reset all local values for next
    // source
    double* Vec = *CB.getLocal();
    for (int i = 0; i < NumNodes; ++i) {
      Vec[i] += delta[i];
      delta[i] = 0;
      sigma[i] = 0;
      d[i]     = 0;
      succ[i].clear();
    }
  }

  /**
   * Runs betweeness-centrality proper. Instead of a vector of sources,
   * it will operate on the first numSources sources.
   *
   * @param numSources Num sources to get BC contribution for
   */
  void runAll(unsigned numSources) {
    // Each thread works on an individual source node
    galois::do_all(
        galois::iterate(0u, numSources),
        [&](const OuterGNode& curSource) { doBC(curSource); }, galois::steal(),
        galois::loopname("Main"));
  }

  /**
   * Runs betweeness-centrality proper.
   *
   * @tparam Cont type of the data structure that holds the nodes to treat
   * as a source during betweeness-centrality.
   *
   * @param v Data structure that holds nodes to treat as a source during
   * betweeness-centrality
   */
  template <typename Cont>
  void run(const Cont& v) {
    // Each thread works on an individual source node
    galois::do_all(
        galois::iterate(v),
        [&](const OuterGNode& curSource) { doBC(curSource); }, galois::steal(),
        galois::loopname("Main"));
  }

  /**
   * Verification for reference torus graph inputs.
   * All nodes should have the same betweenness value up to
   * some tolerance.
   */
  void verify() {
    double sampleBC = 0.0;
    bool firstTime  = true;
    for (int i = 0; i < NumNodes; ++i) {
      double bc = (*CB.getRemote(0))[i];

      for (unsigned j = 1; j < galois::getActiveThreads(); ++j)
        bc += (*CB.getRemote(j))[i];

      if (firstTime) {
        sampleBC = bc;
        galois::gInfo("BC: ", sampleBC);
        firstTime = false;
      } else {
        // check if over some tolerance value
        if ((bc - sampleBC) > 0.0001) {
          galois::gInfo("If torus graph, verification failed ",
                        (bc - sampleBC));
          return;
        }
      }
    }
  }

  /**
   * Print betweeness-centrality measures.
   *
   * @param begin first node to print BC measure of
   * @param end iterator after last node to print
   * @param out stream to output to
   * @param precision precision of the floating points outputted by the function
   */
  void printBCValues(size_t begin, size_t end, std::ostream& out,
                     int precision = 6) {
    for (; begin != end; ++begin) {
      double bc = (*CB.getRemote(0))[begin];

      for (unsigned j = 1; j < galois::getActiveThreads(); ++j)
        bc += (*CB.getRemote(j))[begin];

      out << begin << " " << std::setiosflags(std::ios::fixed)
          << std::setprecision(precision) << bc << "\n";
    }
  }

  /**
   * Print all betweeness centrality values in the graph.
   */
  void printBCcertificate() {
    std::stringstream foutname;
    foutname << "outer_certificate_" << galois::getActiveThreads();

    std::ofstream outf(foutname.str().c_str());
    galois::gInfo("Writing certificate...");

    printBCValues(0, NumNodes, outf, 9);

    outf.close();
  }

  //! sanity check of BC values
  void outerSanity(OuterGraph& graph) {
    galois::GReduceMax<float> accumMax;
    galois::GReduceMin<float> accumMin;
    galois::GAccumulator<float> accumSum;
    accumMax.reset();
    accumMin.reset();
    accumSum.reset();

    // get max, min, sum of BC values using accumulators and reducers
    galois::do_all(
        galois::iterate(graph),
        [&](LevelGNode n) {
          double bc = (*CB.getRemote(0))[n];

          for (unsigned j = 1; j < galois::getActiveThreads(); ++j)
            bc += (*CB.getRemote(j))[n];

          accumMax.update(bc);
          accumMin.update(bc);
          accumSum += bc;
        },
        galois::no_stats(), galois::loopname("OuterSanity"));

    galois::gPrint("Max BC is ", accumMax.reduce(), "\n");
    galois::gPrint("Min BC is ", accumMin.reduce(), "\n");
    galois::gPrint("BC sum is ", accumSum.reduce(), "\n");
  }

private:
  /**
   * Initialize an array at some provided address.
   *
   * @param addr Address to initialize array at
   */
  template <typename T>
  void initArray(T** addr) {
    *addr = new T[NumNodes]();
  }

  /**
   * Destroy an array at some provided address.
   *
   * @param addr Address to destroy array at
   */
  template <typename T>
  void deleteArray(T** addr) {
    delete[] * addr;
  }

  /**
   * Initialize local thread storage.
   */
  void InitializeLocal(void) {
    galois::on_each([this](unsigned, unsigned) {
      this->initArray(CB.getLocal());
      this->initArray(perThreadSigma.getLocal());
      this->initArray(perThreadD.getLocal());
      this->initArray(perThreadDelta.getLocal());
      this->initArray(perThreadSucc.getLocal());
    });
  }

  /**
   * Destroy local thread storage.
   */
  void DeleteLocal(void) {
    galois::on_each([this](unsigned, unsigned) {
      this->deleteArray(CB.getLocal());
      this->deleteArray(perThreadSigma.getLocal());
      this->deleteArray(perThreadD.getLocal());
      this->deleteArray(perThreadDelta.getLocal());
      this->deleteArray(perThreadSucc.getLocal());
    });
  }
};

/**
 * Functor that indicates if a node contains outgoing edges
 */
struct HasOut {
  OuterGraph* graph;
  HasOut(OuterGraph* g) : graph(g) {}

  bool operator()(const OuterGNode& n) const {
    return graph->edge_begin(n) != graph->edge_end(n);
  }
};

////////////////////////////////////////////////////////////////////////////////

void doOuterBC() {
  OuterGraph g;
  galois::graphs::readGraph(g, inputFile);

  BCOuter bcOuter(g);

  size_t NumNodes = g.size();

  // preallocate pages for use in algorithm
  galois::reportPageAlloc("MeminfoPre");
  galois::preAlloc(galois::getActiveThreads() * NumNodes / 1650);
  galois::reportPageAlloc("MeminfoMid");

  // vector of sources to process; initialized if doing outSources
  std::vector<OuterGNode> v;
  // preprocessing: find the nodes with out edges we will process and skip
  // over nodes with no out edges; only done if numOfSources isn't specified
  if (numOfSources == 0) {
    // find first node with out edges
    boost::filter_iterator<HasOut, OuterGraph::iterator> begin =
        boost::make_filter_iterator(HasOut(&g), g.begin(), g.end());
    boost::filter_iterator<HasOut, OuterGraph::iterator> end =
        boost::make_filter_iterator(HasOut(&g), g.end(), g.end());
    // adjustedEnd = last node we will process based on how many iterations
    // (i.e. sources) we want to do
    boost::filter_iterator<HasOut, OuterGraph::iterator> adjustedEnd =
        iterLimit ? galois::safe_advance(begin, end, (int)iterLimit) : end;

    size_t iterations = std::distance(begin, adjustedEnd);
    galois::gPrint("Num Nodes: ", NumNodes, " Start Node: ", startSource,
                   " Iterations: ", iterations, "\n");
    // vector of nodes we want to process
    v.insert(v.end(), begin, adjustedEnd);
  }

  // execute algorithm
  galois::StatTimer execTime("Timer_0");
  execTime.start();
  // either run a contiguous chunk of sources from beginning or run using
  // sources with outgoing edges only
  if (numOfSources > 0) {
    bcOuter.runAll(numOfSources);
  } else {
    bcOuter.run(v);
  }
  execTime.stop();

  bcOuter.printBCValues(0, std::min(10UL, NumNodes), std::cout, 6);
  bcOuter.outerSanity(g);
  if (output)
    bcOuter.printBCcertificate();

  if (!skipVerify)
    bcOuter.verify();

  galois::reportPageAlloc("MeminfoPost");
}
#endif


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/README.md
================================================
All the variants of BC discussed below can be run from the same executable.

BUILD
================================================================================

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/betweennesscentrality; make -j`

Betweenness Centrality (Level)
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

Runs Betweenness Centrality where all threads will work on a single betweenness
centrality source at one time. Does a forward SSSP phase to find shortest path
counts and then does a backward propagation step to calculate BC contributions
for the source being operated on.

This application takes in Galois .gr graphs.

RUN
--------------------------------------------------------------------------------

To run all sources, use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Level -t=<num-threads>`

To run with a specific number of sources N (starting from the beginning), use
the following:
`./betweennesscentrality-cpu <input-graph> -algo=Level -t=<num-threads> -numOfSources=N`


Asynchronous Brandes Betweenness Centrality
================================================================================

DESCRIPTION
----------------------------------------

Runs an asynchronous version of Brandes's Betweenness Centrality as formulated
through the operator formulation of algorithms. It is a two-phase algorithm:
the first phase determines the shortest path DAG and counts the number of
shortest paths through a given node for a particular source, and the second
phase back-propagates dependency values for the calculation of betweenness
centrality.

control.h has some variables that may alter how the algorithm runs and what kind
of data it collects.

Pass in a regular .gr graph.

For more details on the algorithm, see paper here:
https://dl.acm.org/citation.cfm?id=2442521

RUN
--------------------------------------------------------------------------------

To run all sources, use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads>`

To run with a specific number of sources N (starting from the beginning), use
the following:
`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -numOfSources=N`

To run with a specific number of sources N (starting from the beginning) **with
outgoing edges**, use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -numOfOutSources=N`

To run with a specific set of sources, put the sources in a file with
the source ids separated with a line and use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -sourcesToUse=<path-to-file>`

PERFORMANCE
--------------------------------------------------------------------------------

Good scaling and performance is very dependent on the chunk size parameter
for the worklist. It must be changed through the source code as it is
a compile time variable used in templates. The best chunk size is input
dependent.

Good scaling also comes from using the Galois power-of-two allocator
for memory allocations in parallel regions.

Finally, it may be useful to toggle BC_USE_MARKING in control.h: if on, it will
check to see if a node is in a worklist before adding it (preventing duplicates).
Depending on the input graph, performance may improve with this setting on.

Betweenness Centrality (Outer)
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

Runs Betweenness Centrality where the unit of parallelism is a betweenness
centrality source. Each thread will work on the betweenness centrality
computation of it own individual source and find the BC contributions of that
source to the rest of the graph.

This application takes in Galois .gr graphs.


RUN
--------------------------------------------------------------------------------

To run all sources, use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads>`

To run starting from a particular source, use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads> -startNode=<node to begin>`

To run only on N nodes (that have outgoing edges), use the following:
`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads> -limit=N`

PERFORMANCE
--------------------------------------------------------------------------------

If each source's BC calculation takes roughly the same amount of time, then
load balancing should be good. Otherwise, there may be load imbalance among
threads.

ALGORITHM CHOICE
=================================================================================

Async performs best for high-diameter graphs such as road-networks. Level performs
best when the diameter of the graph is not large due to the level-by-level
nature of its computation.


================================================
FILE: lonestar/analytics/cpu/betweennesscentrality/control.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _BC_ASYNC_CONTROL_H_
#define _BC_ASYNC_CONTROL_H_

using ShortPathType = double;

#define BC_COUNT_ACTIONS 0
#define BC_COUNT_LEAVES 0

// on if concurrent BC is to be used
#define BC_CONCURRENT 1
// on if markings on nodes are to be used (helps with no duplicates in worklist)
#define BC_USE_MARKING 1

#endif // end BC_ASYNC_CONTROL


================================================
FILE: lonestar/analytics/cpu/bfs/CMakeLists.txt
================================================
add_executable(bfs-cpu bfs.cpp)
add_dependencies(apps bfs-cpu)
target_link_libraries(bfs-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS bfs-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 bfs-cpu "${BASEINPUT}/reference/structured/rome99.gr")
add_test_scale(small2 bfs-cpu "${BASEINPUT}/scalefree/rmat10.gr")

add_executable(bfs-directionopt-cpu bfsDirectionOpt.cpp)
add_dependencies(apps bfs-directionopt-cpu)
target_link_libraries(bfs-directionopt-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS bfs-directionopt-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 bfs-directionopt-cpu "${BASEINPUT}/reference/structured/rome99.gr")
add_test_scale(small2 bfs-directionopt-cpu "${BASEINPUT}/scalefree/rmat10.gr")


================================================
FILE: lonestar/analytics/cpu/bfs/README.md
================================================
Breadth First Search
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program performs breadth-first search on an input graph, starting from a
source node (specified by -startNode option). 

Async algorithm maintains a concurrent FIFO of active nodes and uses a
for_each loop (a single parallel phase) to go over them. New active nodes are
added to the concurrent FIFO

Sync algorithm iterates over active nodes in rounds, each round, it uses a
do_all loop to iterate over currently active nodes to generate the next set of
active nodes. 

Sync2p further divides each round into two parallel do_all loops

Each algorithm has a variant that implements edge tiling, e.g. SyncTile, which
divides the edges of high-degree nodes into multiple work items for better
load balancing. 

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/bfs; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./bfs-cpu <path-to-graph> -exec PARALLEL -algo SyncTile -t 40`
-`$ ./bfs-cpu <path-to-graph> -exec SERIAL -algo SyncTile -t 40`

PERFORMANCE  
--------------------------------------------------------------------------------

* In our experience, Sync/SyncTile algorithm gives the best performance.
* Async/AsyncTile algorithm typically performs better than Sync on high diameter
  graphs, such as road networks
* All algorithms rely on CHUNK_SIZE for load balancing, which needs to be
  tuned for machine and input graph. 
* Tile variants of algorithms provide better load balancing and performance
  for graphs with high-degree nodes. Tile size is controlled via
  EDGE_TILE_SIZE constant, which needs to be tuned. 


================================================
FILE: lonestar/analytics/cpu/bfs/bfs.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/gstl.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "Lonestar/BoilerPlate.h"
#include "Lonestar/BFS_SSSP.h"

#include "llvm/Support/CommandLine.h"

#include <iostream>
#include <deque>
#include <type_traits>

namespace cll = llvm::cl;

static const char* name = "Breadth-first Search";

static const char* desc =
    "Computes the shortest path from a source node to all nodes in a directed "
    "graph using a modified Bellman-Ford algorithm";

static const char* url = "breadth_first_search";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<unsigned int>
    startNode("startNode",
              cll::desc("Node to start search from (default value 0)"),
              cll::init(0));
static cll::opt<unsigned int>
    reportNode("reportNode",
               cll::desc("Node to report distance to (default value 1)"),
               cll::init(1));

// static cll::opt<unsigned int> stepShiftw("delta",
// cll::desc("Shift value for the deltastep"),
// cll::init(10));

enum Exec { SERIAL, PARALLEL };

enum Algo { AsyncTile = 0, Async, SyncTile, Sync };

const char* const ALGO_NAMES[] = {"AsyncTile", "Async", "SyncTile", "Sync"};

static cll::opt<Exec> execution(
    "exec",
    cll::desc("Choose SERIAL or PARALLEL execution (default value PARALLEL):"),
    cll::values(clEnumVal(SERIAL, "SERIAL"), clEnumVal(PARALLEL, "PARALLEL")),
    cll::init(PARALLEL));

static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm (default value SyncTile):"),
    cll::values(clEnumVal(AsyncTile, "AsyncTile"), clEnumVal(Async, "Async"),
                clEnumVal(SyncTile, "SyncTile"), clEnumVal(Sync, "Sync")),
    cll::init(SyncTile));

using Graph =
    galois::graphs::LC_CSR_Graph<unsigned, void>::with_no_lockable<true>::type;
//::with_numa_alloc<true>::type;

using GNode = Graph::GraphNode;

constexpr static const bool TRACK_WORK          = false;
constexpr static const unsigned CHUNK_SIZE      = 256U;
constexpr static const ptrdiff_t EDGE_TILE_SIZE = 256;

using BFS = BFS_SSSP<Graph, unsigned int, false, EDGE_TILE_SIZE>;

using UpdateRequest       = BFS::UpdateRequest;
using Dist                = BFS::Dist;
using SrcEdgeTile         = BFS::SrcEdgeTile;
using SrcEdgeTileMaker    = BFS::SrcEdgeTileMaker;
using SrcEdgeTilePushWrap = BFS::SrcEdgeTilePushWrap;
using ReqPushWrap         = BFS::ReqPushWrap;
using OutEdgeRangeFn      = BFS::OutEdgeRangeFn;
using TileRangeFn         = BFS::TileRangeFn;

struct EdgeTile {
  Graph::edge_iterator beg;
  Graph::edge_iterator end;
};

struct EdgeTileMaker {
  EdgeTile operator()(Graph::edge_iterator beg,
                      Graph::edge_iterator end) const {
    return EdgeTile{beg, end};
  }
};

struct NodePushWrap {

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    (*this)(cont, n);
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    cont.push(n);
  }
};

struct EdgeTilePushWrap {
  Graph& graph;

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    BFS::pushEdgeTilesParallel(cont, graph, n, EdgeTileMaker{});
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    BFS::pushEdgeTiles(cont, graph, n, EdgeTileMaker{});
  }
};

struct OneTilePushWrap {
  Graph& graph;

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    (*this)(cont, n);
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    EdgeTile t{graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),
               graph.edge_end(n, galois::MethodFlag::UNPROTECTED)};

    cont.push(t);
  }
};

template <bool CONCURRENT, typename T, typename P, typename R>
void asyncAlgo(Graph& graph, GNode source, const P& pushWrap,
               const R& edgeRange) {

  namespace gwl = galois::worklists;
  // typedef PerSocketChunkFIFO<CHUNK_SIZE> dFIFO;
  using FIFO = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;
  using BSWL = gwl::BulkSynchronous<gwl::PerSocketChunkLIFO<CHUNK_SIZE>>;
  using WL   = FIFO;

  using Loop =
      typename std::conditional<CONCURRENT, galois::ForEach,
                                galois::WhileQ<galois::SerFIFO<T>>>::type;

  GALOIS_GCC7_IGNORE_UNUSED_BUT_SET
  constexpr bool useCAS = CONCURRENT && !std::is_same<WL, BSWL>::value;
  GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET

  Loop loop;

  galois::GAccumulator<size_t> BadWork;
  galois::GAccumulator<size_t> WLEmptyWork;

  graph.getData(source) = 0;
  galois::InsertBag<T> initBag;

  if (CONCURRENT) {
    pushWrap(initBag, source, 1, "parallel");
  } else {
    pushWrap(initBag, source, 1);
  }

  loop(
      galois::iterate(initBag),
      [&](const T& item, auto& ctx) {
        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;

        const auto& sdist = graph.getData(item.src, flag);

        if (TRACK_WORK) {
          if (item.dist != sdist) {
            WLEmptyWork += 1;
            return;
          }
        }

        const auto newDist = item.dist;

        for (auto ii : edgeRange(item)) {
          GNode dst   = graph.getEdgeDst(ii);
          auto& ddata = graph.getData(dst, flag);

          while (true) {

            Dist oldDist = ddata;

            if (oldDist <= newDist) {
              break;
            }

            if (!useCAS ||
                __sync_bool_compare_and_swap(&ddata, oldDist, newDist)) {

              if (!useCAS) {
                ddata = newDist;
              }

              if (TRACK_WORK) {
                if (oldDist != BFS::DIST_INFINITY) {
                  BadWork += 1;
                }
              }

              pushWrap(ctx, dst, newDist + 1);
              break;
            }
          }
        }
      },
      galois::wl<WL>(), galois::loopname("runBFS"),
      galois::disable_conflict_detection());

  if (TRACK_WORK) {
    galois::runtime::reportStat_Single("BFS", "BadWork", BadWork.reduce());
    galois::runtime::reportStat_Single("BFS", "EmptyWork",
                                       WLEmptyWork.reduce());
  }
}

template <bool CONCURRENT, typename T, typename P, typename R>
void syncAlgo(Graph& graph, GNode source, const P& pushWrap,
              const R& edgeRange) {

  using Cont = typename std::conditional<CONCURRENT, galois::InsertBag<T>,
                                         galois::SerStack<T>>::type;
  using Loop = typename std::conditional<CONCURRENT, galois::DoAll,
                                         galois::StdForEach>::type;

  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;

  Loop loop;

  auto curr = std::make_unique<Cont>();
  auto next = std::make_unique<Cont>();

  Dist nextLevel              = 0U;
  graph.getData(source, flag) = 0U;

  if (CONCURRENT) {
    pushWrap(*next, source, "parallel");
  } else {
    pushWrap(*next, source);
  }

  assert(!next->empty());

  while (!next->empty()) {

    std::swap(curr, next);
    next->clear();
    ++nextLevel;

    loop(
        galois::iterate(*curr),
        [&](const T& item) {
          for (auto e : edgeRange(item)) {
            auto dst      = graph.getEdgeDst(e);
            auto& dstData = graph.getData(dst, flag);

            if (dstData == BFS::DIST_INFINITY) {
              dstData = nextLevel;
              pushWrap(*next, dst);
            }
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("Sync"));
  }
}

template <bool CONCURRENT>
void runAlgo(Graph& graph, const GNode& source) {

  switch (algo) {
  case AsyncTile:
    asyncAlgo<CONCURRENT, SrcEdgeTile>(
        graph, source, SrcEdgeTilePushWrap{graph}, TileRangeFn());
    break;
  case Async:
    asyncAlgo<CONCURRENT, UpdateRequest>(graph, source, ReqPushWrap(),
                                         OutEdgeRangeFn{graph});
    break;
  case SyncTile:
    syncAlgo<CONCURRENT, EdgeTile>(graph, source, EdgeTilePushWrap{graph},
                                   TileRangeFn());
    break;
  case Sync:
    syncAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),
                                OutEdgeRangeFn{graph});
    break;
  default:
    std::cerr << "ERROR: unkown algo type\n";
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  GNode source;
  GNode report;

  std::cout << "Reading from file: " << inputFile << "\n";
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  if (startNode >= graph.size() || reportNode >= graph.size()) {
    std::cerr << "failed to set report: " << reportNode
              << " or failed to set source: " << startNode << "\n";
    abort();
  }

  auto it = graph.begin();
  std::advance(it, startNode.getValue());
  source = *it;
  it     = graph.begin();
  std::advance(it, reportNode.getValue());
  report = *it;

  size_t approxNodeData = 4 * (graph.size() + graph.sizeEdges());
  galois::preAlloc(8 * numThreads +
                   approxNodeData / galois::runtime::pagePoolSize());

  galois::reportPageAlloc("MeminfoPre");

  galois::do_all(galois::iterate(graph),
                 [&graph](GNode n) { graph.getData(n) = BFS::DIST_INFINITY; });
  graph.getData(source) = 0;

  std::cout << "Running " << ALGO_NAMES[algo] << " algorithm with "
            << (bool(execution) ? "PARALLEL" : "SERIAL") << " execution\n";

  galois::StatTimer execTime("Timer_0");
  execTime.start();

  if (execution == SERIAL) {
    runAlgo<false>(graph, source);
  } else if (execution == PARALLEL) {
    runAlgo<true>(graph, source);
  } else {
    std::cerr << "ERROR: unknown type of execution passed to -exec\n";
  }

  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  std::cout << "Node " << reportNode << " has distance "
            << graph.getData(report) << "\n";

  // Sanity checking code
  galois::GReduceMax<uint64_t> maxDistance;
  galois::GAccumulator<uint64_t> distanceSum;
  galois::GAccumulator<uint32_t> visitedNode;
  maxDistance.reset();
  distanceSum.reset();
  visitedNode.reset();

  galois::do_all(
      galois::iterate(graph),
      [&](uint64_t i) {
        uint32_t myDistance = graph.getData(i);

        if (myDistance != BFS::DIST_INFINITY) {
          maxDistance.update(myDistance);
          distanceSum += myDistance;
          visitedNode += 1;
        }
      },
      galois::loopname("Sanity check"), galois::no_stats());

  // report sanity stats
  uint64_t rMaxDistance = maxDistance.reduce();
  uint64_t rDistanceSum = distanceSum.reduce();
  uint64_t rVisitedNode = visitedNode.reduce();
  galois::gInfo("# visited nodes is ", rVisitedNode);
  galois::gInfo("Max distance is ", rMaxDistance);
  galois::gInfo("Sum of visited distances is ", rDistanceSum);

  if (!skipVerify) {
    if (BFS::verify(graph, source)) {
      std::cout << "Verification successful.\n";
    } else {
      GALOIS_DIE("verification failed");
    }
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/bfs/bfsDirectionOpt.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/DynamicBitset.h"
#include "galois/gstl.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/graphs/LC_CSR_CSC_Graph.h"
#include "galois/runtime/Profile.h"
#include "Lonestar/BFS_SSSP.h"
#include "Lonestar/Utils.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

#include <stdio.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <iostream>
#include <deque>
#include <type_traits>
#include <cstdlib>

namespace cll = llvm::cl;

static const char* name = "Breadth-first Search";

static const char* desc =
    "Computes the shortest path from a source node to all nodes in a directed "
    "graph using a modified Bellman-Ford algorithm";

static const char* url = "breadth_first_search";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<uint64_t>
    startNode("startNode",
              cll::desc("Node to start search from (default value 0)"),
              cll::init(0));
static cll::opt<unsigned int>
    reportNode("reportNode",
               cll::desc("Node to report distance to (default value 1)"),
               cll::init(1));
static cll::opt<unsigned int>
    numRuns("numRuns", cll::desc("Number of runs (default value 1)"),
            cll::init(1));

static cll::opt<int>
    alpha("alpha",
          cll::desc("alpha value to change direction in direction-optimization "
                    "(default value 15)"),
          cll::init(15));
static cll::opt<int>
    beta("beta",
         cll::desc("beta value to change direction in direction-optimization "
                   "(default value 18)"),
         cll::init(18));

static cll::opt<unsigned int>
    preAlloc("preAlloc",
             cll::desc("Number of pages to preAlloc (default value 400)"),
             cll::init(400));

static cll::opt<unsigned int>
    numPrint("numPrint",
             cll::desc("Print parents for the numPrint number of nodes for "
                       "verification if verification is on (default value 10)"),
             cll::init(10));

enum Exec { SERIAL, PARALLEL };

enum Algo { SyncDO = 0, Async, AutoAlgo };

const char* const ALGO_NAMES[] = {"SyncDO", "Async", "Auto"};

static cll::opt<Exec> execution(
    "exec",
    cll::desc("Choose SERIAL or PARALLEL execution (default value PARALLEL):"),
    cll::values(clEnumVal(SERIAL, "SERIAL"), clEnumVal(PARALLEL, "PARALLEL")),
    cll::init(PARALLEL));

static cll::opt<Algo>
    algo("algo", cll::desc("Choose an algorithm (default value Auto):"),
         cll::values(
             clEnumVal(SyncDO, "SyncDO"), clEnumVal(Async, "Async"),
             clEnumVal(AutoAlgo,
                       "Auto: choose between SyncDO and Async automatically")),
         cll::init(AutoAlgo));

using Graph =
    // galois::graphs::LC_CSR_CSC_Graph<unsigned, void, false, true, true>;
    galois::graphs::LC_CSR_CSC_Graph<unsigned, void, false, true, true>;
// galois::graphs::LC_CSR_CSC_Graph<unsigned,
// void>::with_no_lockable<true>::type::with_numa_alloc<true>::type;
using GNode = Graph::GraphNode;

constexpr static const unsigned CHUNK_SIZE      = 256u;
constexpr static const ptrdiff_t EDGE_TILE_SIZE = 256;

using BFS            = BFS_SSSP<Graph, unsigned int, false, EDGE_TILE_SIZE>;
using UpdateRequest  = BFS::UpdateRequest;
using Dist           = BFS::Dist;
using OutEdgeRangeFn = BFS::OutEdgeRangeFn;

struct EdgeTile {
  Graph::edge_iterator beg;
  Graph::edge_iterator end;
};

struct EdgeTileMaker {
  EdgeTile operator()(Graph::edge_iterator beg,
                      Graph::edge_iterator end) const {
    return EdgeTile{beg, end};
  }
};

struct NodePushWrap {

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    (*this)(cont, n);
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    cont.push(n);
  }
};

struct EdgeTilePushWrap {
  Graph& graph;

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    BFS::pushEdgeTilesParallel(cont, graph, n, EdgeTileMaker{});
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    BFS::pushEdgeTiles(cont, graph, n, EdgeTileMaker{});
  }
};

struct OneTilePushWrap {
  Graph& graph;

  template <typename C>
  void operator()(C& cont, const GNode& n, const char* const) const {
    (*this)(cont, n);
  }

  template <typename C>
  void operator()(C& cont, const GNode& n) const {
    EdgeTile t{graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),
               graph.edge_end(n, galois::MethodFlag::UNPROTECTED)};

    cont.push(t);
  }
};

template <typename WL>
void WlToBitset(WL& wl, galois::DynamicBitSet& bitset) {
  galois::do_all(
      galois::iterate(wl), [&](const GNode& src) { bitset.set(src); },
      galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
      galois::loopname("WlToBitset"));
}

template <typename WL>
void BitsetToWl(const Graph& graph, const galois::DynamicBitSet& bitset,
                WL& wl) {
  wl.clear();
  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& src) {
        if (bitset.test(src))
          // pushWrap(wl, src);
          wl.push(src);
      },
      galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
      galois::loopname("BitsetToWl"));
}

template <bool CONCURRENT, typename T, typename P, typename R>
void syncDOAlgo(Graph& graph, GNode source, const P& pushWrap,
                const R& GALOIS_UNUSED(edgeRange), const uint32_t runID) {

  using Cont = typename std::conditional<CONCURRENT, galois::InsertBag<T>,
                                         galois::SerStack<T>>::type;
  using Loop = typename std::conditional<CONCURRENT, galois::DoAll,
                                         galois::StdForEach>::type;

  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;
  galois::GAccumulator<uint32_t> work_items;

  Loop loop;

  galois::DynamicBitSet front_bitset, next_bitset;
  front_bitset.resize(graph.size());
  next_bitset.resize(graph.size());

  front_bitset.reset();
  next_bitset.reset();

  Cont* curr = new Cont();
  Cont* next = new Cont();

  Dist nextLevel              = 0u;
  graph.getData(source, flag) = 0u;

  if (CONCURRENT) {
    pushWrap(*next, source, "parallel");
  } else {
    pushWrap(*next, source);
  }
  // adding source to the worklist
  work_items += 1;
  // next_bitset.set(source);

  int64_t edges_to_check = graph.sizeEdges();
  int64_t scout_count =
      std::distance(graph.edge_begin(source), graph.edge_end(source));
  galois::gPrint("source: ", source, " has OutDegree:", scout_count, "\n");
  assert(!next->empty());

  uint64_t old_workItemNum = 0;
  uint64_t numNodes        = graph.size();
  // uint32_t c_pull = 0, c_push = 0;
  galois::GAccumulator<uint64_t> writes_pull, writes_push;
  writes_push.reset();
  writes_pull.reset();
  // std::vector<uint32_t> pull_levels;
  // pull_levels.reserve(10);

  while (!next->empty()) {

    std::swap(curr, next);
    next->clear();
    if (scout_count > edges_to_check / alpha) {

      WlToBitset(*curr, front_bitset);
      do {
        // c_pull++;
        // pull_levels.push_back(nextLevel);

        ++nextLevel;
        old_workItemNum = work_items.reduce();
        work_items.reset();

        // PULL from in-edges
        loop(
            galois::iterate(graph),
            [&](const T& dst) {
              auto& ddata = graph.getData(dst, flag);
              if (ddata == BFS::DIST_INFINITY) {
                for (auto e : graph.in_edges(dst)) {
                  auto src = graph.getInEdgeDst(e);

                  if (front_bitset.test(src)) {
                    /*
                     * Currently assigning parents on the bfs path.
                     * Assign nextLevel (uncomment below)
                     */
                    // ddata = nextLevel;
                    ddata = src;
                    next_bitset.set(dst);
                    work_items += 1;
                    break;
                  }
                }
              }
            },
            galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
            galois::loopname(
                (std::string("Sync-pull_") + std::to_string(runID)).c_str()));

        std::swap(front_bitset, next_bitset);
        next_bitset.reset();
      } while (work_items.reduce() >= old_workItemNum ||
               (work_items.reduce() > numNodes / beta));

      BitsetToWl(graph, front_bitset, *next);
      scout_count = 1;
    } else {
      // c_push++;
      ++nextLevel;
      edges_to_check -= scout_count;
      work_items.reset();
      // PUSH to out-edges
      loop(
          galois::iterate(*curr),
          [&](const T& src) {
            for (auto e : graph.edges(src)) {
              auto dst    = graph.getEdgeDst(e);
              auto& ddata = graph.getData(dst, flag);

              if (ddata == BFS::DIST_INFINITY) {
                Dist oldDist = ddata;
                /*
                 * Currently assigning parents on the bfs path.
                 * Assign nextLevel (uncomment below)
                 */
                // if(__sync_bool_compare_and_swap(&ddata, oldDist, nextLevel))
                // {
                if (__sync_bool_compare_and_swap(&ddata, oldDist, src)) {
                  next->push(dst);
                  work_items += (graph.edge_end(dst) - graph.edge_begin(dst));
                }
              }
            }
          },
          galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
          galois::loopname(
              (std::string("Sync-push_") + std::to_string(runID)).c_str()));

      scout_count = work_items.reduce();
    }
  }

  delete curr;
  delete next;
}

template <bool CONCURRENT, typename T, typename P, typename R>
void asyncAlgo(Graph& graph, GNode source, const P& pushWrap,
               const R& GALOIS_UNUSED(edgeRange)) {

  namespace gwl = galois::worklists;
  // typedef PerSocketChunkFIFO<CHUNK_SIZE> dFIFO;
  using FIFO = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;
  using WL   = FIFO;

  using Loop =
      typename std::conditional<CONCURRENT, galois::ForEach,
                                galois::WhileQ<galois::SerFIFO<T>>>::type;

  Loop loop;

  galois::GAccumulator<size_t> BadWork;
  galois::GAccumulator<size_t> WLEmptyWork;

  graph.getData(source) = 0;
  galois::InsertBag<T> initBag;

  if (CONCURRENT) {
    pushWrap(initBag, source, "parallel");
  } else {
    pushWrap(initBag, source);
  }

  loop(
      galois::iterate(initBag),
      [&](const GNode& src, auto& ctx) {
        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;

        for (auto ii : graph.edges(src)) {
          GNode dst   = graph.getEdgeDst(ii);
          auto& ddata = graph.getData(dst, flag);

          if (ddata == BFS::DIST_INFINITY) {
            Dist oldDist = ddata;
            if (__sync_bool_compare_and_swap(&ddata, oldDist, src)) {
              ctx.push(dst);
            }
          }
        }
      },
      galois::wl<WL>(), galois::loopname("runBFS"),
      galois::disable_conflict_detection());
}

template <bool CONCURRENT>
void runAlgo(Graph& graph, const GNode& source, const uint32_t runID) {
  switch (algo) {
  case SyncDO:
    syncDOAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),
                                  OutEdgeRangeFn{graph}, runID);
    break;
  case Async:
    asyncAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),
                                 OutEdgeRangeFn{graph});
    break;

  default:
    std::cerr << "ERROR: unkown algo type\n";
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  GNode source;
  GNode report;

  galois::StatTimer StatTimer_graphConstuct("TimerConstructGraph", "BFS");
  StatTimer_graphConstuct.start();
  graph.readAndConstructBiGraphFromGRFile(inputFile);
  StatTimer_graphConstuct.stop();
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  if (startNode >= graph.size() || reportNode >= graph.size()) {
    std::cerr << "failed to set report: " << reportNode
              << " or failed to set source: " << startNode << "\n";
    assert(0);
    abort();
  }

  auto it = graph.begin();
  std::advance(it, startNode.getValue());
  source = *it;
  it     = graph.begin();
  std::advance(it, reportNode.getValue());
  report = *it;

  galois::preAlloc(preAlloc);
  galois::gPrint("Fixed preAlloc done : ", preAlloc, "\n");
  galois::reportPageAlloc("MeminfoPre");

  galois::do_all(galois::iterate(graph),
                 [&graph](GNode n) { graph.getData(n) = BFS::DIST_INFINITY; });

  graph.getData(source) = 0;

  std::cout << "Running " << ALGO_NAMES[algo] << " algorithm with "
            << (bool(execution) ? "PARALLEL" : "SERIAL") << " execution\n";

  std::cout
      << "WARNING: This bfs version uses bi-directional CSR graph "
      << "and assigns parent instead of the shortest distance from source\n";
  if (algo == Async) {
    std::cout << "WARNING: Async bfs does not use direction optimization. "
              << "It uses Galois for_each for asynchronous execution which is "
                 "advantageous "
              << "for large diameter graphs such as road networks\n";
  }

  std::cout << " Execution started\n";

  galois::StatTimer autoAlgoTimer("AutoAlgo_0");
  galois::StatTimer execTime("Timer_0");
  execTime.start();

  if (algo == AutoAlgo) {
    autoAlgoTimer.start();
    if (isApproximateDegreeDistributionPowerLaw(graph)) {
      algo = SyncDO;
    } else {
      algo = Async;
    }
    autoAlgoTimer.stop();
    galois::gInfo("Choosing ", ALGO_NAMES[algo], " algorithm");
  }

  for (unsigned int run = 0; run < numRuns; ++run) {
    galois::gPrint("BFS::go run ", run, " called\n");
    std::string timer_str("Timer_Run" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), "BFS");
    StatTimer_main.start();

    if (execution == SERIAL) {
      runAlgo<false>(graph, source, run);
    } else if (execution == PARALLEL) {
      galois::runtime::profileVtune(
          [&]() { runAlgo<true>(graph, source, run); }, "runAlgo");
    } else {
      std::cerr << "ERROR: unknown type of execution passed to -exec\n";
      std::abort();
    }

    StatTimer_main.stop();

    if ((run + 1) != numRuns) {
      for (unsigned int i = 0; i < 1; ++i) {
        galois::do_all(galois::iterate(graph), [&graph](GNode n) {
          graph.getData(n) = BFS::DIST_INFINITY;
        });
      }
    }
  }

  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  std::cout << "Node " << reportNode << " has parent " << graph.getData(report)
            << "\n";

  if (!skipVerify) {
    for (GNode n = 0; n < numPrint; n++) {
      galois::gPrint("parent[", n, "] : ", graph.getData(n), "\n");
    }
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/bipart/CMakeLists.txt
================================================
add_executable(bipart-cpu bipart.cpp Coarsening.cpp Metric.cpp Partitioning.cpp Refine.cpp)
add_dependencies(apps bipart-cpu)
target_link_libraries(bipart-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS bipart-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 bipart-cpu -hMetisGraph "${BASEINPUT}/partitioning/ibm01.hgr")


================================================
FILE: lonestar/analytics/cpu/bipart/Coarsening.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "bipart.h"
#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/Reduction.h"
#include "galois/runtime/Profile.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/gstl.h"

#include <iostream>
#include <unordered_set>
#include <unordered_map>

// constexpr static const unsigned CHUNK_SIZE = 512U;

int TOTALW;
int LIMIT;
bool FLAG = false;
namespace {

int hash(unsigned val) {
  unsigned long int seed = val * 1103515245 + 12345;
  return ((unsigned)(seed / 65536) % 32768);
}

void parallelRand(std::shared_ptr<MetisGraph> graph, int) {

  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();

  galois::StatTimer T_RAND("RAND");
  T_RAND.start();

  galois::do_all(
      galois::iterate((uint64_t)0, fineGGraph->hedges),
      [&fineGGraph](uint64_t item) {
        unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
        netnum          = hash(netnum);
        fineGGraph->getData(item, flag_no_lock).netrand = netnum;
      },
      galois::steal(),
      // galois::chunk_size<CHUNK_SIZE>());
      galois::loopname("rand"));
  T_RAND.stop();

  // std::cout <<"hedges: " << fineGGraph->hedges << std::endl;

  galois::StatTimer T_INDEX("INDEX");
  T_INDEX.start();
  galois::do_all(
      galois::iterate((uint64_t)0, fineGGraph->hedges),
      [&fineGGraph](uint64_t item) {
        unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;
        netnum          = hash(1);
        fineGGraph->getData(item, flag_no_lock).index = netnum;
      },
      galois::steal(),
      // galois::chunk_size<CHUNK_SIZE>());
      galois::loopname("rand_index"));
  T_INDEX.stop();

  // std::cout <<"rand: " << T_RAND.get() << std::endl;
  // std::cout << "rand_index: " << T_INDEX.get() << std::endl;
}

using MatchingPolicy = void(GNode, GGraph*);

void PLD_f(GNode node, GGraph* fineGGraph) {
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = -ss;
}
void RAND_f(GNode node, GGraph* fineGGraph) {
  unsigned id                       = fineGGraph->getData(node).netrand;
  fineGGraph->getData(node).netval  = -id;
  fineGGraph->getData(node).netrand = -fineGGraph->getData(node).netnum;
}
void PP_f(GNode node, GGraph* fineGGraph) {
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = ss;
}
void WD_f(GNode node, GGraph* fineGGraph) {
  int w = 0;
  for (auto n : fineGGraph->edges(node)) {
    auto nn = fineGGraph->getEdgeDst(n);
    w += fineGGraph->getData(nn).getWeight();
  }
  fineGGraph->getData(node).netval = -w;
}
void MWD_f(GNode node, GGraph* fineGGraph) {
  int w = 0;
  for (auto n : fineGGraph->edges(node)) {
    auto nn = fineGGraph->getEdgeDst(n);
    w += fineGGraph->getData(nn).getWeight();
  }
  fineGGraph->getData(node).netval = w;
}
void RI_f(GNode node, GGraph* fineGGraph) {
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = ss;
}
void MRI_f(GNode node, GGraph* fineGGraph) {
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = ss;
}
void DEG_f(GNode node, GGraph* fineGGraph) {
  int w = 0;
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = ss;
  for (auto n : fineGGraph->edges(node)) {
    auto nn = fineGGraph->getEdgeDst(n);
    w += fineGGraph->getData(nn).getWeight();
  }
  fineGGraph->getData(node).netval = -(w / ss);
}
void MDEG_f(GNode node, GGraph* fineGGraph) {
  int w = 0;
  int ss =
      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));
  fineGGraph->getData(node).netval = ss;
  for (auto n : fineGGraph->edges(node)) {
    auto nn = fineGGraph->getEdgeDst(n);
    w += fineGGraph->getData(nn).getWeight();
  }
  fineGGraph->getData(node).netval = w / ss;
}

template <MatchingPolicy matcher>
void parallelPrioRand(std::shared_ptr<MetisGraph> graph, int iter) {

  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
  parallelRand(graph, iter);

  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        matcher(item, fineGGraph);
        for (auto c : fineGGraph->edges(item)) {
          auto dst = fineGGraph->getEdgeDst(c);
          galois::atomicMin(fineGGraph->getData(dst).netval,
                            fineGGraph->getData(item).netval.load());
        }
      },
      galois::steal(), galois::loopname("atomicMin"));
  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        for (auto c : fineGGraph->edges(item)) {
          auto dst = fineGGraph->getEdgeDst(c);
          if (fineGGraph->getData(dst).netval ==
              fineGGraph->getData(item).netval)
            galois::atomicMin(fineGGraph->getData(dst).netrand,
                              fineGGraph->getData(item).netrand.load());
        }
      },
      galois::steal(), galois::loopname("secondMin2"));
  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        for (auto c : fineGGraph->edges(item)) {
          auto dst = fineGGraph->getEdgeDst(c);
          if (fineGGraph->getData(dst).netrand ==
              fineGGraph->getData(item).netrand)
            galois::atomicMin(fineGGraph->getData(dst).netnum,
                              fineGGraph->getData(item).netnum.load());
        }
      },
      galois::steal(), galois::loopname("secondMin"));
}

// hyper edge matching
template <MatchingPolicy matcher>
void parallelHMatchAndCreateNodes(std::shared_ptr<MetisGraph> graph, int iter,
                                  GNodeBag& bag, std::vector<bool>& hedges,
                                  galois::LargeArray<unsigned>& weight) {
  parallelPrioRand<matcher>(graph, iter);
  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
  assert(fineGGraph != graph->getGraph());
  typedef std::vector<GNode> VecTy;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  std::string name = "phaseI";

  galois::GAccumulator<unsigned> hedge;

  galois::InsertBag<GNode> hedge_bag;

  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        bool flag       = false;
        unsigned nodeid = INT_MAX;
        auto& edges     = *edgesThreadLocal.getLocal();
        edges.clear();
        int w = 0;
        for (auto c : fineGGraph->edges(item)) {
          auto dst   = fineGGraph->getEdgeDst(c);
          auto& data = fineGGraph->getData(dst);
          if (data.isMatched()) {
            flag = true;
            continue;
          }
          if (data.netnum == fineGGraph->getData(item).netnum) {
            if (w + fineGGraph->getData(dst).getWeight() > LIMIT)
              break;
            edges.push_back(dst);
            w += fineGGraph->getData(dst).getWeight();
            nodeid = std::min(nodeid, dst);
          } else {
            flag = true;
          }
        }

        if (!edges.empty()) {
          if (flag && edges.size() == 1)
            return;
          fineGGraph->getData(item).setMatched();
          if (flag)
            hedge_bag.push(item);

          bag.push(nodeid);
          unsigned ww = 0;
          for (auto pp : edges) {
            ww += fineGGraph->getData(pp).getWeight();
            fineGGraph->getData(pp).setMatched();
            fineGGraph->getData(pp).setParent(nodeid);
            fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;
            // fineGGraph->getData(pp).netnum =
            // fineGGraph->getData(item).netnum.load();
          }
          weight[nodeid - fineGGraph->hedges] = ww;
        }
      },
      galois::loopname("phaseI"));

  for (auto item : hedge_bag)
    hedges[item] = true;
}

void moreCoarse(std::shared_ptr<MetisGraph> graph,
                galois::LargeArray<unsigned>& weight) {

  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
  typedef std::vector<GNode> VecTy;
  GNodeBag bag;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        if (fineGGraph->getData(item).isMatched())
          return;
        for (auto c : fineGGraph->edges(item)) {
          auto dst = fineGGraph->getEdgeDst(c);
          if (fineGGraph->getData(dst).isMatched())
            fineGGraph->getData(dst).netval = INT_MIN;
        }
      },
      galois::steal(), galois::loopname("atomicMin2"));
  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        if (fineGGraph->getData(item).isMatched())
          return;
        auto& cells = *edgesThreadLocal.getLocal();
        cells.clear();
        int best = INT_MAX;
        GNode b  = 0;
        for (auto edge : fineGGraph->edges(item)) {
          auto e     = fineGGraph->getEdgeDst(edge);
          auto& data = fineGGraph->getData(e);
          if (!fineGGraph->getData(e).isMatched()) {
            if (data.netnum == fineGGraph->getData(item).netnum) {
              cells.push_back(e);
            }
          } else if (fineGGraph->getData(e).netval == INT_MIN) {
            if (fineGGraph->getData(e).getWeight() < best) {
              best = fineGGraph->getData(e).getWeight();
              b    = e;
            } else if (fineGGraph->getData(e).getWeight() == best) {
              if (e < b)
                b = e;
            }
          }
        }
        if (cells.size() > 0) {
          if (best < INT_MAX) {
            auto nn = fineGGraph->getData(b).getParent();
            for (auto e : cells) {
              bag.push(e);
              fineGGraph->getData(e).setMatched();
              fineGGraph->getData(e).setParent(nn);
              fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;
              // fineGGraph->getData(e).netnum =
              // fineGGraph->getData(b).netnum.load();
            }
          }
        }
      },
      galois::steal(), galois::loopname("moreCoarse"));
  for (auto c : bag) {
    auto nn = fineGGraph->getData(c).getParent();
    int ww  = weight[nn - fineGGraph->hedges];
    ww += fineGGraph->getData(c).getWeight();
    weight[nn - fineGGraph->hedges] = ww;
  }
}

// Coarsening phaseII
void coarsePhaseII(std::shared_ptr<MetisGraph> graph, std::vector<bool>& hedges,
                   galois::LargeArray<unsigned>& weight) {

  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
  typedef std::set<int> SecTy;
  typedef std::vector<GNode> VecTy;
  typedef galois::substrate::PerThreadStorage<SecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalDataV;
  ThreadLocalDataV edgesThreadLocalV;
  std::string name = "CoarseningPhaseII";
  galois::GAccumulator<int> hhedges;
  galois::GAccumulator<int> hnode;
  moreCoarse(graph, weight);

  galois::InsertBag<GNode> hedge_bag;

  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode item) {
        if (fineGGraph->getData(item).isMatched())
          return;
        unsigned ids;
        int count = 0;
        for (auto c : fineGGraph->edges(item)) {
          auto dst   = fineGGraph->getEdgeDst(c);
          auto& data = fineGGraph->getData(dst);
          if (data.isMatched()) {
            if (count == 0) {
              ids = data.getParent();
              count++;
            } else if (ids != data.getParent()) {
              count++;
              break;
            }
          } else {
            count = 0;
            break;
          }
        }
        if (count == 1) {
          fineGGraph->getData(item).setMatched();

        } else {
          //	auto& vec = *edgesThreadLocalV.getLocal();
          // vec.push_back(item);
          hedge_bag.push(item);
          fineGGraph->getData(item).setMatched();
        }
      },
      galois::steal(), galois::loopname("count # Hyperedges"));

  for (auto item : hedge_bag)
    hedges[item] = true;
}

// find nodes that are not incident to any hyperedge
void findLoneNodes(GGraph& graph) {

  galois::do_all(
      galois::iterate((uint64_t)graph.hedges, graph.size()),
      [&](GNode n) { graph.getData(n).notAlone = false; }, galois::steal(),
      galois::loopname("initialize not alone variables"));

  galois::do_all(
      galois::iterate((uint64_t)0, graph.hedges),
      [&](GNode h) {
        for (auto n : graph.edges(h))
          graph.getData(graph.getEdgeDst(n)).notAlone = true;
      },
      galois::steal(), galois::loopname("set not alone variables"));
}

// create coarsened graphs
void parallelCreateEdges(std::shared_ptr<MetisGraph> graph, GNodeBag& bag,
                         std::vector<bool>& hedges,
                         galois::LargeArray<unsigned>& weight) {

  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();
  GGraph* coarseGGraph = graph->getGraph();
  assert(fineGGraph != coarseGGraph);
  galois::GAccumulator<unsigned> hg;
  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode n) {
        if (hedges[n])
          hg += 1;
      },
      galois::steal(), galois::loopname("number of hyperedges loop"));

  // find lone nodes
  findLoneNodes(*fineGGraph);

  galois::do_all(
      galois::iterate(fineGGraph->hedges, fineGGraph->size()),
      [&](GNode ii) {
        if (!fineGGraph->getData(ii)
                 .isMatched()) { // && fineGGraph->getData(ii).notAlone) {
          bag.push(ii);
          fineGGraph->getData(ii).setMatched();
          fineGGraph->getData(ii).setParent(ii);
          fineGGraph->getData(ii).netnum  = INT_MAX;
          weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();
        }
      },
      galois::steal(), galois::loopname("noedgebag match"));

  galois::StatTimer T_BAG("BAG");
  T_BAG.start();
  std::vector<bool> inNodeBag(1000, false);
  std::vector<unsigned> nodeid(1000, INT_MAX);

  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {

    if (!fineGGraph->getData(ii).isMatched() &&
        !fineGGraph->getData(ii).notAlone) {
      int index        = ii % 1000;
      inNodeBag[index] = true;
      if (ii < nodeid[index])
        nodeid[index] = ii;
    }
  }

  for (int i = 0; i < 1000; i++) {

    if (inNodeBag[i]) {
      bag.push(nodeid[i]);
      weight[nodeid[i] - fineGGraph->hedges] = 0;
    }
  }

  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {

    if (!fineGGraph->getData(ii).isMatched() &&
        !fineGGraph->getData(ii).notAlone) {
      int index = ii % 1000;
      fineGGraph->getData(ii).setMatched();
      fineGGraph->getData(ii).setParent(nodeid[index]);
      fineGGraph->getData(ii).netnum = INT_MAX;

      weight[nodeid[index] - fineGGraph->hedges] +=
          fineGGraph->getData(ii).getWeight();
    }
  }
  T_BAG.stop();

  // std::cout <<"bag time: "<< T_BAG.get() << std::endl;
  unsigned hnum   = hg.reduce();
  unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;
  unsigned newval = hnum;

  std::vector<unsigned> idmap(fineGGraph->hnodes);
  std::vector<unsigned> newrand(nodes);
  std::vector<unsigned> newWeight(nodes);
  galois::StatTimer Tloop("for loop");
  Tloop.start();
  std::vector<unsigned> v;

  galois::LargeArray<bool> inBag;

  inBag.allocateBlocked(fineGGraph->size());
  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
    inBag[n] = false;

  for (auto n : bag)
    inBag[n] = true;

  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
    if (inBag[n])
      v.push_back(n);

  for (auto n : v) {
    newrand[newval - hnum]        = n;
    idmap[n - fineGGraph->hedges] = newval++;
    newWeight[idmap[n - fineGGraph->hedges] - hnum] =
        weight[n - fineGGraph->hedges];
  }

  // for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++) {
  galois::do_all(
      galois::iterate(fineGGraph->hedges, fineGGraph->size()),
      [&](GNode n) {
        unsigned id = fineGGraph->getData(n).getParent();
        fineGGraph->getData(n).setParent(idmap[id - fineGGraph->hedges]);
      },
      galois::steal(), galois::loopname("first loop"));
  Tloop.stop();
  uint32_t num_nodes_next = nodes + hnum;
  uint64_t num_edges_next;
  galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(
      num_nodes_next);
  std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);
  std::vector<unsigned> old_id(hnum);

  unsigned h_id = 0;

  for (GNode n = 0; n < fineGGraph->hedges; n++) {
    if (hedges[n]) {
      old_id[h_id]                  = fineGGraph->getData(n).netnum;
      fineGGraph->getData(n).nodeid = h_id++;
    }
  }

  galois::do_all(
      galois::iterate(size_t{0}, fineGGraph->hedges),
      [&](GNode n) {
        if (!hedges[n])
          return;
        // auto data   = fineGGraph->getData(n, flag_no_lock);
        unsigned id = fineGGraph->getData(n).nodeid;

        for (auto ii : fineGGraph->edges(n)) {
          GNode dst = fineGGraph->getEdgeDst(ii);
          //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
          // unsigned pid  = dst_data.getParent();
          unsigned pid = fineGGraph->getData(dst).getParent();

          auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);
          if (f == edges_id[id].end()) {

            edges_id[id].push_back(pid);
          }
        } // End edge loop
      },
      galois::steal(), galois::loopname("BuildGrah: Find edges"));

  std::vector<uint64_t> prefix_edges(num_nodes_next);
  galois::GAccumulator<uint64_t> num_edges_acc;
  galois::do_all(
      galois::iterate(uint32_t{0}, num_nodes_next),
      [&](uint32_t c) {
        prefix_edges[c] = edges_id[c].size();
        num_edges_acc += prefix_edges[c];
      },
      galois::steal(), galois::loopname("BuildGrah: Prefix sum"));

  num_edges_next = num_edges_acc.reduce();
  for (uint32_t c = 1; c < num_nodes_next; ++c) {
    prefix_edges[c] += prefix_edges[c - 1];
  }

  coarseGGraph->constructFrom(num_nodes_next, num_edges_next, prefix_edges,
                              edges_id, edges_data);
  coarseGGraph->hedges = hnum;
  coarseGGraph->hnodes = nodes;
  galois::do_all(
      galois::iterate(*coarseGGraph),
      [&](GNode ii) {
        if (ii < hnum) {
          coarseGGraph->getData(ii).netval = INT_MAX;
          coarseGGraph->getData(ii).netnum = old_id[ii];
        } else {
          coarseGGraph->getData(ii).netval  = INT_MAX;
          coarseGGraph->getData(ii).netnum  = INT_MAX;
          coarseGGraph->getData(ii).netrand = INT_MAX;
          coarseGGraph->getData(ii).nodeid  = ii;
          coarseGGraph->getData(ii).setWeight(
              newWeight[ii - coarseGGraph->hedges]);
        }
      },
      galois::steal(), galois::loopname("noedgebag match"));

  inBag.destroy();
  inBag.deallocate();
}

void findMatching(std::shared_ptr<MetisGraph> coarseMetisGraph,
                  scheduleMode sch, int iter) {
  std::shared_ptr<MetisGraph> fineMetisGraph =
      coarseMetisGraph->getFinerGraph();
  GNodeBag nodes;
  int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;
  std::vector<bool> hedges(sz, false);
  galois::LargeArray<unsigned> weight;
  weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);

  switch (sch) {
  case PLD:
    parallelHMatchAndCreateNodes<PLD_f>(coarseMetisGraph, iter, nodes, hedges,
                                        weight);
    break;
  case RAND:
    parallelHMatchAndCreateNodes<RAND_f>(coarseMetisGraph, iter, nodes, hedges,
                                         weight);
    break;
  case PP:
    parallelHMatchAndCreateNodes<PP_f>(coarseMetisGraph, iter, nodes, hedges,
                                       weight);
    break;
  case WD:
    parallelHMatchAndCreateNodes<WD_f>(coarseMetisGraph, iter, nodes, hedges,
                                       weight);
    break;
  case RI:
    parallelHMatchAndCreateNodes<RI_f>(coarseMetisGraph, iter, nodes, hedges,
                                       weight);
    break;
  case MRI:
    parallelHMatchAndCreateNodes<MRI_f>(coarseMetisGraph, iter, nodes, hedges,
                                        weight);
    break;
  case MWD:
    parallelHMatchAndCreateNodes<MWD_f>(coarseMetisGraph, iter, nodes, hedges,
                                        weight);
    break;
  case DEG:
    parallelHMatchAndCreateNodes<DEG_f>(coarseMetisGraph, iter, nodes, hedges,
                                        weight);
    break;
  case MDEG:
    parallelHMatchAndCreateNodes<MDEG_f>(coarseMetisGraph, iter, nodes, hedges,
                                         weight);
    break;
  default:
    abort();
  }
  coarsePhaseII(coarseMetisGraph, hedges, weight);
  parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);

  weight.destroy();
  weight.deallocate();
}

std::shared_ptr<MetisGraph>
coarsenOnce(std::shared_ptr<MetisGraph> fineMetisGraph, scheduleMode sch,
            int iter) {
  std::shared_ptr<MetisGraph> coarseMetisGraph =
      std::make_shared<MetisGraph>(fineMetisGraph);
  findMatching(coarseMetisGraph, sch, iter);
  return coarseMetisGraph;
}

} // namespace

std::shared_ptr<MetisGraph> coarsen(std::shared_ptr<MetisGraph> fineMetisGraph,
                                    unsigned coarsenTo, scheduleMode sch) {

  std::shared_ptr<MetisGraph> coarseGraph = fineMetisGraph;
  unsigned size                           = fineMetisGraph->getGraph()->hnodes;
  unsigned hedgeSize                      = 0;
  const float ratio                       = 55.0 / 45.0;
  const float tol                         = std::max(ratio, 1 - ratio) - 1;
  const int hi                            = (1 + tol) * size / (2 + tol);
  LIMIT                                   = hi / 4;

  unsigned Size    = size;
  unsigned iterNum = 0;
  unsigned newSize = size;
  while (Size > coarsenTo) {
    if (iterNum > coarsenTo)
      break;
    if (Size - newSize <= 0 && iterNum > 2)
      break;
    newSize     = coarseGraph->getGraph()->hnodes;
    coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);
    Size        = coarseGraph->getGraph()->hnodes;
    hedgeSize   = coarseGraph->getGraph()->hedges;
    // std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net
    // is "
    //          << hedgeSize << "\n";
    if (hedgeSize < 1000)
      break;

    ++iterNum;
  }
  return coarseGraph;
}


================================================
FILE: lonestar/analytics/cpu/bipart/Metric.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "bipart.h"

#include <iomanip>
#include <iostream>
#include <numeric>

struct onlineStat {
  unsigned num;
  unsigned val;
  double valSQ;
  unsigned mmin;
  unsigned mmax;

  onlineStat()
      : num(0), val(0), valSQ(0), mmin(std::numeric_limits<unsigned>::max()),
        mmax(0) {}

  void add(unsigned v) {
    ++num;
    val += v;
    valSQ += (double)v * (double)v;
    mmin = std::min(v, mmin);
    mmax = std::max(v, mmax);
  }

  double mean() { return (double)val / (double)num; }

  double variance() {
    double t = valSQ / (double)num;
    double m = mean();
    return t - m * m;
  }

  unsigned count() { return num; }
  unsigned total() { return val; }
  unsigned min() { return mmin; }
  unsigned max() { return mmax; }
};

unsigned graphStat(GGraph& graph) {
  onlineStat e;
  for (auto ii : graph) {
    unsigned val = std::distance(graph.edge_begin(ii), graph.edge_end(ii));
    e.add(val);
  }
  std::cout << "Nodes " << e.count() << " Edges(total, var, min, max) "
            << e.total() << " " << e.variance() << " " << e.min() << " "
            << e.max();
  return e.count();
}

std::vector<unsigned> edgeCut(GGraph& g, unsigned nparts) {
  std::vector<unsigned> cuts(nparts);

  // find boundary nodes with positive gain
  for (auto nn : g) {
    unsigned gPart = g.getData(nn).getPart();
    for (auto ii : g.edges(nn)) {
      auto& m = g.getData(g.getEdgeDst(ii));
      if (m.getPart() != gPart) {
        cuts.at(gPart) += g.getEdgeData(ii);
      }
    }
  }
  return cuts;
}

unsigned computeCut(GGraph& g) {
  unsigned cuts = 0;
  for (auto nn : g) {
    unsigned gPart = g.getData(nn).getPart();
    for (auto ii : g.edges(nn)) {
      auto& m = g.getData(g.getEdgeDst(ii));
      if (m.getPart() != gPart)
        cuts += g.getEdgeData(ii);
    }
  }
  return cuts / 2;
}

void printCuts(const char* str, MetisGraph* g, unsigned numPartitions) {
  std::vector<unsigned> ec = edgeCut(*g->getGraph(), numPartitions);
  std::cout << str << " Edge Cuts:\n";
  for (unsigned x = 0; x < ec.size(); ++x)
    std::cout << (x == 0 ? "" : " ") << ec[x];
  std::cout << "\n";
  std::cout << str << " Average Edge Cut: "
            << (std::accumulate(ec.begin(), ec.end(), 0) / ec.size()) << "\n";
  std::cout << str
            << " Minimum Edge Cut: " << *std::min_element(ec.begin(), ec.end())
            << "\n";
}


================================================
FILE: lonestar/analytics/cpu/bipart/Partitioning.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "bipart.h"
#include <set>
#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include <map>
#include <set>
#include <cstdlib>
#include <iostream>
#include <stack>
#include <climits>
#include <array>

namespace {
// final
__attribute__((unused)) int cut(GGraph& g) {

  GNodeBag bag;
  galois::do_all(
      galois::iterate(g),
      [&](GNode n) {
        if (g.hedges <= n)
          return;
        for (auto cell : g.edges(n)) {
          auto c   = g.getEdgeDst(cell);
          int part = g.getData(c).getPart();
          for (auto x : g.edges(n)) {
            auto cc   = g.getEdgeDst(x);
            int partc = g.getData(cc).getPart();
            if (partc != part) {
              bag.push(n);
              return;
            }
          }
        }
      },
      galois::loopname("cutsize"));
  return std::distance(bag.begin(), bag.end());
}

void initGain(GGraph& g) {
  galois::do_all(
      galois::iterate(g),
      [&](GNode n) {
        if (n < g.hedges)
          return;
        g.getData(n).FS.store(0);
        g.getData(n).TE.store(0);
      },
      galois::loopname("firstinit"));

  typedef std::map<GNode, int> mapTy;
  typedef galois::substrate::PerThreadStorage<mapTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  galois::do_all(
      galois::iterate(g),
      [&](GNode n) {
        if (g.hedges <= n)
          return;
        int p1 = 0;
        int p2 = 0;
        for (auto x : g.edges(n)) {
          auto cc  = g.getEdgeDst(x);
          int part = g.getData(cc).getPart();
          if (part == 0)
            p1++;
          else
            p2++;
          if (p1 > 1 && p2 > 1)
            break;
        }
        if (!(p1 > 1 && p2 > 1) && (p1 + p2 > 1)) {
          for (auto x : g.edges(n)) {
            auto cc  = g.getEdgeDst(x);
            int part = g.getData(cc).getPart();
            int nodep;
            if (part == 0)
              nodep = p1;
            else
              nodep = p2;
            if (nodep == 1) {
              galois::atomicAdd(g.getData(cc).FS, 1);
            }
            if (nodep == (p1 + p2)) {
              galois::atomicAdd(g.getData(cc).TE, 1);
            }
          }
        }
      },
      galois::steal(), galois::loopname("initGainsPart"));
}

} // namespace

// Final
void partition(std::shared_ptr<MetisGraph> mcg, unsigned K) {
  GGraph* g = mcg->getGraph();
  galois::GAccumulator<unsigned int> accum;
  int waccum;
  galois::GAccumulator<unsigned int> accumZ;
  GNodeBag nodelist;
  galois::do_all(
      galois::iterate(g->hedges, g->size()),
      [&](GNode item) {
        accum += g->getData(item).getWeight();
        g->getData(item, galois::MethodFlag::UNPROTECTED).initRefine(1, true);
        g->getData(item, galois::MethodFlag::UNPROTECTED).initPartition();
      },
      galois::loopname("initPart"));

  galois::do_all(
      galois::iterate(size_t{0}, g->hedges),
      [&](GNode item) {
        for (auto c : g->edges(item)) {
          auto n = g->getEdgeDst(c);
          g->getData(n).setPart(0);
        }
      },
      galois::loopname("initones"));
  GNodeBag nodelistoz;
  galois::do_all(
      galois::iterate(g->hedges, g->size()),
      [&](GNode item) {
        if (g->getData(item).getPart() == 0) {
          accumZ += g->getData(item).getWeight();
          nodelist.push(item);
        } else
          nodelistoz.push(item);
      },
      galois::loopname("initones"));
  unsigned newSize = accum.reduce();
  waccum           = accum.reduce() - accumZ.reduce();
  // unsigned targetWeight = accum.reduce() / 2;
  unsigned kvalue        = (K + 1) / 2;
  unsigned targetWeight0 = (accum.reduce() * kvalue) / K;
  unsigned targetWeight1 = accum.reduce() - targetWeight0;

  if (static_cast<long>(accumZ.reduce()) > waccum) {
    int gain = waccum;
    // initGain(*g);
    while (1) {
      initGain(*g);
      std::vector<GNode> nodeListz;
      GNodeBag nodelistz;
      galois::do_all(
          galois::iterate(nodelist),
          [&](GNode node) {
            unsigned pp = g->getData(node).getPart();
            if (pp == 0) {
              nodelistz.push(node);
            }
          },
          galois::loopname("while"));

      for (auto c : nodelistz)
        nodeListz.push_back(c);
      std::sort(
          nodeListz.begin(), nodeListz.end(), [&g](GNode& lpw, GNode& rpw) {
            if (fabs((float)((g->getData(lpw).getGain()) *
                             (1.0f / g->getData(lpw).getWeight())) -
                     (float)((g->getData(rpw).getGain()) *
                             (1.0f / g->getData(rpw).getWeight()))) < 0.00001f)
              return (float)g->getData(lpw).nodeid <
                     (float)g->getData(rpw).nodeid;
            return (float)((g->getData(lpw).getGain()) *
                           (1.0f / g->getData(lpw).getWeight())) >
                   (float)((g->getData(rpw).getGain()) *
                           (1.0f / g->getData(rpw).getWeight()));
          });
      int i = 0;
      for (auto zz : nodeListz) {
        // auto zz = *nodeListz.begin();
        g->getData(zz).setPart(1);
        gain += g->getData(zz).getWeight();
        // std::cout<<" weight "<<g->getData(zz).getWeight()<<"\n";

        i++;
        if (gain >= static_cast<long>(targetWeight1))
          break;
        if (i > sqrt(newSize))
          break;
      }

      if (gain >= static_cast<long>(targetWeight1))
        break;
      // updateGain(*g,zz);
    }

  } else {

    int gain = accumZ.reduce();
    // std::cout<<"gain is "<<gain<<"\n";
    // initGain(*g);
    while (1) {
      initGain(*g);
      std::vector<GNode> nodeListz;
      GNodeBag nodelistz;
      galois::do_all(
          galois::iterate(nodelistoz),
          [&](GNode node) {
            // for (auto node : nodelist) {
            unsigned pp = g->getData(node).getPart();
            if (pp == 1) {
              nodelistz.push(node);
            }
          },
          galois::loopname("while"));
      for (auto c : nodelistz)
        nodeListz.push_back(c);

      std::sort(
          nodeListz.begin(), nodeListz.end(), [&g](GNode& lpw, GNode& rpw) {
            if (fabs((float)((g->getData(lpw).getGain()) *
                             (1.0f / g->getData(lpw).getWeight())) -
                     (float)((g->getData(rpw).getGain()) *
                             (1.0f / g->getData(rpw).getWeight()))) < 0.00001f)
              return (float)g->getData(lpw).nodeid <
                     (float)g->getData(rpw).nodeid;
            return (float)((g->getData(lpw).getGain()) *
                           (1.0f / g->getData(lpw).getWeight())) >
                   (float)((g->getData(rpw).getGain()) *
                           (1.0f / g->getData(rpw).getWeight()));
          });

      int i = 0;
      for (auto zz : nodeListz) {
        // auto zz = *nodeListz.begin();
        g->getData(zz).setPart(0);
        gain += g->getData(zz).getWeight();

        i++;
        if (gain >= static_cast<long>(targetWeight0))
          break;
        if (i > sqrt(newSize))
          break;
      }

      if (gain >= static_cast<long>(targetWeight0))
        break;

      // updateGain(*g,zz);
    }
  }
}


================================================
FILE: lonestar/analytics/cpu/bipart/README.md
================================================
Hypergraph Partitioning Decomposition
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Partitiong a hypergraph into <b>k</b> sets. A hypergraph is a generalization of 
graph in which edges can connect more than two nodes. Hypergraph partitioning has 
applications in VLSI, data mining, bioinformatics, and etc. 

The hypergraph is represented as a bipartite graph where one sets of nodes represents
hyperedges and the other set represnts nodes. There is an edge between nodes and a hyperedge
if the node is in that hyperedge.

INPUT
--------------------------------------------------------------------------------

This application takes in **HMetis** inputs .hgr graphs.
You must specify the -hMetisGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/bipart/; make -j

RUN
--------------------------------------------------------------------------------

To run on machine with a k value of 4, use the following:
`./bipart-cpu <input-graph> <number-of-coarsening-levels> <number-of-refinement-levels> -<scheduling-policy> -t=<num-threads> -hMetisGraph`


================================================
FILE: lonestar/analytics/cpu/bipart/Refine.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "bipart.h"
#include "galois/AtomicHelpers.h"
#include <set>
#include <iostream>
#include <fstream>

namespace {

void projectPart(std::shared_ptr<MetisGraph> Graph) {
  GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();
  GGraph* coarseGraph = Graph->getGraph();
  galois::do_all(
      galois::iterate(fineGraph->hedges, fineGraph->size()),
      [&](GNode n) {
        auto parent   = fineGraph->getData(n).getParent();
        auto& cn      = coarseGraph->getData(parent);
        unsigned part = cn.getPart();
        fineGraph->getData(n).setPart(part);
      },
      galois::loopname("project"));
}

void initGains(GGraph& g, int) {
  std::string name    = "initgain";
  std::string fetsref = "FETSREF_"; // + std::to_string(pass);

  galois::do_all(
      galois::iterate(g.hedges, g.size()),
      [&](GNode n) {
        g.getData(n).FS.store(0);
        g.getData(n).TE.store(0);
      },
      galois::loopname(name.c_str()));
  galois::InsertBag<std::pair<GNode, GGraph::edge_iterator>> bag;
  typedef std::map<GNode, int> mapTy;
  typedef galois::substrate::PerThreadStorage<mapTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  galois::do_all(
      galois::iterate(size_t{0}, g.hedges),
      [&](GNode n) {
        int p1 = 0;
        int p2 = 0;
        for (auto x : g.edges(n)) {
          auto cc  = g.getEdgeDst(x);
          int part = g.getData(cc).getPart();
          if (part == 0)
            p1++;
          else
            p2++;
          if (p1 > 1 && p2 > 1)
            break;
        }
        if (!(p1 > 1 && p2 > 1) && (p1 + p2 > 1)) {
          for (auto x : g.edges(n)) {
            auto cc  = g.getEdgeDst(x);
            int part = g.getData(cc).getPart();
            int nodep;
            if (part == 0)
              nodep = p1;
            else
              nodep = p2;
            if (nodep == 1) {
              galois::atomicAdd(g.getData(cc).FS, 1);
            }
            if (nodep == (p1 + p2)) {
              galois::atomicAdd(g.getData(cc).TE, 1);
            }
          }
        }
      },
      galois::steal(), galois::loopname("initGains"));
}

void unlock(GGraph& g) {
  galois::do_all(
      galois::iterate(g.hedges, g.size()),
      [&](GNode n) { g.getData(n).counter = 0; }, galois::loopname("unlock"));
}

__attribute__((unused)) void unlocked(GGraph& g) {
  galois::do_all(
      galois::iterate(g.hedges, g.size()),
      [&](GNode n) { g.getData(n).setLocked(false); },
      galois::loopname("unlocked"));
}
// refine
void parallel_refine_KF(GGraph& g, float, unsigned refineTo) {

  // std::cout<<"in parallel balance\n";
  typedef galois::gstl::Vector<unsigned> VecTy;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  std::string name = "findZandO";

  // typedef galois::worklists::PerSocketChunkFIFO<8> Chunk;

  galois::GAccumulator<unsigned int> accum;
  galois::GAccumulator<unsigned int> nodeSize;
  galois::do_all(
      galois::iterate(g.hedges, g.size()),
      [&](GNode n) {
        nodeSize += g.getData(n).getWeight();
        if (g.getData(n).getPart() > 0)
          accum += g.getData(n).getWeight();
      },
      galois::loopname("make balance"));
  unsigned pass = 0;
  // std::cout<<"cut parallel "<<calculate_cutsize(g)<<"\n";
  // initGain(g);
  while (pass < refineTo) {
    // T.start();
    initGains(g, refineTo);
    // T.stop();
    // std::cout<<"init gain time "<<T.get()<<" for round "<<pass<<"\n";
    GNodeBag nodelistz;
    GNodeBag nodelisto;
    unsigned zeroW = 0;
    unsigned oneW  = 0;
    galois::do_all(
        galois::iterate(g.hedges, g.size()),
        [&](GNode n) {
          if (g.getData(n).FS == 0 && g.getData(n).TE == 0)
            return;
          int gain = g.getData(n).getGain();
          if (gain < 0) {
            return;
          }
          unsigned pp = g.getData(n).getPart();
          if (pp == 0) {
            nodelistz.push(n);
          } else {
            nodelisto.push(n);
          }
        },
        galois::loopname("findZandO"));
    zeroW = std::distance(nodelistz.begin(), nodelistz.end());
    oneW  = std::distance(nodelisto.begin(), nodelisto.end());
    GNodeBag bb;
    std::vector<GNode> bbagz;
    std::vector<GNode> bbago;
    for (auto n : nodelistz)
      bbagz.push_back(n);
    for (auto n : nodelisto)
      bbago.push_back(n);
    std::sort(bbagz.begin(), bbagz.end(), [&g](GNode& lpw, GNode& rpw) {
      if (g.getData(lpw).getGain() == g.getData(rpw).getGain())
        return g.getData(lpw).nodeid < g.getData(rpw).nodeid;
      return g.getData(lpw).getGain() > g.getData(rpw).getGain();
    });
    std::sort(bbago.begin(), bbago.end(), [&g](GNode& lpw, GNode& rpw) {
      if (g.getData(lpw).getGain() == g.getData(rpw).getGain())
        return g.getData(lpw).nodeid < g.getData(rpw).nodeid;
      return g.getData(lpw).getGain() > g.getData(rpw).getGain();
    });
    if (zeroW <= oneW) {
      for (unsigned i = 0; i < zeroW; i++) {
        bb.push(bbago[i]);
        bb.push(bbagz[i]);
        //    if (i >= sqrt(Size)) break;
      }
      galois::do_all(
          galois::iterate(bb),
          [&](GNode n) {
            if (g.getData(n).getPart() == 0)
              g.getData(n).setPart(1);
            else
              g.getData(n).setPart(0);
            g.getData(n).counter++;
          },
          galois::loopname("swap"));
    } else {
      for (unsigned i = 0; i < oneW; i++) {
        bb.push(bbago[i]);
        bb.push(bbagz[i]);
        //     if (i >= sqrt(Size)) break;
      }
      galois::do_all(
          galois::iterate(bb),
          [&](GNode n) {
            if (g.getData(n).getPart() == 0)
              g.getData(n).setPart(1);
            else
              g.getData(n).setPart(0);
            g.getData(n).counter++;
          },
          galois::loopname("swap"));
    }
    pass++;
  }
  unlock(g);
}

void parallel_make_balance(GGraph& g, float tol, int p) {

  unsigned Size = g.hnodes;

  galois::GAccumulator<unsigned int> accum;
  galois::GAccumulator<unsigned int> nodeSize;
  galois::do_all(
      galois::iterate(g.hedges, g.size()),
      [&](GNode n) {
        nodeSize += g.getData(n).getWeight();
        if (g.getData(n).getPart() > 0)
          accum += g.getData(n).getWeight();
      },
      galois::loopname("make balance"));

  const int hi = (1 + tol) * nodeSize.reduce() / (2 + tol);
  const int lo = nodeSize.reduce() - hi;
  int bal      = accum.reduce();

  while (1) {
    if (bal >= lo && bal <= hi)
      break;
    initGains(g, p);

    // creating buckets
    std::array<std::vector<GNode>, 101> nodeListz;
    std::array<std::vector<GNode>, 101> nodeListo;

    std::array<GNodeBag, 101> nodelistz;
    std::array<GNodeBag, 101> nodelisto;

    // bucket for nodes with gan by weight ratio <= -9.0f
    std::vector<GNode> nodeListzNegGain;
    std::vector<GNode> nodeListoNegGain;

    GNodeBag nodelistzNegGain;
    GNodeBag nodelistoNegGain;

    if (bal < lo) {

      // placing each node in an appropriate bucket using the gain by weight
      // ratio
      galois::do_all(
          galois::iterate(g.hedges, g.size()),
          [&](GNode n) {
            float gain = ((float)g.getData(n).getGain()) /
                         ((float)g.getData(n).getWeight());
            unsigned pp = g.getData(n).getPart();
            if (pp == 0) {
              // nodes with gain >= 1.0f are in one bucket
              if (gain >= 1.0f) {
                nodelistz[0].push(n);
              } else if (gain >= 0.0f) {
                int d   = gain * 10.0f;
                int idx = 10 - d;
                nodelistz[idx].push(n);
              } else if (gain > -9.0f) {
                int d   = gain * 10.0f - 1;
                int idx = 10 - d;
                nodelistz[idx].push(n);
              } else { // NODES with gain by weight ratio <= -9.0f are in one
                       // bucket
                nodelistzNegGain.push(n);
              }
            }
          },
          galois::steal());

      // sorting each bucket in parallel
      galois::do_all(
          galois::iterate(nodelistz),
          [&](GNodeBag& b) {
            if (b.begin() == b.end())
              return;

            GNode n    = *b.begin();
            float gain = ((float)g.getData(n).getGain()) /
                         ((float)g.getData(n).getWeight());
            int idx;
            if (gain >= 1.0f)
              idx = 0;
            else if (gain >= 0.0f) {
              int d = gain * 10.0f;
              idx   = 10 - d;
            } else {
              int d = gain * 10.0f - 1;
              idx   = 10 - d;
            }
            for (auto x : b) {
              nodeListz[idx].push_back(x);
            }

            std::sort(nodeListz[idx].begin(), nodeListz[idx].end(),
                      [&g](GNode& lpw, GNode& rpw) {
                        if (fabs((float)((g.getData(lpw).getGain()) *
                                         (1.0f / g.getData(lpw).getWeight())) -
                                 (float)((g.getData(rpw).getGain()) *
                                         (1.0f / g.getData(rpw).getWeight()))) <
                            0.00001f)
                          return (float)g.getData(lpw).nodeid <
                                 (float)g.getData(rpw).nodeid;
                        return (float)((g.getData(lpw).getGain()) *
                                       (1.0f / g.getData(lpw).getWeight())) >
                               (float)((g.getData(rpw).getGain()) *
                                       (1.0f / g.getData(rpw).getWeight()));
                      });
          },
          galois::steal());

      int i = 0;
      int j = 0;

      // now moving nodes from partition 0 to 1
      while (j <= 100) {
        if (nodeListz[j].size() == 0) {
          j++;
          continue;
        }

        for (auto zz : nodeListz[j]) {
          g.getData(zz).setPart(1);
          bal += g.getData(zz).getWeight();
          if (bal >= lo)
            break;
          i++;
          if (i > sqrt(Size))
            break;
        }
        if (bal >= lo)
          break;
        if (i > sqrt(Size))
          break;
        j++;
      }

      if (bal >= lo)
        break;
      if (i > sqrt(Size))
        continue;

      // moving nodes from nodeListzNegGain
      //
      if (nodelistzNegGain.begin() == nodelistzNegGain.end())
        continue;

      for (auto x : nodelistzNegGain)
        nodeListzNegGain.push_back(x);

      std::sort(nodeListzNegGain.begin(), nodeListzNegGain.end(),
                [&g](GNode& lpw, GNode& rpw) {
                  if (fabs((float)((g.getData(lpw).getGain()) *
                                   (1.0f / g.getData(lpw).getWeight())) -
                           (float)((g.getData(rpw).getGain()) *
                                   (1.0f / g.getData(rpw).getWeight()))) <
                      0.00001f)
                    return (float)g.getData(lpw).nodeid <
                           (float)g.getData(rpw).nodeid;
                  return (float)((g.getData(lpw).getGain()) *
                                 (1.0f / g.getData(lpw).getWeight())) >
                         (float)((g.getData(rpw).getGain()) *
                                 (1.0f / g.getData(rpw).getWeight()));
                });

      for (auto zz : nodeListzNegGain) {
        g.getData(zz).setPart(1);
        bal += g.getData(zz).getWeight();
        if (bal >= lo)
          break;
        i++;
        if (i > sqrt(Size))
          break;
      }

      if (bal >= lo)
        break;

    } // end if

    else {

      // placing each node in an appropriate bucket using the gain by weight
      // ratio
      galois::do_all(galois::iterate(g.hedges, g.size()), [&](GNode n) {
        float gain =
            ((float)g.getData(n).getGain()) / ((float)g.getData(n).getWeight());
        unsigned pp = g.getData(n).getPart();
        if (pp == 1) {
          // nodes with gain >= 1.0f are in one bucket
          if (gain >= 1.0f) {
            nodelisto[0].push(n);
          } else if (gain >= 0.0f) {
            int d   = gain * 10.0f;
            int idx = 10 - d;
            nodelisto[idx].push(n);
          } else if (gain > -9.0f) {
            int d   = gain * 10.0f - 1;
            int idx = 10 - d;
            nodelisto[idx].push(n);
          } else { // NODES with gain by weight ratio <= -9.0f are in one bucket
            nodelistoNegGain.push(n);
          }
        }
      });

      // sorting each bucket in parallel
      galois::do_all(galois::iterate(nodelisto), [&](GNodeBag& b) {
        if (b.begin() == b.end())
          return;

        GNode n = *b.begin();
        float gain =
            ((float)g.getData(n).getGain()) / ((float)g.getData(n).getWeight());
        int idx;
        if (gain >= 1.0f)
          idx = 0;
        else if (gain >= 0.0f) {
          int d = gain * 10.0f;
          idx   = 10 - d;
        } else {
          int d = gain * 10.0f - 1;
          idx   = 10 - d;
        }
        for (auto x : b) {
          nodeListo[idx].push_back(x);
        }

        std::sort(nodeListo[idx].begin(), nodeListo[idx].end(),
                  [&g](GNode& lpw, GNode& rpw) {
                    if (fabs((float)((g.getData(lpw).getGain()) *
                                     (1.0f / g.getData(lpw).getWeight())) -
                             (float)((g.getData(rpw).getGain()) *
                                     (1.0f / g.getData(rpw).getWeight()))) <
                        0.00001f)
                      return (float)g.getData(lpw).nodeid <
                             (float)g.getData(rpw).nodeid;
                    return (float)((g.getData(lpw).getGain()) *
                                   (1.0f / g.getData(lpw).getWeight())) >
                           (float)((g.getData(rpw).getGain()) *
                                   (1.0f / g.getData(rpw).getWeight()));
                  });
      });

      int i = 0;
      int j = 0;

      // now moving nodes from partition 1 to 0
      while (j <= 100) {
        if (nodeListo[j].size() == 0) {
          j++;
          continue;
        }

        for (auto zz : nodeListo[j]) {
          g.getData(zz).setPart(0);
          bal -= g.getData(zz).getWeight();
          if (bal <= hi)
            break;
          i++;
          if (i > sqrt(Size))
            break;
        }
        if (bal <= hi)
          break;
        if (i > sqrt(Size))
          break;
        j++;
      }

      if (bal <= hi)
        break;
      if (i > sqrt(Size))
        continue;

      // moving nodes from nodeListoNegGain
      //
      if (nodelistoNegGain.begin() == nodelistoNegGain.end())
        continue;

      for (auto x : nodelistoNegGain)
        nodeListoNegGain.push_back(x);

      std::sort(nodeListoNegGain.begin(), nodeListoNegGain.end(),
                [&g](GNode& lpw, GNode& rpw) {
                  if (fabs((float)((g.getData(lpw).getGain()) *
                                   (1.0f / g.getData(lpw).getWeight())) -
                           (float)((g.getData(rpw).getGain()) *
                                   (1.0f / g.getData(rpw).getWeight()))) <
                      0.00001f)
                    return (float)g.getData(lpw).nodeid <
                           (float)g.getData(rpw).nodeid;
                  return (float)((g.getData(lpw).getGain()) *
                                 (1.0f / g.getData(lpw).getWeight())) >
                         (float)((g.getData(rpw).getGain()) *
                                 (1.0f / g.getData(rpw).getWeight()));
                });

      for (auto zz : nodeListoNegGain) {
        g.getData(zz).setPart(0);
        bal -= g.getData(zz).getWeight();
        if (bal <= hi)
          break;
        i++;
        if (i > sqrt(Size))
          break;
      }

      if (bal <= hi)
        break;
    } // end else

  } // end while
}

} // namespace

bool isPT(int n) {
  if (n == 0)
    return false;

  return (ceil(log2(n)) == floor(log2(n)));
}

void refine(std::shared_ptr<MetisGraph> coarseGraph, unsigned K,
            double imbalance) {
  float ratio = 0.0f;
  float tol   = 0.0f;
  bool flag   = isPT(K);
  if (flag) {
    ratio = (50.0f + (double)imbalance) / (50.0f - (double)imbalance);
    tol   = std::max(ratio, 1 - ratio) - 1;
  } else {
    ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed
    tol   = std::max(ratio, 1 - ratio) - 1;
  }
  do {
    std::shared_ptr<MetisGraph> fineGraph = coarseGraph->getFinerGraph();
    auto gg                               = coarseGraph->getGraph();

    parallel_refine_KF(*gg, tol, 2);
    parallel_make_balance(*gg, tol, 2);
    bool do_pro = true;
    if (fineGraph.get() && do_pro) {
      projectPart(coarseGraph);
    }
  } while ((coarseGraph = coarseGraph->getFinerGraph()));
}


================================================
FILE: lonestar/analytics/cpu/bipart/bipart.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "bipart.h"
#include "galois/graphs/ReadGraph.h"
#include "galois/Timer.h"
#include "Lonestar/BoilerPlate.h"
#include "galois/graphs/FileGraph.h"
#include "galois/LargeArray.h"

#include <vector>
#include <set>
#include <map>
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <numeric>
#include <algorithm>
#include <cmath>
#include <fstream>
#include <iostream>
#include <array>
#include <unordered_set>

namespace cll = llvm::cl;

static const char* name = "BIPART";
static const char* desc =
    "Partitions a hypergraph into K parts and minimizing the graph cut";
static const char* url = "BiPart";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<scheduleMode> schedulingMode(
    cll::desc("Choose a inital scheduling mode:"),
    cll::values(clEnumVal(PLD, "PLD"), clEnumVal(PP, "PP"), clEnumVal(WD, "WD"),
                clEnumVal(RI, "RI"), clEnumVal(MRI, "MRI"),
                clEnumVal(MDEG, "MDEG"), clEnumVal(DEG, "DEG"),
                clEnumVal(MWD, "MWD"), clEnumVal(HIS, "HIS"),
                clEnumVal(RAND, "random")),
    cll::init(RAND));

static cll::opt<bool>
    mtxInput("mtxinput",
             cll::desc("Use text mtx files instead of binary galois gr files"),
             cll::init(false));
static cll::opt<bool> weighted("weighted", cll::desc("weighted"),
                               cll::init(false));
static cll::opt<bool>
    verbose("verbose",
            cll::desc("verbose output (debugging mode, takes extra time)"),
            cll::init(false));
static cll::opt<std::string> outfile("outputFile",
                                     cll::desc("output partition file name"));
static cll::opt<std::string>
    orderedfile("ordered", cll::desc("output ordered graph file name"));
static cll::opt<std::string>
    permutationfile("permutation", cll::desc("output permutation file name"));
static cll::opt<unsigned> csize(cll::Positional,
                                cll::desc("<size of coarsest graph>"),
                                cll::init(25));

static cll::opt<unsigned> refiter(cll::Positional,
                                  cll::desc("<number of iterations in ref>"),
                                  cll::init(2));
static cll::opt<unsigned> numPartitions(cll::Positional,
                                        cll::desc("<number of partitions>"),
                                        cll::init(2));
static cll::opt<double> imbalance(
    "balance",
    cll::desc("Percentage deviated from mean partition size (default 5)"),
    cll::init(5.0));

//! Flag that forces user to be aware that they should be passing in a
//! hMetis graph.
static cll::opt<bool>
    hMetisGraph("hMetisGraph",
                cll::desc("Specify that the input graph is a hMetis"),
                cll::init(false));

static cll::opt<bool>
    output("output", cll::desc("Specify if partitions need to be written"),
           cll::init(false));
double Ctime = 0.0f;
double Ptime = 0.0f;
double Rtime = 0.0f;
/**
 * Partitioning
 */
void Partition(std::shared_ptr<MetisGraph> metisGraph, unsigned coarsenTo,
               unsigned K) {
  galois::StatTimer execTime("Timer_0");
  execTime.start();

  galois::StatTimer T("CoarsenSEP");
  T.start();
  std::shared_ptr<MetisGraph> mcg =
      coarsen(metisGraph, coarsenTo, schedulingMode);
  T.stop();

  galois::StatTimer T2("PartitionSEP");
  T2.start();
  partition(mcg, K);
  T2.stop();

  galois::StatTimer T3("Refine");
  T3.start();
  refine(mcg, K, imbalance);
  T3.stop();
  Ctime += (T.get() / 1000.0f);
  Ptime += (T2.get() / 1000.0f);
  Rtime += (T3.get() / 1000.0f);

  execTime.stop();
}

int computingCut(GGraph& g) {

  GNodeBag bag;
  galois::GAccumulator<unsigned> edgecut;
  galois::do_all(
      galois::iterate((size_t)0, g.hedges),
      [&](GNode n) {
        std::set<unsigned> nump;
        for (auto cell : g.edges(n)) {
          auto c   = g.getEdgeDst(cell);
          int part = g.getData(c).getPart();
          nump.insert(part);
        }
        edgecut += (nump.size() - 1);
      },
      galois::loopname("cutsize"));
  return edgecut.reduce();
}

int computingBalance(GGraph& g) {
  int max = 0;
  std::vector<int> parts(numPartitions, 0);
  for (size_t c = g.hedges; c < g.size(); c++) {
    unsigned pp = g.getData(c).getPart();
    parts[pp]++;
  }
  for (unsigned i = 0; i < numPartitions; i++) {
    if (parts[i] > max)
      max = parts[i];
  }
  return max;
}
// printGraphBeg(*graph)

typedef galois::graphs::FileGraph FG;
typedef FG::GraphNode FN;
template <typename GNode, typename Weights>
struct order_by_degree {
  GGraph& graph;
  Weights& weights;
  order_by_degree(GGraph& g, Weights& w) : graph(g), weights(w) {}
  bool operator()(const GNode& a, const GNode& b) {
    uint64_t wa = weights[a];
    uint64_t wb = weights[b];
    int pa      = graph.getData(a, galois::MethodFlag::UNPROTECTED).getPart();
    int pb      = graph.getData(b, galois::MethodFlag::UNPROTECTED).getPart();
    if (pa != pb) {
      return pa < pb;
    }
    return wa < wb;
  }
};

typedef galois::substrate::PerThreadStorage<std::map<GNode, uint64_t>>
    PerThreadDegInfo;

std::map<uint64_t, uint64_t>
cellToNet(std::map<uint64_t, std::vector<uint64_t>> netToCell) {
  std::map<uint64_t, uint64_t> celltonet;
  for (auto n : netToCell) {
    for (auto c : n.second) {
      celltonet[c]++;
    }
  }
  return celltonet;
}

int hash(unsigned val) {
  unsigned long int seed = val * 1103515245 + 12345;
  return ((unsigned)(seed / 65536) % 32768);
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!hMetisGraph) {
    GALOIS_DIE("This application requires a hMetis graph input;"
               " please use the -hMetisGraph flag "
               " to indicate the input is a hMetisGraph graph.");
  }

  std::shared_ptr<MetisGraph> metisGraph(new MetisGraph());
  GGraph& graph = *metisGraph->getGraph();
  std::ifstream f(inputFile.c_str());
  std::string line;
  std::getline(f, line);
  std::stringstream ss(line);
  uint32_t i1;
  uint64_t i2;
  ss >> i1 >> i2;
  uint32_t hedges = i1;
  uint64_t nodes  = i2;
  std::cout << "hedges: " << hedges << "\n";
  std::cout << "nodes: " << nodes << "\n\n";

  galois::StatTimer T("buildingG");
  T.start();

  galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(hedges +
                                                                      nodes);
  std::vector<std::vector<EdgeTy>> edges_data(hedges + nodes);
  std::vector<uint64_t> prefix_edges(nodes + hedges);
  uint32_t cnt   = 0;
  uint32_t edges = 0;
  while (std::getline(f, line)) {
    if (cnt >= hedges) {
      printf("ERROR: too many lines in input file\n");
      exit(-1);
    }
    std::stringstream ss(line);
    int val;
    while (ss >> val) {
      if ((val < 1) || (val > static_cast<long>(nodes))) {
        printf("ERROR: node value %d out of bounds\n", val);
        exit(-1);
      }
      unsigned newval = hedges + (val - 1);
      edges_id[cnt].push_back(newval);
      edges++;
    }
    cnt++;
  }
  f.close();
  graph.hedges = hedges;
  graph.hnodes = nodes;
  std::cout << "number of hedges " << hedges << "\n";
  uint32_t sizes = hedges + nodes;

  galois::do_all(galois::iterate(uint32_t{0}, sizes),
                 [&](uint32_t c) { prefix_edges[c] = edges_id[c].size(); });

  for (uint64_t c = 1; c < nodes + hedges; ++c) {
    prefix_edges[c] += prefix_edges[c - 1];
  }

  graph.constructFrom(nodes + hedges, edges, prefix_edges, edges_id,
                      edges_data);
  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) {
        if (n < hedges)
          graph.getData(n).netnum = n + 1;
        else
          graph.getData(n).netnum = INT_MAX;
        graph.getData(n).netrand = INT_MAX;
        graph.getData(n).netval  = INT_MAX;
        graph.getData(n).nodeid  = n + 1;
      },
      galois::steal(), galois::loopname("build initial graph"));
  T.stop();
  std::cout << "time to build a graph " << T.get() << "\n";
  graphStat(graph);
  std::cout << "\n";
  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 10);
  galois::reportPageAlloc("MeminfoPre");
  galois::do_all(
      galois::iterate(graph.hedges, graph.size()),
      [&](GNode item) {
        // accum += g->getData(item).getWeight();
        graph.getData(item, galois::MethodFlag::UNPROTECTED)
            .initRefine(0, true);
        graph.getData(item, galois::MethodFlag::UNPROTECTED).initPartition();
      },
      galois::steal(), galois::loopname("initPart"));

  Partition(metisGraph, csize, numPartitions);

  const int k = numPartitions;
  // calculating number of iterations/levels required
  int num = log2(k);

  int kValue[k];
  for (int i = 0; i < k; i++)
    kValue[i] = 0;

  kValue[0]           = (k + 1) / 2;
  kValue[(k + 1) / 2] = k / 2;

  galois::do_all(
      galois::iterate((uint64_t)graph.hedges, graph.size()),
      [&](GNode n) {
        unsigned pp = graph.getData(n).getPart();
        if (pp == 1) {
          graph.getData(n).setPart((k + 1) / 2);
        }
      },
      galois::steal(), galois::loopname("set part (original graph)"));

  // running it level by level

  // toProcess contains nodes to be executed in a given level
  std::set<int> toProcess;
  std::set<int> toProcessNew;
  toProcess.insert(0);
  toProcess.insert((k + 1) / 2);

  std::vector<std::vector<GNode>> nodesvec(k);
  // std::array<std::vector<GNode>, 100> hedgesvec;

  for (int level = 0; level < num; level++) {

    for (int i = 0; i < k; i++)
      nodesvec[i].clear();

    // distributing nodes in relevant vectors according to their current
    // partition assignment
    for (GNode n = graph.hedges; n < graph.size(); n++) {
      unsigned pp = graph.getData(n).getPart();
      nodesvec[pp].push_back(n);
    }

    std::vector<std::vector<GNode>> hedgevec(k);

    // distribute hyperedges according to their current partition
    galois::do_all(
        galois::iterate((uint64_t)0, graph.hedges),
        [&](GNode h) {
          auto edge = *(graph.edges(h).begin());
          auto dst  = graph.getEdgeDst(edge);
          auto ii   = graph.getData(dst).getPart();

          bool flag = true;

          for (auto n : graph.edges(h)) {
            auto part = graph.getData(graph.getEdgeDst(n)).getPart();

            if (part != ii) {
              flag = false;
              break;
            }
          }

          if (flag)
            graph.getData(h).setPart(ii);
          else
            graph.getData(h).setPart(100000);
        },
        galois::steal(), galois::loopname("distribute hedges"));

    for (GNode h = 0; h < graph.hedges; h++) {
      unsigned part = graph.getData(h).getPart();
      if (part != 100000)
        hedgevec[part].push_back(h);
    }

    // calling Partition for each partition number
    for (unsigned i : toProcess) {
      if (kValue[i] > 1) {
        std::shared_ptr<MetisGraph> metisG;
        GGraph& gr = *metisG->getGraph();

        unsigned ed = 0;

        for (auto h : hedgevec[i])
          graph.getData(h).index = ed++;

        unsigned id = ed;
        for (auto n : nodesvec[i]) {
          graph.getData(n).index = id++;
        }

        unsigned totalnodes = id;
        galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_ids(
            totalnodes);
        std::vector<std::vector<EdgeTy>> edge_data(totalnodes);
        std::vector<uint64_t> pre_edges(totalnodes);
        unsigned edges = 0;

        galois::do_all(
            galois::iterate(hedgevec[i]),
            [&](GNode h) {
              for (auto v : graph.edges(h)) {
                auto vv = graph.getEdgeDst(v);

                uint32_t newid = graph.getData(h).index;
                unsigned nm    = graph.getData(vv).index;
                edges_ids[newid].push_back(nm);
              }
            },
            galois::steal(), galois::loopname("populate edge ids"));

        uint64_t num_edges_acc = 0;
        // galois::do_all(
        //  galois::iterate(uint32_t{0}, totalnodes),
        for (uint32_t c = 0; c < totalnodes; c++) {
          pre_edges[c] = edges_ids[c].size();
          num_edges_acc += pre_edges[c];
        }
        // galois::steal(), galois::loopname("set pre edges"));

        edges = num_edges_acc;

        for (uint64_t c = 1; c < totalnodes; ++c) {
          pre_edges[c] += pre_edges[c - 1];
        }
        gr.constructFrom(totalnodes, edges, pre_edges, edges_ids, edge_data);

        gr.hedges = ed;
        gr.hnodes = id - ed;

        galois::do_all(
            galois::iterate(gr),
            [&](GNode n) {
              if (n < gr.hedges)
                gr.getData(n).netnum = n + 1;
              else
                gr.getData(n).netnum = INT_MAX;
              gr.getData(n).netrand = INT_MAX;
              gr.getData(n).netval  = INT_MAX;
              gr.getData(n).nodeid  = n + 1;
            },
            galois::steal(), galois::loopname("build graph: recursion level"));

        Partition(metisG, csize, kValue[i]);

        std::shared_ptr<MetisGraph> mcg = metisG;

        int tmp                   = kValue[i];
        kValue[i]                 = (tmp + 1) / 2;
        kValue[i + (tmp + 1) / 2] = (tmp) / 2;
        toProcessNew.insert(i);
        toProcessNew.insert(i + (tmp + 1) / 2);

        galois::do_all(
            galois::iterate(nodesvec[i]),
            [&](GNode v) {
              GNode n     = graph.getData(v).index;
              unsigned pp = gr.getData(n).getPart();
              if (pp == 0) {
                graph.getData(v).setPart(i);
              } else if (pp == 1) {
                graph.getData(v).setPart(i + (tmp + 1) / 2);
              }
            },
            galois::steal(),
            galois::loopname("set part: inside recursive call"));

      } // end if
    }   // end for

    toProcess = toProcessNew;
    toProcessNew.clear();
  } // end while
  std::cout << "Coarsening time(s):," << Ctime << "\n";
  std::cout << "Partitiong time(s):," << Ptime << "\n";
  std::cout << "Refinement time(s):," << Rtime << "\n";
  std::cout << "\n";
  std::cout << "Edge Cut," << computingCut(graph) << "\n\n";

  galois::runtime::reportStat_Single("BiPart", "Edge Cut", computingCut(graph));
  // galois::runtime::reportStat_Single("BiPart", "zero-one",
  //                                   computingBalance(graph));

  totalTime.stop();
  if (output) {

    std::vector<std::vector<uint64_t>> parts(numPartitions);

    for (GNode n = graph.hedges; n < graph.size(); n++) {
      unsigned p = graph.getData(n).getPart();
      parts[p].push_back(n - graph.hedges + 1);
    }

    std::ofstream outputFile(outfile.c_str());

    for (unsigned i = 0; i < numPartitions; i++) {
      outputFile << i + 1 << " ";
      for (auto v : parts[i])
        outputFile << v << " ";
      outputFile << "\n";
    }
    outputFile.close();
  }
  return 0;
}


================================================
FILE: lonestar/analytics/cpu/bipart/bipart.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef BIPART_H_
#define BIPART_H_

#include "galois/graphs/LC_CSR_Graph.h"
#include "galois/AtomicWrapper.h"

class MetisNode;
typedef uint32_t EdgeTy;

struct GGraph
    : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<
          true>::type::with_numa_alloc<true>::type {
  // false>::type::with_numa_alloc<true>::type {
  size_t hedges;
  size_t hnodes;
};

using GNode    = GGraph::GraphNode;
using GNodeBag = galois::InsertBag<GNode>;

constexpr galois::MethodFlag flag_no_lock = galois::MethodFlag::UNPROTECTED;
// algorithms
enum scheduleMode { PLD, WD, RI, PP, MRI, MWD, DEG, MDEG, HIS, RAND };

enum coarseModeII { HMETISII, PAIRII };
enum pairScheduleModeII { FIRSTII, MAXWII, ECII };
// Nodes in the metis graph
class MetisNode {

  struct coarsenData {
    int matched : 1;
    int failedmatch : 1;
    GNode parent;
  };
  struct refineData {
    unsigned partition;
    unsigned oldPartition;
    bool maybeBoundary;
  };
  struct partitionData {
    bool locked;
  };

  partitionData pd;

  void initCoarsen() {
    data.cd.matched     = false;
    data.cd.failedmatch = false;
    data.cd.parent      = 0;
    netval              = 0;
  }

public:
  // bool flag;
  unsigned counter;
  int nodeid;
  galois::CopyableAtomic<int> FS;
  galois::CopyableAtomic<int> TE;
  galois::CopyableAtomic<int> netnum;
  galois::CopyableAtomic<int> netrand;
  galois::CopyableAtomic<int> netval;
  galois::CopyableAtomic<int> degree;
  /*std::atomic<int> FS;
    std::atomic<int> TE;
    std::atomic<int> netnum;
    std::atomic<int> netrand;
    std::atomic<int> netval;
    std::atomic<int> degree;
*/ uint32_t index;
  bool notAlone;

  void initPartition() { pd.locked = false; }

  // int num;
  explicit MetisNode(int weight) : _weight(weight) {
    initCoarsen();
    initPartition();
    counter           = 0;
    data.rd.partition = 0;
  }

  MetisNode(unsigned weight, GNode child0, GNode child1 = 0) : _weight(weight) {
    initCoarsen();
    initPartition();
    children[0]       = child0;
    children[1]       = child1;
    counter           = 0;
    data.rd.partition = 0;
  }

  MetisNode() : _weight(1) {
    initCoarsen();
    initPartition();
    counter           = 0;
    data.rd.partition = 0;
    data.cd.matched   = false;
  }

  // call to switch data to refining
  void initRefine(unsigned part = 0, bool bound = false) {
    refineData rd = {part, part, bound};
    data.rd       = rd;
    counter       = 0;
  }

  int getWeight() const { return _weight; }
  void setWeight(int weight) { _weight = weight; }

  void setParent(GNode p) { data.cd.parent = p; }
  GNode getParent() const {
    assert(data.cd.parent);
    return data.cd.parent;
  }
  int getGain() { return FS - (TE + counter); }

  void setMatched() { data.cd.matched = true; }
  void notMatched() { data.cd.matched = false; }
  bool isMatched() const { return data.cd.matched; }

  void setFailedMatch() { data.cd.failedmatch = true; }
  bool isFailedMatch() const { return data.cd.failedmatch; }

  GNode getChild(unsigned x) const { return children[x]; }
  void setChild(GNode c) { children.push_back(c); }
  unsigned numChildren() const { return children.size(); }

  unsigned getPart() const { return data.rd.partition; }
  void setPart(unsigned val) { data.rd.partition = val; }

  int getOldPart() const { return data.rd.oldPartition; }
  void OldPartCpyNew() { data.rd.oldPartition = data.rd.partition; }

  bool getmaybeBoundary() const { return data.rd.maybeBoundary; }
  void setmaybeBoundary(bool val) { data.rd.maybeBoundary = val; }

  void setLocked(bool locked) { pd.locked = locked; }
  bool isLocked() { return pd.locked; }

private:
  union {
    coarsenData cd;
    refineData rd;
  } data;

  std::vector<GNode> children;
  unsigned _weight;
};

// Structure to keep track of graph hirarchy
class MetisGraph : public std::enable_shared_from_this<MetisGraph> {
  MetisGraph* coarser;
  std::shared_ptr<MetisGraph> finer;

  GGraph graph;

public:
  MetisGraph() : coarser(0) {}

  explicit MetisGraph(std::shared_ptr<MetisGraph> finerGraph)
      : coarser(0), finer(finerGraph) {
    finer->coarser = this;
  }

  const GGraph* getGraph() const { return &graph; }
  GGraph* getGraph() { return &graph; }
  std::shared_ptr<MetisGraph> getFinerGraph() const { return finer; }
  MetisGraph* getCoarserGraph() const { return coarser; }

  // unsigned getNumNodes() { return std::distance(graph.cellList().begin(),
  // graph.cellList().end()); }

  unsigned getTotalWeight() {
    std::shared_ptr<MetisGraph> f = shared_from_this();
    while (f->finer)
      f = f->finer;
    // return std::distance(f->graph.cellList().begin(),
    // f->graph.cellList().end());
    return 0;
  }
};

// Metrics
unsigned graphStat(GGraph& graph);
// Coarsening
std::shared_ptr<MetisGraph> coarsen(std::shared_ptr<MetisGraph> fineMetisGraph,
                                    unsigned coarsenTo, scheduleMode sMode);

// Partitioning
void partition(std::shared_ptr<MetisGraph>, unsigned);
// Refinement
void refine(std::shared_ptr<MetisGraph> coarseGraph, unsigned K,
            double imbalance);

#endif


================================================
FILE: lonestar/analytics/cpu/clustering/CMakeLists.txt
================================================
add_executable(louvain-clustering-cpu louvainClustering.cpp)
add_dependencies(apps louvain-clustering-cpu)
target_link_libraries(louvain-clustering-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS louvain-clustering-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 louvain-clustering-cpu -symmetricGraph "${BASEINPUT}/scalefree/symmetric/rmat10.sgr")

add_executable(leiden-clustering-cpu leidenClustering.cpp)
add_dependencies(apps leiden-clustering-cpu)
target_link_libraries(leiden-clustering-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS leiden-clustering-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 leiden-clustering-cpu -symmetricGraph "${BASEINPUT}/scalefree/symmetric/rmat10.sgr")


================================================
FILE: lonestar/analytics/cpu/clustering/README.md
================================================
Clustering
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This directory contains hierarchical community detection algorithms, that
recursively merge the communities into a single node and perform clustering on the 
coarsened graph until nodes stop changing communities.

The two algorithms are following:

* Louvain Clustering: This algorithm uses the modularity function to find
  well-connected communities by maximizing the modularity score, which
  quantifies the quality of node assignments to the communities based on the
  density of connections.
* Leiden Clustering: This is a variant of the Louvain clustering algorithm with
  the modified coarsening phase that allows nodes to switch communities even
  after coarsening. This is shown to improve clustering quality with little
  extra computation.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/clustering; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./louvain-clustering-cpu <path-to-graph> -t 40 -c_threshold=0.01 -threshold=0.000001 -max_iter 1000 -algo=Foreach  -resolution=0.001 -symmetricGraph`

-`$ ./leiden-clustering-cpu <path-to-graph> -t 40 -c_threshold=0.01 -threshold=0.000001 -max_iter 1000 -algo=Foreach  -resolution=0.001 -symmetricGraph`


================================================
FILE: lonestar/analytics/cpu/clustering/clustering.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef CLUSTERING_H
#define CLUSTERING_H

#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/LargeArray.h"

#include "llvm/Support/CommandLine.h"

#include <random>
#include <fstream>

namespace cll = llvm::cl;
static cll::opt<bool>
    enable_VF("enable_VF",
              cll::desc("Flag to enable vertex following optimization."),
              cll::init(false));

static cll::opt<double> c_threshold("c_threshold",
                                    cll::desc("Threshold for modularity gain"),
                                    cll::init(0.01));

static cll::opt<double>
    threshold("threshold", cll::desc("Total threshold for modularity gain"),
              cll::init(0.01));

static cll::opt<uint32_t>
    max_iter("max_iter", cll::desc("Maximum number of iterations to execute"),
             cll::init(10));

static cll::opt<bool>
    output_CID("output_CID", cll::desc("Flag to enable cluster ID printing."),
               cll::init(false));

static cll::opt<std::string>
    output_CID_filename("output_CID_filename",
                        cll::desc("File name to output cluster IDs."),
                        cll::init("output_CID_filename"));

static cll::opt<double>
    resolution("resolution", cll::desc("Resolution for CPM quality function."),
               cll::init(1.0));

static cll::opt<double>
    randomness("randomness",
               cll::desc("Randomness factor for refining clusters in Leiden."),
               cll::init(0.01));

static cll::opt<uint32_t>
    min_graph_size("min_graph_size", cll::desc("Minimum coarsened graph size"),
                   cll::init(100));

/*
 * Typedefs
 */
constexpr static const uint64_t INF_VAL =
    std::numeric_limits<uint64_t>::max() / 2 - 1;
constexpr static const uint64_t UNASSIGNED =
    std::numeric_limits<uint64_t>::max();
constexpr static const double DOUBLE_MAX =
    std::numeric_limits<double>::max() / 4;

constexpr galois::MethodFlag flag_no_lock    = galois::MethodFlag::UNPROTECTED;
constexpr galois::MethodFlag flag_read_lock  = galois::MethodFlag::READ;
constexpr galois::MethodFlag flag_write_lock = galois::MethodFlag::WRITE;

typedef galois::LargeArray<uint64_t> largeArray;
typedef float EdgeTy;
// typedef uint32_t EdgeTy;
typedef galois::LargeArray<EdgeTy> largeArrayEdgeTy;

template <typename GraphTy>
void printGraphCharateristics(GraphTy& graph) {

  galois::gPrint("/******************************************/\n");
  galois::gPrint("/************ Graph Properties ************/\n");
  galois::gPrint("/******************************************/\n");
  galois::gPrint("Number of Nodes: ", graph.size(), "\n");
  galois::gPrint("Number of Edges: ", graph.sizeEdges(), "\n");
}

/**
 * Algorithm to find the best cluster for the node
 * to move to among its neighbors.
 */
template <typename GraphTy>
void findNeighboringClusters(GraphTy& graph, typename GraphTy::GraphNode& n,
                             std::map<uint64_t, uint64_t>& cluster_local_map,
                             std::vector<EdgeTy>& counter,
                             EdgeTy& self_loop_wt) {
  using GNode = typename GraphTy::GraphNode;
  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
    graph.getData(graph.getEdgeDst(ii), flag_write_lock);
  }

  uint64_t num_unique_clusters = 0;
  /**
   * Add the node's current cluster to be considered
   * for movement as well
   */
  cluster_local_map[graph.getData(n).curr_comm_ass] =
      0;                // Add n's current cluster
  counter.push_back(0); // Initialize the counter to zero (no edges incident
                        // yet)
  num_unique_clusters++;

  // Assuming we have grabbed lock on all the neighbors
  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
    GNode dst = graph.getEdgeDst(ii);
    auto edge_wt =
        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded
    if (dst == n) {
      self_loop_wt += edge_wt; // Self loop weights is recorded
    }
    auto stored_already = cluster_local_map.find(
        graph.getData(dst).curr_comm_ass); // Check if it already exists
    if (stored_already != cluster_local_map.end()) {
      counter[stored_already->second] += edge_wt;
    } else {
      cluster_local_map[graph.getData(dst).curr_comm_ass] = num_unique_clusters;
      counter.push_back(edge_wt);
      num_unique_clusters++;
    }
  } // End edge loop
  return;
}
template <typename GraphTy>
uint64_t vertexFollowing(GraphTy& graph) {
  using GNode = typename GraphTy::GraphNode;
  // Initialize each node to its own cluster
  galois::do_all(galois::iterate(graph),
                 [&graph](GNode n) { graph.getData(n).curr_comm_ass = n; });

  // Remove isolated and degree-one nodes
  galois::GAccumulator<uint64_t> isolatedNodes;
  galois::do_all(galois::iterate(graph), [&](GNode n) {
    auto& n_data = graph.getData(n);
    uint64_t degree =
        std::distance(graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),
                      graph.edge_end(n, galois::MethodFlag::UNPROTECTED));
    if (degree == 0) {
      isolatedNodes += 1;
      n_data.curr_comm_ass = UNASSIGNED;
    } else {
      if (degree == 1) {
        // Check if the destination has degree greater than one
        auto dst = graph.getEdgeDst(
            graph.edge_end(n, galois::MethodFlag::UNPROTECTED));
        uint64_t dst_degree = std::distance(
            graph.edge_begin(dst, galois::MethodFlag::UNPROTECTED),
            graph.edge_end(dst, galois::MethodFlag::UNPROTECTED));
        if ((dst_degree > 1 || (n > dst))) {
          isolatedNodes += 1;
          n_data.curr_comm_ass = graph.getData(dst).curr_comm_ass;
        }
      }
    }
  });
  // The number of isolated nodes that can be removed
  return isolatedNodes.reduce();
}

template <typename GraphTy, typename CommArrayTy>
void sumVertexDegreeWeight(GraphTy& graph, CommArrayTy& c_info) {
  using GNode = typename GraphTy::GraphNode;
  galois::do_all(galois::iterate(graph), [&](GNode n) {
    EdgeTy total_weight = 0;
    auto& n_data        = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      total_weight += graph.getEdgeData(ii, flag_no_lock);
    }
    n_data.degree_wt    = total_weight;
    c_info[n].degree_wt = total_weight;
    c_info[n].size      = 1;
  });
}
template <typename GraphTy, typename CommArrayTy>
void sumVertexDegreeWeightWithNodeWeight(GraphTy& graph, CommArrayTy& c_info) {
  using GNode = typename GraphTy::GraphNode;
  galois::do_all(galois::iterate(graph), [&](GNode n) {
    EdgeTy total_weight = 0;
    auto& n_data        = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      total_weight += graph.getEdgeData(ii, flag_no_lock);
    }
    n_data.degree_wt    = total_weight;
    c_info[n].degree_wt = total_weight;
    c_info[n].size      = 1;
    c_info[n].node_wt.store(n_data.node_wt);
  });
}

template <typename GraphTy, typename CommArrayTy>
void sumClusterWeight(GraphTy& graph, CommArrayTy& c_info) {
  using GNode = typename GraphTy::GraphNode;
  galois::do_all(galois::iterate(graph), [&](GNode n) {
    EdgeTy total_weight = 0;
    auto& n_data        = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      total_weight += graph.getEdgeData(ii, flag_no_lock);
    }
    n_data.degree_wt    = total_weight;
    c_info[n].degree_wt = 0;
  });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    auto& n_data = graph.getData(n);
    if (n_data.curr_comm_ass != UNASSIGNED)
      galois::atomicAdd(c_info[n_data.curr_comm_ass].degree_wt,
                        n_data.degree_wt);
  });
}

template <typename GraphTy>
double calConstantForSecondTerm(GraphTy& graph) {
  using GNode = typename GraphTy::GraphNode;
  /**
   * Using double to avoid overflow
   */
  galois::GAccumulator<double> local_weight;
  galois::do_all(galois::iterate(graph), [&graph, &local_weight](GNode n) {
    local_weight += graph.getData(n).degree_wt;
  });
  /* This is twice since graph is symmetric */
  double total_edge_weight_twice = local_weight.reduce();
  return 1 / total_edge_weight_twice;
}

template <typename GraphTy, typename CommArrayTy>
uint64_t maxCPMQuality(std::map<uint64_t, uint64_t>& cluster_local_map,
                       std::vector<EdgeTy>& counter, EdgeTy self_loop_wt,
                       CommArrayTy& c_info, uint64_t node_wt, uint64_t sc) {

  uint64_t max_index = sc; // Assign the initial value as self community
  double cur_gain    = 0;
  double max_gain    = 0;
  double eix         = counter[0] - self_loop_wt;
  double eiy         = 0;
  double size_x      = (double)(c_info[sc].node_wt - node_wt);
  double size_y      = 0;

  auto stored_already = cluster_local_map.begin();
  do {
    if (sc != stored_already->first) {

      eiy =
          counter[stored_already->second]; // Total edges incident on cluster y
      size_y = c_info[stored_already->first].node_wt;

      cur_gain = 2.0f * (double)(eiy - eix) -
                 resolution * node_wt * (double)(size_y - size_x);
      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&
                                    (stored_already->first < max_index))) {
        max_gain  = cur_gain;
        max_index = stored_already->first;
      }
    }
    stored_already++; // Explore next cluster
  } while (stored_already != cluster_local_map.end());

  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {
    max_index = sc;
  }
  assert(max_gain >= 0);
  return max_index;
}

template <typename GraphTy, typename CommArrayTy>
double calCPMQuality(GraphTy& graph, CommArrayTy& c_info, double& e_xx,
                     double& a2_x, double& constant_for_second_term) {

  using GNode = typename GraphTy::GraphNode;
  /* Variables needed for Modularity calculation */
  double mod = -1;

  std::cout << "graph size: " << graph.size() << "\n";
  largeArrayEdgeTy cluster_wt_internal;

  /*** Initialization ***/
  cluster_wt_internal.allocateBlocked(graph.size());

  /* Calculate the overall modularity */
  galois::GAccumulator<double> acc_e_xx;
  galois::GAccumulator<double> acc_a2_x;

  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { cluster_wt_internal[n] = 0; });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    auto n_data = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==
          n_data.curr_comm_ass) {
        cluster_wt_internal[n] += graph.getEdgeData(ii);
      }
    }
  });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    acc_e_xx += cluster_wt_internal[n];
    acc_a2_x +=
        (double)(c_info[n].node_wt) * ((double)(c_info[n].node_wt - 1) * 0.5f);
    // acc_a2_x += (double) (c_info[n].node_wt) * ((double) (c_info[n].node_wt)
    // * resolution);
  });

  e_xx = acc_e_xx.reduce();
  a2_x = acc_a2_x.reduce();
  mod  = (e_xx - a2_x) * (double)constant_for_second_term;

  return mod;
}

template <typename CommArrayTy>
uint64_t maxModularity(std::map<uint64_t, uint64_t>& cluster_local_map,
                       std::vector<EdgeTy>& counter, EdgeTy self_loop_wt,
                       CommArrayTy& c_info, EdgeTy degree_wt, uint64_t sc,
                       double constant) {

  uint64_t max_index = sc; // Assign the intial value as self community
  double cur_gain    = 0;
  double max_gain    = 0;
  double eix         = counter[0] - self_loop_wt;
  double ax          = c_info[sc].degree_wt - degree_wt;
  double eiy         = 0;
  double ay          = 0;

  auto stored_already = cluster_local_map.begin();
  do {
    if (sc != stored_already->first) {
      ay = c_info[stored_already->first].degree_wt; // Degree wt of cluster y
      eiy =
          counter[stored_already->second]; // Total edges incident on cluster y
      cur_gain = 2 * constant * (eiy - eix) +
                 2 * degree_wt * ((ax - ay) * constant * constant);

      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&
                                    (stored_already->first < max_index))) {
        max_gain  = cur_gain;
        max_index = stored_already->first;
      }
    }
    stored_already++; // Explore next cluster
  } while (stored_already != cluster_local_map.end());

  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {
    max_index = sc;
  }

  assert(max_gain >= 0);
  return max_index;
}

template <typename CommArrayTy>
uint64_t
maxModularityWithoutSwaps(std::map<uint64_t, uint64_t>& cluster_local_map,
                          std::vector<EdgeTy>& counter, uint64_t self_loop_wt,
                          CommArrayTy& c_info, EdgeTy degree_wt, uint64_t sc,
                          double constant) {

  uint64_t max_index = sc; // Assign the intial value as self community
  double cur_gain    = 0;
  double max_gain    = 0;
  double eix         = counter[0] - self_loop_wt;
  double ax          = c_info[sc].degree_wt - degree_wt;
  double eiy         = 0;
  double ay          = 0;

  auto stored_already = cluster_local_map.begin();
  do {
    if (sc != stored_already->first) {
      ay = c_info[stored_already->first].degree_wt; // Degree wt of cluster y

      if (ay < (ax + degree_wt)) {
        stored_already++;
        continue;
      } else if (ay == (ax + degree_wt) && stored_already->first > sc) {
        stored_already++;
        continue;
      }

      eiy =
          counter[stored_already->second]; // Total edges incident on cluster y
      cur_gain = 2 * constant * (eiy - eix) +
                 2 * degree_wt * ((ax - ay) * constant * constant);

      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&
                                    (stored_already->first < max_index))) {
        max_gain  = cur_gain;
        max_index = stored_already->first;
      }
    }
    stored_already++; // Explore next cluster
  } while (stored_already != cluster_local_map.end());

  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {
    max_index = sc;
  }

  assert(max_gain >= 0);
  return max_index;
}

template <typename GraphTy, typename CommArrayTy>
double calModularityDelay(GraphTy& graph, CommArrayTy& c_info,
                          CommArrayTy& c_update, double& e_xx, double& a2_x,
                          double& constant_for_second_term,
                          std::vector<uint64_t>& local_target) {
  using GNode = typename GraphTy::GraphNode;
  /* Variables needed for Modularity calculation */
  double mod = -1;

  largeArrayEdgeTy cluster_wt_internal;

  /*** Initialization ***/
  cluster_wt_internal.allocateBlocked(graph.size());

  /* Calculate the overall modularity */
  galois::GAccumulator<double> acc_e_xx;
  galois::GAccumulator<double> acc_a2_x;

  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { cluster_wt_internal[n] = 0; });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      if (local_target[graph.getEdgeDst(ii)] == local_target[n]) {
        cluster_wt_internal[n] += graph.getEdgeData(ii);
      }
    }
  });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    acc_e_xx += cluster_wt_internal[n];
    acc_a2_x += (double)(c_info[n].degree_wt + c_update[n].degree_wt) *
                ((double)(c_info[n].degree_wt + c_update[n].degree_wt) *
                 (double)constant_for_second_term);
  });

  e_xx = acc_e_xx.reduce();
  a2_x = acc_a2_x.reduce();

  mod = e_xx * (double)constant_for_second_term -
        a2_x * (double)constant_for_second_term;
  return mod;
}

template <typename GraphTy, typename CommArrayTy>
double calModularity(GraphTy& graph, CommArrayTy& c_info, double& e_xx,
                     double& a2_x, double& constant_for_second_term) {
  using GNode = typename GraphTy::GraphNode;
  /* Variables needed for Modularity calculation */
  double mod = -1;

  largeArrayEdgeTy cluster_wt_internal;

  /*** Initialization ***/
  cluster_wt_internal.allocateBlocked(graph.size());

  /* Calculate the overall modularity */
  galois::GAccumulator<double> acc_e_xx;
  galois::GAccumulator<double> acc_a2_x;

  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { cluster_wt_internal[n] = 0; });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    auto n_data = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==
          n_data.curr_comm_ass) {
        cluster_wt_internal[n] += graph.getEdgeData(ii);
      }
    }
  });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    acc_e_xx += cluster_wt_internal[n];
    acc_a2_x +=
        (double)(c_info[n].degree_wt) *
        ((double)(c_info[n].degree_wt) * (double)constant_for_second_term);
  });

  e_xx = acc_e_xx.reduce();
  a2_x = acc_a2_x.reduce();

  mod = e_xx * (double)constant_for_second_term -
        a2_x * (double)constant_for_second_term;
  return mod;
}

/*
 * To compute the final modularity using prev cluster
 * assignments.
 */
template <typename GraphTy, typename CommArrayTy>
double calModularityFinal(GraphTy& graph) {
  using GNode     = typename GraphTy::GraphNode;
  using CommArray = CommArrayTy;

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double mod = -1;

  largeArrayEdgeTy cluster_wt_internal;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());
  cluster_wt_internal.allocateBlocked(graph.size());

  /* Calculate the weighted degree sum for each vertex */
  sumClusterWeight(graph, c_info);

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);

  /* Calculate the overall modularity */
  double e_xx = 0;
  galois::GAccumulator<double> acc_e_xx;
  double a2_x = 0;
  galois::GAccumulator<double> acc_a2_x;

  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { cluster_wt_internal[n] = 0; });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    auto n_data = graph.getData(n);
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==
          n_data.curr_comm_ass) {
        // if(graph.getData(graph.getEdgeDst(ii)).prev_comm_ass ==
        // n_data.prev_comm_ass) {
        cluster_wt_internal[n] += graph.getEdgeData(ii);
      }
    }
  });

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    acc_e_xx += cluster_wt_internal[n];
    acc_a2_x +=
        (double)(c_info[n].degree_wt) *
        ((double)(c_info[n].degree_wt) * (double)constant_for_second_term);
  });

  e_xx = acc_e_xx.reduce();
  a2_x = acc_a2_x.reduce();

  mod = e_xx * (double)constant_for_second_term -
        a2_x * (double)constant_for_second_term;
  return mod;
}
template <typename GraphTy>
uint64_t renumberClustersContiguously(GraphTy& graph) {
  using GNode = typename GraphTy::GraphNode;
  std::map<uint64_t, uint64_t> cluster_local_map;
  uint64_t num_unique_clusters = 0;

  for (GNode n = 0; n < graph.size(); ++n) {
    auto& n_data = graph.getData(n, flag_no_lock);
    if (n_data.curr_comm_ass != UNASSIGNED) {
      assert(n_data.curr_comm_ass < graph.size());
      auto stored_already = cluster_local_map.find(n_data.curr_comm_ass);
      if (stored_already != cluster_local_map.end()) {
        n_data.curr_comm_ass = stored_already->second;
      } else {
        cluster_local_map[n_data.curr_comm_ass] = num_unique_clusters;
        n_data.curr_comm_ass                    = num_unique_clusters;
        num_unique_clusters++;
      }
    }
  }
  return num_unique_clusters;
}

template <typename GraphTy>
uint64_t renumberClustersContiguouslySubcomm(GraphTy& graph) {

  using GNode = typename GraphTy::GraphNode;
  std::map<uint64_t, uint64_t> cluster_local_map;
  uint64_t num_unique_clusters = 0;

  for (GNode n = 0; n < graph.size(); ++n) {
    auto& n_data = graph.getData(n, flag_no_lock);
    assert(n_data.curr_subcomm_ass != UNASSIGNED);
    assert(n_data.curr_subcomm_ass < graph.size());
    auto stored_already = cluster_local_map.find(n_data.curr_subcomm_ass);
    if (stored_already != cluster_local_map.end()) {
      n_data.curr_subcomm_ass = stored_already->second;
    } else {
      cluster_local_map[n_data.curr_subcomm_ass] = num_unique_clusters;
      n_data.curr_subcomm_ass                    = num_unique_clusters;
      num_unique_clusters++;
    }
  }

  return num_unique_clusters;
}

template <typename GraphTy>
uint64_t renumberClustersContiguouslyArray(largeArray& arr) {
  using GNode = typename GraphTy::GraphNode;
  std::map<uint64_t, uint64_t> cluster_local_map;
  uint64_t num_unique_clusters = 0;

  for (GNode n = 0; n < arr.size(); ++n) {
    if (arr[n] != UNASSIGNED) {
      assert(arr[n] < arr.size());
      auto stored_already = cluster_local_map.find(arr[n]);
      if (stored_already != cluster_local_map.end()) {
        arr[n] = stored_already->second;
      } else {
        cluster_local_map[arr[n]] = num_unique_clusters;
        arr[n]                    = num_unique_clusters;
        num_unique_clusters++;
      }
    }
  }
  return num_unique_clusters;
}

template <typename GraphTy>
void printGraph(GraphTy& graph) {
  using GNode = typename GraphTy::GraphNode;
  for (GNode n = 0; n < graph.size(); ++n) {
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      galois::gPrint(n, " --> ", graph.getEdgeDst(ii), " , ",
                     graph.getEdgeData(ii), "\n");
    }
  }
}

template <typename GraphTy>
void printNodeClusterId(GraphTy& graph, std::string output_CID_filename) {
  using GNode = typename GraphTy::GraphNode;
  std::ofstream outputFile(output_CID_filename, std::ofstream::out);
  for (GNode n = 0; n < graph.size(); ++n) {
    outputFile << n << "  " << graph.getData(n).curr_comm_ass << "\n";
    // outputFile << graph.getData(n).curr_comm_ass << "\n";
  }
}

template <typename GraphTy, typename CommArrayTy>
void checkModularity(GraphTy& graph, largeArray& clusters_orig) {
  using GNode = typename GraphTy::GraphNode;
  galois::gPrint("checkModularity\n");

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    graph.getData(n, flag_no_lock).curr_comm_ass = clusters_orig[n];
  });

  uint64_t num_unique_clusters = renumberClustersContiguously(graph);
  galois::gPrint("Number of unique clusters (renumber): ", num_unique_clusters,
                 "\n");
  auto mod = calModularityFinal<GraphTy, CommArrayTy>(graph);
  galois::gPrint("FINAL MOD: ", mod, "\n");
}

/***********************************************
 ********** Leiden Routines ********************
 **********************************************/
uint64_t generateRandonNumber(uint64_t min, uint64_t max) {
  std::random_device dev;
  std::mt19937 rng(dev());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
      min, max); // distribution in range [min, max]
  return dist6(rng);
}

uint64_t generateRandonNumberDouble(double min, double max) {
  std::random_device
      rd; // Will be used to obtain a seed for the random number engine
  std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
  std::uniform_real_distribution<> dis(min,
                                       max); // distribution in range [min, max]
  return dis(gen);
}

template <typename CommArrayTy>
double diffCPMQuality(uint64_t curr_subcomm, uint64_t candidate_subcomm,
                      std::map<uint64_t, uint64_t>& cluster_local_map,
                      std::vector<EdgeTy>& counter, CommArrayTy& subcomm_info,
                      EdgeTy self_loop_wt) {

  uint64_t size_x = subcomm_info[curr_subcomm].node_wt;
  uint64_t size_y = subcomm_info[candidate_subcomm].node_wt;

  double diff =
      (double)(counter[cluster_local_map[candidate_subcomm]] -
               counter[cluster_local_map[curr_subcomm]] + self_loop_wt) +
      resolution * 0.5f *
          (double)((size_x * (size_x - 1) + size_y * (size_y - 1)) -
                   ((size_x - 1) * (size_x - 2) + size_y * (size_y + 1)));

  return diff;
}

template <typename GraphTy, typename CommArrayTy>
uint64_t getRandomSubcommunity(GraphTy& graph, uint64_t n,
                               CommArrayTy& subcomm_info,
                               uint64_t total_degree_wt,
                               double constant_for_second_term) {
  using GNode           = typename GraphTy::GraphNode;
  uint64_t curr_subcomm = graph.getData(n).curr_subcomm_ass;

  std::map<uint64_t, uint64_t>
      cluster_local_map; // Map each neighbor's subcommunity to local number:
                         // Subcommunity --> Index
  std::vector<EdgeTy> counter; // Number of edges to each unique subcommunity
  uint64_t num_unique_clusters = 1;

  cluster_local_map[curr_subcomm] = 0; // Add n's current subcommunity
  counter.push_back(0); // Initialize the counter to zero (no edges incident
                        // yet)

  EdgeTy self_loop_wt = 0;

  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
    GNode dst = graph.getEdgeDst(ii);
    EdgeTy edge_wt =
        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded

    if (dst == n) {
      self_loop_wt += edge_wt; // Self loop weights is recorded
    }
    auto stored_already = cluster_local_map.find(
        graph.getData(dst).curr_subcomm_ass); // Check if it already exists
    if (stored_already != cluster_local_map.end()) {
      counter[stored_already->second] += edge_wt;
    } else {
      cluster_local_map[graph.getData(dst).curr_subcomm_ass] =
          num_unique_clusters;
      counter.push_back(edge_wt);
      num_unique_clusters++;
    }
  } // End edge loop

  std::map<uint64_t, uint64_t> new_cluster_local_map;
  std::vector<EdgeTy> new_counter;
  num_unique_clusters = 0;
  EdgeTy total        = 0;

  for (auto pair : cluster_local_map) {
    auto subcomm = pair.first;
    if (curr_subcomm == subcomm)
      continue;
    uint64_t subcomm_degree_wt = subcomm_info[subcomm].degree_wt;

    // check if subcommunity is well connected
    if (subcomm_info[subcomm].internal_edge_wt <
        constant_for_second_term * (double)subcomm_degree_wt *
            ((double)total_degree_wt - (double)subcomm_degree_wt))
      continue;
    if (diffCPMQuality(curr_subcomm, subcomm, cluster_local_map, counter,
                       subcomm_info, self_loop_wt) > 0) {
      new_cluster_local_map[subcomm] = num_unique_clusters;
      EdgeTy count                   = counter[cluster_local_map[subcomm]];
      new_counter.push_back(count);
      total += count;
    }
  }

  // Pick max community size
  uint64_t rand_idx = 1; // getRandomInt(0,total-1);

  uint64_t idx = 0;
  for (auto pair : new_cluster_local_map) {
    if (new_counter[idx] > rand_idx)
      return pair.first;
    rand_idx = rand_idx - new_counter[idx];
    idx++;
  }

  return UNASSIGNED;
}

template <typename GraphTy, typename CommTy>
uint64_t getRandomSubcommunity2(GraphTy& graph, typename GraphTy::GraphNode n,
                                CommTy& subcomm_info, uint64_t total_degree_wt,
                                uint64_t comm_id,
                                double constant_for_second_term) {
  using GNode  = typename GraphTy::GraphNode;
  auto& n_data = graph.getData(n);
  /*
   * Remove the currently selected node from its current cluster.
   * This causes the cluster to be empty.
   */
  subcomm_info[n_data.curr_subcomm_ass].node_wt          = 0;
  subcomm_info[n_data.curr_subcomm_ass].internal_edge_wt = 0;

  /*
   * Map each neighbor's subcommunity to local number: Subcommunity --> Index
   */
  std::map<uint64_t, uint64_t> cluster_local_map;

  /*
   * Edges weight to each unique subcommunity
   */
  std::vector<EdgeTy> counter;
  std::vector<uint64_t> neighboring_cluster_ids;

  /*
   * Identify the neighboring clusters of the currently selected
   * node, that is, the clusters with which the currently
   * selected node is connected. The old cluster of the currently
   * selected node is also included in the set of neighboring
   * clusters. In this way, it is always possible that the
   * currently selected node will be moved back to its old
   * cluster.
   */
  cluster_local_map[n_data.curr_subcomm_ass] = 0; // Add n's current
                                                  // subcommunity
  counter.push_back(0); // Initialize the counter to zero (no edges incident
                        // yet)
  neighboring_cluster_ids.push_back(n_data.curr_subcomm_ass);
  uint64_t num_unique_clusters = 1;

  EdgeTy self_loop_wt = 0;

  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
    GNode dst = graph.getEdgeDst(ii);
    EdgeTy edge_wt =
        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded
    if (graph.getData(dst).curr_comm_ass == comm_id) {
      if (dst == n) {
        self_loop_wt += edge_wt; // Self loop weights is recorded
      }
      auto stored_already = cluster_local_map.find(
          graph.getData(dst).curr_subcomm_ass); // Check if it already exists
      if (stored_already != cluster_local_map.end()) {
        counter[stored_already->second] += edge_wt;
      } else {
        cluster_local_map[graph.getData(dst).curr_subcomm_ass] =
            num_unique_clusters;
        counter.push_back(edge_wt);
        neighboring_cluster_ids.push_back(graph.getData(dst).curr_subcomm_ass);
        num_unique_clusters++;
      }
    }
  } // End edge loop

  uint64_t best_cluster                            = n_data.curr_subcomm_ass;
  double max_quality_value_increment               = 0;
  double total_transformed_quality_value_increment = 0;
  double quality_value_increment                   = 0;
  std::vector<double> cum_transformed_quality_value_increment_per_cluster(
      num_unique_clusters);
  for (auto pair : cluster_local_map) {
    auto subcomm = pair.first;
    if (n_data.curr_subcomm_ass == subcomm)
      continue;

    uint64_t subcomm_node_wt   = subcomm_info[subcomm].node_wt;
    uint64_t subcomm_degree_wt = subcomm_info[subcomm].degree_wt;

    // check if subcommunity is well connected
    if (subcomm_info[subcomm].internal_edge_wt >=
        constant_for_second_term * (double)subcomm_degree_wt *
            ((double)total_degree_wt - (double)subcomm_degree_wt)) {

      quality_value_increment =
          counter[pair.second] - n_data.node_wt * subcomm_node_wt * resolution;

      if (quality_value_increment > max_quality_value_increment) {
        best_cluster                = subcomm;
        max_quality_value_increment = quality_value_increment;
      }

      if (quality_value_increment >= 0)
        total_transformed_quality_value_increment +=
            std::exp(quality_value_increment / randomness);
    }
    cum_transformed_quality_value_increment_per_cluster[pair.second] =
        total_transformed_quality_value_increment;
    counter[pair.second] = 0;
  }

  /*
   * Determine the neighboring cluster to which the currently
   * selected node will be moved.
   */
  int64_t min_idx, max_idx, mid_idx;
  uint64_t chosen_cluster;
  double r;
  if (total_transformed_quality_value_increment < DOUBLE_MAX) {
    r = total_transformed_quality_value_increment *
        generateRandonNumberDouble(0.0, 1.0);
    min_idx = -1;
    max_idx = num_unique_clusters + 1;
    while (min_idx < max_idx - 1) {
      mid_idx = (min_idx + max_idx) / 2;
      if (cum_transformed_quality_value_increment_per_cluster[mid_idx] >= r)
        max_idx = mid_idx;
      else
        min_idx = mid_idx;
    }
    chosen_cluster = neighboring_cluster_ids[max_idx];
  } else {
    chosen_cluster = best_cluster;
  }
  return chosen_cluster;
}
/**
 * Finds a clustering of the nodes in a network using the local merging
 * algorithm.
 *
 * <p>
 * The local merging algorithm starts from a singleton partition. It
 * performs a single iteration over the nodes in a network. Each node
 * belonging to a singleton cluster is considered for merging with another
 * cluster. This cluster is chosen randomly from all clusters that do not
 * result in a decrease in the quality function. The larger the increase in
 * the quality function, the more likely a cluster is to be chosen. The
 * strength of this effect is determined by the randomness parameter. The
 * higher the value of the randomness parameter, the stronger the
 * randomness in the choice of a cluster. The lower the value of the
 * randomness parameter, the more likely the cluster resulting in the
 * largest increase in the quality function is to be chosen. A node is
 * merged with a cluster only if both are sufficiently well connected to
 * the rest of the network.
 * </p>
 *
 * @param
 *
 * @return : Number of unique subcommunities formed
 * DO NOT parallelize as it is called within Galois parallel loops
 *
 */
template <typename GraphTy, typename CommTy>
void mergeNodesSubset(GraphTy& graph,
                      std::vector<typename GraphTy::GraphNode>& cluster_nodes,
                      uint64_t comm_id, uint64_t total_degree_wt,
                      CommTy& subcomm_info, double constant_for_second_term) {

  using GNode = typename GraphTy::GraphNode;

  // select set R
  std::vector<GNode> cluster_nodes_to_move;
  for (uint64_t i = 0; i < cluster_nodes.size(); ++i) {
    GNode n      = cluster_nodes[i];
    auto& n_data = graph.getData(n);
    /*
     * Initialize with singleton sub-communities
     */
    EdgeTy nodeEdgeWeightWithinCluster = 0;
    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
      GNode dst      = graph.getEdgeDst(ii);
      EdgeTy edge_wt = graph.getEdgeData(ii, flag_no_lock);
      /*
       * Must include the edge weight of all neighbors excluding self loops
       * belonging to the community comm_id
       */
      if (dst != n && graph.getData(dst).curr_comm_ass == comm_id) {
        nodeEdgeWeightWithinCluster += edge_wt;
      }
    }

    uint64_t node_wt   = n_data.node_wt;
    uint64_t degree_wt = n_data.degree_wt;
    /*
     * Additionally, only nodes that are well connected with
     * the rest of the network are considered for moving.
     * (externalEdgeWeightPerCluster[j] >= clusterWeights[j] * (totalNodeWeight
     * - clusterWeights[j]) * resolution
     */
    if (nodeEdgeWeightWithinCluster >=
        constant_for_second_term * (double)degree_wt *
            ((double)total_degree_wt - (double)degree_wt))
      cluster_nodes_to_move.push_back(n);

    subcomm_info[n].node_wt          = node_wt;
    subcomm_info[n].internal_edge_wt = nodeEdgeWeightWithinCluster;
    subcomm_info[n].size             = 1;
    subcomm_info[n].degree_wt        = degree_wt;
  }

  for (GNode n : cluster_nodes_to_move) {
    auto& n_data = graph.getData(n);
    /*
     * Only consider singleton communities
     */
    if (subcomm_info[n_data.curr_subcomm_ass].size == 1) {
      uint64_t new_subcomm_ass =
          getRandomSubcommunity2(graph, n, subcomm_info, total_degree_wt,
                                 comm_id, constant_for_second_term);

      if ((int64_t)new_subcomm_ass != -1 &&
          new_subcomm_ass != graph.getData(n).curr_subcomm_ass) {
        n_data.curr_subcomm_ass = new_subcomm_ass;

        /*
         * Move the currently selected node to its new cluster and
         * update the clustering statistics.
         */
        galois::atomicAdd(subcomm_info[new_subcomm_ass].node_wt,
                          n_data.node_wt);
        galois::atomicAdd(subcomm_info[new_subcomm_ass].size, (uint64_t)1);
        galois::atomicAdd(subcomm_info[new_subcomm_ass].degree_wt,
                          n_data.degree_wt);

        for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {
          GNode dst    = graph.getEdgeDst(ii);
          auto edge_wt = graph.getEdgeData(ii, flag_no_lock);
          if (dst != n && graph.getData(dst).curr_comm_ass == comm_id) {
            if (graph.getData(dst).curr_subcomm_ass == new_subcomm_ass) {
              subcomm_info[new_subcomm_ass].internal_edge_wt -= edge_wt;
            } else {
              subcomm_info[new_subcomm_ass].internal_edge_wt += edge_wt;
            }
          }
        }
      }
    }
  }
}

/*
 * Refine the clustering by iterating over the clusters and by
 * trying to split up each cluster into multiple clusters.
 */
template <typename GraphTy, typename CommArrayTy>
void refinePartition(GraphTy& graph, double constant_for_second_term) {

  using GNode     = typename GraphTy::GraphNode;
  using CommArray = CommArrayTy;

  galois::gPrint("Refining\n");

  // set singleton subcommunities
  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) { graph.getData(n).curr_subcomm_ass = n; }, galois::steal());

  // populate nodes into communities
  std::vector<std::vector<GNode>> cluster_bags(2 * graph.size() + 1);
  CommArray comm_info;

  comm_info.allocateBlocked(2 * graph.size() + 1);

  galois::do_all(
      galois::iterate((uint32_t)0, (uint32_t)(2 * graph.size() + 1)),
      [&](uint32_t n) {
        comm_info[n].node_wt   = (uint64_t)0;
        comm_info[n].degree_wt = (uint64_t)0;
      },
      galois::steal());

  for (GNode n : graph) {
    auto& n_data = graph.getData(n, flag_no_lock);
    if (n_data.curr_comm_ass != UNASSIGNED)
      cluster_bags[n_data.curr_comm_ass].push_back(n);

    galois::atomicAdd(comm_info[n_data.curr_comm_ass].node_wt, n_data.node_wt);
    galois::atomicAdd(comm_info[n_data.curr_comm_ass].degree_wt,
                      n_data.degree_wt);
  }

  CommArray subcomm_info;

  subcomm_info.allocateBlocked(graph.size() + 1);

  // call mergeNodesSubset for each community in parallel
  galois::do_all(galois::iterate((uint64_t)0, (uint64_t)graph.size()),
                 [&](uint64_t c) {
                   /*
                    * Only nodes belonging to singleton clusters can be moved to
                    * a different cluster. This guarantees that clusters will
                    * never be split up.
                    */
                   comm_info[c].num_subcomm = 0;
                   if (cluster_bags[c].size() > 1) {
                     // comm_info[c].num_subcomm =
                     mergeNodesSubset<GraphTy, CommArray>(
                         graph, cluster_bags[c], c, comm_info[c].degree_wt,
                         subcomm_info, constant_for_second_term);
                   } else {
                     comm_info[c].num_subcomm = 0;
                   }
                 });
}

/*
 *
 * Graph construction routines to make
 * coarser graphs.
 *
 */
template <typename GraphTy>
void buildNextLevelGraph(GraphTy& graph, GraphTy& graph_next,
                         uint64_t num_unique_clusters) {
  using GNode = typename GraphTy::GraphNode;
  std::cerr << "Inside buildNextLevelGraph\n";

  galois::StatTimer TimerGraphBuild("Timer_Graph_build");
  TimerGraphBuild.start();
  uint32_t num_nodes_next = num_unique_clusters;
  uint64_t num_edges_next = 0; // Unknown right now

  std::vector<std::vector<GNode>> cluster_bags(num_unique_clusters);
  // Comment: Serial separation is better than do_all due to contention
  for (GNode n = 0; n < graph.size(); ++n) {
    auto n_data = graph.getData(n, flag_no_lock);
    if (n_data.curr_comm_ass != UNASSIGNED)
      cluster_bags[n_data.curr_comm_ass].push_back(n);
  }

  std::vector<std::vector<uint32_t>> edges_id(num_unique_clusters);
  std::vector<std::vector<EdgeTy>> edges_data(num_unique_clusters);

  /* First pass to find the number of edges */
  galois::do_all(
      galois::iterate((uint64_t)0, num_unique_clusters),
      [&](uint64_t c) {
        std::map<uint64_t, uint64_t> cluster_local_map;
        uint64_t num_unique_clusters = 0;
        for (auto cb_ii = cluster_bags[c].begin();
             cb_ii != cluster_bags[c].end(); ++cb_ii) {

          assert(graph.getData(*cb_ii, flag_no_lock).curr_comm_ass ==
                 c); // All nodes in this bag must have same cluster id

          for (auto ii = graph.edge_begin(*cb_ii); ii != graph.edge_end(*cb_ii);
               ++ii) {
            GNode dst     = graph.getEdgeDst(ii);
            auto dst_data = graph.getData(dst, flag_no_lock);
            assert(dst_data.curr_comm_ass != UNASSIGNED);
            auto stored_already = cluster_local_map.find(
                dst_data.curr_comm_ass); // Check if it already exists
            if (stored_already != cluster_local_map.end()) {
              edges_data[c][stored_already->second] += graph.getEdgeData(ii);
            } else {
              cluster_local_map[dst_data.curr_comm_ass] = num_unique_clusters;
              edges_id[c].push_back(dst_data.curr_comm_ass);
              edges_data[c].push_back(graph.getEdgeData(ii));
              num_unique_clusters++;
            }
          } // End edge loop
        }
      },
      galois::steal(), galois::loopname("BuildGrah: Find edges"));

  /* Serial loop to reduce all the edge counts */
  std::vector<uint64_t> prefix_edges_count(num_unique_clusters);
  galois::GAccumulator<uint64_t> num_edges_acc;
  galois::do_all(galois::iterate((uint32_t)0, num_nodes_next), [&](uint32_t c) {
    prefix_edges_count[c] = edges_id[c].size();
    num_edges_acc += prefix_edges_count[c];
  });

  num_edges_next = num_edges_acc.reduce();
  for (uint32_t c = 1; c < num_nodes_next; ++c) {
    prefix_edges_count[c] += prefix_edges_count[c - 1];
  }

  assert(prefix_edges_count[num_unique_clusters - 1] == num_edges_next);
  galois::gPrint("#nodes : ", num_nodes_next, ", #edges : ", num_edges_next,
                 "\n");
  std::cerr << "Graph construction started"
            << "\n";
  galois::StatTimer TimerConstructFrom("Timer_Construct_From");
  TimerConstructFrom.start();
  graph_next.constructFrom(num_nodes_next, num_edges_next, prefix_edges_count,
                           edges_id, edges_data);
  TimerConstructFrom.stop();

  TimerGraphBuild.stop();
  galois::gPrint("Graph construction done\n");
}

template <typename GraphTy>
void buildNextLevelGraphSubComm(GraphTy& graph, GraphTy& graph_next,
                                uint64_t num_unique_clusters,
                                std::vector<uint64_t>& original_comm_ass,
                                std::vector<uint64_t>& cluster_node_wt) {
  using GNode = typename GraphTy::GraphNode;

  galois::StatTimer TimerGraphBuild("Timer_Graph_build");
  TimerGraphBuild.start();
  uint32_t num_nodes_next = num_unique_clusters;
  uint64_t num_edges_next = 0; // Unknown right now

  std::vector<std::vector<GNode>> cluster_bags(num_unique_clusters);
  // Comment: Serial separation is better than do_all due to contention
  for (GNode n = 0; n < graph.size(); ++n) {
    auto n_data = graph.getData(n, flag_no_lock);
    original_comm_ass[n_data.curr_subcomm_ass] =
        graph.getData(n_data.curr_comm_ass).curr_subcomm_ass;
    assert(n_data.curr_comm_ass != UNASSIGNED);
    cluster_bags[n_data.curr_subcomm_ass].push_back(n);
    cluster_node_wt[n_data.curr_subcomm_ass] += n_data.node_wt;
  }

  std::vector<std::vector<uint32_t>> edges_id(num_unique_clusters);
  std::vector<std::vector<EdgeTy>> edges_data(num_unique_clusters);

  /* First pass to find the number of edges */
  galois::do_all(
      galois::iterate((uint64_t)0, num_unique_clusters),
      [&](uint64_t c) {
        std::map<uint64_t, uint64_t> cluster_local_map;
        uint64_t num_unique_clusters = 0;
        for (auto cb_ii = cluster_bags[c].begin();
             cb_ii != cluster_bags[c].end(); ++cb_ii) {

          assert(graph.getData(*cb_ii, flag_no_lock).curr_subcomm_ass ==
                 c); // All nodes in this bag must have same cluster id

          for (auto ii = graph.edge_begin(*cb_ii); ii != graph.edge_end(*cb_ii);
               ++ii) {
            GNode dst     = graph.getEdgeDst(ii);
            auto dst_data = graph.getData(dst, flag_no_lock);
            assert(dst_data.curr_subcomm_ass != UNASSIGNED);
            auto stored_already = cluster_local_map.find(
                dst_data.curr_subcomm_ass); // Check if it already exists
            if (stored_already != cluster_local_map.end()) {
              edges_data[c][stored_already->second] += graph.getEdgeData(ii);
            } else {
              cluster_local_map[dst_data.curr_subcomm_ass] =
                  num_unique_clusters;
              edges_id[c].push_back(dst_data.curr_subcomm_ass);
              edges_data[c].push_back(graph.getEdgeData(ii));
              num_unique_clusters++;
            }
          } // End edge loop
        }
      },
      galois::steal(), galois::loopname("BuildGrah: Find edges"));

  /* Serial loop to reduce all the edge counts */
  std::vector<uint64_t> prefix_edges_count(num_unique_clusters);
  galois::GAccumulator<uint64_t> num_edges_acc;
  galois::do_all(galois::iterate((uint32_t)0, num_nodes_next), [&](uint32_t c) {
    prefix_edges_count[c] = edges_id[c].size();
    num_edges_acc += prefix_edges_count[c];
  });

  num_edges_next = num_edges_acc.reduce();
  for (uint32_t c = 1; c < num_nodes_next; ++c) {
    prefix_edges_count[c] += prefix_edges_count[c - 1];
  }

  assert(prefix_edges_count[num_unique_clusters - 1] == num_edges_next);
  galois::gPrint("#nodes : ", num_nodes_next, ", #edges : ", num_edges_next,
                 "\n");
  galois::gPrint("#prefix last : ", prefix_edges_count[num_unique_clusters - 1],
                 "\n");

  std::cerr << "Graph construction started"
            << "\n";

  galois::StatTimer TimerConstructFrom("Timer_Construct_From");
  TimerConstructFrom.start();
  graph_next.constructFrom(num_nodes_next, num_edges_next, prefix_edges_count,
                           edges_id, edges_data);
  TimerConstructFrom.stop();

  std::cout << " c1:" << calConstantForSecondTerm(graph) << "\n";
  TimerGraphBuild.stop();
  galois::gPrint("Graph construction done\n");
}

#endif // CLUSTERING_H


================================================
FILE: lonestar/analytics/cpu/clustering/leidenClustering.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/gstl.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"

#include "llvm/Support/CommandLine.h"

#include <iostream>
#include <fstream>
#include <deque>
#include <type_traits>

#include "Lonestar/BoilerPlate.h"
#include "clustering.h"
#include "galois/DynamicBitset.h"

static const char* name = "Louvain Clustering";

static const char* desc = "Cluster nodes of the graph using Louvain Clustering";

static const char* url = "louvain_clustering";

enum Algo { foreach };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(clEnumValN(Algo::foreach, "Foreach",
                           "Using galois for_each for conflict mitigation")),
    cll::init(Algo::foreach));

// Maintain community information
struct Comm {
  std::atomic<uint64_t> size;
  std::atomic<EdgeTy> degree_wt;
  std::atomic<uint64_t> node_wt;
  EdgeTy internal_edge_wt;
  uint64_t num_subcomm;
};

typedef galois::LargeArray<Comm> CommArray;
// Graph Node information
struct Node {
  uint64_t prev_comm_ass;
  uint64_t curr_comm_ass;
  EdgeTy degree_wt;
  int64_t colorId;
  /** Only required for Leiden **/
  uint64_t curr_subcomm_ass;
  uint64_t node_wt;
};

using Graph = galois::graphs::LC_CSR_Graph<Node, EdgeTy>::with_no_lockable<
    false>::type::with_numa_alloc<true>::type;
using GNode = Graph::GraphNode;

double algoLeidenWithLocking(Graph& graph, double lower, double threshold,
                             uint32_t& iter) {

  galois::StatTimer TimerClusteringTotal("Timer_Clustering_Total");
  TimerClusteringTotal.start();

  galois::gPrint("Inside algoLeidenWithLocking\n");

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double prev_mod      = lower;
  double curr_mod      = -1;
  double threshold_mod = threshold;
  uint32_t num_iter    = iter;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());

  /* Calculate the weighted degree sum for each vertex */
  sumVertexDegreeWeightWithNodeWeight(graph, c_info);

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);

  if (iter > 1) {
    galois::do_all(galois::iterate(graph), [&](GNode n) {
      c_info[n].size      = 0;
      c_info[n].degree_wt = 0;
      c_info[n].node_wt   = 0;
    });

    galois::do_all(galois::iterate(graph), [&](GNode n) {
      auto& n_data = graph.getData(n);
      galois::atomicAdd(c_info[n_data.curr_comm_ass].size, uint64_t{1});
      galois::atomicAdd(c_info[n_data.curr_comm_ass].node_wt, n_data.node_wt);
      galois::atomicAdd(c_info[n_data.curr_comm_ass].degree_wt,
                        n_data.degree_wt);
    });
  }

  galois::gPrint("============================================================="
                 "===========================================\n");
  galois::gPrint("Itr      Explore_xx            A_x2          Prev-Prev-Mod   "
                 "      Prev-Mod           Curr-Mod\n");
  galois::gPrint("============================================================="
                 "===========================================\n");

  galois::StatTimer TimerClusteringWhile("Timer_Clustering_While");
  TimerClusteringWhile.start();
  while (true) {
    num_iter++;
    galois::do_all(galois::iterate(graph), [&](GNode n) {
      c_update[n].degree_wt = 0;
      c_update[n].size      = 0;
      c_update[n].node_wt   = 0;
    });

    galois::for_each(
        galois::iterate(graph),
        [&](GNode n, auto&) {
          auto& n_data    = graph.getData(n, flag_write_lock);
          uint64_t degree = std::distance(graph.edge_begin(n, flag_write_lock),
                                          graph.edge_end(n, flag_write_lock));

          uint64_t local_target = UNASSIGNED;
          std::map<uint64_t, uint64_t>
              cluster_local_map; // Map each neighbor's cluster to local number:
                                 // Community --> Index
          std::vector<EdgeTy> counter; // Number of edges to each unique cluster
          EdgeTy self_loop_wt = 0;

          if (degree > 0) {
            findNeighboringClusters(graph, n, cluster_local_map, counter,
                                    self_loop_wt);
            local_target =
                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,
                              n_data.degree_wt, n_data.curr_comm_ass,
                              constant_for_second_term);
            // local_target = maxCPMQuality<Graph, CommArray>(cluster_local_map,
            // counter, self_loop_wt, c_info, n_data.node_wt,
            // n_data.curr_comm_ass);
          } else {
            local_target = UNASSIGNED;
          }

          /* Update cluster info */
          if (local_target != n_data.curr_comm_ass &&
              local_target != UNASSIGNED) {

            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);
            galois::atomicAdd(c_info[local_target].size, uint64_t{1});
            galois::atomicAdd(c_info[local_target].node_wt, n_data.node_wt);

            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,
                                   n_data.degree_wt);
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,
                                   uint64_t{1});
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].node_wt,
                                   n_data.node_wt);

            /* Set the new cluster id */
            n_data.curr_comm_ass = local_target;
          }
        },
        galois::loopname("leiden algo: Phase 1"), galois::no_pushes());

    /* Calculate the overall modularity */
    double e_xx = 0;
    double a2_x = 0;

    // curr_mod = calCPMQuality(graph, c_info, e_xx, a2_x,
    // constant_for_second_term);
    curr_mod =
        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);

    galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                   lower, "      ", prev_mod, "       ", curr_mod, "\n");

    if ((curr_mod - prev_mod) < threshold_mod) {
      galois::gPrint("Modularity gain: ", (curr_mod - prev_mod), " < ",
                     threshold_mod, " \n");
      prev_mod = curr_mod;
      break;
    }
    prev_mod = curr_mod;
  } // End while
  TimerClusteringWhile.stop();

  iter = num_iter;

  c_info.destroy();
  c_info.deallocate();

  c_update.destroy();
  c_update.deallocate();

  TimerClusteringTotal.stop();
  return prev_mod;
}

void runMultiPhaseLouvainAlgorithm(Graph& graph, uint64_t min_graph_size,
                                   double c_threshold,
                                   largeArray& clusters_orig) {

  galois::gPrint("Inside runMultiPhaseLouvainAlgorithm\n");
  double prev_mod = -1; // Previous modularity
  double curr_mod = -1; // Current modularity
  uint32_t phase  = 0;

  Graph* graph_curr = &graph;
  Graph graph_next;
  uint32_t iter           = 0;
  uint64_t num_nodes_orig = clusters_orig.size();
  /**
   * Assign cluster id from previous iteration
   */
  galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {
    graph_curr->getData(n).curr_comm_ass    = n;
    graph_curr->getData(n).curr_subcomm_ass = n;
    graph_curr->getData(n).node_wt          = 1;
  });
  for (GNode i = 0; i < graph.size(); ++i) {
    if (graph.getData(i).node_wt > 1)
      galois::gPrint("-->node wt : ", graph.getData(i).node_wt, "\n");
  }
  while (true) {
    iter++;
    phase++;
    galois::gPrint("Starting Phase : ", phase, "\n");
    galois::gPrint("Graph size : ", (*graph_curr).size(), "\n");

    if ((*graph_curr).size() > min_graph_size) {
      switch (algo) {
      case foreach:
        curr_mod =
            algoLeidenWithLocking(*graph_curr, curr_mod, c_threshold, iter);
        break;
      default:
        std::abort();
      }
    }

    if (iter < max_iter && (curr_mod - prev_mod) > threshold) {
      double constant_for_second_term = calConstantForSecondTerm(graph);
      refinePartition<Graph, CommArray>(*graph_curr, constant_for_second_term);

      uint64_t num_unique_subclusters =
          renumberClustersContiguouslySubcomm(*graph_curr);
      galois::gPrint("Number of unique sub cluster (Refine) : ",
                     num_unique_subclusters, "\n");
      std::vector<uint64_t> original_comm_ass(graph_curr->size());
      std::vector<uint64_t> cluster_node_wt(num_unique_subclusters, 0);

      if (phase == 1) {
        galois::do_all(
            galois::iterate(uint64_t{0}, num_nodes_orig), [&](GNode n) {
              clusters_orig[n] = (*graph_curr).getData(n).curr_subcomm_ass;
            });
      } else {
        galois::do_all(
            galois::iterate(uint64_t{0}, num_nodes_orig),
            [&](GNode n) {
              assert(clusters_orig[n] < (*graph_curr).size());
              clusters_orig[n] =
                  (*graph_curr).getData(clusters_orig[n]).curr_subcomm_ass;
            },
            galois::steal());
      }
      buildNextLevelGraphSubComm(*graph_curr, graph_next,
                                 num_unique_subclusters, original_comm_ass,
                                 cluster_node_wt);
      prev_mod   = curr_mod;
      graph_curr = &graph_next;
      /**
       * Assign cluster id from previous iteration
       */
      galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {
        auto& n_data            = graph_curr->getData(n);
        n_data.curr_comm_ass    = original_comm_ass[n];
        n_data.curr_subcomm_ass = original_comm_ass[n];
        n_data.node_wt          = cluster_node_wt[n];
      });

      cluster_node_wt.clear();
      printGraphCharateristics(*graph_curr);
    } else {
      break;
    }
  }
  galois::gPrint("Phases : ", phase, "Iter : ", iter, "\n");
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  Graph graph_next;
  Graph* graph_curr;

  std::cout << "Reading from file: " << inputFile << "\n";
  std::cout << "[WARNING:] Make sure " << inputFile
            << " is symmetric graph without duplicate edges\n";
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  graph_curr = &graph;

  /*
   * To keep track of communities for nodes in the original graph.
   * Community will be set to -1 for isolated nodes
   */
  largeArray clusters_orig;
  clusters_orig.allocateBlocked(graph_curr->size());

  /*
   * Vertex following optimization
   */
  if (enable_VF) {
    uint64_t num_nodes_to_fix =
        vertexFollowing(graph); // Find nodes that follow other nodes
    galois::gPrint("Isolated nodes : ", num_nodes_to_fix, "\n");

    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);
    galois::gPrint(
        "Number of unique clusters (renumber): ", num_unique_clusters, "\n");
    /*
     *Initialize node cluster id.
     */
    galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {
      clusters_orig[n] = graph.getData(n, flag_no_lock).curr_comm_ass;
    });

    /*
     * Build new graph to remove the isolated nodes
     */
    buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);
    graph_curr = &graph_next;
    printGraphCharateristics(*graph_curr);
  } else {

    /*
     *Initialize node cluster id.
     */
    galois::do_all(galois::iterate(*graph_curr),
                   [&](GNode n) { clusters_orig[n] = UNASSIGNED; });

    printGraphCharateristics(*graph_curr);
  }

  uint64_t min_graph_size = 10;
  galois::StatTimer execTime("Timer_0");
  execTime.start();
  runMultiPhaseLouvainAlgorithm(*graph_curr, min_graph_size, c_threshold,
                                clusters_orig);
  execTime.stop();

  /*
   * Sanity check: Check modularity at the end
   */
  checkModularity<Graph, CommArray>(graph, clusters_orig);
  if (output_CID) {
    printNodeClusterId(graph, output_CID_filename);
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/clustering/louvainClustering.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "clustering.h"
#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/DynamicBitset.h"
#include "galois/gstl.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

#include <iostream>
#include <fstream>
#include <deque>
#include <type_traits>

static const char* name = "Louvain Clustering";

static const char* desc = "Cluster nodes of the graph using Louvain Clustering";

static const char* url = "louvain_clustering";

enum Algo { coloring, foreach, delay, doall };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(clEnumValN(Algo::coloring, "Coloring",
                           "Using colors to mitigate conflicts"),
                clEnumValN(Algo::foreach, "Foreach",
                           "Using galois for_each for conflict mitigation"),
                clEnumValN(Algo::delay, "Delay",
                           "Using galois for_each for conflict mitigation but "
                           "delay the updation"),
                clEnumValN(Algo::doall, "Doall",
                           "Using galois for_each for conflict mitigation")),
    cll::init(Algo::foreach));

// Maintain community information
struct Comm {
  std::atomic<uint64_t> size;
  std::atomic<EdgeTy> degree_wt;
  EdgeTy internal_edge_wt;
};

typedef galois::LargeArray<Comm> CommArray;

// Graph Node information
struct Node {
  uint64_t prev_comm_ass;
  uint64_t curr_comm_ass;
  EdgeTy degree_wt;
  int64_t colorId;
};

using Graph = galois::graphs::LC_CSR_Graph<Node, EdgeTy>::with_no_lockable<
    false>::type::with_numa_alloc<true>::type;
using GNode = Graph::GraphNode;

double algoLouvainWithLocking(Graph& graph, double lower, double threshold,
                              uint32_t& iter) {
  galois::StatTimer TimerClusteringTotal("Timer_Clustering_Total");
  TimerClusteringTotal.start();

  galois::gPrint("Inside algoLouvainWithLocking\n");

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double prev_mod      = lower;
  double curr_mod      = -1;
  double threshold_mod = threshold;
  uint32_t num_iter    = iter;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());

  /* Initialization each node to its own cluster */
  galois::do_all(galois::iterate(graph), [&graph](GNode n) {
    graph.getData(n).curr_comm_ass = n;
    graph.getData(n).prev_comm_ass = n;
  });

  galois::gPrint("Init Done\n");
  /* Calculate the weighted degree sum for each vertex */
  sumVertexDegreeWeight(graph, c_info);
  galois::gPrint("c_info[0] : ", c_info[0].degree_wt.load(), "\n");

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);
  galois::gPrint("constant_for_second_term : ", constant_for_second_term, "\n");

  galois::gPrint("============================================================="
                 "===========================================\n");
  galois::gPrint("Itr      Explore_xx            A_x2          Prev-Prev-Mod   "
                 "      Prev-Mod           Curr-Mod\n");
  galois::gPrint("============================================================="
                 "===========================================\n");

  galois::StatTimer TimerClusteringWhile("Timer_Clustering_While");
  TimerClusteringWhile.start();
  while (true) {
    num_iter++;

    galois::do_all(galois::iterate(graph), [&](GNode n) {
      c_update[n].degree_wt = 0;
      c_update[n].size      = 0;
    });

    galois::for_each(
        galois::iterate(graph),
        [&](GNode n, auto&) {
          auto& n_data    = graph.getData(n, flag_write_lock);
          uint64_t degree = std::distance(graph.edge_begin(n, flag_write_lock),
                                          graph.edge_end(n, flag_write_lock));
          uint64_t local_target = UNASSIGNED;
          std::map<uint64_t, uint64_t>
              cluster_local_map; // Map each neighbor's cluster to local number:
                                 // Community --> Index
          std::vector<EdgeTy> counter; // Number of edges to each unique cluster
          EdgeTy self_loop_wt = 0;
          if (degree > 0) {

            findNeighboringClusters(graph, n, cluster_local_map, counter,
                                    self_loop_wt);
            // Find the max gain in modularity
            local_target =
                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,
                              n_data.degree_wt, n_data.curr_comm_ass,
                              constant_for_second_term);

          } else {
            local_target = UNASSIGNED;
          }

          /* Update cluster info */
          if (local_target != n_data.curr_comm_ass &&
              local_target != UNASSIGNED) {

            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);
            galois::atomicAdd(c_info[local_target].size, (uint64_t)1);
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,
                                   n_data.degree_wt);
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,
                                   (uint64_t)1);

            /* Set the new cluster id */
            n_data.curr_comm_ass = local_target;
          }
        },
        galois::loopname("louvain algo: Phase 1"), galois::no_pushes());

    /* Calculate the overall modularity */
    double e_xx = 0;
    double a2_x = 0;

    curr_mod =
        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);

    galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                   lower, "      ", prev_mod, "       ", curr_mod, "\n");

    if ((curr_mod - prev_mod) < threshold_mod) {
      galois::gPrint("Modularity gain: ", (curr_mod - prev_mod), " < ",
                     threshold_mod, " \n");
      prev_mod = curr_mod;
      break;
    }

    prev_mod = curr_mod;

  } // End while
  TimerClusteringWhile.stop();

  iter = num_iter;

  c_info.destroy();
  c_info.deallocate();

  c_update.destroy();
  c_update.deallocate();

  TimerClusteringTotal.stop();
  return prev_mod;
}

double algoLouvainWithoutLockingDoAll(Graph& graph, double lower,
                                      double threshold, uint32_t& iter) {

  galois::StatTimer TimerClusteringTotal("Timer_Clustering_Total");
  TimerClusteringTotal.start();

  galois::gPrint("Inside algoLouvainWithLocking\n");

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double prev_mod      = lower;
  double curr_mod      = -1;
  double threshold_mod = threshold;
  uint32_t num_iter    = iter;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());

  /* Initialization each node to its own cluster */
  galois::do_all(galois::iterate(graph), [&graph](GNode n) {
    graph.getData(n).curr_comm_ass = n;
    graph.getData(n).prev_comm_ass = n;
    graph.getData(n).colorId       = -1;
  });

  galois::gPrint("Init Done\n");
  /* Calculate the weighted degree sum for each vertex */
  sumVertexDegreeWeight(graph, c_info);
  galois::gPrint("c_info[0] : ", c_info[0].degree_wt.load(), "\n");

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);
  galois::gPrint("constant_for_second_term : ", constant_for_second_term, "\n");

  galois::gPrint("============================================================="
                 "===========================================\n");
  galois::gPrint("Itr      Explore_xx            A_x2          Prev-Prev-Mod   "
                 "      Prev-Mod           Curr-Mod\n");
  galois::gPrint("============================================================="
                 "===========================================\n");

  galois::StatTimer TimerClusteringWhile("Timer_Clustering_While");
  TimerClusteringWhile.start();
  while (true) {
    num_iter++;

    galois::do_all(galois::iterate(graph), [&](GNode n) {
      c_update[n].degree_wt = 0;
      c_update[n].size      = 0;
    });

    galois::do_all(
        galois::iterate(graph),
        [&](GNode n) {
          auto& n_data    = graph.getData(n, flag_write_lock);
          uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),
                                          graph.edge_end(n, flag_no_lock));
          uint64_t local_target = UNASSIGNED;
          std::map<uint64_t, uint64_t>
              cluster_local_map; // Map each neighbor's cluster to local number:
                                 // Community --> Index
          std::vector<EdgeTy> counter; // Number of edges to each unique cluster
          EdgeTy self_loop_wt = 0;

          if (degree > 0) {
            findNeighboringClusters(graph, n, cluster_local_map, counter,
                                    self_loop_wt);
            // Find the max gain in modularity
            local_target = maxModularityWithoutSwaps(
                cluster_local_map, counter, self_loop_wt, c_info,
                n_data.degree_wt, n_data.curr_comm_ass,
                constant_for_second_term);

          } else {
            local_target = UNASSIGNED;
          }

          /* Update cluster info */
          if (local_target != n_data.curr_comm_ass &&
              local_target != UNASSIGNED) {

            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);
            galois::atomicAdd(c_info[local_target].size, (uint64_t)1);
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,
                                   n_data.degree_wt);
            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,
                                   (uint64_t)1);

            /* Set the new cluster id */
            n_data.curr_comm_ass = local_target;
          }
        },
        galois::loopname("louvain algo: Phase 1"));

    /* Calculate the overall modularity */
    double e_xx = 0;
    double a2_x = 0;

    curr_mod =
        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);

    galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                   lower, "      ", prev_mod, "       ", curr_mod, "\n");

    if ((curr_mod - prev_mod) < threshold_mod) {
      galois::gPrint("Modularity gain: ", (curr_mod - prev_mod), " < ",
                     threshold_mod, " \n");
      prev_mod = curr_mod;
      break;
    }

    prev_mod = curr_mod;

  } // End while
  TimerClusteringWhile.stop();

  iter = num_iter;

  c_info.destroy();
  c_info.deallocate();

  c_update.destroy();
  c_update.deallocate();

  TimerClusteringTotal.stop();
  return prev_mod;
}

double algoLouvainWithLockingDelayUpdate(Graph& graph, double lower,
                                         double threshold, uint32_t& iter) {
  galois::gPrint("Inside algoLouvainWithLockingDelay\n");

  galois::StatTimer TimerClusteringTotal("Timer_Clustering_Total");
  TimerClusteringTotal.start();

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double prev_mod      = -1; // lower;
  double curr_mod      = -1;
  double threshold_mod = threshold;
  uint32_t num_iter    = iter;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());

  /* Initialization each node to its own cluster */
  galois::do_all(galois::iterate(graph), [&graph](GNode n) {
    graph.getData(n).curr_comm_ass = n;
    graph.getData(n).prev_comm_ass = n;
    graph.getData(n).colorId       = -1;
  });

  galois::gPrint("Init Done\n");
  /* Calculate the weighted degree sum for each vertex */
  sumVertexDegreeWeight(graph, c_info);
  galois::gPrint("c_info[5] : ", c_info[0].degree_wt.load(), "\n");

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);
  galois::gPrint("constant_for_second_term : ", constant_for_second_term, "\n");

  galois::gPrint("============================================================="
                 "===========================================\n");
  galois::gPrint("Itr      Explore_xx            A_x2          Prev-Prev-Mod   "
                 "      Prev-Mod           Curr-Mod\n");
  galois::gPrint("============================================================="
                 "===========================================\n");

  galois::StatTimer TimerClusteringWhile("Timer_Clustering_While");
  TimerClusteringWhile.start();
  while (true) {
    num_iter++;

    galois::do_all(galois::iterate(graph), [&](GNode n) {
      c_update[n].degree_wt = 0;
      c_update[n].size      = 0;
    });

    std::vector<uint64_t> local_target(graph.size(), UNASSIGNED);
    galois::GAccumulator<uint32_t> syncRound;
    galois::do_all(
        galois::iterate(graph),
        [&](GNode n) {
          auto& n_data    = graph.getData(n, flag_write_lock);
          uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),
                                          graph.edge_end(n, flag_no_lock));
          std::map<uint64_t, uint64_t>
              cluster_local_map; // Map each neighbor's cluster to local number:
                                 // Community --> Index
          std::vector<EdgeTy> counter; // Number of edges to each unique cluster
          EdgeTy self_loop_wt = 0;

          if (degree > 0) {
            findNeighboringClusters(graph, n, cluster_local_map, counter,
                                    self_loop_wt);
            // Find the max gain in modularity
            local_target[n] =
                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,
                              n_data.degree_wt, n_data.curr_comm_ass,
                              constant_for_second_term);
          } else {
            local_target[n] = UNASSIGNED;
          }

          /* Update cluster info */
          if (local_target[n] != n_data.curr_comm_ass &&
              local_target[n] != UNASSIGNED) {

            galois::atomicAdd(c_update[local_target[n]].degree_wt,
                              n_data.degree_wt);
            galois::atomicAdd(c_update[local_target[n]].size, (uint64_t)1);
            galois::atomicSubtract(c_update[n_data.curr_comm_ass].degree_wt,
                                   n_data.degree_wt);
            galois::atomicSubtract(c_update[n_data.curr_comm_ass].size,
                                   (uint64_t)1);
          }
        },
        galois::loopname("louvain algo: Phase 1"));

    /* Calculate the overall modularity */
    double e_xx = 0;
    double a2_x = 0;
    curr_mod    = calModularityDelay(graph, c_info, c_update, e_xx, a2_x,
                                  constant_for_second_term, local_target);
    galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                   lower, "      ", prev_mod, "       ", curr_mod, "\n");

    if ((curr_mod - prev_mod) < threshold_mod) {
      galois::gPrint("Modularity gain: ", (curr_mod - prev_mod), " < ",
                     threshold_mod, " \n");
      prev_mod = curr_mod;
      break;
    }

    prev_mod = curr_mod;
    if (prev_mod < lower)
      prev_mod = lower;

    galois::do_all(galois::iterate(graph), [&](GNode n) {
      auto& n_data         = graph.getData(n, flag_no_lock);
      n_data.prev_comm_ass = n_data.curr_comm_ass;
      n_data.curr_comm_ass = local_target[n];
      galois::atomicAdd(c_info[n].size, c_update[n].size.load());
      galois::atomicAdd(c_info[n].degree_wt, c_update[n].degree_wt.load());

      c_update[n].size      = 0;
      c_update[n].degree_wt = 0;
    });

  } // End while
  TimerClusteringWhile.stop();

  iter = num_iter;

  c_info.destroy();
  c_info.deallocate();

  c_update.destroy();
  c_update.deallocate();

  TimerClusteringTotal.stop();
  return prev_mod;
}

uint64_t coloringDistanceOne(Graph& graph) {
  galois::for_each(
      galois::iterate(graph),
      [&](GNode n, auto&) {
        auto& n_data = graph.getData(n, flag_write_lock);

        /* Grab lock on neighbours: Cautious operator */
        for (auto ii = graph.edge_begin(n, flag_write_lock);
             ii != graph.edge_end(n, flag_write_lock); ++ii) {
          graph.getData(graph.getEdgeDst(ii),
                        flag_write_lock); // TODO: Can we use read lock?
        }

        int64_t max_color = -1;
        int64_t my_color  = 0;
        int64_t degree = std::distance(graph.edge_begin(n), graph.edge_end(n));
        if (degree > 0) {
          std::vector<bool> isColorSet;
          isColorSet.resize(degree, false);
          for (auto ii = graph.edge_begin(n, flag_write_lock);
               ii != graph.edge_end(n, flag_write_lock); ++ii) {
            auto dst = graph.getEdgeDst(ii);
            if (dst == n)
              continue;

            auto& dst_data = graph.getData(
                dst, flag_write_lock); // TODO: Can we use read lock?
            if (dst_data.colorId >= 0) {
              if (dst_data.colorId >= degree)
                isColorSet.resize(dst_data.colorId);

              isColorSet[dst_data.colorId] = true;
              if ((dst_data.colorId > max_color)) {
                max_color = dst_data.colorId;
              }
            }
          }

          if (max_color >= 0) {
            /* Assign color */
            for (; my_color <= max_color; my_color++) {
              if (isColorSet[my_color] == false) {
                break;
              }
            }

            if (my_color == max_color)
              my_color++;
          }
        }
        n_data.colorId = my_color;
      },
      galois::loopname("Coloring loop"));

  galois::gPrint("Checking for conflicts\n");
  /* Check for conflicts */
  galois::GAccumulator<uint64_t> conflicts;
  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) {
        auto& n_data = graph.getData(n, flag_no_lock);
        for (auto ii = graph.edge_begin(n, flag_write_lock);
             ii != graph.edge_end(n, flag_write_lock); ++ii) {
          auto dst       = graph.getEdgeDst(ii);
          auto& dst_data = graph.getData(dst, flag_no_lock);
          if (dst_data.colorId == n_data.colorId)
            conflicts += 1;
        }
      },
      galois::loopname("Coloring conflicts"));
  galois::gPrint("WARNING: Conflicts found : ", conflicts.reduce(), "\n");

  int64_t num_colors = 0;
  for (GNode n = 0; n < graph.size(); ++n) {
    int64_t color = graph.getData(n, flag_no_lock).colorId;
    if (color > num_colors)
      num_colors = color;
  }

  return num_colors;
}

double algoLouvainWithColoring(Graph& graph, double lower, double threshold,
                               uint32_t& iter) {

  galois::StatTimer TimerClusteringTotal("Timer_Clustering_Total");
  TimerClusteringTotal.start();

  galois::gPrint("Inside algoLouvainWithColoring\n");

  CommArray c_info;   // Community info
  CommArray c_update; // Used for updating community

  /* Variables needed for Modularity calculation */
  double constant_for_second_term;
  double prev_mod      = lower;
  double curr_mod      = -1;
  double threshold_mod = threshold;
  uint32_t num_iter    = iter;

  /*** Initialization ***/
  c_info.allocateBlocked(graph.size());
  c_update.allocateBlocked(graph.size());

  /* Initialization each node to its own cluster */
  galois::do_all(galois::iterate(graph), [&graph](GNode n) {
    graph.getData(n).curr_comm_ass = n;
    graph.getData(n).prev_comm_ass = n;
    graph.getData(n).colorId       = -1;
  });

  galois::gPrint("Coloring\n");
  galois::StatTimer TimerColoring("Timer_Cloring");
  TimerColoring.start();
  int64_t num_colors = coloringDistanceOne(graph);
  TimerColoring.stop();

  /* Calculate the weighted degree sum for each vertex */
  sumVertexDegreeWeight(graph, c_info);
  galois::gPrint("c_info[5] : ", c_info[0].degree_wt.load(), "\n");

  /* Compute the total weight (2m) and 1/2m terms */
  constant_for_second_term = calConstantForSecondTerm(graph);
  galois::gPrint("constant_for_second_term : ", constant_for_second_term, "\n");

  galois::gPrint("============================================================="
                 "===========================================\n");
  galois::gPrint(
      "Itr      Explore_xx            A_x2           Prev-Mod           "
      "Curr-Mod         Time-1(s)       Time-2(s)        T/Itr(s)\n");
  galois::gPrint("============================================================="
                 "===========================================\n");

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    c_update[n].degree_wt = 0;
    c_update[n].size      = 0;
  });

  galois::StatTimer TimerClusteringWhile("Timer_Clustering_While");
  TimerClusteringWhile.start();
  while (true) {
    num_iter++;

    for (int64_t c = 0; c < num_colors; ++c) {
      // galois::gPrint("Color : ", c, "\n");
      galois::do_all(
          galois::iterate(graph),
          [&](GNode n) {
            auto& n_data = graph.getData(n, flag_write_lock);
            if (n_data.colorId == c) {
              uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),
                                              graph.edge_end(n, flag_no_lock));
              uint64_t local_target = UNASSIGNED;
              std::map<uint64_t, uint64_t>
                  cluster_local_map; // Map each neighbor's cluster to local
                                     // number: Community --> Index
              std::vector<EdgeTy>
                  counter; // Number of edges to each unique cluster
              EdgeTy self_loop_wt = 0;

              if (degree > 0) {
                findNeighboringClusters(graph, n, cluster_local_map, counter,
                                        self_loop_wt);
                // Find the max gain in modularity
                local_target = maxModularity(
                    cluster_local_map, counter, self_loop_wt, c_info,
                    n_data.degree_wt, n_data.curr_comm_ass,
                    constant_for_second_term);
              } else {
                local_target = UNASSIGNED;
              }
              /* Update cluster info */
              if (local_target != n_data.curr_comm_ass &&
                  local_target != UNASSIGNED) {
                galois::atomicAdd(c_update[local_target].degree_wt,
                                  n_data.degree_wt);
                galois::atomicAdd(c_update[local_target].size, (uint64_t)1);
                galois::atomicSubtract(c_update[n_data.curr_comm_ass].degree_wt,
                                       n_data.degree_wt);
                galois::atomicSubtract(c_update[n_data.curr_comm_ass].size,
                                       (uint64_t)1);
                /* Set the new cluster id */
                n_data.curr_comm_ass = local_target;
              }
            }
          },
          galois::loopname("louvain algo: Phase 1"));

      galois::do_all(galois::iterate(graph), [&](GNode n) {
        galois::atomicAdd(c_info[n].size, c_update[n].size.load());
        galois::atomicAdd(c_info[n].degree_wt, c_update[n].degree_wt.load());
        c_update[n].size      = 0;
        c_update[n].degree_wt = 0;
      });
    }

    /* Calculate the overall modularity */
    double e_xx = 0;
    double a2_x = 0;
    curr_mod =
        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);

    galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                   prev_mod, "       ", curr_mod, "\n");

    if ((curr_mod - prev_mod) < threshold_mod) {
      galois::gPrint("Modularity gain: ", (curr_mod - prev_mod), " < ",
                     threshold_mod, " \n");
      prev_mod = curr_mod;
      break;
    }

    prev_mod = curr_mod;

  } // End while
  TimerClusteringWhile.stop();

  iter = num_iter;

  c_info.destroy();
  c_info.deallocate();

  c_update.destroy();
  c_update.deallocate();

  TimerClusteringTotal.stop();
  return prev_mod;
}

void runMultiPhaseLouvainAlgorithm(Graph& graph, uint32_t min_graph_size,
                                   double c_threshold,
                                   largeArray& clusters_orig) {

  galois::gPrint("Inside runMultiPhaseLouvainAlgorithm\n");
  double prev_mod = -1; // Previous modularity
  double curr_mod = -1; // Current modularity
  uint32_t phase  = 0;

  Graph* graph_curr = &graph;
  Graph graph_next;
  uint32_t iter           = 0;
  uint64_t num_nodes_orig = clusters_orig.size();
  while (true) {
    iter++;
    phase++;
    galois::gPrint("Starting Phase : ", phase, "\n");
    galois::gPrint("Graph size : ", (*graph_curr).size(), "\n");

    if ((*graph_curr).size() > min_graph_size) {

      switch (algo) {
      case coloring:
        curr_mod =
            algoLouvainWithColoring(*graph_curr, curr_mod, c_threshold, iter);
        break;
      case foreach:
        curr_mod =
            algoLouvainWithLocking(*graph_curr, curr_mod, c_threshold, iter);
        break;
      case doall:
        curr_mod = algoLouvainWithoutLockingDoAll(*graph_curr, curr_mod,
                                                  c_threshold, iter);
        break;
      case delay:
        curr_mod = algoLouvainWithLockingDelayUpdate(*graph_curr, curr_mod,
                                                     c_threshold, iter);
        break;
      default:
        std::abort();
      }
    }

    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);
    galois::gPrint(
        "Number of unique clusters (renumber): ", num_unique_clusters, "\n");

    galois::gPrint("Prev_mod main: ", prev_mod, "\n");
    if (iter < max_iter && (curr_mod - prev_mod) > threshold) {
      if (!enable_VF && phase == 1) {
        assert(num_nodes_orig == (*graph_curr).size());
        galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {
          clusters_orig[n] =
              (*graph_curr).getData(n, flag_no_lock).curr_comm_ass;
        });
      } else {
        galois::do_all(
            galois::iterate((uint64_t)0, num_nodes_orig), [&](GNode n) {
              if (clusters_orig[n] != UNASSIGNED) {
                assert(clusters_orig[n] < graph_curr->size());
                clusters_orig[n] = (*graph_curr)
                                       .getData(clusters_orig[n], flag_no_lock)
                                       .curr_comm_ass;
              }
            });
      }
      buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);
      prev_mod   = curr_mod;
      graph_curr = &graph_next;
      printGraphCharateristics(*graph_curr);
    } else {
      break;
    }
  }
  galois::gPrint("Phases : ", phase, "\n");
  galois::gPrint("Iter : ", iter, "\n");
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  Graph graph_next;
  Graph* graph_curr;

  std::cout << "Reading from file: " << inputFile << "\n";
  std::cout << "[WARNING:] Make sure " << inputFile
            << " is symmetric graph without duplicate edges\n";
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  graph_curr = &graph;

  /*
   * To keep track of communities for nodes in the original graph.
   * Community will be set to -1 for isolated nodes
   */
  largeArray clusters_orig;
  clusters_orig.allocateBlocked(graph_curr->size());

  /*
   * Vertex following optimization
   */
  if (enable_VF) {
    uint64_t num_nodes_to_fix =
        vertexFollowing(graph); // Find nodes that follow other nodes
    galois::gPrint("Isolated nodes : ", num_nodes_to_fix, "\n");

    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);
    galois::gPrint(
        "Number of unique clusters (renumber): ", num_unique_clusters, "\n");
    /*
     *Initialize node cluster id.
     */
    galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {
      clusters_orig[n] = graph.getData(n, flag_no_lock).curr_comm_ass;
    });

    // Build new graph to remove the isolated nodes
    buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);
    graph_curr = &graph_next;
    printGraphCharateristics(*graph_curr);
  } else {

    /*
     *Initialize node cluster id.
     */
    galois::do_all(galois::iterate(*graph_curr),
                   [&](GNode n) { clusters_orig[n] = -1; });

    printGraphCharateristics(*graph_curr);
  }

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  runMultiPhaseLouvainAlgorithm(*graph_curr, min_graph_size, c_threshold,
                                clusters_orig);
  execTime.stop();

  /*
   * Sanity check: Check modularity at the end
   */
  checkModularity<Graph, CommArray>(graph, clusters_orig);
  if (output_CID) {
    printNodeClusterId(graph, output_CID_filename);
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/connected-components/CMakeLists.txt
================================================
add_executable(connected-components-cpu ConnectedComponents.cpp)
add_dependencies(apps connected-components-cpu)
target_link_libraries(connected-components-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS connected-components-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small connected-components-cpu "${BASEINPUT}/scalefree/symmetric/rmat10.sgr" "-symmetricGraph")


================================================
FILE: lonestar/analytics/cpu/connected-components/ConnectedComponents.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/Bag.h"
#include "galois/ParallelSTL.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/UnionFind.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/OCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/runtime/Profile.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

#include <utility>
#include <vector>
#include <algorithm>
#include <iostream>

#include <ostream>
#include <fstream>

const char* name = "Connected Components";
const char* desc = "Computes the connected components of a graph";

namespace cll = llvm::cl;

enum Algo {
  serial,
  labelProp,
  synchronous,
  async,
  edgeasync,
  blockedasync,
  edgetiledasync,
  afforest,
  edgeafforest,
  edgetiledafforest,
};

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(
        clEnumValN(Algo::async, "Async", "Asynchronous"),
        clEnumValN(Algo::edgeasync, "EdgeAsync", "Edge-Asynchronous"),
        clEnumValN(Algo::edgetiledasync, "EdgetiledAsync",
                   "EdgeTiled-Asynchronous (default)"),
        clEnumValN(Algo::blockedasync, "BlockedAsync", "Blocked asynchronous"),
        clEnumValN(Algo::labelProp, "LabelProp",
                   "Using label propagation algorithm"),
        clEnumValN(Algo::serial, "Serial", "Serial"),
        clEnumValN(Algo::synchronous, "Sync", "Synchronous"),
        clEnumValN(Algo::afforest, "Afforest", "Using Afforest sampling"),
        clEnumValN(Algo::edgeafforest, "EdgeAfforest",
                   "Using Afforest sampling, Edge-wise"),
        clEnumValN(Algo::edgetiledafforest, "EdgetiledAfforest",
                   "Using Afforest sampling, EdgeTiled")

            ),
    cll::init(Algo::edgetiledasync));

static cll::opt<std::string>
    largestComponentFilename("outputLargestComponent",
                             cll::desc("[output graph file]"), cll::init(""));
static cll::opt<std::string>
    permutationFilename("outputNodePermutation",
                        cll::desc("[output node permutation file]"),
                        cll::init(""));
#ifndef NDEBUG
enum OutputEdgeType { void_, int32_, int64_ };
static cll::opt<unsigned int>
    memoryLimit("memoryLimit",
                cll::desc("Memory limit for out-of-core algorithms (in MB)"),
                cll::init(~0U));
static cll::opt<OutputEdgeType> writeEdgeType(
    "edgeType", cll::desc("Input/Output edge type:"),
    cll::values(
        clEnumValN(OutputEdgeType::void_, "void", "no edge values"),
        clEnumValN(OutputEdgeType::int32_, "int32", "32 bit edge values"),
        clEnumValN(OutputEdgeType::int64_, "int64", "64 bit edge values")),
    cll::init(OutputEdgeType::void_));
#endif

// TODO (bozhi) LLVM commandline library now supports option categorization.
// Categorize params when libllvm is updated to make -help beautiful!
// static cll::OptionCategory ParamCat("Algorithm-Specific Parameters",
//                                       "Only used for specific algorithms.");
static cll::opt<uint32_t>
    EDGE_TILE_SIZE("edgeTileSize",
                   cll::desc("(For Edgetiled algos) Size of edge tiles "
                             "(default 512)"),
                   // cll::cat(ParamCat),
                   cll::init(512)); // 512 -> 64
static const int CHUNK_SIZE = 1;
//! parameter for the Vertex Neighbor Sampling step of Afforest algorithm
static cll::opt<uint32_t> NEIGHBOR_SAMPLES(
    "vns",
    cll::desc("(For Afforest and its variants) number of edges "
              "per vertice to process initially for exposing "
              "partial connectivity (default 2)"),
    // cll::cat(ParamCat),
    cll::init(2));
//! parameter for the Large Component Skipping step of Afforest algorithm
static cll::opt<uint32_t> COMPONENT_SAMPLES(
    "lcs",
    cll::desc("(For Afforest and its variants) number of times "
              "randomly sampling over vertices to approximately "
              "capture the largest intermediate component "
              "(default 1024)"),
    // cll::cat(ParamCat),
    cll::init(1024));

struct Node : public galois::UnionFindNode<Node> {
  using component_type = Node*;

  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}
  Node(const Node& o) : galois::UnionFindNode<Node>(o.m_component) {}

  Node& operator=(const Node& o) {
    Node c(o);
    std::swap(c, *this);
    return *this;
  }

  component_type component() { return this->get(); }
  bool isRepComp(unsigned int) { return false; }
};

const unsigned int LABEL_INF = std::numeric_limits<unsigned int>::max();

/**
 * Serial connected components algorithm. Just use union-find.
 */
struct SerialAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    for (const GNode& src : graph) {
      Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
      for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
        GNode dst   = graph.getEdgeDst(ii);
        Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
        sdata.merge(&ddata);
      }
    }

    for (const GNode& src : graph) {
      Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
      sdata.compress();
    }
  }
};

struct LabelPropAlgo {

  struct LNode {
    using component_type = unsigned int;
    std::atomic<unsigned int> comp_current;
    unsigned int comp_old;

    component_type component() { return comp_current; }
    bool isRep() { return false; }
    bool isRepComp(unsigned int x) { return x == comp_current; }
  };

  using Graph =
      galois::graphs::LC_CSR_Graph<LNode, void>::with_no_lockable<true>::type;
  using GNode          = Graph::GraphNode;
  using component_type = LNode::component_type;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    galois::GReduceLogicalOr changed;
    do {
      changed.reset();
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            LNode& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
            if (sdata.comp_old > sdata.comp_current) {
              sdata.comp_old = sdata.comp_current;

              changed.update(true);

              for (auto e : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
                GNode dst              = graph.getEdgeDst(e);
                auto& ddata            = graph.getData(dst);
                unsigned int label_new = sdata.comp_current;
                galois::atomicMin(ddata.comp_current, label_new);
              }
            }
          },
          galois::disable_conflict_detection(), galois::steal(),
          galois::loopname("LabelPropAlgo"));
    } while (changed.reduce());
  }
};

/**
 * Synchronous connected components algorithm.  Initially all nodes are in
 * their own component. Then, we merge endpoints of edges to form the spanning
 * tree. Merging is done in two phases to simplify concurrent updates: (1)
 * find components and (2) union components.  Since the merge phase does not
 * do any finds, we only process a fraction of edges at a time; otherwise,
 * the union phase may unnecessarily merge two endpoints in the same
 * component.
 */
struct SynchronousAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  struct Edge {
    GNode src;
    Node* ddata;
    int count;
    Edge(GNode src, Node* ddata, int count)
        : src(src), ddata(ddata), count(count) {}
  };

  void operator()(Graph& graph) {
    size_t rounds = 0;
    galois::GAccumulator<size_t> emptyMerges;

    galois::InsertBag<Edge> wls[2];
    galois::InsertBag<Edge>* next;
    galois::InsertBag<Edge>* cur;

    cur  = &wls[0];
    next = &wls[1];

    galois::do_all(galois::iterate(graph), [&](const GNode& src) {
      for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
        GNode dst = graph.getEdgeDst(ii);
        if (src >= dst)
          continue;
        Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
        cur->push(Edge(src, &ddata, 0));
        break;
      }
    });

    while (!cur->empty()) {
      galois::do_all(
          galois::iterate(*cur),
          [&](const Edge& edge) {
            Node& sdata =
                graph.getData(edge.src, galois::MethodFlag::UNPROTECTED);
            if (!sdata.merge(edge.ddata))
              emptyMerges += 1;
          },
          galois::loopname("Merge"));

      galois::do_all(
          galois::iterate(*cur),
          [&](const Edge& edge) {
            GNode src   = edge.src;
            Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
            Node* scomponent = sdata.findAndCompress();
            Graph::edge_iterator ii =
                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
            Graph::edge_iterator ei =
                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
            int count = edge.count + 1;
            std::advance(ii, count);
            for (; ii != ei; ++ii, ++count) {
              GNode dst = graph.getEdgeDst(ii);
              if (src >= dst)
                continue;
              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              Node* dcomponent = ddata.findAndCompress();
              if (scomponent != dcomponent) {
                next->push(Edge(src, dcomponent, count));
                break;
              }
            }
          },
          galois::loopname("Find"));

      cur->clear();
      std::swap(cur, next);
      rounds += 1;
    }

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("Compress"));

    galois::runtime::reportStat_Single("CC-Sync", "rounds", rounds);
    galois::runtime::reportStat_Single("CC-Sync", "emptyMerges",
                                       emptyMerges.reduce());
  }
};

/**
 * Like synchronous algorithm, but if we restrict path compression (as done is
 * @link{UnionFindNode}), we can perform unions and finds concurrently.
 */
struct AsyncAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    galois::GAccumulator<size_t> emptyMerges;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);

          for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
            GNode dst   = graph.getEdgeDst(ii);
            Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);

            if (src >= dst)
              continue;

            if (!sdata.merge(&ddata))
              emptyMerges += 1;
          }
        },
        galois::loopname("CC-Async"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("CC-Async-Compress"));

    galois::runtime::reportStat_Single("CC-Async", "emptyMerges",
                                       emptyMerges.reduce());
  }
};

struct EdgeAsyncAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;
  using Edge  = std::pair<GNode, typename Graph::edge_iterator>;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    galois::GAccumulator<size_t> emptyMerges;

    galois::InsertBag<Edge> works;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
            if (src < graph.getEdgeDst(ii)) {
              works.push_back(std::make_pair(src, ii));
            }
          }
        },
        galois::loopname("CC-EdgeAsyncInit"), galois::steal());

    galois::do_all(
        galois::iterate(works),
        [&](Edge& e) {
          Node& sdata = graph.getData(e.first, galois::MethodFlag::UNPROTECTED);
          GNode dst   = graph.getEdgeDst(e.second);
          Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);

          if (e.first > dst)
            // continue;
            ;
          else if (!sdata.merge(&ddata)) {
            emptyMerges += 1;
          }
        },
        galois::loopname("CC-EdgeAsync"), galois::steal());

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("CC-Async-Compress"));

    galois::runtime::reportStat_Single("CC-Async", "emptyMerges",
                                       emptyMerges.reduce());
  }
};

/**
 * Improve performance of async algorithm by following machine topology.
 */
struct BlockedAsyncAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  struct WorkItem {
    GNode src;
    Graph::edge_iterator start;
  };

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  //! Add the next edge between components to the worklist
  template <bool MakeContinuation, int Limit, typename Pusher>
  static void process(Graph& graph, const GNode& src,
                      const Graph::edge_iterator& start, Pusher& pusher) {

    Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
    int count   = 1;
    for (Graph::edge_iterator
             ii = start,
             ei = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
         ii != ei; ++ii, ++count) {
      GNode dst   = graph.getEdgeDst(ii);
      Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);

      if (src >= dst)
        continue;

      if (sdata.merge(&ddata)) {
        if (Limit == 0 || count != Limit)
          continue;
      }

      if (MakeContinuation || (Limit != 0 && count == Limit)) {
        WorkItem item = {src, ii + 1};
        pusher.push(item);
        break;
      }
    }
  }

  void operator()(Graph& graph) {
    galois::InsertBag<WorkItem> items;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Graph::edge_iterator start =
              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          if (galois::substrate::ThreadPool::getSocket() == 0) {
            process<true, 0>(graph, src, start, items);
          } else {
            process<true, 1>(graph, src, start, items);
          }
        },
        galois::loopname("Initialize"));

    galois::for_each(
        galois::iterate(items),
        [&](const WorkItem& item, auto& ctx) {
          process<true, 0>(graph, item.src, item.start, ctx);
        },
        galois::loopname("Merge"),
        galois::wl<galois::worklists::PerSocketChunkFIFO<128>>());

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("CC-Async-Compress"));
  }
};

struct EdgeTiledAsyncAlgo {
  using Graph =
      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  struct EdgeTile {
    // Node* sData;
    GNode src;
    Graph::edge_iterator beg;
    Graph::edge_iterator end;
  };

  /*struct EdgeTileMaker {
      EdgeTile operator() (Node* sdata, Graph::edge_iterator beg,
  Graph::edge_iterator end) const{ return EdgeTile{sdata, beg, end};
      }
  };*/

  void operator()(Graph& graph) {
    galois::GAccumulator<size_t> emptyMerges;

    galois::InsertBag<EdgeTile> works;

    std::cout << "INFO: Using edge tile size of " << EDGE_TILE_SIZE
              << " and chunk size of " << CHUNK_SIZE << "\n";
    std::cout << "WARNING: Performance varies considerably due to parameter.\n";
    std::cout
        << "WARNING: Do not expect the default to be good for your graph.\n";

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          // Node& sdata=graph.getData(src,
          // galois::MethodFlag::UNPROTECTED);
          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

          assert(beg <= end);
          if ((end - beg) > EDGE_TILE_SIZE) {
            for (; beg + EDGE_TILE_SIZE < end;) {
              auto ne = beg + EDGE_TILE_SIZE;
              assert(ne < end);
              works.push_back(EdgeTile{src, beg, ne});
              beg = ne;
            }
          }

          if ((end - beg) > 0) {
            works.push_back(EdgeTile{src, beg, end});
          }
        },
        galois::loopname("CC-EdgeTiledAsyncInit"), galois::steal());

    galois::do_all(
        galois::iterate(works),
        [&](const EdgeTile& tile) {
          // Node& sdata = *(tile.sData);
          GNode src   = tile.src;
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);

          for (auto ii = tile.beg; ii != tile.end; ++ii) {
            GNode dst = graph.getEdgeDst(ii);
            if (src >= dst)
              continue;

            Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            if (!sdata.merge(&ddata))
              emptyMerges += 1;
          }
        },
        galois::loopname("CC-edgetiledAsync"), galois::steal(),
        galois::chunk_size<CHUNK_SIZE>() // 16 -> 1
    );

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("CC-Async-Compress"));

    galois::runtime::reportStat_Single("CC-edgeTiledAsync", "emptyMerges",
                                       emptyMerges.reduce());
  }
};

template <typename component_type, typename Graph>
component_type approxLargestComponent(Graph& graph) {
  using map_type = std::unordered_map<
      component_type, int, std::hash<component_type>,
      std::equal_to<component_type>,
      galois::gstl::Pow2Alloc<std::pair<const component_type, int>>>;
  using pair_type = std::pair<component_type, int>;

  map_type comp_freq(COMPONENT_SAMPLES);
  std::random_device rd;
  std::mt19937 rng(rd());
  std::uniform_int_distribution<uint32_t> dist(0, graph.size() - 1);
  for (uint32_t i = 0; i < COMPONENT_SAMPLES; i++) {
    auto& ndata = graph.getData(dist(rng), galois::MethodFlag::UNPROTECTED);
    comp_freq[ndata.component()]++;
  }

  assert(!comp_freq.empty());
  auto most_frequent =
      std::max_element(comp_freq.begin(), comp_freq.end(),
                       [](const pair_type& a, const pair_type& b) {
                         return a.second < b.second;
                       });

  galois::gDebug(
      "Approximate largest intermediate component: ", most_frequent->first,
      " (hit rate ", 100.0 * (most_frequent->second) / COMPONENT_SAMPLES, "%)");

  return most_frequent->first;
}

/**
 * CC w/ Afforest sampling.
 *
 * [1] M. Sutton, T. Ben-Nun and A. Barak, "Optimizing Parallel Graph
 * Connectivity Computation via Subgraph Sampling," 2018 IEEE International
 * Parallel and Distributed Processing Symposium (IPDPS), Vancouver, BC, 2018,
 * pp. 12-21.
 */
struct AfforestAlgo {
  struct NodeData : public galois::UnionFindNode<NodeData> {
    using component_type = NodeData*;

    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}
    NodeData(const NodeData& o)
        : galois::UnionFindNode<NodeData>(o.m_component) {}

    component_type component() { return this->get(); }
    bool isRepComp(unsigned int) { return false; } // verify

  public:
    void link(NodeData* b) {
      NodeData* a = m_component.load(std::memory_order_relaxed);
      b           = b->m_component.load(std::memory_order_relaxed);
      while (a != b) {
        if (a < b)
          std::swap(a, b);
        // Now a > b
        NodeData* ac = a->m_component.load(std::memory_order_relaxed);
        if ((ac == a && a->m_component.compare_exchange_strong(a, b)) ||
            (b == ac))
          break;
        a = (a->m_component.load(std::memory_order_relaxed))
                ->m_component.load(std::memory_order_relaxed);
        b = b->m_component.load(std::memory_order_relaxed);
      }
    }
  };
  using Graph =
      galois::graphs::LC_CSR_Graph<NodeData,
                                   void>::with_no_lockable<true>::type;
  using GNode          = Graph::GraphNode;
  using component_type = NodeData::component_type;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    // (bozhi) should NOT go through single direction in sampling step: nodes
    // with edges less than NEIGHBOR_SAMPLES will fail
    for (uint32_t r = 0; r < NEIGHBOR_SAMPLES; ++r) {
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            Graph::edge_iterator ii =
                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
            Graph::edge_iterator ei =
                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
            for (std::advance(ii, r); ii < ei; ii++) {
              GNode dst = graph.getEdgeDst(ii);
              NodeData& sdata =
                  graph.getData(src, galois::MethodFlag::UNPROTECTED);
              NodeData& ddata =
                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              sdata.link(&ddata);
              break;
            }
          },
          galois::steal(), galois::loopname("Afforest-VNS-Link"));

      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            NodeData& sdata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);
            sdata.compress();
          },
          galois::steal(), galois::loopname("Afforest-VNS-Compress"));
    }

    galois::StatTimer StatTimer_Sampling("Afforest-LCS-Sampling");
    StatTimer_Sampling.start();
    const component_type c = approxLargestComponent<component_type>(graph);
    StatTimer_Sampling.stop();

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (sdata.component() == c)
            return;
          Graph::edge_iterator ii =
              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          Graph::edge_iterator ei =
              graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
          for (std::advance(ii, NEIGHBOR_SAMPLES.getValue()); ii < ei; ++ii) {
            GNode dst = graph.getEdgeDst(ii);
            NodeData& ddata =
                graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            sdata.link(&ddata);
          }
        },
        galois::steal(), galois::loopname("Afforest-LCS-Link"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("Afforest-LCS-Compress"));
  }
};

/**
 * Edge CC w/ Afforest sampling
 */
struct EdgeAfforestAlgo {
  struct NodeData : public galois::UnionFindNode<NodeData> {
    using component_type = NodeData*;

    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}
    NodeData(const NodeData& o)
        : galois::UnionFindNode<NodeData>(o.m_component) {}

    component_type component() { return this->get(); }
    bool isRepComp(unsigned int) { return false; } // verify

  public:
    NodeData* hook_min(NodeData* b, NodeData* c = 0) {
      NodeData* a = m_component.load(std::memory_order_relaxed);
      b           = b->m_component.load(std::memory_order_relaxed);
      while (a != b) {
        if (a < b)
          std::swap(a, b);
        // Now a > b
        NodeData* ac = a->m_component.load(std::memory_order_relaxed);
        if (ac == a && a->m_component.compare_exchange_strong(a, b)) {
          if (b == c)
            return a; //! return victim
          return 0;
        }
        if (b == ac) {
          return 0;
        }
        a = (a->m_component.load(std::memory_order_relaxed))
                ->m_component.load(std::memory_order_relaxed);
        b = b->m_component.load(std::memory_order_relaxed);
      }
      return 0;
    }
  };
  using Graph =
      galois::graphs::LC_CSR_Graph<NodeData,
                                   void>::with_no_lockable<true>::type;
  using GNode          = Graph::GraphNode;
  using component_type = NodeData::component_type;

  using Edge = std::pair<GNode, GNode>;

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    // (bozhi) should NOT go through single direction in sampling step: nodes
    // with edges less than NEIGHBOR_SAMPLES will fail
    for (uint32_t r = 0; r < NEIGHBOR_SAMPLES; ++r) {
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            Graph::edge_iterator ii =
                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
            Graph::edge_iterator ei =
                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
            std::advance(ii, r);
            if (ii < ei) {
              GNode dst = graph.getEdgeDst(ii);
              NodeData& sdata =
                  graph.getData(src, galois::MethodFlag::UNPROTECTED);
              NodeData& ddata =
                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              sdata.hook_min(&ddata);
            }
          },
          galois::steal(), galois::loopname("EdgeAfforest-VNS-Link"));
    }
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("EdgeAfforest-VNS-Compress"));

    galois::StatTimer StatTimer_Sampling("EdgeAfforest-LCS-Sampling");
    StatTimer_Sampling.start();
    const component_type c = approxLargestComponent<component_type>(graph);
    StatTimer_Sampling.stop();
    const component_type c0 =
        &(graph.getData(0, galois::MethodFlag::UNPROTECTED));

    galois::InsertBag<Edge> works;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (sdata.component() == c)
            return;
          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

          for (std::advance(beg, NEIGHBOR_SAMPLES.getValue()); beg < end;
               beg++) {
            GNode dst = graph.getEdgeDst(beg);
            NodeData& ddata =
                graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            if (src < dst || c == ddata.component()) {
              works.push_back(std::make_pair(src, dst));
            }
          }
        },
        galois::loopname("EdgeAfforest-LCS-Assembling"), galois::steal());

    galois::for_each(
        galois::iterate(works),
        [&](const Edge& e, auto& ctx) {
          NodeData& sdata =
              graph.getData(e.first, galois::MethodFlag::UNPROTECTED);
          if (sdata.component() == c)
            return;
          NodeData& ddata =
              graph.getData(e.second, galois::MethodFlag::UNPROTECTED);
          component_type victim = sdata.hook_min(&ddata, c);
          if (victim) {
            GNode src = victim - c0; // TODO (bozhi) tricky!
            for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
              GNode dst = graph.getEdgeDst(ii);
              ctx.push_back(std::make_pair(dst, src));
            }
          }
        },
        galois::disable_conflict_detection(),
        galois::loopname("EdgeAfforest-LCS-Link"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("EdgeAfforest-LCS-Compress"));
  }
};

/**
 * Edgetiled CC w/ Afforest sampling
 */
struct EdgeTiledAfforestAlgo {
  struct NodeData : public galois::UnionFindNode<NodeData> {
    using component_type = NodeData*;

    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}
    NodeData(const NodeData& o)
        : galois::UnionFindNode<NodeData>(o.m_component) {}

    component_type component() { return this->get(); }
    bool isRepComp(unsigned int) { return false; } // verify

  public:
    void link(NodeData* b) {
      NodeData* a = m_component.load(std::memory_order_relaxed);
      b           = b->m_component.load(std::memory_order_relaxed);
      while (a != b) {
        if (a < b)
          std::swap(a, b);
        // Now a > b
        NodeData* ac = a->m_component.load(std::memory_order_relaxed);
        if ((ac == a && a->m_component.compare_exchange_strong(a, b)) ||
            (b == ac))
          break;
        a = (a->m_component.load(std::memory_order_relaxed))
                ->m_component.load(std::memory_order_relaxed);
        b = b->m_component.load(std::memory_order_relaxed);
      }
    }
  };
  using Graph =
      galois::graphs::LC_CSR_Graph<NodeData,
                                   void>::with_no_lockable<true>::type;
  using GNode          = Graph::GraphNode;
  using component_type = NodeData::component_type;

  struct EdgeTile {
    GNode src;
    Graph::edge_iterator beg;
    Graph::edge_iterator end;
  };

  template <typename G>
  void readGraph(G& graph) {
    galois::graphs::readGraph(graph, inputFile);
  }

  void operator()(Graph& graph) {
    // (bozhi) should NOT go through single direction in sampling step: nodes
    // with edges less than NEIGHBOR_SAMPLES will fail
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          auto ii = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
          for (uint32_t r = 0; r < NEIGHBOR_SAMPLES && ii < end; ++r, ++ii) {
            GNode dst = graph.getEdgeDst(ii);
            NodeData& sdata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);
            NodeData& ddata =
                graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            sdata.link(&ddata);
          }
        },
        galois::steal(), galois::loopname("EdgetiledAfforest-VNS-Link"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("EdgetiledAfforest-VNS-Compress"));

    galois::StatTimer StatTimer_Sampling("EdgetiledAfforest-LCS-Sampling");
    StatTimer_Sampling.start();
    const component_type c = approxLargestComponent<component_type>(graph);
    StatTimer_Sampling.stop();

    galois::InsertBag<EdgeTile> works;
    std::cout << "INFO: Using edge tile size of " << EDGE_TILE_SIZE
              << " and chunk size of " << CHUNK_SIZE << "\n";
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (sdata.component() == c)
            return;
          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

          for (std::advance(beg, NEIGHBOR_SAMPLES.getValue());
               beg + EDGE_TILE_SIZE < end;) {
            auto ne = beg + EDGE_TILE_SIZE;
            assert(ne < end);
            works.push_back(EdgeTile{src, beg, ne});
            beg = ne;
          }

          if ((end - beg) > 0) {
            works.push_back(EdgeTile{src, beg, end});
          }
        },
        galois::loopname("EdgetiledAfforest-LCS-Tiling"), galois::steal());

    galois::do_all(
        galois::iterate(works),
        [&](const EdgeTile& tile) {
          NodeData& sdata =
              graph.getData(tile.src, galois::MethodFlag::UNPROTECTED);
          if (sdata.component() == c)
            return;
          for (auto ii = tile.beg; ii < tile.end; ++ii) {
            GNode dst = graph.getEdgeDst(ii);
            NodeData& ddata =
                graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            sdata.link(&ddata);
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("EdgetiledAfforest-LCS-Link"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          sdata.compress();
        },
        galois::steal(), galois::loopname("EdgetiledAfforest-LCS-Compress"));
  }
};

template <typename Graph>
bool verify(
    Graph&,
    typename std::enable_if<galois::graphs::is_segmented<Graph>::value>::type* =
        0) {
  return true;
}

template <typename Graph>
bool verify(Graph& graph,
            typename std::enable_if<
                !galois::graphs::is_segmented<Graph>::value>::type* = 0) {

  using GNode = typename Graph::GraphNode;

  auto is_bad = [&graph](const GNode& n) {
    auto& me = graph.getData(n);
    for (auto ii : graph.edges(n)) {
      GNode dst  = graph.getEdgeDst(ii);
      auto& data = graph.getData(dst);
      if (data.component() != me.component()) {
        std::cerr << std::dec << "not in same component: " << (unsigned int)n
                  << " (" << me.component() << ")"
                  << " and " << (unsigned int)dst << " (" << data.component()
                  << ")"
                  << "\n";
        return true;
      }
    }
    return false;
  };

  return galois::ParallelSTL::find_if(graph.begin(), graph.end(), is_bad) ==
         graph.end();
}

template <typename Algo, typename Graph>
typename Graph::node_data_type::component_type findLargest(Graph& graph) {

  using GNode          = typename Graph::GraphNode;
  using component_type = typename Graph::node_data_type::component_type;

  using Map = galois::gstl::Map<component_type, int>;

  auto reduce = [](Map& lhs, Map&& rhs) -> Map& {
    Map v{std::move(rhs)};

    for (auto& kv : v) {
      if (lhs.count(kv.first) == 0) {
        lhs[kv.first] = 0;
      }
      lhs[kv.first] += kv.second;
    }

    return lhs;
  };

  auto mapIdentity = []() { return Map(); };

  auto accumMap = galois::make_reducible(reduce, mapIdentity);

  galois::GAccumulator<size_t> accumReps;

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& x) {
        auto& n = graph.getData(x, galois::MethodFlag::UNPROTECTED);

        if (std::is_same<Algo, LabelPropAlgo>::value) {
          if (n.isRepComp((unsigned int)x)) {
            accumReps += 1;
            return;
          }
        } else {
          if (n.isRep()) {
            accumReps += 1;
            return;
          }
        }

        // Don't add reps to table to avoid adding components of size
        // 1
        accumMap.update(Map{std::make_pair(n.component(), 1)});
      },
      galois::loopname("CountLargest"));

  Map& map    = accumMap.reduce();
  size_t reps = accumReps.reduce();

  using ComponentSizePair = std::pair<component_type, int>;

  auto sizeMax = [](const ComponentSizePair& a, const ComponentSizePair& b) {
    if (a.second > b.second) {
      return a;
    }
    return b;
  };

  auto identity = []() { return ComponentSizePair{}; };

  auto maxComp = galois::make_reducible(sizeMax, identity);

  galois::do_all(galois::iterate(map),
                 [&](const ComponentSizePair& x) { maxComp.update(x); });

  ComponentSizePair largest = maxComp.reduce();

  // Compensate for dropping representative node of components
  double ratio       = graph.size() - reps + map.size();
  size_t largestSize = largest.second + 1;
  if (ratio) {
    ratio = largestSize / ratio;
  }

  std::cout << "Total components: " << reps << "\n";
  std::cout << "Number of non-trivial components: " << map.size()
            << " (largest size: " << largestSize << " [" << ratio << "])\n";

  return largest.first;
}

template <typename Graph>
void initialize(Graph&) {}

template <>
void initialize<LabelPropAlgo::Graph>(typename LabelPropAlgo::Graph& graph) {
  unsigned int id = 0;

  for (typename LabelPropAlgo::Graph::iterator ii = graph.begin(),
                                               ei = graph.end();
       ii != ei; ++ii, ++id) {
    graph.getData(*ii).comp_current = id;
    graph.getData(*ii).comp_old     = LABEL_INF;
  }
}

template <typename Algo>
void run() {
  using Graph = typename Algo::Graph;

  Algo algo;
  Graph graph;

  algo.readGraph(graph);
  std::cout << "Read " << graph.size() << " nodes\n";

  initialize(graph);

  galois::preAlloc(numThreads +
                   (3 * graph.size() * sizeof(typename Graph::node_data_type)) /
                       galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  algo(graph);
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify || largestComponentFilename != "" ||
      permutationFilename != "") {
    findLargest<Algo, Graph>(graph);
    if (!verify(graph)) {
      GALOIS_DIE("verification failed");
    }
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  switch (algo) {
  case Algo::async:
    run<AsyncAlgo>();
    break;
  case Algo::edgeasync:
    run<EdgeAsyncAlgo>();
    break;
  case Algo::edgetiledasync:
    run<EdgeTiledAsyncAlgo>();
    break;
  case Algo::blockedasync:
    run<BlockedAsyncAlgo>();
    break;
  case Algo::labelProp:
    run<LabelPropAlgo>();
    break;
  case Algo::serial:
    run<SerialAlgo>();
    break;
  case Algo::synchronous:
    run<SynchronousAlgo>();
    break;
  case Algo::afforest:
    run<AfforestAlgo>();
    break;
  case Algo::edgeafforest:
    run<EdgeAfforestAlgo>();
    break;
  case Algo::edgetiledafforest:
    run<EdgeTiledAfforestAlgo>();
    break;

  default:
    std::cerr << "Unknown algorithm\n";
    abort();
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/connected-components/README.md
================================================
Weakly Connected components
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Find all connected components of an undirected (symmetric) graph. Set the same
label to nodes which belong to the same component.
Two major categories of algorithm: Pointer Jumping and Label Propogation.

Pointer Jumping is based on union-find that nodes' label is a pointer pointing
to its representative. We merge endpoints of edges to form the spanning tree.
Merging is done in two phases to simplify concurrent updates: (1) find components,
update the pointer to reduce the depth of the tree and (2) union componenets,
update nodes connected by edges.

In Label Propagation, each node is marked with a unique label and propagating
vertex labels through neighboring vertices until all the vertices in the same
component are labelled with a unique ID.

  - Serial: Serial pointer-jumping implementation.
  - Synchronous: Bulk synchronous data-driven implementation.
    Alternatively execute on two worklists.
  - Async: Asynchronous topology-driven implementation. Work unit is a node.
  - BlockedAsync: Asynchronous topology-driven implementation with NUMA-aware
    optimization. Work unit is a node.
  - EdgeAsync: Asynchronous topology-driven. Work unit is an edge.
  - EdgetiledAsync (default): Asynchronous topology-driven.
    Work unit is an edge tile.
  - LabelProp: Label propagation implementation.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/connected-components; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm (edgetiledasync), use the following:
-`$ ./connected-components-cpu <input-graph (symmetric)> -t=<num-threads> -symmetricGraph`

To run a specific algorithm, use the following:
-`$ ./connected-components-cpu <input-graph (symmetric)> -t=<num-threads> -algo=<algorithm> -symmetricGraph'

PERFORMANCE  
--------------------------------------------------------------------------------

Default algorithm 'edgetiledasync' works best on rmat25, r4-2e26, roadUSA graphs
among all algorithms. Two parameters 'EDGE_TILE_SIZE' and 'CHUNK_SIZE'
(granularity of work stealing) are crucial to performance and has to be tuned on
different platforms. They are set to be 512 and 1 respectively by default.
Label propagation is the best if the input graph is randomized,
i.e. node ID are randomized, highest degree node is not node 0.


================================================
FILE: lonestar/analytics/cpu/gmetis/CMakeLists.txt
================================================
add_executable(gmetis-cpu Coarsening.cpp GMetis.cpp Metric.cpp Partitioning.cpp Refine.cpp)
add_dependencies(apps gmetis-cpu)
target_link_libraries(gmetis-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS gmetis-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

# Disable failing test (issue #116).
add_test_scale(small1 gmetis-cpu "${BASEINPUT}/reference/structured/rome99.gr" "-numPartitions=4" NOT_QUICK)
add_test_scale(small2 gmetis-cpu "${BASEINPUT}/scalefree/rmat10.gr" "-numPartitions=256")


================================================
FILE: lonestar/analytics/cpu/gmetis/Coarsening.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Metis.h"
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/substrate/PerThreadStorage.h"
#include "galois/gstl.h"

#include <iostream>

namespace {

using MatchingPolicy    = GNode(GNode, GGraph*);
using MatchingSubPolicy = std::pair<GNode, int>(GNode, GGraph*, bool tag);

std::pair<GNode, int> HEMmatch(GNode node, GGraph* graph, bool) {
  GNode retval = node; // match self if nothing else
  int maxwgt   = std::numeric_limits<int>::min();
  //    nume += std::distance(graph->edge_begin(node), graph->edge_end(node));
  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {
    //      ++checked;
    GNode neighbor = graph->getEdgeDst(jj);
    MetisNode& neighMNode =
        graph->getData(neighbor, galois::MethodFlag::UNPROTECTED);
    int edgeData = graph->getEdgeData(jj, galois::MethodFlag::UNPROTECTED);
    if (!neighMNode.isMatched() && neighbor != node && maxwgt < edgeData) {
      maxwgt = edgeData;
      retval = neighbor;
    }
  }
  return std::make_pair(retval, maxwgt);
}
GNode HEMmatch(GNode node, GGraph* graph) {
  return HEMmatch(node, graph, true).first;
}

GNode RMmatch(GNode node, GGraph* graph) {
  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {
    GNode neighbor = graph->getEdgeDst(jj);
    if (!graph->getData(neighbor, galois::MethodFlag::UNPROTECTED)
             .isMatched() &&
        neighbor != node)
      return neighbor;
  }
  return node;
  // Don't actually do random, just choose first
}
// std::pair<GNode, int> RMmatch(GNode node, GGraph* graph, bool tag) {
//  return std::make_pair(RMmatch(node, graph), 0);
//}

template <MatchingSubPolicy matcher>
GNode TwoHopMatcher(GNode node, GGraph* graph) {
  std::pair<GNode, int> retval(node, std::numeric_limits<int>::min());
  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {
    GNode neighbor             = graph->getEdgeDst(jj);
    std::pair<GNode, int> tval = matcher(neighbor, graph, true);
    if (tval.first != node && tval.first != neighbor &&
        tval.second > retval.second)
      retval = tval;
  }
  return retval.first;
}

typedef galois::GAccumulator<unsigned> Pcounter;

/*
 *This function is responsible for matching.
 1. There are two types of matching. Random and Heavy Edge matching
 2. Random matching picks any random node above a threshold and matches the
 nodes. RM.h
 3. Heavy Edge Matching matches the vertex which is connected by the heaviest
 edge. HEM.h
 4. This function can also create the multinode, i.e. the node which is created
 on combining two matched nodes.
 5. You can enable/disable 4th by changing variantMetis::mergeMatching
*/
template <MatchingPolicy matcher, typename WL>
void parallelMatchAndCreateNodes(MetisGraph* graph, Pcounter& pc,
                                 GNodeBag& noEdgeBag, bool selfMatch) {
  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();
  GGraph* coarseGGraph = graph->getGraph();
  assert(fineGGraph != coarseGGraph);

  galois::for_each(
      galois::iterate(*fineGGraph),
      [&](GNode item, galois::UserContext<GNode>&) {
        if (fineGGraph->getData(item).isMatched())
          return;

        if (fineGGraph->edge_begin(item, galois::MethodFlag::UNPROTECTED) ==
            fineGGraph->edge_end(item, galois::MethodFlag::UNPROTECTED)) {
          noEdgeBag.push(item);
          return;
        }

        GNode ret;
        do {
          ret = matcher(item, fineGGraph);
          // lock ret, since we found it lock-free it may be matched, so try
          // again
        } while (fineGGraph->getData(ret).isMatched());

        // at this point both ret and item (and failed matches) are locked.
        // We do not leave the above loop until we both have the lock on
        // the node and check the matched status of the locked node.  the
        // lock before (final) read ensures that we will see any write to
        // matched

        unsigned numEdges = std::distance(
            fineGGraph->edge_begin(item, galois::MethodFlag::UNPROTECTED),
            fineGGraph->edge_end(item, galois::MethodFlag::UNPROTECTED));
        // assert(numEdges == std::distance(fineGGraph->edge_begin(item),
        // fineGGraph->edge_end(item)));

        GNode N;
        if (ret != item) {
          // match found
          numEdges += std::distance(
              fineGGraph->edge_begin(ret, galois::MethodFlag::UNPROTECTED),
              fineGGraph->edge_end(ret, galois::MethodFlag::UNPROTECTED));
          // Cautious point
          N = coarseGGraph->createNode(numEdges,
                                       fineGGraph->getData(item).getWeight() +
                                           fineGGraph->getData(ret).getWeight(),
                                       item, ret);
          fineGGraph->getData(item).setMatched();
          fineGGraph->getData(ret).setMatched();
          fineGGraph->getData(item).setParent(N);
          fineGGraph->getData(ret).setParent(N);
        } else {
          // assertAllMatched(item, fineGGraph);
          // Cautious point
          // no match
          if (selfMatch) {
            pc.update(1U);
            N = coarseGGraph->createNode(
                numEdges, fineGGraph->getData(item).getWeight(), item);
            fineGGraph->getData(item).setMatched();
            fineGGraph->getData(item).setParent(N);
          }
        }
      },
      galois::wl<WL>(), galois::no_pushes(), galois::loopname("match"));
}

/*
 * This function is responsible for doing a union find of the edges
 * between matched nodes and populate the edges in the coarser graph
 * node.
 */
void createCoarseEdges(MetisGraph* graph) {
  GGraph* coarseGGraph = graph->getGraph();
  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();
  assert(fineGGraph != coarseGGraph);

  typedef galois::gstl::Vector<std::pair<GNode, unsigned>> VecTy;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;

  galois::do_all(
      galois::iterate(*coarseGGraph),
      [&](GNode node) {
        //    std::cout << 'p';
        // fineGGraph is read only in this loop, so skip locks
        MetisNode& nodeData =
            coarseGGraph->getData(node, galois::MethodFlag::UNPROTECTED);

        auto& edges = *edgesThreadLocal.getLocal();
        edges.clear();
        for (unsigned x = 0; x < nodeData.numChildren(); ++x) {
          for (auto ii : fineGGraph->edges(nodeData.getChild(x),
                                           galois::MethodFlag::UNPROTECTED)) {
            GNode dst = fineGGraph->getEdgeDst(ii);
            GNode p = fineGGraph->getData(dst, galois::MethodFlag::UNPROTECTED)
                          .getParent();
            edges.emplace_back(p, fineGGraph->getEdgeData(
                                      ii, galois::MethodFlag::UNPROTECTED));
          }
        }

        // slightly faster not ordering by edge weight
        // std::sort(edges.begin(), edges.end(), [] (const std::pair<GNode,
        // unsigned>& lhs, const std::pair<GNode, unsigned>& rhs) { return
        // lhs.first < rhs.first; } );

        // insert edges
        for (auto pp = edges.begin(), ep = edges.end(); pp != ep;) {
          GNode dst    = pp->first;
          unsigned sum = pp->second;
          ++pp;
          if (node != dst) { // no self edges
            while (pp != ep && pp->first == dst) {
              sum += pp->second;
              ++pp;
            }
            coarseGGraph->addMultiEdge(node, dst,
                                       galois::MethodFlag::UNPROTECTED, sum);
          }
        }
        //    assert(e);
        // nodeData.setNumEdges(e);
      },
      galois::steal(), galois::loopname("popedge"));
}

struct HighDegreeIndexer {
  static GGraph* indexgraph;
  unsigned int operator()(const GNode& val) const {
    return indexgraph->getData(val, galois::MethodFlag::UNPROTECTED)
                   .isFailedMatch()
               ? std::numeric_limits<unsigned int>::max()
               : (std::numeric_limits<unsigned int>::max() -
                  ((std::distance(indexgraph->edge_begin(
                                      val, galois::MethodFlag::UNPROTECTED),
                                  indexgraph->edge_end(
                                      val, galois::MethodFlag::UNPROTECTED))) >>
                   2));
  }
};
GGraph* HighDegreeIndexer::indexgraph = 0;

struct LowDegreeIndexer {
  unsigned int operator()(const GNode& val) const {
    unsigned x = std::distance(HighDegreeIndexer::indexgraph->edge_begin(
                                   val, galois::MethodFlag::UNPROTECTED),
                               HighDegreeIndexer::indexgraph->edge_end(
                                   val, galois::MethodFlag::UNPROTECTED));
    return x; // >> 2;
    // int targetlevel = 0;
    // while (x >>= 1) ++targetlevel;
    // return targetlevel;
  }
};

struct WeightIndexer {
  int operator()(const GNode& val) const {
    return HighDegreeIndexer::indexgraph
        ->getData(val, galois::MethodFlag::UNPROTECTED)
        .getWeight();
  }
};

/*unsigned minRuns(unsigned coarsenTo, unsigned size) {
  unsigned num = 0;
  while (coarsenTo < size) {
    ++num;
    size /= 2;
  }
  return num;
}*/

unsigned fixupLoners(GNodeBag& b, GGraph* coarseGGraph, GGraph* fineGGraph) {
  unsigned count = 0;
  auto ii = b.begin(), ee = b.end();
  while (ii != ee) {
    auto i2 = ii;
    ++i2;
    if (i2 != ee) {
      GNode N =
          coarseGGraph->createNode(0,
                                   fineGGraph->getData(*ii).getWeight() +
                                       fineGGraph->getData(*i2).getWeight(),
                                   *ii, *i2);
      fineGGraph->getData(*ii).setMatched();
      fineGGraph->getData(*i2).setMatched();
      fineGGraph->getData(*ii).setParent(N);
      fineGGraph->getData(*i2).setParent(N);
      ++ii;
      ++count;
    } else {
      GNode N = coarseGGraph->createNode(
          0, fineGGraph->getData(*ii).getWeight(), *ii);
      fineGGraph->getData(*ii).setMatched();
      fineGGraph->getData(*ii).setParent(N);
    }
    ++ii;
  }
  return count;
}

unsigned findMatching(MetisGraph* coarseMetisGraph, bool useRM, bool use2Hop,
                      bool verbose) {
  MetisGraph* fineMetisGraph = coarseMetisGraph->getFinerGraph();

  /*
   * Different worklist versions tried, PerSocketChunkFIFO 256 works best with
   * LC_MORPH_graph. Another good type would be Lazy Iter.
   */
  // typedef galois::worklists::ChunkLIFO<64, GNode> WL;
  // typedef
  // galois::worklists::LazyIter<decltype(fineGGraph->local_begin()),false> WL;

  GNodeBag bagOfLoners;
  Pcounter pc;

  bool useOBIM = true;

  typedef galois::worklists::StableIterator<true> WL;
  if (useRM) {
    parallelMatchAndCreateNodes<RMmatch, WL>(coarseMetisGraph, pc, bagOfLoners,
                                             !use2Hop);
  } else {
    // FIXME: use obim for SHEM matching
    typedef galois::worklists::PerSocketChunkLIFO<32> Chunk;
    // typedef galois::worklists::OrderedByIntegerMetric<WeightIndexer, Chunk>
    // pW;
    typedef galois::worklists::OrderedByIntegerMetric<LowDegreeIndexer, Chunk>
        pLD;
    // typedef galois::worklists::OrderedByIntegerMetric<HighDegreeIndexer,
    // Chunk> pHD;

    HighDegreeIndexer::indexgraph = fineMetisGraph->getGraph();
    if (useOBIM)
      parallelMatchAndCreateNodes<HEMmatch, pLD>(coarseMetisGraph, pc,
                                                 bagOfLoners, !use2Hop);
    else
      parallelMatchAndCreateNodes<HEMmatch, WL>(coarseMetisGraph, pc,
                                                bagOfLoners, !use2Hop);
  }
  unsigned c = fixupLoners(bagOfLoners, coarseMetisGraph->getGraph(),
                           fineMetisGraph->getGraph());
  if (verbose && c)
    std::cout << "\n\tLone Matches " << c;
  if (use2Hop) {
    typedef galois::worklists::PerSocketChunkLIFO<32> Chunk;
    // typedef galois::worklists::OrderedByIntegerMetric<WeightIndexer, Chunk>
    // pW;
    typedef galois::worklists::OrderedByIntegerMetric<LowDegreeIndexer, Chunk>
        pLD;
    // typedef galois::worklists::OrderedByIntegerMetric<HighDegreeIndexer,
    // Chunk> pHD;

    HighDegreeIndexer::indexgraph = fineMetisGraph->getGraph();
    Pcounter pc2;
    if (useOBIM)
      parallelMatchAndCreateNodes<TwoHopMatcher<HEMmatch>, pLD>(
          coarseMetisGraph, pc2, bagOfLoners, true);
    else
      parallelMatchAndCreateNodes<TwoHopMatcher<HEMmatch>, WL>(
          coarseMetisGraph, pc2, bagOfLoners, true);
    return pc2.reduce();
  }
  return pc.reduce();
}

MetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, unsigned& rem, bool useRM,
                        bool with2Hop, bool verbose) {
  MetisGraph* coarseMetisGraph = new MetisGraph(fineMetisGraph);
  galois::Timer t, t2;
  if (verbose)
    t.start();
  rem = findMatching(coarseMetisGraph, useRM, with2Hop, verbose);
  if (verbose) {
    t.stop();
    std::cout << "\n\tTime Matching " << t.get() << "\n";
    t2.start();
  }
  createCoarseEdges(coarseMetisGraph);
  if (verbose) {
    t2.stop();
    std::cout << "\tTime Creating " << t2.get() << "\n";
  }
  return coarseMetisGraph;
}

} // namespace

MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                    bool verbose) {
  MetisGraph* coarseGraph = fineMetisGraph;
  unsigned size           = std::distance(fineMetisGraph->getGraph()->begin(),
                                fineMetisGraph->getGraph()->end());
  unsigned iterNum        = 0;
  bool with2Hop           = false;
  unsigned stat           = 0;
  while (true) { // overflow
    if (verbose) {
      std::cout << "Coarsening " << iterNum << "\t";
      stat = graphStat(*coarseGraph->getGraph());
    }
    unsigned rem     = 0;
    coarseGraph      = coarsenOnce(coarseGraph, rem, false, with2Hop, verbose);
    unsigned newSize = size / 2 + rem / 2;
    if (verbose) {
      std::cout << "\tTO\t";
      unsigned stat2 = graphStat(*coarseGraph->getGraph());
      std::cout << "\n\tRatio " << (double)stat2 / (double)stat << " REM "
                << rem << " new size " << newSize << "\n";
    }

    if (size * 3 < newSize * 4) {
      with2Hop = true;
      if (verbose)
        std::cout << "** Enabling 2 hop matching\n";
    } else {
      with2Hop = false;
    }

    size = newSize;
    if (newSize * 4 < coarsenTo) { // be more exact near the end
      size = std::distance(coarseGraph->getGraph()->begin(),
                           coarseGraph->getGraph()->end());
      if (size < coarsenTo)
        break;
    }
    ++iterNum;
  }

  return coarseGraph;
}


================================================
FILE: lonestar/analytics/cpu/gmetis/GMetis.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <vector>
#include <set>
#include <map>
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <numeric>
#include <algorithm>
#include <cmath>
#include <fstream>

#include "Metis.h"
#include "galois/graphs/ReadGraph.h"
#include "galois/Timer.h"
//#include "GraphReader.h"
#include "Lonestar/BoilerPlate.h"
#include "galois/graphs/FileGraph.h"
#include "galois/LargeArray.h"

namespace cll = llvm::cl;

static const char* name = "GMetis";
static const char* desc =
    "Partitions a graph into K parts and minimizing the graph cut";
static const char* url = "gMetis";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<InitialPartMode> partMode(
    cll::desc("Choose a inital part mode:"),
    cll::values(clEnumVal(GGP, "GGP"), clEnumVal(GGGP, "GGGP (default)"),
                clEnumVal(MGGGP, "MGGGP")),
    cll::init(GGGP));
static cll::opt<refinementMode> refineMode(
    cll::desc("Choose a refinement mode:"),
    cll::values(clEnumVal(BKL, "BKL"), clEnumVal(BKL2, "BKL2 (default)"),
                clEnumVal(ROBO, "ROBO"), clEnumVal(GRACLUS, "GRACLUS")),
    cll::init(BKL2));

static cll::opt<bool>
    mtxInput("mtxinput",
             cll::desc("Use text mtx files instead of binary galois gr files"),
             cll::init(false));
static cll::opt<bool> weighted("weighted", cll::desc("weighted"),
                               cll::init(false));
static cll::opt<bool>
    verbose("verbose",
            cll::desc("verbose output (debugging mode, takes extra time)"),
            cll::init(false));
static cll::opt<std::string> outfile("output",
                                     cll::desc("output partition file name"));
static cll::opt<std::string>
    orderedfile("ordered", cll::desc("output ordered graph file name"));
static cll::opt<std::string>
    permutationfile("permutation", cll::desc("output permutation file name"));
static cll::opt<int> numPartitions("numPartitions",
                                   cll::desc("<Number of partitions>"),
                                   cll::Required);
static cll::opt<double> imbalance(
    "balance",
    cll::desc("Fraction deviated from mean partition size (default 0.01)"),
    cll::init(0.01));

// const double COARSEN_FRACTION = 0.9;

/**
 * KMetis Algorithm
 */
void Partition(MetisGraph* metisGraph, unsigned nparts) {
  unsigned fineMetisGraphWeight = metisGraph->getTotalWeight();
  unsigned meanWeight = ((double)fineMetisGraphWeight) / (double)nparts;
  unsigned coarsenTo  = 20 * nparts;

  if (verbose)
    std::cout << "Starting coarsening: \n";
  galois::StatTimer T("Coarsen");
  T.start();
  auto mcg =
      std::unique_ptr<MetisGraph>(coarsen(metisGraph, coarsenTo, verbose));
  T.stop();
  if (verbose)
    std::cout << "Time coarsen: " << T.get() << "\n";

  galois::StatTimer T2("Partition");
  T2.start();
  std::vector<partInfo> parts;
  parts = partition(mcg.get(), fineMetisGraphWeight, nparts, partMode);
  T2.stop();

  if (verbose)
    std::cout << "Init edge cut : " << computeCut(*mcg->getGraph()) << "\n\n";

  std::vector<partInfo> initParts = parts;
  if (verbose)
    std::cout << "Time clustering:  " << T2.get() << '\n';

  if (verbose) {
    switch (refineMode) {
    case BKL2:
      std::cout << "Sorting refinnement with BKL2\n";
      break;
    case BKL:
      std::cout << "Sorting refinnement with BKL\n";
      break;
    case ROBO:
      std::cout << "Sorting refinnement with ROBO\n";
      break;
    case GRACLUS:
      std::cout << "Sorting refinnement with GRACLUS\n";
      break;
    default:
      abort();
    }
  }

  galois::StatTimer T3("Refine");
  T3.start();
  refine(mcg.get(), parts, meanWeight - (unsigned)(meanWeight * imbalance),
         meanWeight + (unsigned)(meanWeight * imbalance), refineMode, verbose);
  T3.stop();
  if (verbose)
    std::cout << "Time refinement: " << T3.get() << "\n";

  std::cout << "Initial dist\n";
  printPartStats(initParts);
  std::cout << "\n";

  std::cout << "Refined dist\n";
  printPartStats(parts);
  std::cout << "\n";
}

typedef galois::graphs::FileGraph FG;
typedef FG::GraphNode FN;
template <typename GNode, typename Weights>
struct order_by_degree {
  GGraph& graph;
  Weights& weights;
  order_by_degree(GGraph& g, Weights& w) : graph(g), weights(w) {}
  bool operator()(const GNode& a, const GNode& b) {
    uint64_t wa = weights[a];
    uint64_t wb = weights[b];
    int pa      = graph.getData(a, galois::MethodFlag::UNPROTECTED).getPart();
    int pb      = graph.getData(b, galois::MethodFlag::UNPROTECTED).getPart();
    if (pa != pb) {
      return pa < pb;
    }
    return wa < wb;
  }
};

typedef galois::substrate::PerThreadStorage<std::map<GNode, uint64_t>>
    PerThreadDegInfo;

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  srand(-1);
  MetisGraph metisGraph;
  GGraph& graph = *metisGraph.getGraph();

  galois::graphs::readGraph(graph, inputFile);

  galois::do_all(
      galois::iterate(graph),
      [&](GNode node) {
        for (auto jj : graph.edges(node)) {
          graph.getEdgeData(jj) = 1;
          // weight+=1;
        }
      },
      galois::loopname("initMorphGraph"));

  graphStat(graph);
  std::cout << "\n";

  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 5);
  galois::reportPageAlloc("MeminfoPre");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  Partition(&metisGraph, numPartitions);
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  std::cout << "Total edge cut: " << computeCut(graph) << "\n";

  if (outfile != "") {
    MetisGraph* coarseGraph = &metisGraph;
    while (coarseGraph->getCoarserGraph())
      coarseGraph = coarseGraph->getCoarserGraph();
    std::ofstream outFile(outfile.c_str());
    for (auto it = graph.begin(), ie = graph.end(); it != ie; it++) {
      unsigned gPart = graph.getData(*it).getPart();
      outFile << gPart << '\n';
    }
  }

  if (orderedfile != "" || permutationfile != "") {
    galois::graphs::FileGraph g;
    g.fromFile(inputFile);
    typedef galois::LargeArray<GNode> Permutation;
    Permutation perm;
    perm.create(g.size());
    std::copy(graph.begin(), graph.end(), perm.begin());
    PerThreadDegInfo threadDegInfo;
    std::vector<int> parts(numPartitions);
    for (unsigned int i = 0; i < parts.size(); i++) {
      parts[i] = i;
    }

    using WL = galois::worklists::PerSocketChunkFIFO<16>;

    galois::for_each(
        galois::iterate(parts),
        [&](int part, auto& lwl) {
          constexpr auto flag = galois::MethodFlag::UNPROTECTED;
          typedef std::vector<
              std::pair<unsigned, GNode>,
              galois::PerIterAllocTy::rebind<std::pair<unsigned, GNode>>::other>
              GD;
          // copy and translate all edges
          GD orderedNodes(GD::allocator_type(lwl.getPerIterAlloc()));
          for (auto n : graph) {
            auto& nd = graph.getData(n, flag);
            if (static_cast<int>(nd.getPart()) == part) {
              int edges = std::distance(graph.edge_begin(n, flag),
                                        graph.edge_end(n, flag));
              orderedNodes.push_back(std::make_pair(edges, n));
            }
          }
          std::sort(orderedNodes.begin(), orderedNodes.end());
          int index = 0;
          std::map<GNode, uint64_t>& threadMap(*threadDegInfo.getLocal());
          for (auto p : orderedNodes) {
            GNode n = p.second;
            threadMap[n] += index;
            for (auto eb : graph.edges(n, flag)) {
              GNode neigh = graph.getEdgeDst(eb);
              auto& nd    = graph.getData(neigh, flag);
              if (static_cast<int>(nd.getPart()) == part) {
                threadMap[neigh] += index;
              }
            }
            index++;
          }
        },
        galois::wl<WL>(), galois::per_iter_alloc(),
        galois::loopname("Order Graph"));

    std::map<GNode, uint64_t> globalMap;
    for (unsigned int i = 0; i < threadDegInfo.size(); i++) {
      std::map<GNode, uint64_t>& localMap(*threadDegInfo.getRemote(i));
      for (auto mb = localMap.begin(), me = localMap.end(); mb != me; mb++) {
        globalMap[mb->first] = mb->second;
      }
    }
    order_by_degree<GNode, std::map<GNode, uint64_t>> fn(graph, globalMap);
    std::map<GNode, int> nodeIdMap;
    int id = 0;
    for (auto nb = graph.begin(), ne = graph.end(); nb != ne; nb++) {
      nodeIdMap[*nb] = id;
      id++;
    }
    // compute inverse
    std::stable_sort(perm.begin(), perm.end(), fn);
    galois::LargeArray<uint64_t> perm2;
    perm2.create(g.size());
    // compute permutation
    id = 0;
    for (auto pb = perm.begin(), pe = perm.end(); pb != pe; pb++) {
      int prevId    = nodeIdMap[*pb];
      perm2[prevId] = id;
      id++;
    }
    galois::graphs::FileGraph out;
    galois::graphs::permute<int>(g, perm2, out);
    if (orderedfile != "")
      out.toFile(orderedfile);
    if (permutationfile != "") {
      std::ofstream file(permutationfile.c_str());
      galois::LargeArray<uint64_t> transpose;
      transpose.create(g.size());
      uint64_t id = 0;
      for (auto& ii : perm2) {
        transpose[ii] = id++;
      }
      for (auto& ii : transpose) {
        file << ii << "\n";
      }
    }
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/gmetis/GraphReader.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GRAPHREADER_H_
#define GRAPHREADER_H_
#include <fstream>
#include <vector>
using namespace std;

typedef galois::graphs::LC_CSR_Graph<int, unsigned int> InputGraph;
typedef galois::graphs::LC_CSR_Graph<int, unsigned int>::GraphNode InputGNode;

while (true) {
  int index = strtol(items, &remaining, 10) - 1;
  if (index < 0)
    break;
  items    = remaining;
  GNode n2 = nodes[index];
  if (n1 == n2) {
    continue;
  }
  graph->addEdge(n1, n2, galois::MethodFlag::WRITE, 1);
  graph->getData(n1).setEdgeWeight(graph->getData(n1).getEdgeWeight() + 1);
  graph->getData(n1).setNumEdges(graph->getData(n1).getNumEdges() + 1);
  countEdges++;
}
}

parallelMakeNodes(GGraph* g, vector<GNode>& gn, InputGraph* in,
                  galois::GAccumulator<int>& numNodes)
    : graph(g), inputGraph(in), gnodes(gn), pnumNodes(numNodes) {}
void operator()(InputGNode node, galois::UserContext<InputGNode>& ctx) {
  int id     = inputGraph->getData(node);
  GNode item = graph->createNode(100, 1); // FIXME: edge num
  //    graph->addNode(item);
  gnodes[id] = item;
  pnumNodes += 1;
}
}
;

struct parallelMakeEdges {
  GGraph* graph;
  InputGraph* inputGraph;
  vector<GNode>& gnodes;
  bool weighted;
  bool directed;
  galois::GAccumulator<int>& pnumEdges;

  parallelMakeEdges(GGraph* g, vector<GNode>& gn, InputGraph* in,
                    galois::GAccumulator<int>& numE, bool w = false,
                    bool dir = true)
      : graph(g), inputGraph(in), gnodes(gn), pnumEdges(numE) {
    weighted = w;
    directed = dir;
  }

  void operator()(InputGNode inNode, galois::UserContext<InputGNode>& ctx) {
    int nodeId          = inputGraph->getData(inNode);
    GNode node          = gnodes[nodeId];
    MetisNode& nodeData = graph->getData(node);
    for (InputGraph::edge_iterator jj   = inputGraph->edge_begin(inNode),
                                   eejj = inputGraph->edge_end(inNode);
         jj != eejj; ++jj) {
      InputGNode inNeighbor = inputGraph->getEdgeDst(jj);
      if (inNode == inNeighbor)
        continue;
      int neighId = inputGraph->getData(inNeighbor);
      int weight  = 1;
      if (weighted) {
        weight = inputGraph->getEdgeData(jj);
      }
      graph->addEdge(node, gnodes[neighId], galois::MethodFlag::WRITE, weight);
      nodeData.setNumEdges(nodeData.getNumEdges() + 1);
      nodeData.setEdgeWeight(nodeData.getEdgeWeight() + weight);
      /*if(!directed){
        graph->getEdgeData(graph->addEdge(node, gnodes[neighId])) = weight;//
        nodeData.incNumEdges();
        nodeData.addEdgeWeight(weight);
        }else{
        graph->getEdgeData(graph->addEdge(node, gnodes[neighId])) = weight;
        graph->getEdgeData(graph->addEdge(gnodes[neighId], node)) = weight;
        }*/
      pnumEdges += 1;
    }
  }
};

void readGraph(MetisGraph* metisGraph, const char* filename,
               bool weighted = false, bool directed = true) {
  InputGraph inputGraph;
  galois::graphs::readGraph(inputGraph, filename);
  cout << "start to transfer data to GGraph\n";
  int id = 0;
  for (InputGraph::iterator ii = inputGraph.begin(), ee = inputGraph.end();
       ii != ee; ++ii) {
    InputGNode node          = *ii;
    inputGraph.getData(node) = id++;
  }

  GGraph* graph = metisGraph->getGraph();
  vector<GNode> gnodes(inputGraph.size());
  id = 0;
  /*for(uint64_t i=0;i<inputGraph.size();i++){
    GNode node = graph->createNode(MetisNode(id, 1));
    graph->addNode(node);
    gnodes[id++] = node;
    }*/

  typedef galois::worklists::PerSocketChunkFIFO<256> WL;
  galois::GAccumulator<int> pnumNodes;
  galois::GAccumulator<int> pnumEdges;

  galois::Timer t;
  t.start();
  galois::for_each<WL>(inputGraph.begin(), inputGraph.end(),
                       parallelMakeNodes(graph, gnodes, &inputGraph, pnumNodes),
                       "NodesLoad");
  t.stop();
  cout << t.get() << " ms\n";
  t.start();
  galois::for_each<WL>(
      inputGraph.begin(), inputGraph.end(),
      parallelMakeEdges(graph, gnodes, &inputGraph, pnumEdges, weighted, true),
      "EdgesLoad");
  t.stop();
  cout << t.get() << " ms\n";

  int numNodes = pnumNodes.reduce();
  int numEdges = pnumEdges.reduce();

  cout << "Done Reading Graph ";
  cout << "numNodes: " << numNodes << "|numEdges: " << numEdges / 2 << "\n";
}

#endif /* GRAPHREADER_H_ */


================================================
FILE: lonestar/analytics/cpu/gmetis/Metis.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef METIS_H_
#define METIS_H_

#include "galois/graphs/LC_Morph_Graph.h"

class MetisNode;
using GGraph   = galois::graphs::LC_Morph_Graph<MetisNode, int>;
using GNode    = GGraph::GraphNode;
using GNodeBag = galois::InsertBag<GNode>;

// algorithms
enum InitialPartMode { GGP, GGGP, MGGGP };
enum refinementMode { BKL, BKL2, ROBO, GRACLUS };
// Nodes in the metis graph
class MetisNode {

  struct coarsenData {
    int matched : 1;
    int failedmatch : 1;
    GNode parent;
  };
  struct refineData {
    unsigned partition;
    unsigned oldPartition;
    bool maybeBoundary;
  };
  struct partitionData {
    bool locked;
  };

  partitionData pd;

  void initCoarsen() {
    data.cd.matched     = false;
    data.cd.failedmatch = false;
    data.cd.parent      = NULL;
  }

public:
  void initPartition() { pd.locked = false; }
  // int num;
  explicit MetisNode(int weight) : _weight(weight) {
    initCoarsen();
    initPartition();
  }

  MetisNode(unsigned weight, GNode child0, GNode child1 = NULL)
      : _weight(weight) {
    initCoarsen();
    initPartition();
    children[0] = child0;
    children[1] = child1;
  }

  MetisNode() : _weight(1) {
    initCoarsen();
    initPartition();
  }

  // call to switch data to refining
  void initRefine(unsigned part = 0, bool bound = false) {
    refineData rd = {part, part, bound};
    data.rd       = rd;
  }

  int getWeight() const { return _weight; }
  void setWeight(int weight) { _weight = weight; }

  void setParent(GNode p) { data.cd.parent = p; }
  GNode getParent() const {
    assert(data.cd.parent);
    return data.cd.parent;
  }

  void setMatched() { data.cd.matched = true; }
  bool isMatched() const { return data.cd.matched; }

  void setFailedMatch() { data.cd.failedmatch = true; }
  bool isFailedMatch() const { return data.cd.failedmatch; }

  GNode getChild(unsigned x) const { return children[x]; }
  unsigned numChildren() const { return children[1] ? 2 : 1; }

  unsigned getPart() const { return data.rd.partition; }
  void setPart(unsigned val) { data.rd.partition = val; }

  int getOldPart() const { return data.rd.oldPartition; }
  void OldPartCpyNew() { data.rd.oldPartition = data.rd.partition; }

  bool getmaybeBoundary() const { return data.rd.maybeBoundary; }
  void setmaybeBoundary(bool val) { data.rd.maybeBoundary = val; }

  void setLocked(bool locked) { pd.locked = locked; }
  bool isLocked() { return pd.locked; }

private:
  union {
    coarsenData cd;
    refineData rd;
  } data;

  GNode children[2];
  unsigned _weight;
};

// Structure to keep track of graph hirarchy
class MetisGraph {
  MetisGraph* coarser;
  MetisGraph* finer;

  GGraph graph;

public:
  MetisGraph() : coarser(0), finer(0) {}

  explicit MetisGraph(MetisGraph* finerGraph) : coarser(0), finer(finerGraph) {
    finer->coarser = this;
  }

  const GGraph* getGraph() const { return &graph; }
  GGraph* getGraph() { return &graph; }
  MetisGraph* getFinerGraph() const { return finer; }
  MetisGraph* getCoarserGraph() const { return coarser; }

  unsigned getNumNodes() { return std::distance(graph.begin(), graph.end()); }

  unsigned getTotalWeight() {
    MetisGraph* f = this;
    while (f->finer)
      f = f->finer;
    return std::distance(f->graph.begin(), f->graph.end());
  }
};

// Structure to store working partition information
struct partInfo {
  unsigned partNum;
  unsigned partMask;
  unsigned partWeight;

  explicit partInfo(unsigned mw) : partNum(0), partMask(1), partWeight(mw) {}

  partInfo() : partNum(~0), partMask(~0), partWeight(~0) {}

  partInfo(unsigned pn, unsigned pm, unsigned pw)
      : partNum(pn), partMask(pm), partWeight(pw) {}

  unsigned splitID() const { return partNum | partMask; }

  std::pair<unsigned, unsigned> splitRatio(unsigned numParts) {
    unsigned L = 0, R = 0;
    unsigned LM = partMask - 1; // 00100 -> 00011
    for (unsigned x = 0; x < numParts; ++x)
      if ((x & LM) == partNum) {
        if (x & partMask)
          ++R;
        else
          ++L;
      }
    return std::make_pair(L, R);
  }

  partInfo split() {
    partInfo np(splitID(), partMask << 1, 0);
    partMask <<= 1;
    return np;
  }
};

std::ostream& operator<<(std::ostream& os, const partInfo& p);

// Metrics
void printPartStats(std::vector<partInfo>&);
unsigned graphStat(GGraph& graph);
std::vector<unsigned> edgeCut(GGraph& g, unsigned nparts);
void printCuts(const char* str, MetisGraph* g, unsigned numPartitions);
unsigned computeCut(GGraph& g);

// Coarsening
MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                    bool verbose);

// Partitioning
std::vector<partInfo> partition(MetisGraph* coarseMetisGraph,
                                unsigned fineMetisGraphWeight,
                                unsigned numPartitions,
                                InitialPartMode partMode);
std::vector<partInfo> BisectAll(MetisGraph* mcg, unsigned numPartitions,
                                unsigned maxSize);
// Refinement
void refine(MetisGraph* coarseGraph, std::vector<partInfo>& parts,
            unsigned minSize, unsigned maxSize, refinementMode refM,
            bool verbose);
// void refinePart(GGraph& g, std::vector<partInfo>& parts, unsigned maxSize);
// Balancing
void balance(MetisGraph* Graph, std::vector<partInfo>& parts, unsigned maxSize);

#endif


================================================
FILE: lonestar/analytics/cpu/gmetis/Metric.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Metis.h"

#include <iomanip>
#include <iostream>
#include <numeric>

struct onlineStat {
  unsigned num;
  unsigned val;
  double valSQ;
  unsigned mmin;
  unsigned mmax;

  onlineStat()
      : num(0), val(0), valSQ(0), mmin(std::numeric_limits<unsigned>::max()),
        mmax(0) {}

  void add(unsigned v) {
    ++num;
    val += v;
    valSQ += (double)v * (double)v;
    mmin = std::min(v, mmin);
    mmax = std::max(v, mmax);
  }

  double mean() { return (double)val / (double)num; }

  double variance() {
    double t = valSQ / (double)num;
    double m = mean();
    return t - m * m;
  }

  unsigned count() { return num; }
  unsigned total() { return val; }
  unsigned min() { return mmin; }
  unsigned max() { return mmax; }
};

unsigned graphStat(GGraph& graph) {
  onlineStat e;
  for (auto ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {
    unsigned val = std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));
    e.add(val);
  }
  std::cout << "Nodes " << e.count() << " Edges(total, var, min, max) "
            << e.total() << " " << e.variance() << " " << e.min() << " "
            << e.max();
  return e.count();
}

std::vector<unsigned> edgeCut(GGraph& g, unsigned nparts) {
  std::vector<unsigned> cuts(nparts);

  // find boundary nodes with positive gain
  for (auto nn : g) {
    unsigned gPart = g.getData(nn).getPart();
    for (auto ii : g.edges(nn)) {
      auto& m = g.getData(g.getEdgeDst(ii));
      if (m.getPart() != gPart) {
        cuts.at(gPart) += g.getEdgeData(ii);
      }
    }
  }
  return cuts;
}

unsigned computeCut(GGraph& g) {
  unsigned cuts = 0;
  for (auto nn : g) {
    unsigned gPart = g.getData(nn).getPart();
    for (auto ii : g.edges(nn)) {
      auto& m = g.getData(g.getEdgeDst(ii));
      if (m.getPart() != gPart)
        cuts += g.getEdgeData(ii);
    }
  }
  return cuts / 2;
}

void printPartStats(std::vector<partInfo>& parts) {
  onlineStat e;
  assert(!parts.empty());
  for (unsigned x = 0; x < parts.size(); ++x) {
    e.add(parts[x].partWeight);
  }
  std::cout << "target " << e.total() / e.count() << " var " << e.variance()
            << " min " << e.min() << " max " << e.max() << "\n";
}

std::ostream& operator<<(std::ostream& os, const partInfo& p) {
  os << "Num " << std::setw(3) << p.partNum << "\tmask " << std::setw(5)
     << std::hex << p.partMask << std::dec << "\tweight " << p.partWeight;
  return os;
}

void printCuts(const char* str, MetisGraph* g, unsigned numPartitions) {
  std::vector<unsigned> ec = edgeCut(*g->getGraph(), numPartitions);
  std::cout << str << " Edge Cuts:\n";
  for (unsigned x = 0; x < ec.size(); ++x)
    std::cout << (x == 0 ? "" : " ") << ec[x];
  std::cout << "\n";
  std::cout << str << " Average Edge Cut: "
            << (std::accumulate(ec.begin(), ec.end(), 0) / ec.size()) << "\n";
  std::cout << str
            << " Minimum Edge Cut: " << *std::min_element(ec.begin(), ec.end())
            << "\n";
}


================================================
FILE: lonestar/analytics/cpu/gmetis/Partitioning.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "Metis.h"
#include <set>
#include "galois/Galois.h"
#include <map>
#include <set>
#include <cstdlib>
#include <iostream>
#include <stack>
#include <climits>
const bool multiSeed = true;

namespace {

// gain of moving n from it's current part to new part
int gain_limited(GGraph& g, GNode n, unsigned newpart,
                 galois::MethodFlag flag) {
  int retval     = 0;
  unsigned nPart = g.getData(n, flag).getPart();
  for (auto ii : g.edges(n, flag)) {
    GNode neigh = g.getEdgeDst(ii);
    auto nData  = g.getData(neigh, flag);
    if (nData.getPart() == nPart)
      retval -= g.getEdgeData(ii, flag);
    else if (nData.getPart() == newpart)
      retval += g.getEdgeData(ii, flag);
  }
  return retval;
}

GNode findSeed(GGraph& g, unsigned partNum, int partWeight,
               galois::MethodFlag flag) {
  // pick a seed

  int rWeight = (int)(drand48() * (partWeight));
  GNode seed  = *g.begin();
  /*std::vector<std::pair<int,GNode> > nodeEd;
  for (auto ii = g.begin(), ee = g.end(); ii != ee; ++ii) {
    if (g.getData(*ii, flag).getPart() == partNum) {
      seed = *ii;
      nodeEd.push_back(std::make_pair(std::distance(g.edge_begin(*ii),g.edge_end(*ii)),*ii));
    }
  }
  std::sort(nodeEd.begin(),nodeEd.end());
  return nodeEd[nodeEd.size()-1].second;
  */
  for (auto ii = g.begin(), ee = g.end(); ii != ee; ++ii) {
    if (g.getData(*ii, flag).getPart() == partNum) {
      seed = *ii;
      rWeight -= g.getData(*ii, flag).getWeight();
      if (rWeight < 0)
        return *ii;
    }
  }

  return seed;
}

using BisectPolicy = partInfo(GGraph& g, partInfo& oldPart,
                              std::pair<unsigned, unsigned> ratio,
                              std::vector<GNode>* b, int oldWeight);

partInfo bisect_GGP(GGraph& g, partInfo& oldPart,
                    std::pair<unsigned, unsigned> ratio,
                    std::vector<GNode>* b = NULL, int = 0) {
  partInfo newPart = oldPart.split();
  std::deque<GNode> boundary;
  unsigned& newWeight = newPart.partWeight = 0;
  unsigned targetWeight =
      oldPart.partWeight * ratio.second / (ratio.first + ratio.second);

  auto flag = galois::MethodFlag::UNPROTECTED;

  do {
    boundary.push_back(findSeed(g, oldPart.partNum, oldPart.partWeight, flag));
    // grow partition
    while (newWeight < targetWeight && !boundary.empty()) {
      GNode n = boundary.front();
      boundary.pop_front();
      if (g.getData(n, flag).getPart() == newPart.partNum)
        continue;
      newWeight += g.getData(n, flag).getWeight();
      g.getData(n, flag).setPart(newPart.partNum);
      if (b)
        b->push_back(n);
      for (auto ii : g.edges(n, flag))
        if (g.getData(g.getEdgeDst(ii), flag).getPart() == oldPart.partNum)
          boundary.push_back(g.getEdgeDst(ii));
    }
  } while (newWeight < targetWeight && multiSeed);

  oldPart.partWeight -= newWeight;
  return newPart;
}

partInfo bisect_GGGP(GGraph& g, partInfo& oldPart,
                     std::pair<unsigned, unsigned> ratio,
                     std::vector<GNode>* b = NULL, int oldWeight = 0) {
  partInfo newPart = oldPart.split();
  std::map<GNode, int> gains;
  std::map<int, std::set<GNode>> boundary;

  unsigned& newWeight = newPart.partWeight = oldWeight;
  unsigned targetWeight =
      oldPart.partWeight * ratio.second / (ratio.first + ratio.second);
  // pick a seed

  auto flag = galois::MethodFlag::UNPROTECTED;

  do {
    // boundary[0].insert(findSeed(g, oldPart.partNum, oldPart.partWeight,
    // flag));
    GNode bNode = findSeed(g, oldPart.partNum, oldPart.partWeight, flag);
    boundary[gain_limited(g, bNode, newPart.partNum, flag)].insert(bNode);

    // grow partition
    while (newWeight < targetWeight && !boundary.empty()) {
      auto bi = boundary.rbegin();
      GNode n = *bi->second.begin();
      bi->second.erase(bi->second.begin());
      if (bi->second.empty())
        boundary.erase(bi->first);
      if (g.getData(n, flag).getPart() == newPart.partNum)
        continue;
      newWeight += g.getData(n, flag).getWeight();
      g.getData(n, flag).setPart(newPart.partNum);
      if (b)
        b->push_back(n);
      for (auto ii : g.edges(n, flag)) {
        GNode dst = g.getEdgeDst(ii);
        auto gi   = gains.find(dst);
        if (gi != gains.end()) { // update
          boundary[gi->second].erase(dst);
          if (boundary[gi->second].empty())
            boundary.erase(gi->second);
          gains.erase(dst);
        }
        if (g.getData(dst, flag).getPart() == oldPart.partNum) {
          int newgain = gains[dst] =
              gain_limited(g, dst, newPart.partNum, flag);
          boundary[newgain].insert(dst);
        }
      }
    }
  } while (newWeight < targetWeight && multiSeed);

  oldPart.partWeight -= newWeight;
  return newPart;
}

int computeEdgeCut(GGraph& g) {
  int cuts = 0;
  for (auto nn : g) {
    unsigned gPart = g.getData(nn).getPart();
    for (auto ii : g.edges(nn)) {
      auto& m = g.getData(g.getEdgeDst(ii));
      if (m.getPart() != gPart) {
        cuts += g.getEdgeData(ii);
      }
    }
  }

  return cuts / 2;
}

/*int node_gain(GGraph &graph, GNode node) {
  auto nData = graph.getData(node,galois::MethodFlag::UNPROTECTED);
  int gain = 0;
  for (auto ei : graph.edges(node)) {
    auto neigh = graph.getEdgeDst(ei);
    int ew = graph.getEdgeData(ei);
    auto neighData = graph.getData(neigh,galois::MethodFlag::UNPROTECTED);
    if (nData.getPart() != neighData.getPart()) {
      gain += ew;
    } else {
      gain -= ew;
    }
  }
  return gain;
}*/

typedef std::pair<int, std::pair<GNode, GNode>> PartMatch;
typedef galois::substrate::PerThreadStorage<PartMatch> PerThreadPartInfo;
void KLMatch(GGraph& graph, std::vector<GNode>& boundary,
             PerThreadPartInfo& threadInfo, int oldPartNum, int newPartNum) {

  auto isPartOk = [&](int partNum) -> bool {
    return (partNum == oldPartNum || partNum == newPartNum);
  };
  auto isNodeOk = [&](MetisNode& node) -> bool {
    return !node.isLocked() && isPartOk(node.getPart());
  };

  galois::for_each(
      galois::iterate(boundary),
      [&](GNode node, auto&) {
        auto flag            = galois::MethodFlag::UNPROTECTED;
        PartMatch* localInfo = threadInfo.getLocal();
        int gain             = localInfo->first;
        auto& srcData        = graph.getData(node, flag);
        int srcGain          = 0;
        if (!isNodeOk(srcData)) {
          return;
        }

        for (auto ei : graph.edges(node, flag)) {
          int ew      = graph.getEdgeData(ei, flag);
          GNode n     = graph.getEdgeDst(ei);
          auto& nData = graph.getData(n, flag);
          if (!isNodeOk(nData)) {
            continue;
          }
          if (nData.getPart() == srcData.getPart()) {
            srcGain -= ew;
          } else {
            srcGain += ew;
          }
        }
        for (auto ei : graph.edges(node, flag)) {
          GNode n       = graph.getEdgeDst(ei);
          auto nData    = graph.getData(n, flag);
          int nw        = graph.getEdgeData(ei, flag);
          int neighGain = 0;
          if (!isNodeOk(nData) || nData.getPart() == srcData.getPart()) {
            continue;
          }
          for (auto nei : graph.edges(n, flag)) {
            int ew      = graph.getEdgeData(nei, flag);
            GNode nn    = graph.getEdgeDst(nei);
            auto nnData = graph.getData(nn, flag);
            if (!isNodeOk(nnData)) {
              continue;
            }
            if (nnData.getPart() == nData.getPart()) {
              neighGain -= ew;
            } else {
              neighGain += ew;
            }
          }
          int totalGain = srcGain + neighGain - 2 * nw;
          if (totalGain > gain) {
            gain                     = totalGain;
            localInfo->first         = gain;
            localInfo->second.first  = node;
            localInfo->second.second = n;
          }
        }
      },
      galois::loopname("KLMatch"),
      galois::wl<galois::worklists::ChunkLIFO<32>>());
};

void refine_kl(GGraph& graph, std::vector<GNode>& boundary, int oldPartNum,
               int newPartNum, std::vector<partInfo>& parts) {
  std::vector<GNode> swappedNodes;
  std::vector<PartMatch> foundNodes;
  // int iter = 0;
  do {
    std::vector<PartMatch> matches;
    for (unsigned int j = 0; j < boundary.size(); j++) {
      PerThreadPartInfo iterationInfo;
      for (unsigned int i = 0; i < iterationInfo.size(); i++) {
        iterationInfo.getRemote(i)->first         = INT_MIN;
        iterationInfo.getRemote(i)->second.first  = NULL;
        iterationInfo.getRemote(i)->second.second = NULL;
      }
      KLMatch(graph, boundary, iterationInfo, oldPartNum, newPartNum);
      PartMatch bestMatch;
      bestMatch.first = INT_MIN;
      for (unsigned int i = 0; i < iterationInfo.size(); i++) {
        PartMatch match = *iterationInfo.getRemote(i);
        if (match.first > bestMatch.first) {
          bestMatch = match;
        }
      }
      if (bestMatch.second.first == NULL || bestMatch.second.second == NULL)
        break;
      auto& m1 = graph.getData(bestMatch.second.first);
      auto& m2 = graph.getData(bestMatch.second.second);
      m1.setLocked(true);
      m2.setLocked(true);
      matches.push_back(bestMatch);
      foundNodes.push_back(bestMatch);
    }
    if (matches.size() == 0) {
      break;
    }
    int g_max = 0;
    int temp  = 0;
    int index = -1;
    for (unsigned int k = 0; k < matches.size(); k++) {
      g_max += matches[k].first;
      if (g_max > temp) {
        temp  = g_max;
        index = k;
      }
    }
    g_max = temp;

    if (g_max <= 0 || index < 0)
      break;

    for (int i = 0; i <= index; i++) {
      PartMatch match = matches[i];
      GNode n1        = match.second.first;
      GNode n2        = match.second.second;
      auto& n1Data    = graph.getData(n1);
      auto& n2Data    = graph.getData(n2);
      int p1          = n1Data.getPart();
      int p2          = n2Data.getPart();
      parts[p1].partWeight += (n2Data.getWeight() - n1Data.getWeight());
      parts[p2].partWeight += (n1Data.getWeight() - n2Data.getWeight());
      n1Data.setPart(p2);
      n2Data.setPart(p1);
      swappedNodes.push_back(n1);
      swappedNodes.push_back(n2);
    }
    for (unsigned int i = index + 1; i < matches.size(); i++) {
      auto& m1 = graph.getData(matches[i].second.first);
      auto& m2 = graph.getData(matches[i].second.second);
      m1.setLocked(false);
      m2.setLocked(false);
    }
  } while (true);
  for (PartMatch match : foundNodes) {
    graph.getData(match.second.first).setLocked(false);
    graph.getData(match.second.second).setLocked(false);
  }
}

template <BisectPolicy bisect>
void serialBisect(MetisGraph* mg, unsigned int, unsigned int nparts,
                  std::vector<partInfo>& parts) {
  GGraph* graph = mg->getGraph();
  std::stack<partInfo*> workList;
  workList.push(&parts[0]);
  while (!workList.empty()) {
    partInfo* item = workList.top();
    workList.pop();
    if (item->splitID() >= nparts) // when to stop
      continue;
    std::pair<unsigned, unsigned> ratio = item->splitRatio(nparts);
    std::vector<GNode> newNodes;
    // int iter = 0;
    partInfo newPart;
    newPart.partWeight = 0;
    newPart = bisect(*graph, *item, ratio, &newNodes, newPart.partWeight);
    parts[newPart.partNum] = newPart;
    refine_kl(*graph, newNodes, item->partNum, newPart.partNum, parts);
    newPart.partWeight = parts[newPart.partNum].partWeight;
    item->partWeight   = parts[item->partNum].partWeight;
    // unsigned targetWeight = item->partWeight * ratio.second / (ratio.first +
    // ratio.second);
    item->partWeight = parts[item->partNum].partWeight;
    workList.push(&(parts[newPart.partNum]));
    workList.push(item);
  }
}

template <BisectPolicy bisect>
void parallelBisect(MetisGraph* mg, unsigned int, unsigned int nparts,
                    std::vector<partInfo>& parts) {
  GGraph* graph = mg->getGraph();
  galois::for_each(
      galois::iterate({&parts[0]}),
      [&](partInfo* item, auto& cnx) {
        if (item->splitID() >= nparts) // when to stop
          return;
        std::pair<unsigned, unsigned> ratio = item->splitRatio(nparts);
        // std::cout << "Splitting " << item->partNum << ":" <<
        // item->partMask << " L " << ratio.first << " R " <<
        // ratio.second << "\n";
        partInfo newPart = bisect(*graph, *item, ratio, NULL, 0);
        // std::cout << "Result " << item->partNum << " " <<
        // newPart.partNum << "\n";
        parts[newPart.partNum] = newPart;
        cnx.push(&parts[newPart.partNum]);
        cnx.push(item);
      },
      galois::loopname("parallelBisect"),
      galois::wl<galois::worklists::ChunkLIFO<1>>());
}

} // namespace

std::vector<partInfo> partition(MetisGraph* mcg, unsigned fineMetisGraphWeight,
                                unsigned numPartitions,
                                InitialPartMode partMode) {
  std::vector<partInfo> parts(numPartitions);
  assert(fineMetisGraphWeight == mcg->getTotalWeight());
  parts[0] = partInfo(fineMetisGraphWeight);

  galois::do_all(
      galois::iterate(*mcg->getGraph()),
      [g = mcg->getGraph()](GNode item) {
        g->getData(item, galois::MethodFlag::UNPROTECTED).initRefine(0, true);
        g->getData(item, galois::MethodFlag::UNPROTECTED).initPartition();
      },
      galois::loopname("initPart"));

  bool serialPartition = false;
  if (serialPartition) {
    switch (partMode) {
    case GGP:
      std::cout << "\nSorting initial partitioning using GGP:\n";
      serialBisect<bisect_GGP>(mcg, fineMetisGraphWeight, numPartitions, parts);
      break;
    case GGGP:
      std::cout << "\nSorting initial partitioning using GGGP:\n";
      serialBisect<bisect_GGGP>(mcg, fineMetisGraphWeight, numPartitions,
                                parts);
      break;
    default:
      abort();
    }
  } else {
    switch (partMode) {
    case GGP:
      std::cout << "\nSorting initial partitioning using GGP:\n";
      parallelBisect<bisect_GGP>(mcg, fineMetisGraphWeight, numPartitions,
                                 parts);
      break;
    case GGGP:
      std::cout << "\nSorting initial partitioning using GGGP:\n";
      parallelBisect<bisect_GGGP>(mcg, fineMetisGraphWeight, numPartitions,
                                  parts);
      break;
    default:
      abort();
    }
  }
  // XXX(ddn): Leave commented out until we have balance() defined.
#if 0
  if (!multiSeed) {
    unsigned maxWeight = 1.01 * mcg->getTotalWeight() / numPartitions;
    balance(mcg, parts, maxWeight);
  }
#endif
  static_assert(multiSeed, "not yet implemented");
  return parts;
}

namespace {
int edgeCount(GGraph& g) {
  int count = 0;
  for (auto nn : g)
    for (auto ii : g.edges(nn))
      count += g.getEdgeData(ii);
  return count / 2;
}
} // namespace

std::vector<partInfo> BisectAll(MetisGraph* mcg, unsigned numPartitions,
                                unsigned int) {
  std::cout << "\nSorting initial partitioning using MGGGP:\n";
  auto flag = galois::MethodFlag::UNPROTECTED;
  GGraph& g = *mcg->getGraph();

  int bestCut = edgeCount(g);
  std::map<GNode, int> bestParts;
  std::vector<partInfo> bestPartInfos(numPartitions);

  for (int nbTry = 0; nbTry < 20; nbTry++) {
    std::vector<partInfo> partInfos(numPartitions);
    std::vector<std::map<int, std::set<GNode>>> boundary(numPartitions);
    std::map<int, std::set<int>> partitions;
    for (auto ii : g)
      g.getData(ii).setPart(numPartitions + 1);
    auto seedIter = g.begin();
    int k         = 0;
    // find one seed for each partition and do initialization
    for (unsigned int i = 0; i < numPartitions; i++) {
      int seed      = (int)(drand48() * (mcg->getNumNodes())) + 1;
      bool goodseed = true;
      while (seed--)
        if (++seedIter == g.end())
          seedIter = g.begin();
      GNode n = *seedIter;

      for (unsigned int j = 0; j < i && k < 50; j++) {
        goodseed = goodseed && (*boundary[j][0].begin() != n);
        for (auto ii : g.edges(n, flag))
          goodseed = goodseed && (*boundary[j][0].begin() != g.getEdgeDst(ii));
      }
      if (!goodseed) {
        k++;
        i--;
        continue;
      }
      partInfos[i] = partInfo(i, 0, 0);
      boundary[i][0].insert(n);
      partitions[0].insert(i);
    }
    auto beg = g.begin();
    while (!partitions.empty()) {
      // find the next partition to improove
      auto bb       = partitions.begin();
      int partToMod = *bb->second.begin();
      bb->second.erase(bb->second.begin());
      if (bb->second.empty())
        partitions.erase(bb->first);

      // find the node to add to the partition
      GNode n = *g.begin();
      do {
        if (boundary[partToMod].empty())
          break;
        auto bi = boundary[partToMod].rbegin();
        n       = *bi->second.begin();
        bi->second.erase(bi->second.begin());
        if (bi->second.empty())
          boundary[partToMod].erase(bi->first);
      } while (g.getData(n, flag).getPart() < numPartitions &&
               !boundary[partToMod].empty());

      if (g.getData(n, flag).getPart() < numPartitions &&
          boundary[partToMod].empty()) {
        GGraph::iterator ii = beg, ee = g.end();
        for (; ii != ee; ii++)
          if (g.getData(*ii).getPart() == numPartitions + 1)
            break;
        if (ii == ee)
          break;
        else
          n = *(beg = ii);
      }

      // add the node
      partInfos[partToMod].partWeight += g.getData(n, flag).getWeight();
      partitions[partInfos[partToMod].partWeight].insert(partToMod);
      g.getData(n, flag).setPart(partToMod);
      for (auto ii : g.edges(n, flag)) {
        GNode dst   = g.getEdgeDst(ii);
        int newgain = gain_limited(g, dst, partToMod, flag);
        boundary[partToMod][newgain].insert(dst);
      }
    }
    // decides if this partition is the nez best one
    int newCut = computeEdgeCut(g);
    if (newCut < bestCut) {
      bestCut = newCut;
      for (GGraph::iterator ii = g.begin(), ee = g.end(); ii != ee; ii++)
        bestParts[*ii] = g.getData(*ii, flag).getPart();
      for (unsigned int i = 0; i < numPartitions; i++)
        bestPartInfos[i] = partInfos[i];
    }
  }

  for (GGraph::iterator ii = g.begin(), ee = g.end(); ii != ee; ii++)
    g.getData(*ii, flag).setPart(bestParts[*ii]);

  return bestPartInfos;
}


================================================
FILE: lonestar/analytics/cpu/gmetis/README.md
================================================
GMETIS
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program partitions a given graph using the METIS algorithm:

George Karypis and Vipin Kumar. Multilevel k-way Partitioning Scheme for 
Irregular Graphs. J. Parallel Distributed Computing. 1998.

George Karypis and Vipin Kumar. A fast and high quality multilevel scheme 
for partitioning irregular graphs. International Conference on Parallel 
Processing. 1995

The algorithm first coarsens the graph, partitions it, and then refines 
the partitioning.

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/gmetis; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./gmetis-cpu <path-to-graph> <number-of-partitions>`
-`$ ./gmetis-cpu <path-to-graph> <number-of-partitions> -t 20 -GGP`

PERFORMANCE
--------------------------------------------------------------------------------

* In our experience, the default GGGP and BKL2 algorithms for initial partitioning 
  and refining, respectively, give the best performance.

* The performance of all algorithms depend on an optimal choice of the compile 
  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is 
  enabled (via galois::steal()). The optimal value of the constant might depend on 
  the architecture, so you might want to evaluate the performance over a range of 
  values (say [16-4096]).


================================================
FILE: lonestar/analytics/cpu/gmetis/Refine.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "Metis.h"
#include <set>
#include <iostream>

namespace {

bool isBoundary(GGraph& g, GNode n) {
  unsigned int nPart = g.getData(n).getPart();
  for (auto ii : g.edges(n))
    if (g.getData(g.getEdgeDst(ii)).getPart() != nPart)
      return true;
  return false;
}

// This is only used on the terminal graph (find graph)
void findBoundary(GNodeBag& bag, GGraph& cg) {

  galois::do_all(
      galois::iterate(cg),
      [&](GNode n) {
        auto& cn = cg.getData(n, galois::MethodFlag::UNPROTECTED);
        if (cn.getmaybeBoundary())
          cn.setmaybeBoundary(isBoundary(cg, n));
        if (cn.getmaybeBoundary())
          bag.push(n);
      },
      galois::loopname("findBoundary"));
}

// this is used on the coarse graph to project to the fine graph
void findBoundaryAndProject(GNodeBag& bag, GGraph& cg, GGraph& fg) {
  galois::do_all(
      galois::iterate(cg),
      [&](GNode n) {
        auto& cn = cg.getData(n, galois::MethodFlag::UNPROTECTED);
        if (cn.getmaybeBoundary())
          cn.setmaybeBoundary(isBoundary(cg, n));

        // project part and maybe boundary
        // unsigned part = cn.getPart();
        for (unsigned x = 0; x < cn.numChildren(); ++x) {
          fg.getData(cn.getChild(x), galois::MethodFlag::UNPROTECTED)
              .initRefine(cn.getPart(), cn.getmaybeBoundary());
        }
        if (cn.getmaybeBoundary())
          bag.push(n);
      },
      galois::loopname("findBoundaryAndProject"));
}

template <bool balance>
void refine_BKL2(unsigned minSize, unsigned maxSize, GGraph& cg, GGraph* fg,
                 std::vector<partInfo>& parts) {

  auto gainIndexer = [&cg](GNode n) -> int {
    int retval              = 0;
    galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;
    unsigned int nPart      = cg.getData(n, flag).getPart();
    for (auto ii = cg.edge_begin(n, flag), ee = cg.edge_end(n); ii != ee;
         ++ii) {
      GNode neigh = cg.getEdgeDst(ii);
      if (cg.getData(neigh, flag).getPart() == nPart)
        retval -= cg.getEdgeData(ii, flag);
      else
        retval += cg.getEdgeData(ii, flag);
    }
    return -retval / 16;
  };

  typedef galois::worklists::PerSocketChunkFIFO<8> Chunk;
  typedef galois::worklists::OrderedByIntegerMetric<decltype(gainIndexer),
                                                    Chunk, 10>
      pG;

  GNodeBag boundary;

  if (fg)
    findBoundaryAndProject(boundary, cg, *fg);
  else
    findBoundary(boundary, cg);

  //! [Example Per-Thread-Storage Declaration]
  typedef galois::gstl::Vector<unsigned> VecTy;
  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
  ThreadLocalData edgesThreadLocal;
  //! [Example Per-Thread-Storage Declaration]

  //! [Example Per-Thread-Storage Usage]
  // Find the partition n is most connected to
  auto pickPartitionEC = [&](GNode n, auto&) -> unsigned {
    auto& edges = *edgesThreadLocal.getLocal();
    edges.clear();
    edges.resize(parts.size(), 0);
    unsigned P = cg.getData(n).getPart();
    for (auto ii : cg.edges(n)) {
      GNode neigh = cg.getEdgeDst(ii);
      auto& nd    = cg.getData(neigh);
      if (parts[nd.getPart()].partWeight < maxSize || nd.getPart() == P)
        edges[nd.getPart()] += cg.getEdgeData(ii);
    }
    return std::distance(edges.begin(),
                         std::max_element(edges.begin(), edges.end()));
  };
  //! [Example Per-Thread-Storage Usage]

  // Find the smallest partition n is connected to
  auto pickPartitionMP = [&](GNode n, auto&) -> unsigned {
    unsigned P  = cg.getData(n).getPart();
    unsigned W  = parts[P].partWeight;
    auto& edges = *edgesThreadLocal.getLocal();
    edges.clear();
    edges.resize(parts.size(), ~0);
    edges[P] = W;
    W        = (double)W * 0.9;
    for (auto ii : cg.edges(n)) {
      GNode neigh = cg.getEdgeDst(ii);
      auto& nd    = cg.getData(neigh);
      if (parts[nd.getPart()].partWeight < W)
        edges[nd.getPart()] = parts[nd.getPart()].partWeight;
    }
    return std::distance(edges.begin(),
                         std::min_element(edges.begin(), edges.end()));
  };

  galois::for_each(
      galois::iterate(boundary),
      [&](GNode n, auto& cnx) {
        auto& nd         = cg.getData(n);
        unsigned curpart = nd.getPart();
        unsigned newpart =
            balance ? pickPartitionMP(n, cnx) : pickPartitionEC(n, cnx);
        if (parts[curpart].partWeight < minSize)
          return;
        if (curpart != newpart) {
          nd.setPart(newpart);
          __sync_fetch_and_sub(&parts[curpart].partWeight, nd.getWeight());
          __sync_fetch_and_add(&parts[newpart].partWeight, nd.getWeight());
          for (auto ii : cg.edges(n)) {
            GNode neigh = cg.getEdgeDst(ii);
            auto& ned   = cg.getData(neigh);
            if (ned.getPart() != newpart && !ned.getmaybeBoundary()) {
              ned.setmaybeBoundary(true);
              if (fg)
                for (unsigned x = 0; x < ned.numChildren(); ++x)
                  fg->getData(ned.getChild(x), galois::MethodFlag::UNPROTECTED)
                      .setmaybeBoundary(true);
            }
            // if (ned.getPart() != newpart)
            // cnx.push(neigh);
          }
          if (fg)
            for (unsigned x = 0; x < nd.numChildren(); ++x)
              fg->getData(nd.getChild(x), galois::MethodFlag::UNPROTECTED)
                  .setPart(newpart);
        }
      },
      galois::loopname("refine"), galois::wl<pG>(gainIndexer));
}

void projectPart(MetisGraph* Graph, std::vector<partInfo>&) {
  GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();
  GGraph* coarseGraph = Graph->getGraph();

  galois::do_all(
      galois::iterate(*coarseGraph),
      [&](GNode n) {
        auto& cn      = coarseGraph->getData(n);
        unsigned part = cn.getPart();
        for (unsigned x = 0; x < cn.numChildren(); ++x) {
          fineGraph->getData(cn.getChild(x)).setPart(part);
        }
      },
      galois::loopname("project"));
}

int gain(GGraph& g, GNode n) {
  int retval         = 0;
  unsigned int nPart = g.getData(n).getPart();
  for (auto ii : g.edges(n)) {
    GNode neigh = g.getEdgeDst(ii);
    if (g.getData(neigh).getPart() == nPart)
      retval -= g.getEdgeData(ii);
    else
      retval += g.getEdgeData(ii);
  }
  return retval;
}

void parallelBoundary(GNodeBag& bag, GGraph& graph) {
  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) {
        if (gain(graph, n) > 0)
          bag.push(n);
      },
      galois::loopname("Get-Boundary"));
}

void refineOneByOne(GGraph& g, std::vector<partInfo>& parts) {
  std::vector<GNode> boundary;
  unsigned int meanWeight = 0;
  for (unsigned int i = 0; i < parts.size(); i++)
    meanWeight += parts[i].partWeight;
  meanWeight /= parts.size();

  GNodeBag boundaryBag;
  parallelBoundary(boundaryBag, g);

  for (auto ii = boundaryBag.begin(), ie = boundaryBag.end(); ii != ie; ii++) {
    GNode n        = (*ii);
    unsigned nPart = g.getData(n).getPart();
    int part[parts.size()];
    for (unsigned int i = 0; i < parts.size(); i++)
      part[i] = 0;
    for (auto ii : g.edges(n)) {
      GNode neigh = g.getEdgeDst(ii);
      part[g.getData(neigh).getPart()] += g.getEdgeData(ii);
    }
    int t          = part[nPart];
    unsigned int p = nPart;
    for (unsigned int i = 0; i < parts.size(); i++)
      if (i != nPart && part[i] > t &&
          parts[nPart].partWeight > parts[i].partWeight * (98) / (100) &&
          parts[nPart].partWeight > meanWeight * 98 / 100) {
        t = part[i];
        p = i;
      }
    if (p != nPart) {
      g.getData(n).setPart(p);
      parts[p].partWeight += g.getData(n).getWeight();
      parts[nPart].partWeight -= g.getData(n).getWeight();
    }
  }
}

void refine_BKL(GGraph& g, std::vector<partInfo>& parts) {
  std::set<GNode> boundary;

  // find boundary nodes with positive gain
  GNodeBag boundaryBag;
  parallelBoundary(boundaryBag, g);

  for (auto ii = boundaryBag.begin(), ie = boundaryBag.end(); ii != ie; ii++) {
    boundary.insert(*ii);
  }

  // refine by swapping with a neighbor high-gain node
  while (!boundary.empty()) {
    GNode n = *boundary.begin();
    boundary.erase(boundary.begin());
    unsigned nPart = g.getData(n).getPart();
    for (auto ii : g.edges(n)) {
      GNode neigh        = g.getEdgeDst(ii);
      unsigned neighPart = g.getData(neigh).getPart();
      if (neighPart != nPart && boundary.count(neigh) && gain(g, n) > 0 &&
          gain(g, neigh) > 0) {
        unsigned nWeight     = g.getData(n).getWeight();
        unsigned neighWeight = g.getData(neigh).getWeight();
        // swap
        g.getData(n).setPart(neighPart);
        g.getData(neigh).setPart(nPart);
        // update partinfo
        parts[neighPart].partWeight += nWeight;
        parts[neighPart].partWeight -= neighWeight;
        parts[nPart].partWeight += neighWeight;
        parts[nPart].partWeight -= nWeight;
        // remove nodes
        boundary.erase(neigh);
        break;
      }
    }
  }
}

/*double ratiocut(int nbClust, int* degree, int* card)
{
  double res=0;
  for (int i=0; i<nbClust;i++)
    res += (double)(degree[i])/(double)(card[i]);

  return res;
}*/

void GraclusRefining(GGraph* graph, int nbParti, int nbIter) {
  nbIter = std::min(15, nbIter);
  std::vector<double> Dist(nbParti);
  std::vector<int> card(nbParti);
  std::vector<int> degreeIn(nbParti);

  using Accum = galois::GAccumulator<size_t>;
  std::vector<Accum> cardAccum(nbParti);
  std::vector<Accum> degreeInAccum(nbParti);

  for (int j = 0; j < nbIter; j++) {

    GGraph& g = *graph;
    galois::do_all(
        galois::iterate(g),
        [&](GNode n) {
          unsigned int clust =
              g.getData(n, galois::MethodFlag::UNPROTECTED).getPart();
          int degreet = 0;

          g.getData(n, galois::MethodFlag::UNPROTECTED).OldPartCpyNew();

          for (auto ii : g.edges(n, galois::MethodFlag::UNPROTECTED))
            if (g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED)
                    .getPart() == clust)
              degreet +=
                  (int)g.getEdgeData(ii, galois::MethodFlag::UNPROTECTED);

          cardAccum[clust] +=
              g.getData(n, galois::MethodFlag::UNPROTECTED).getWeight();
          degreeInAccum[clust] += degreet;
        },
        galois::loopname("compute dists"));

    for (int i = 0; i < nbParti; i++) {
      card[i] = cardAccum[i].reduce();
      cardAccum[i].reset();

      degreeIn[i] = degreeInAccum[i].reduce();
      degreeInAccum[i].reset();

      Dist[i] = (card[i] != 0) ? (double)(degreeIn[i] + card[i]) /
                                     ((double)card[i] * card[i])
                               : 0;
    }

    galois::do_all(
        galois::iterate(g),
        [&](GNode n) {
          double dmin   = std::numeric_limits<double>::min();
          int partition = -1;
          galois::gstl::Map<int, int> degreein;
          degreein[g.getData(n, galois::MethodFlag::UNPROTECTED)
                       .getOldPart()] += 1;
          for (auto ii : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
            int nclust =
                g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED)
                    .getOldPart();
            degreein[nclust] +=
                (int)g.getEdgeData(ii, galois::MethodFlag::UNPROTECTED);
          }

          for (auto clust = degreein.begin(), ee = degreein.end(); clust != ee;
               ++clust) {
            // the distance between the cluster clust and the noden is :
            double d = Dist[clust->first] - (2.0 * (double)clust->second /
                                             (double)card[clust->first]);
            if (d < dmin || partition == -1) {
              dmin      = d;
              partition = clust->first;
            }
          }
          g.getData(n, galois::MethodFlag::UNPROTECTED).setPart(partition);
        },
        galois::loopname("make moves"));
  }
}

} // namespace

void refine(MetisGraph* coarseGraph, std::vector<partInfo>& parts,
            unsigned minSize, unsigned maxSize, refinementMode refM,
            bool verbose) {
  MetisGraph* tGraph = coarseGraph;
  int nbIter         = 1;
  if (refM == GRACLUS) {
    while ((tGraph = tGraph->getFinerGraph()))
      nbIter *= 2;
    nbIter /= 4;
  }
  do {
    MetisGraph* fineGraph = coarseGraph->getFinerGraph();
    bool doProject        = true;
    if (verbose) {
      std::cout << "Cut " << computeCut(*coarseGraph->getGraph())
                << " Weights ";
      printPartStats(parts);
      std::cout << "\n";
    }
    // refine nparts times
    switch (refM) {
    case BKL2:
      refine_BKL2<false>(minSize, maxSize, *coarseGraph->getGraph(),
                         fineGraph ? fineGraph->getGraph() : nullptr, parts);
      doProject = false;
      break;
    case BKL:
      refine_BKL(*coarseGraph->getGraph(), parts);
      break;
    case ROBO:
      refineOneByOne(*coarseGraph->getGraph(), parts);
      break;
    case GRACLUS:
      GraclusRefining(coarseGraph->getGraph(), parts.size(), nbIter);
      nbIter = (nbIter + 1) / 2;
      break;
    default:
      abort();
    }
    // project up
    if (fineGraph && doProject) {
      projectPart(coarseGraph, parts);
    }
  } while ((coarseGraph = coarseGraph->getFinerGraph()));
}

/*
void balance(MetisGraph* coarseGraph, std::vector<partInfo>& parts, unsigned
meanSize) { MetisGraph* fineGraph = coarseGraph->getFinerGraph();
    refine_BKL2<true>(meanSize, *coarseGraph->getGraph(), fineGraph ?
fineGraph->getGraph() : nullptr, parts);
}
*/


================================================
FILE: lonestar/analytics/cpu/independentset/CMakeLists.txt
================================================
add_executable(maximal-independentset-cpu IndependentSet.cpp)
add_dependencies(apps maximal-independentset-cpu)
target_link_libraries(maximal-independentset-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS maximal-independentset-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small maximal-independentset-cpu "${BASEINPUT}/scalefree/symmetric/rmat10.sgr" "-symmetricGraph")


================================================
FILE: lonestar/analytics/cpu/independentset/IndependentSet.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/ParallelSTL.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/runtime/Profile.h"
#include "llvm/Support/CommandLine.h"

#include "Lonestar/BoilerPlate.h"

#include <utility>
#include <vector>
#include <algorithm>
#include <iostream>
#include <type_traits>
#include <random>
#include <math.h>

const char* name = "Maximal Independent Set";
const char* desc =
    "Computes a maximal independent set (not maximum) of nodes in a graph";
const char* url = "independent_set";

enum Algo { serial, pull, nondet, detBase, prio, edgetiledprio };

namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(
        clEnumVal(serial, "Serial"),
        clEnumVal(pull,
                  "Pull-based (node 0 is initially in the independent set)"),
        clEnumVal(nondet, "Non-deterministic, use bulk synchronous worklist"),
        clEnumVal(detBase, "use deterministic worklist"),
        clEnumVal(
            prio,
            "prio algo based on Martin's GPU ECL-MIS algorithm (default)"),
        clEnumVal(
            edgetiledprio,
            "edge-tiled prio algo based on Martin's GPU ECL-MIS algorithm")),
    cll::init(prio));

enum MatchFlag : char { UNMATCHED, OTHER_MATCHED, MATCHED };

struct Node {
  MatchFlag flag;
  Node() : flag(UNMATCHED) {}
};

struct prioNode {
  unsigned char flag; // 1 bit matched,6 bits prio, 1 bit undecided
  prioNode() : flag((unsigned char){0x01}) {}
};

struct SerialAlgo {
  using Graph = galois::graphs::LC_CSR_Graph<Node, void>::with_numa_alloc<
      true>::type ::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  void operator()(Graph& graph) {
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      if (findUnmatched(graph, *ii))
        match(graph, *ii);
    }
  }

  bool findUnmatched(Graph& graph, GNode src) {
    Node& me = graph.getData(src);
    if (me.flag != UNMATCHED)
      return false;

    for (auto ii : graph.edges(src)) {
      GNode dst  = graph.getEdgeDst(ii);
      Node& data = graph.getData(dst);
      if (data.flag == MATCHED)
        return false;
    }
    return true;
  }

  void match(Graph& graph, GNode src) {
    Node& me = graph.getData(src);
    for (auto ii : graph.edges(src)) {
      GNode dst  = graph.getEdgeDst(ii);
      Node& data = graph.getData(dst);
      data.flag  = OTHER_MATCHED;
    }
    me.flag = MATCHED;
  }
};

template <Algo algo>
struct DefaultAlgo {

  using Graph = typename galois::graphs::LC_CSR_Graph<
      Node, void>::template with_numa_alloc<true>::type;

  using GNode = typename Graph::GraphNode;

  struct LocalState {
    bool mod;
    explicit LocalState() : mod(false) {}
  };

  template <galois::MethodFlag Flag>
  bool build(Graph& graph, GNode src) {
    Node& me = graph.getData(src, Flag);
    if (me.flag != UNMATCHED)
      return false;

    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
      GNode dst  = graph.getEdgeDst(ii);
      Node& data = graph.getData(dst, Flag);
      if (data.flag == MATCHED)
        return false;
    }
    return true;
  }

  void modify(Graph& graph, GNode src) {
    Node& me = graph.getData(src, galois::MethodFlag::UNPROTECTED);
    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
      GNode dst  = graph.getEdgeDst(ii);
      Node& data = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
      data.flag  = OTHER_MATCHED;
    }
    me.flag = MATCHED;
  }

  template <typename C>
  void processNode(Graph& graph, const GNode& src, C& ctx) {
    bool mod;
    mod = build<galois::MethodFlag::WRITE>(graph, src);
    graph.getData(src, galois::MethodFlag::WRITE);
    ctx.cautiousPoint(); // Failsafe point

    if (mod) {
      modify(graph, src);
    }
  }

  template <typename WL, typename... Args>
  void run(Graph& graph, Args&&... args) {

    auto detID = [](const GNode& x) { return x; };

    galois::for_each(
        galois::iterate(graph),
        [&, this](const GNode& src, auto& ctx) {
          this->processNode(graph, src, ctx);
        },
        galois::no_pushes(), galois::wl<WL>(), galois::loopname("DefaultAlgo"),
        galois::det_id<decltype(detID)>(detID),
        galois::local_state<LocalState>(), std::forward<Args>(args)...);
  }

  void operator()(Graph& graph) {
    using DWL = galois::worklists::Deterministic<>;

    using BSWL = galois::worklists::BulkSynchronous<
        typename galois::worklists::PerSocketChunkFIFO<64>>;

    switch (algo) {
    case nondet:
      run<BSWL>(graph);
      break;
    case detBase:
      run<DWL>(graph);
      break;
    default:
      std::cerr << "Unknown algorithm" << algo << "\n";
      abort();
    }
  }
};

struct PullAlgo {

  using Graph = galois::graphs::LC_CSR_Graph<Node, void>::with_numa_alloc<
      true>::type ::with_no_lockable<true>::type;

  using GNode = Graph::GraphNode;
  using Bag   = galois::InsertBag<GNode>;

  using Counter = galois::GAccumulator<size_t>;

  template <typename R>
  void pull(const R& range, Graph& graph, Bag& matched, Bag& otherMatched,
            Bag& next, Counter& numProcessed) {

    galois::do_all(
        range,
        [&](const GNode& src) {
          numProcessed += 1;
          Node& n = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (n.flag == OTHER_MATCHED)
            return;

          MatchFlag f = MATCHED;
          for (auto edge :
               graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {
            GNode dst = graph.getEdgeDst(edge);
            if (dst >= src) {
              continue;
            }

            Node& other = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
            if (other.flag == MATCHED) {
              f = OTHER_MATCHED;
              break;
            } else if (other.flag == UNMATCHED) {
              f = UNMATCHED;
            }
          }

          if (f == UNMATCHED) {
            next.push_back(src);
          } else if (f == MATCHED) {
            matched.push_back(src);
          } else {
            otherMatched.push_back(src);
          }
        },
        galois::loopname("pull"));
  }

  template <MatchFlag F>
  void take(Bag& bag, Graph& graph, Counter& numTaken) {

    galois::do_all(
        galois::iterate(bag),
        [&](const GNode& src) {
          Node& n = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          numTaken += 1;
          n.flag = F;
        },
        galois::loopname("take"));
  }

  void operator()(Graph& graph) {
    size_t rounds = 0;
    Counter numProcessed;
    Counter numTaken;

    Bag bags[2];
    Bag* cur  = &bags[0];
    Bag* next = &bags[1];
    Bag matched;
    Bag otherMatched;
    uint64_t size  = graph.size();
    uint64_t delta = graph.size() / 25;

    Graph::iterator ii = graph.begin();
    Graph::iterator ei = graph.begin();

    while (size > 0) {
      numProcessed.reset();

      if (!cur->empty()) {
        pull(galois::iterate(*cur), graph, matched, otherMatched, *next,
             numProcessed);
      }

      size_t numCur = numProcessed.reduce();
      std::advance(ei, std::min(size, delta) - numCur);

      if (ii != ei) {
        pull(galois::iterate(ii, ei), graph, matched, otherMatched, *next,
             numProcessed);
      }

      ii = ei;

      numTaken.reset();

      take<MATCHED>(matched, graph, numTaken);
      take<OTHER_MATCHED>(otherMatched, graph, numTaken);

      cur->clear();
      matched.clear();
      otherMatched.clear();
      std::swap(cur, next);
      rounds += 1;
      assert(size >= numTaken.reduce());
      size -= numTaken.reduce();
    }

    galois::runtime::reportStat_Single("IndependentSet-PullAlgo", "rounds",
                                       rounds);
  }
};

struct PrioAlgo {
  using Graph = galois::graphs::LC_CSR_Graph<prioNode, void>::with_numa_alloc<
      true>::type ::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  unsigned int hash(unsigned int val) const {
    val = ((val >> 16) ^ val) * 0x45d9f3b;
    val = ((val >> 16) ^ val) * 0x45d9f3b;
    return (val >> 16) ^ val;
  }

  void operator()(Graph& graph) {
    galois::GAccumulator<size_t> rounds;
    galois::GAccumulator<float> nedges;
    galois::GReduceLogicalOr unmatched;
    galois::substrate::PerThreadStorage<std::mt19937*> generator;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          nedges += std::distance(
              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),
              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));
        },
        galois::loopname("cal_degree"), galois::steal());

    float nedges_tmp = nedges.reduce();
    float avg_degree = nedges_tmp / (float)graph.size();
    unsigned char in = ~1;
    float scale_avg  = ((in / 2) - 1) * avg_degree;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          prioNode& nodedata =
              graph.getData(src, galois::MethodFlag::UNPROTECTED);
          float degree = (float)std::distance(
              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),
              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));
          float x = degree - hash(src) * 0.00000000023283064365386962890625f;
          int res = round(scale_avg / (avg_degree + x));
          unsigned char val = (res + res) | 1;
          nodedata.flag     = val;
        },
        galois::loopname("init-prio"), galois::steal());

    do {
      unmatched.reset();
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            prioNode& nodedata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);

            if (!(nodedata.flag & (unsigned char)1))
              return;

            for (auto edge :
                 graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {
              GNode dst = graph.getEdgeDst(edge);

              prioNode& other =
                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);

              if (other.flag == (unsigned char)0xfe) { // matched, highest prio
                nodedata.flag = (unsigned char)0x00;
                unmatched.update(true);
                return;
              }

              if (nodedata.flag > other.flag)
                continue;
              else if (nodedata.flag == other.flag) {
                if (src > dst)
                  continue;
                else if (src == dst) {
                  nodedata.flag = (unsigned char)0x00; // other_matched
                  return;
                } else {
                  unmatched.update(true);
                  return;
                }
              } else {
                unmatched.update(true);
                return;
              }
            }
            nodedata.flag = (unsigned char)0xfe; // matched, highest prio
          },
          galois::loopname("execute"), galois::steal());

      rounds += 1;
    } while (unmatched.reduce());

    galois::runtime::reportStat_Single("IndependentSet-prioAlgo", "rounds",
                                       rounds.reduce());
  }
};

struct EdgeTiledPrioAlgo {
  using Graph = galois::graphs::LC_CSR_Graph<prioNode, void>::with_numa_alloc<
      true>::type ::with_no_lockable<true>::type;
  using GNode = Graph::GraphNode;

  struct EdgeTile {
    GNode src;
    Graph::edge_iterator beg;
    Graph::edge_iterator end;
    bool flag;
  };

  unsigned int hash(unsigned int val) const {
    val = ((val >> 16) ^ val) * 0x45d9f3b;
    val = ((val >> 16) ^ val) * 0x45d9f3b;
    return (val >> 16) ^ val;
  }

  void operator()(Graph& graph) {
    galois::GAccumulator<size_t> rounds;
    galois::GAccumulator<float> nedges;
    galois::GReduceLogicalOr unmatched;
    galois::substrate::PerThreadStorage<std::mt19937*> generator;
    galois::InsertBag<EdgeTile> works;
    const int EDGE_TILE_SIZE = 64;
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          nedges += std::distance(
              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),
              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));
        },
        galois::loopname("cal_degree"), galois::steal());

    float nedges_tmp = nedges.reduce();
    float avg_degree = nedges_tmp / (float)graph.size();
    unsigned char in = ~1;
    float scale_avg  = ((in / 2) - 1) * avg_degree;

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          prioNode& nodedata =
              graph.getData(src, galois::MethodFlag::UNPROTECTED);
          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

          float degree = (float)std::distance(beg, end);
          float x = degree - hash(src) * 0.00000000023283064365386962890625f;
          int res = round(scale_avg / (avg_degree + x));
          unsigned char val = (res + res) | 0x03;

          nodedata.flag = val;
          assert(beg <= end);
          if ((end - beg) > EDGE_TILE_SIZE) {
            for (; beg + EDGE_TILE_SIZE < end;) {
              auto ne = beg + EDGE_TILE_SIZE;
              assert(ne < end);
              works.push_back(EdgeTile{src, beg, ne, false});
              beg = ne;
            }
          }
          if ((end - beg) > 0) {
            works.push_back(EdgeTile{src, beg, end, false});
          }
        },
        galois::loopname("init-prio"), galois::steal());

    do {
      unmatched.reset();
      galois::do_all(
          galois::iterate(works),
          [&](EdgeTile& tile) {
            GNode src = tile.src;

            prioNode& nodedata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);

            if ((nodedata.flag & (unsigned char){1})) { // is undecided

              for (auto edge = tile.beg; edge != tile.end; ++edge) {
                GNode dst = graph.getEdgeDst(edge);

                prioNode& other =
                    graph.getData(dst, galois::MethodFlag::UNPROTECTED);

                if (other.flag ==
                    (unsigned char){0xfe}) { // permanent matched, highest prio
                  nodedata.flag = (unsigned char){0x00};
                  return;
                }

                if (nodedata.flag > other.flag)
                  continue;
                else if (nodedata.flag == other.flag) {
                  if (src > dst)
                    continue;
                  else if (src == dst) {
                    nodedata.flag = (unsigned char){0x00}; // other_matched
                    tile.flag     = false;
                    return;
                  } else {
                    tile.flag = false;
                    unmatched.update(true);
                    return;
                  }
                } else {
                  tile.flag = false;
                  unmatched.update(true);
                  return;
                }
              }
              tile.flag = true; // temporary-matched
            }
          },
          galois::loopname("execute"), galois::steal());

      galois::do_all(
          galois::iterate(works),
          [&](EdgeTile& tile) {
            auto src = tile.src;
            prioNode& nodedata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);

            if ((nodedata.flag & (unsigned char){1}) &&
                tile.flag == false) { // undecided and temporary no
              nodedata.flag &=
                  (unsigned char){0xfd}; // 0x1111 1101, not temporary yes
            }
          },
          galois::loopname("match_reduce"), galois::steal());

      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            prioNode& nodedata =
                graph.getData(src, galois::MethodFlag::UNPROTECTED);
            if ((nodedata.flag & (unsigned char){0x01}) != 0) { // undecided
              if ((nodedata.flag & (unsigned char){0x02}) !=
                  0) { // temporary yes
                nodedata.flag =
                    (unsigned char){0xfe}; // 0x1111 1110, permanent yes
                for (auto edge :
                     graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {
                  GNode dst = graph.getEdgeDst(edge);

                  prioNode& other =
                      graph.getData(dst, galois::MethodFlag::UNPROTECTED);
                  other.flag =
                      (unsigned char){0x00}; // OTHER_MATCHED, permanent no
                }
              } else
                nodedata.flag |=
                    (unsigned char){0x03}; // 0x0000 0011, temp yes, undecided
            }
          },
          galois::loopname("match_update"), galois::steal());

      rounds += 1;
    } while (unmatched.reduce());

    galois::runtime::reportStat_Single("IndependentSet-prioAlgo", "rounds",
                                       rounds.reduce());
  }
};

template <typename Graph>
struct is_bad {
  using GNode = typename Graph::GraphNode;
  using Node  = typename Graph::node_data_type;
  Graph& graph;

  is_bad(Graph& g) : graph(g) {}

  bool operator()(GNode n) const {
    Node& me = graph.getData(n);
    if (me.flag == MATCHED) {
      for (auto ii : graph.edges(n)) {
        GNode dst  = graph.getEdgeDst(ii);
        Node& data = graph.getData(dst);
        if (dst != n && data.flag == MATCHED) {
          std::cerr << "double match\n";
          return true;
        }
      }
    } else if (me.flag == UNMATCHED) {
      bool ok = false;
      for (auto ii : graph.edges(n)) {
        GNode dst  = graph.getEdgeDst(ii);
        Node& data = graph.getData(dst);
        if (data.flag != UNMATCHED) {
          ok = true;
        }
      }
      if (!ok) {
        std::cerr << "not maximal\n";
        return true;
      }
    }
    return false;
  }
};

template <typename Graph>
struct is_matched {
  Graph& graph;
  using GNode = typename Graph::GraphNode;

  is_matched(Graph& g) : graph(g) {}

  bool operator()(const GNode& n) const {
    return graph.getData(n).flag == MATCHED;
  }
};

template <typename Graph, typename Algo>
bool verify(Graph& graph, Algo&) {
  using GNode    = typename Graph::GraphNode;
  using prioNode = typename Graph::node_data_type;

  if (std::is_same<Algo, PrioAlgo>::value ||
      std::is_same<Algo, EdgeTiledPrioAlgo>::value) {
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          prioNode& nodedata =
              graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (nodedata.flag == (unsigned char){0xfe}) {
            nodedata.flag = MATCHED;
          } else if (nodedata.flag == (unsigned char){0x00}) {
            nodedata.flag = OTHER_MATCHED;
          } else
            std::cout << "error in verify_change! Some nodes are not decided."
                      << "\n";
        },
        galois::loopname("verify_change"));
  }

  return galois::ParallelSTL::find_if(graph.begin(), graph.end(),
                                      is_bad<Graph>(graph)) == graph.end();
}

template <typename Algo>
void run() {
  using Graph = typename Algo::Graph;
  using GNode = typename Graph::GraphNode;

  Algo algo;
  Graph graph;
  galois::graphs::readGraph(graph, inputFile);

  // galois::preAlloc(numThreads + (graph.size() * sizeof(Node) * numThreads /
  // 8) / galois::runtime::MM::hugePageSize); Tighter upper bound
  if (std::is_same<Algo, DefaultAlgo<nondet>>::value) {
    galois::preAlloc(numThreads +
                     16 * graph.size() / galois::runtime::pagePoolSize());
  } else {
    galois::preAlloc(numThreads + 64 * (sizeof(GNode) + sizeof(Node)) *
                                      graph.size() /
                                      galois::runtime::pagePoolSize());
  }

  galois::reportPageAlloc("MeminfoPre");
  galois::StatTimer execTime("Timer_0");

  execTime.start();
  algo(graph);
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify && !verify(graph, algo)) {
    std::cerr << "verification failed\n";
    assert(0 && "verification failed");
    abort();
  }

  std::cout << "Cardinality of maximal independent set: "
            << galois::ParallelSTL::count_if(graph.begin(), graph.end(),
                                             is_matched<Graph>(graph))
            << "\n";
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("independent set requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph");
  }

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  switch (algo) {
  case serial:
    run<SerialAlgo>();
    break;
  case nondet:
    run<DefaultAlgo<nondet>>();
    break;
  case detBase:
    run<DefaultAlgo<detBase>>();
    break;
  case pull:
    run<PullAlgo>();
    break;
  case prio:
    run<PrioAlgo>();
    break;
  case edgetiledprio:
    run<EdgeTiledPrioAlgo>();
    break;
  default:
    std::cerr << "Unknown algorithm" << algo << "\n";
    abort();
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/independentset/README.md
================================================
Maximal Independent Set
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Find the Maximal Independent Set (not maximum) of ndoes in an undirected 
(symmetric) graph. 

For convenience, we used IN to represent a node in the independent set, OUT to 
represent not in the independent set, UNDECIDED represent undecided.

- serial: serial greedy version.
- pull: pull-based greedy version. Node 0 is initially marked IN.
- detBase: greedy version, using Galois deterministic worklist.
- nondet: greedy version, using Galois bulk synchronous worklist.
- prio(default): based on Martin Butcher's GPU ECL-MIS algorithm. For more information,
  please look at http://cs.txstate.edu/~burtscher/research/ECL-MIS/.
- edgetiledprio: edge-tiled version of prio.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/independentset/; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm (prio), use the following:
-`$ ./maximal-independentset-cpu <input-graph (symmetric)> -t=<num-threads> -symmetricGraph`

To run a specific algorithm, use the following:
-`$ ./maximal-independentset-cpu <input-graph (symmetric)> -t=<num-threads> -algo=<algorithm> -symmetricGraph`

PERFORMANCE  
--------------------------------------------------------------------------------

In 'prio', when a node has high priority than all of its neighbors, it is marked 
as IN. For its neighbors, you can choose either 1) update its neighbors to OUT in 
same round (Push), or 2) next round its neighbors check if they have an IN neighbor, 
and update themselves to OUT (Pull).
First method works better on none-power-law graphs. Second method works better 
on power-law graphs. 


================================================
FILE: lonestar/analytics/cpu/k-core/CMakeLists.txt
================================================
add_executable(k-core-cpu kcore.cpp)
add_dependencies(apps k-core-cpu)
target_link_libraries(k-core-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS k-core-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small k-core-cpu --kcore=4 -symmetricGraph "${BASEINPUT}/scalefree/symmetric/rmat10.sgr")


================================================
FILE: lonestar/analytics/cpu/k-core/README.md
================================================
K-Core Decomposition
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Finds the <b>k-core</b> in a graph. A k-core of a graph G is defined as a maxiaml
connected subgraph in which all vertices have degree at least k.

This is a parallel worklist push-style implementation. The initial worklist consists
of nodes that have degree less than k. These nodes will decrement the degree
of their neighbors, and the first time a neighbor's degree falls under the
specified k value, it will be added onto the worklist so it can decrement
its neighbors as it is considered removed from the graph.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/k-core/; make -j`

RUN
--------------------------------------------------------------------------------

To run on machine with a k value of 4, use the following:
`./k-core-cpu <symmetric-input-graph> -t=<num-threads> -kcore=4 -symmetricGraph`

PERFORMANCE
--------------------------------------------------------------------------------

Worklist chunk size (specified as a constant in the source code) may affect
performance based on the input provided to k-core.

There is preallocation of pages before the main computation begins: if the
statistics reported at the end of computation indicate that pages
were allocated during computation (i.e., MemAllocMid is less than MemAllocPost),
you may need to change how many pages are preallocated before computation.


================================================
FILE: lonestar/analytics/cpu/k-core/kcore.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause
 * BSD License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/gstl.h"
#include "galois/AtomicHelpers.h"
#include "galois/Reduction.h"
#include "galois/graphs/LCGraph.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

constexpr static const char* const REGION_NAME = "k-core";
constexpr static const char* const name        = "k-core";
constexpr static const char* const desc        = "Finds the k-core of a graph, "
                                          "defined as the subgraph where"
                                          " all vertices have degree at "
                                          "least k.";

/*******************************************************************************
 * Declaration of command line arguments
 ******************************************************************************/
namespace cll = llvm::cl;

enum Algo { Async = 0, Sync };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

//! Choose algorithm: worklist vs. sync.
static cll::opt<Algo> algo("algo",
                           cll::desc("Choose an algorithm (default Sync):"),
                           cll::values(clEnumVal(Async, "Asynchronous"),
                                       clEnumVal(Sync, "Synchronous")),
                           cll::init(Sync));

//! Required k specification for k-core.
static cll::opt<unsigned int> k_core_num("kcore", cll::desc("k-core value"),
                                         cll::Required);

/*******************************************************************************
 * Graph structure declarations + other inits
 ******************************************************************************/

//! Node deadness can be derived from current degree and k value, so no field
//! necessary.
struct NodeData {
  std::atomic<uint32_t> currentDegree;
};

//! Typedef for graph used, CSR graph (edge-type is void).
using Graph =
    galois::graphs::LC_CSR_Graph<NodeData, void>::with_no_lockable<true>::type;
//! Typedef for node type in the CSR graph.
using GNode = Graph::GraphNode;

//! Chunksize for for_each worklist: best chunksize will depend on input.
constexpr static const unsigned CHUNK_SIZE = 64u;

/*******************************************************************************
 * Functions for running the algorithm
 ******************************************************************************/

/**
 * Initialize degree fields in graph with current degree. Since symmetric,
 * out edge count is equivalent to in-edge count.
 *
 * @param graph Graph to initialize degrees in
 */
void degreeCounting(Graph& graph) {
  galois::do_all(
      galois::iterate(graph.begin(), graph.end()),
      [&](GNode curNode) {
        NodeData& curData = graph.getData(curNode);
        curData.currentDegree.store(
            std::distance(graph.edge_begin(curNode), graph.edge_end(curNode)));
      },
      galois::loopname("DegreeCounting"), galois::no_stats());
};

/**
 * Setup initial worklist of dead nodes.
 *
 * @param graph Graph to operate on
 * @param initialWorklist Empty worklist to be filled with dead nodes.
 */
void setupInitialWorklist(Graph& graph,
                          galois::InsertBag<GNode>& initialWorklist) {
  galois::do_all(
      galois::iterate(graph.begin(), graph.end()),
      [&](GNode curNode) {
        NodeData& curData = graph.getData(curNode);
        if (curData.currentDegree < k_core_num) {
          //! Dead node, add to initialWorklist for processing later.
          initialWorklist.emplace(curNode);
        }
      },
      galois::loopname("InitialWorklistSetup"), galois::no_stats());
}

/**
 * Starting with initial dead nodes as current worklist; decrement degree;
 * add to next worklist; switch next with current and repeat until worklist
 * is empty (i.e. no more dead nodes).
 *
 * @param graph Graph to operate on
 */
void syncCascadeKCore(Graph& graph) {
  galois::InsertBag<GNode>* current = new galois::InsertBag<GNode>;
  galois::InsertBag<GNode>* next    = new galois::InsertBag<GNode>;

  //! Setup worklist.
  setupInitialWorklist(graph, *next);

  while (!next->empty()) {
    //! Make "next" into current.
    std::swap(current, next);
    next->clear();

    galois::do_all(
        galois::iterate(*current),
        [&](GNode deadNode) {
          //! Decrement degree of all neighbors.
          for (auto e : graph.edges(deadNode)) {
            GNode dest         = graph.getEdgeDst(e);
            NodeData& destData = graph.getData(dest);
            uint32_t oldDegree =
                galois::atomicSubtract(destData.currentDegree, 1u);

            if (oldDegree == k_core_num) {
              //! This thread was responsible for putting degree of destination
              //! below threshold; add to worklist.
              next->emplace(dest);
            }
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("SyncCascadeDeadNodes"));
  }

  delete current;
  delete next;
}

/**
 * Starting with initial dead nodes, decrement degree and add to worklist
 * as they drop below 'k' threshold until worklist is empty (i.e. no more dead
 * nodes).
 *
 * @param graph Graph to operate on
 * @param initialWorklist Worklist containing initial dead nodes
 */
void asyncCascadeKCore(Graph& graph,
                       galois::InsertBag<GNode>& initialWorklist) {
  galois::for_each(
      galois::iterate(initialWorklist),
      [&](GNode deadNode, auto& ctx) {
        //! Decrement degree of all neighbors.
        for (auto e : graph.edges(deadNode)) {
          GNode dest         = graph.getEdgeDst(e);
          NodeData& destData = graph.getData(dest);
          uint32_t oldDegree =
              galois::atomicSubtract(destData.currentDegree, 1u);

          if (oldDegree == k_core_num) {
            //! This thread was responsible for putting degree of destination
            //! below threshold: add to worklist.
            ctx.push(dest);
          }
        }
      },
      galois::disable_conflict_detection(), galois::chunk_size<CHUNK_SIZE>(),
      galois::loopname("AsyncCascadeDeadNodes"));
}

/*******************************************************************************
 * Sanity check operators
 ******************************************************************************/

/**
 * Print number of nodes that are still alive.
 *
 * @param graph Graph to get alive count of
 */
void kCoreSanity(Graph& graph) {
  galois::GAccumulator<uint32_t> aliveNodes;
  aliveNodes.reset();

  galois::do_all(
      galois::iterate(graph.begin(), graph.end()),
      [&](GNode curNode) {
        NodeData& curData = graph.getData(curNode);
        if (curData.currentDegree >= k_core_num) {
          aliveNodes += 1;
        }
      },
      galois::loopname("KCoreSanityCheck"), galois::no_stats());

  galois::gPrint("Number of nodes in the ", k_core_num, "-core is ",
                 aliveNodes.reduce(), "\n");
}

/*******************************************************************************
 * Main method for running
 ******************************************************************************/

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  //! Some initial stat reporting.
  galois::gInfo("Worklist chunk size of ", CHUNK_SIZE,
                ": best size may depend"
                " on input.");
  galois::runtime::reportStat_Single(REGION_NAME, "ChunkSize", CHUNK_SIZE);
  galois::reportPageAlloc("MemAllocPre");

  //! Read graph from disk.
  galois::StatTimer graphReadingTimer("GraphConstructTime", REGION_NAME);
  graphReadingTimer.start();
  Graph graph;
  galois::graphs::readGraph(graph, inputFile);
  graphReadingTimer.stop();

  //! Preallocate pages in memory so allocation doesn't occur during compute.
  galois::StatTimer preallocTime("PreAllocTime", REGION_NAME);
  preallocTime.start();
  galois::preAlloc(
      std::max(size_t{galois::getActiveThreads()} * (graph.size() / 1000000),
               std::max(10U, galois::getActiveThreads()) * size_t{10}));
  preallocTime.stop();
  galois::reportPageAlloc("MemAllocMid");

  //! Intialization of degrees.
  degreeCounting(graph);

  //! Begins main computation.
  galois::StatTimer execTime("Timer_0");

  execTime.start();

  if (algo == Async) {
    galois::gInfo("Running asynchronous k-core with k-core number ",
                  k_core_num);
    //! Worklist setup of initial dead ndoes.
    galois::InsertBag<GNode> initialWorklist;
    setupInitialWorklist(graph, initialWorklist);
    //! Actual work; propagate deadness by decrementing degrees and adding dead
    //! nodes to worklist.
    asyncCascadeKCore(graph, initialWorklist);
  } else if (algo == Sync) {
    galois::gInfo("Running synchronous k-core with k-core number ", k_core_num);
    //! Synchronous k-core.
    syncCascadeKCore(graph);
  } else {
    GALOIS_DIE("invalid specification of k-core algorithm");
  }

  execTime.stop();

  galois::reportPageAlloc("MemAllocPost");

  //! Sanity check.
  if (!skipVerify) {
    kCoreSanity(graph);
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/k-truss/CMakeLists.txt
================================================
add_executable(k-truss-cpu K-Truss.cpp)
add_dependencies(apps k-truss-cpu)
target_link_libraries(k-truss-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS k-truss-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_executable(verify-k-truss Verify.cpp)
add_dependencies(apps verify-k-truss)
target_link_libraries(verify-k-truss PRIVATE Galois::shmem lonestar)
install(TARGETS verify-k-truss DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small k-truss-cpu -trussNum=4 -symmetricGraph "${BASEINPUT}/scalefree/symmetric/rmat10.sgr")


================================================
FILE: lonestar/analytics/cpu/k-truss/K-Truss.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause
 * BSD License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Bag.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/runtime/Statistics.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

#include <iostream>
#include <deque>
#include <algorithm>
#include <fstream>
#include <memory>

enum Algo {
  bspJacobi,
  bsp,
  bspCoreThenTruss,
};

namespace cll = llvm::cl;

static const char* name = "Maximal k-trusses";
static const char* desc =
    "Computes the maximal k-trusses for a given undirected graph";
static const char* url = "k_truss";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<unsigned int>
    trussNum("trussNum", cll::desc("report trussNum-trusses"), cll::Required);

static cll::opt<std::string>
    outName("o", cll::desc("output file for the edgelist of resulting truss"));

static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(
        clEnumValN(Algo::bspJacobi, "bspJacobi",
                   "Bulk-synchronous parallel with separated edge removal"),
        clEnumValN(Algo::bsp, "bsp", "Bulk-synchronous parallel (default)"),
        clEnumValN(Algo::bspCoreThenTruss, "bspCoreThenTruss",
                   "Compute k-1 core and then k-truss")),
    cll::init(Algo::bsp));

//! Set LSB of an edge weight to indicate the removal of the edge.
using Graph =
    galois::graphs::LC_CSR_Graph<void, uint32_t>::template with_numa_alloc<
        true>::type::template with_no_lockable<true>::type;

using GNode   = Graph::GraphNode;
using Edge    = std::pair<GNode, GNode>;
using EdgeVec = galois::InsertBag<Edge>;
using NodeVec = galois::InsertBag<GNode>;

template <typename T>
using PerIterAlloc = typename galois::PerIterAllocTy::rebind<T>::other;

static const uint32_t valid   = 0x0;
static const uint32_t removed = 0x1;

#if 0 ///< Deprecated codes.
///< TODO We can restore the asynchronous ktruss.

/**
 * Get the common negihbors between the node src and the dst.
 *
 * @param g
 * @param src the target src node.
 * @param dst the target dst node.
 * @param a
 *
 * @return dequeue containing the common neighbors between the src and the dst.
 */
std::deque<GNode, PerIterAlloc<GNode>>
getValidCommonNeighbors(Graph& g, GNode src, GNode dst,
                        galois::PerIterAllocTy& a,
                        galois::MethodFlag flag = galois::MethodFlag::WRITE) {
  auto srcI = g.edge_begin(src, flag), srcE = g.edge_end(src, flag),
       dstI = g.edge_begin(dst, flag), dstE = g.edge_end(dst, flag);
  std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors(a);

  while (true) {
    //! Find the first valid edge.
    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {
      ++srcI;
    }

    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {
      ++dstI;
    }

    //! Check for intersection.
    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);

    if (srcI == srcE || dstI == dstE) {
      break;
    }

    if (sN < dN) {
      ++srcI;
    } else if (dN < sN) {
      ++dstI;
    } else {
      commonNeighbors.push_back(sN);
      ++srcI;
      ++dstI;
    }
  }
  return commonNeighbors;
}

/**
 * AsyncTrussTxAlgo:
 * 1. Compute support for all edges and pick out unsupported ones.
 * 2. Remove unsupported edges, decrease the support for affected edges and pick
 *    out those becomeing unsupported.
 * 3. Repeat 2. until no more unsupported edges are found.
 *
 * edges update in default Galois sync model, i.e. transactional semantics.
 */
struct AsyncTrussTxAlgo {
  std::string name() { return "asyncTx"; }

  struct PickUnsupportedEdges {
    Graph& g;
    unsigned int j;
    EdgeVec& r;

    PickUnsupportedEdges(Graph& g, unsigned int j, EdgeVec& r)
        : g(g), j(j), r(r) {}

    void operator()(Edge e, galois::UserContext<Edge>& ctx) {
      auto src = e.first, dst = e.second;
      std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors =
          getValidCommonNeighbors(g, src, dst, ctx.getPerIterAlloc(),
                                  galois::MethodFlag::UNPROTECTED);
      auto numValidCommonNeighbors = commonNeighbors.size();

      g.getEdgeData(g.findEdgeSortedByDst(src, dst)) =
                                          (numValidCommonNeighbors << 1);
      g.getEdgeData(g.findEdgeSortedByDst(dst, src)) =
                                          (numValidCommonNeighbors << 1);
      if (numValidCommonNeighbors < j) {
        r.push_back(e);
      }
    }
  };

  struct PropagateEdgeRemoval {
    Graph& g;
    unsigned int j;

    PropagateEdgeRemoval(Graph& g, unsigned int j) : g(g), j(j) {}

    void removeUnsupportedEdge(GNode src, GNode dst,
                               galois::UserContext<Edge>& ctx) {
      auto& oeData = g.getEdgeData(g.findEdgeSortedByDst(src, dst));
      auto& ieData = g.getEdgeData(g.findEdgeSortedByDst(dst, src));

      auto newSupport = (oeData >> 1) - 1;
      oeData          = (newSupport << 1);
      ieData          = (newSupport << 1);
      if (newSupport < j) {
        ctx.push(std::make_pair(src, dst));
      }
    }

    void operator()(Edge e, galois::UserContext<Edge>& ctx) {
      auto src = e.first, dst = e.second;

      //! Lock src's neighbors.
      auto& oeData = g.getEdgeData(g.findEdgeSortedByDst(src, dst));
      //! Lock src's neighbors' neighbors for back edges from them to src's
      //! neighbors.
      for (auto ei : g.edges(src)) {
        g.edges(g.getEdgeDst(ei));
      }

      //! Lock dst's neighbors.
      auto& ieData = g.getEdgeData(g.findEdgeSortedByDst(dst, src));
      //! Lock dst's neighbors' neighbors for back edge from them to dst's
      //! neighbors.
      for (auto ei : g.edges(dst)) {
        g.edges(g.getEdgeDst(ei));
      }

      //! Avoid repeated processing.
      if (oeData & removed) {
        return;
      }

      //! Mark as removed.
      oeData = removed;
      ieData = removed;

      //! Propagate edge removal.
      std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors =
          getValidCommonNeighbors(g, src, dst, ctx.getPerIterAlloc());
      for (auto n : commonNeighbors) {
        removeUnsupportedEdge(((n < src) ? n : src), ((n < src) ? src : n),
                              ctx);
        removeUnsupportedEdge(((n < dst) ? n : dst), ((n < dst) ? dst : n),
                              ctx);
      }
    }
  };

  void operator()(Graph& g, unsigned int k) {
    if (k - 2 == 0) {
      return;
    }

    EdgeVec work, unsupported;

    //! Symmetry breaking:
    //! Consider only edges (i, j) where i < j.
    galois::do_all(galois::iterate(g),
                   [&g, &work](GNode n) {
                     for (auto e :
                          g.edges(n, galois::MethodFlag::UNPROTECTED)) {
                       auto dst = g.getEdgeDst(e);
                       if (dst > n) {
                         work.push_back(std::make_pair(n, dst));
                       }
                     }
                   },
                   galois::steal());

    galois::for_each(galois::iterate(work),
                     PickUnsupportedEdges{g, k - 2, unsupported},
                     galois::loopname("PickUnsupportedEdges"),
                     galois::disable_conflict_detection(), galois::no_pushes(),
                     galois::per_iter_alloc());

    galois::for_each(galois::iterate(unsupported),
                     PropagateEdgeRemoval{g, k - 2},
                     galois::loopname("PropagateEdgeRemoval"),
                     galois::per_iter_alloc());
  } ///< End operator().
};  ///< End AsyncTrussTxAlgo.

#endif

/**
 * Initialize edge data to valid.
 */
template <typename Graph>
void initialize(Graph& g) {
  g.sortAllEdgesByDst();

  //! Initializa all edges to valid.
  galois::do_all(
      galois::iterate(g),
      [&g](typename Graph::GraphNode N) {
        for (auto e : g.edges(N, galois::MethodFlag::UNPROTECTED)) {
          g.getEdgeData(e) = valid;
        }
      },
      galois::steal());
}

/**
 * Dump ktruss for each node to a file.
 */
template <typename Graph>
void reportKTruss(Graph& g) {
  if (outName.empty()) {
    return;
  }

  std::ofstream of(outName);
  if (!of.is_open()) {
    std::cerr << "Cannot open " << outName << " for output.\n";
    return;
  }

  for (auto n : g) {
    for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
      auto dst = g.getEdgeDst(e);
      if (n < dst && (g.getEdgeData(e) & 0x1) != removed) {
        of << n << " " << dst << " " << g.getEdgeData(e) << "\n";
      }
    }
  }

  of.close();
}

/**
 * Check if the number of valid edges is more than or equal to j.
 * If it is, then the node n still could be processed.
 * Otherwise, the node n will be ignored in the following steps.
 *
 * @param g
 * @param n the target node n to be tested
 * @param j the target number of triangels
 *
 * @return true if the target node n has the number of degrees
 *         more than or equal to j
 */
bool isValidDegreeNoLessThanJ(Graph& g, GNode n, unsigned int j) {
  size_t numValid = 0;
  for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
    if (!(g.getEdgeData(e) & removed)) {
      numValid += 1;
      if (numValid >= j) {
        return true;
      }
    }
  }
  return numValid >= j;
}

/**
 * Measure the number of intersected edges between the src and the dst nodes.
 *
 * @param g
 * @param src the source node
 * @param dst the destination node
 * @param j the number of the target triangles
 *
 * @return true if the src and the dst are included in more than j triangles
 */
bool isSupportNoLessThanJ(Graph& g, GNode src, GNode dst, unsigned int j) {
  size_t numValidEqual = 0;
  auto srcI            = g.edge_begin(src, galois::MethodFlag::UNPROTECTED),
       srcE            = g.edge_end(src, galois::MethodFlag::UNPROTECTED),
       dstI            = g.edge_begin(dst, galois::MethodFlag::UNPROTECTED),
       dstE            = g.edge_end(dst, galois::MethodFlag::UNPROTECTED);

  while (true) {
    //! Find the first valid edge.
    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {
      ++srcI;
    }
    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {
      ++dstI;
    }

    if (srcI == srcE || dstI == dstE) {
      return numValidEqual >= j;
    }

    //! Check for intersection.
    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);
    if (sN < dN) {
      ++srcI;
    } else if (dN < sN) {
      ++dstI;
    } else {
      numValidEqual += 1;
      if (numValidEqual >= j) {
        return true;
      }
      ++srcI;
      ++dstI;
    }
  }

  return numValidEqual >= j;
}

/**
 * BSPTrussJacobiAlgo:
 * 1. Scan for unsupported edges.
 * 2. If no unsupported edges are found, done.
 * 3. Remove unsupported edges in a separated loop.
 *    TODO why would it be processed in a separted loop?
 * 4. Go back to 1.
 */
struct BSPTrussJacobiAlgo {
  std::string name() { return "bsp"; }

  struct PickUnsupportedEdges {
    Graph& g;
    unsigned int j;
    EdgeVec& r; ///< unsupported
    EdgeVec& s; ///< next

    PickUnsupportedEdges(Graph& g, unsigned int j, EdgeVec& r, EdgeVec& s)
        : g(g), j(j), r(r), s(s) {}

    void operator()(Edge e) {
      EdgeVec& w = isSupportNoLessThanJ(g, e.first, e.second, j) ? s : r;
      w.push_back(e);
    }
  };

  void operator()(Graph& g, unsigned int k) {
    if (k - 2 == 0) {
      return;
    }

    EdgeVec unsupported, work[2];
    EdgeVec *cur = &work[0], *next = &work[1];

    //! Symmetry breaking:
    //! Consider only edges (i, j) where i < j.
    galois::do_all(
        galois::iterate(g),
        [&](GNode n) {
          for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
            auto dst = g.getEdgeDst(e);
            if (dst > n) {
              cur->push_back(std::make_pair(n, dst));
            }
          }
        },
        galois::steal());

    while (true) {
      galois::do_all(galois::iterate(*cur),
                     PickUnsupportedEdges{g, k - 2, unsupported, *next},
                     galois::steal());

      if (std::distance(unsupported.begin(), unsupported.end()) == 0) {
        break;
      }

      //! Mark unsupported edges as removed.
      galois::do_all(
          galois::iterate(unsupported),
          [&](Edge e) {
            g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) = removed;
            g.getEdgeData(g.findEdgeSortedByDst(e.second, e.first)) = removed;
          },
          galois::steal());

      unsupported.clear();
      cur->clear();
      std::swap(cur, next);
    }
  } ///< End operator()
};  ///< End struct BSPTrussJacobiAlgo

/**
 * BSPTrussAlgo:
 * 1. Keep supported edges and remove unsupported edges.
 * 2. If all edges are kept, done.
 * 3. Go back to 3.
 */
struct BSPTrussAlgo {
  std::string name() { return "bsp"; }

  struct KeepSupportedEdges {
    Graph& g;
    unsigned int j;
    EdgeVec& s;

    KeepSupportedEdges(Graph& g, unsigned int j, EdgeVec& s)
        : g(g), j(j), s(s) {}

    void operator()(Edge e) {
      if (isSupportNoLessThanJ(g, e.first, e.second, j)) {
        s.push_back(e);
      } else {
        g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) = removed;
        g.getEdgeData(g.findEdgeSortedByDst(e.second, e.first)) = removed;
      }
    }
  };

  void operator()(Graph& g, unsigned int k) {
    if (k - 2 == 0) {
      return;
    }

    EdgeVec work[2];
    EdgeVec *cur = &work[0], *next = &work[1];
    size_t curSize, nextSize;

    //! Symmetry breaking:
    //! Consider only edges (i, j) where i < j.
    galois::do_all(
        galois::iterate(g),
        [&g, cur](GNode n) {
          for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
            auto dst = g.getEdgeDst(e);
            if (dst > n) {
              cur->push_back(std::make_pair(n, dst));
            }
          }
        },
        galois::steal());
    curSize = std::distance(cur->begin(), cur->end());

    //! Remove unsupported edges until no more edges can be removed.
    while (true) {
      galois::do_all(galois::iterate(*cur), KeepSupportedEdges{g, k - 2, *next},
                     galois::steal());
      nextSize = std::distance(next->begin(), next->end());

      if (curSize == nextSize) {
        //! Every edge in *cur is kept, done
        break;
      }

      cur->clear();
      curSize = nextSize;
      std::swap(cur, next);
    }
  } ///< End operator()
};  ///< End struct BSPTrussAlgo

/**
 * BSPCoreAlgo:
 * 1. Keep nodes w/ degree >= k and remove all edges for nodes whose degree < k.
 * 2. If all nodes are kept, done.
 * 3. Go back to 1.
 */
struct BSPCoreAlgo {
  std::string name() { return "bspCore"; }

  struct KeepValidNodes {
    Graph& g;
    unsigned int j;
    NodeVec& s;

    KeepValidNodes(Graph& g, unsigned int j, NodeVec& s) : g(g), j(j), s(s) {}

    void operator()(GNode n) {
      if (isValidDegreeNoLessThanJ(g, n, j)) {
        s.push_back(n);
      } else {
        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
          auto dst                                     = g.getEdgeDst(e);
          g.getEdgeData(g.findEdgeSortedByDst(n, dst)) = removed;
          g.getEdgeData(g.findEdgeSortedByDst(dst, n)) = removed;
        }
      }
    }
  };

  void operator()(Graph& g, unsigned int k) {
    NodeVec work[2];
    NodeVec *cur = &work[0], *next = &work[1];
    size_t curSize = g.size(), nextSize;

    galois::do_all(galois::iterate(g), KeepValidNodes{g, k, *next},
                   galois::steal());
    nextSize = std::distance(next->begin(), next->end());

    while (curSize != nextSize) {
      cur->clear();
      curSize = nextSize;
      std::swap(cur, next);

      galois::do_all(galois::iterate(*cur), KeepValidNodes{g, k, *next},
                     galois::steal());
      nextSize = std::distance(next->begin(), next->end());
    }
  }
}; ///< End BSPCoreAlgo.

/**
 * BSPCoreThenTrussAlgo:
 * 1. Reduce the graph to k-1 core
 * 2. Compute k-truss from k-1 core
 */
struct BSPCoreThenTrussAlgo {
  std::string name() { return "bspCoreThenTruss"; }

  void operator()(Graph& g, unsigned int k) {
    if (k - 2 == 0) {
      return;
    }

    galois::StatTimer TCore("Reduce_to_(k-1)-core");
    TCore.start();

    BSPCoreAlgo bspCore;
    bspCore(g, k - 1);

    TCore.stop();

    galois::StatTimer TTruss("Reduce_to_k-truss");
    TTruss.start();

    BSPTrussAlgo bspTrussIm;
    bspTrussIm(g, k);

    TTruss.stop();
  } ///< End operator().
};  ///< End struct BSPCoreThenTrussAlgo.

template <typename Algo>
void run() {
  Graph graph;
  Algo algo;

  std::cout << "Reading from file: " << inputFile << "\n";
  galois::graphs::readGraph(graph, inputFile, true);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";
  std::cout << "Running " << algo.name() << " algorithm for maximal "
            << trussNum << "-truss\n";

  size_t approxEdgeData = 4 * (graph.size() + graph.sizeEdges());
  galois::preAlloc(numThreads +
                   4 * (approxEdgeData) / galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  initialize(graph);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  algo(graph, trussNum);
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");
  reportKTruss(graph);

  uint64_t numEdges = 0;

  for (auto n : graph) {
    for (auto e : graph.edges(n, galois::MethodFlag::UNPROTECTED)) {
      auto dst = graph.getEdgeDst(e);
      if (n < dst && (graph.getEdgeData(e) & 0x1) != removed) {
        numEdges++;
      }
    }
  }

  galois::gInfo("Number of edges left in truss is ", numEdges);
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  if (2 > trussNum) {
    std::cerr << "trussNum >= 2\n";
    return -1;
  }

  switch (algo) {
  case bspJacobi:
    run<BSPTrussJacobiAlgo>();
    break;
  case bsp:
    run<BSPTrussAlgo>();
    break;
  case bspCoreThenTruss:
    run<BSPCoreThenTrussAlgo>();
    break;
  default:
    std::cerr << "Unknown algorithm\n";
    abort();
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/k-truss/README.md
================================================
K-Truss
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program finds the k-truss for some k value in a given undirect graph.
A k-truss is the subgraph of a graph in which every edge in the subgraph
is a part of at least k - 2 triangles.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/k-truss; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

Find the 5 truss using 40 threads and the BSP algorithm.

-`$ ./k-truss-cpu <path-symmetric-clean-graph> -algo bsp -trussNum=5 -t 40 -symmetricGraph`

The following outputs the edges of a 10 truss to a file using bspJacobi (edge
removal is separated).

-`$ ./k-truss-cpu <path-symmetric-clean-graph> -algo bspJacobi -t 40 -trussNum=10 -o=10truss.out -symmetricGraph`

PERFORMANCE
--------------------------------------------------------------------------------

* The BSP variant (the default, -bsp) generally performs better in our experience.


================================================
FILE: lonestar/analytics/cpu/k-truss/Verify.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Bag.h"
#include "galois/Timer.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

#include <iostream>
#include <unordered_set>
#include <algorithm>
#include <fstream>

namespace cll = llvm::cl;

static const char* name = "verify_ktruss";
static const char* desc = "Verify for maximal k-truss";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input graph>"), cll::Required);
static cll::opt<std::string> trussFile("trussFile",
                                       cll::desc("edgelist for the trusses"),
                                       cll::Required);
static cll::opt<unsigned int>
    trussNum("trussNum", cll::desc("verify for maximal trussNum-trusses"),
             cll::Required);
static cll::opt<unsigned int>
    ktrussNodes("trussNodes", cll::desc("truss nodes for verification"),
                cll::init(0));
static cll::opt<unsigned int>
    ktrussEdges("trussEdges", cll::desc("truss edges for verification"),
                cll::init(0)); // must be undirected edge count, i.e. counting
                               // (n1, n2) and (n2, n1) as 1 edge

static const uint32_t valid   = 0x0;
static const uint32_t removed = 0x1;

// edge weight: (# triangles supported << 1) | removal
//   set LSB of an edge weight to indicate the removal of the edge.
//   << 1 to track # triangles an edge supports,
//   >> 1 when computing edge supports
typedef galois::graphs::LC_CSR_Graph<void, uint32_t>::template with_numa_alloc<
    true>::type ::template with_no_lockable<true>::type Graph;
typedef Graph::GraphNode GNode;

typedef std::pair<GNode, GNode> Edge;
typedef galois::InsertBag<Edge> EdgeVec;

void initialize(Graph& g) {
  g.sortAllEdgesByDst();

  // initializa all edges to removed
  galois::do_all(
      galois::iterate(g),
      [&g](typename Graph::GraphNode N) {
        for (auto e : g.edges(N, galois::MethodFlag::UNPROTECTED)) {
          g.getEdgeData(e) = removed;
        }
      },
      galois::steal());
}

// TODO: can we read in edges in parallel?
void readTruss(Graph& g) {
  std::ifstream edgelist(trussFile);
  if (!edgelist.is_open()) {
    std::string errMsg = "Failed to open " + trussFile;
    GALOIS_DIE(errMsg);
  }

  unsigned int n1, n2;
  unsigned int edges = 0;
  std::unordered_set<unsigned int> nodes;
  while (edgelist >> n1 >> n2) {
    auto e = g.findEdgeSortedByDst(n1, n2);
    if (valid == g.getEdgeData(e)) {
      std::cout << "ignoring duplicate edge" << n1 << ", " << n2 << "\n";
      continue;
    }
    g.getEdgeData(e) = valid;

    e = g.findEdgeSortedByDst(n2, n1);
    if (valid == g.getEdgeData(e)) {
      std::cout << "duplicate edge (rev) " << n2 << ", " << n1 << "\n";
      continue;
    }
    g.getEdgeData(e) = valid;

    edges++;
    nodes.insert(n1);
    nodes.insert(n2);
  }

  std::cout << "read " << nodes.size() << " unique nodes\n";
  std::cout << "read " << edges << " unique edges\n";

  if (ktrussEdges && edges != ktrussEdges) {
    std::cerr << "edges read not equal to -trussEdges=" << ktrussEdges << "\n";
    GALOIS_DIE("verification error");
  }

  if (ktrussNodes && nodes.size() != ktrussNodes) {
    std::cerr << "nodes read not equal to -trussNodes=" << ktrussNodes << "\n";
    GALOIS_DIE("verification error");
  }
}

void printGraph(Graph& g) {
  for (auto n : g) {
    std::cout << "node " << n << "\n";
    for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
      auto d = g.getEdgeDst(e);
      if (d >= n)
        continue;
      std::cout << "  edge to " << d
                << ((g.getEdgeData(e) & removed) ? " removed" : "") << "\n";
    }
  }
}

std::pair<size_t, size_t> countValidNodesAndEdges(Graph& g) {
  galois::GAccumulator<size_t> numNodes, numEdges;

  galois::do_all(
      galois::iterate(g),
      [&g, &numNodes, &numEdges](GNode n) {
        size_t numN = 0;
        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
          if (!(g.getEdgeData(e) & removed)) {
            if (g.getEdgeDst(e) > n) {
              numEdges += 1;
            }
            numN = 1;
          }
        }
        numNodes += numN;
      },
      galois::steal());

  return std::make_pair(numNodes.reduce(), numEdges.reduce());
}

bool isSupportNoLessThanJ(Graph& g, GNode src, GNode dst, unsigned int j) {
  size_t numValidEqual = 0;
  auto srcI            = g.edge_begin(src, galois::MethodFlag::UNPROTECTED),
       srcE            = g.edge_end(src, galois::MethodFlag::UNPROTECTED),
       dstI            = g.edge_begin(dst, galois::MethodFlag::UNPROTECTED),
       dstE            = g.edge_end(dst, galois::MethodFlag::UNPROTECTED);

  while (true) {
    // find the first valid edge
    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {
      ++srcI;
    }
    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {
      ++dstI;
    }

    if (srcI == srcE || dstI == dstE) {
      return numValidEqual >= j;
    }

    // check for intersection
    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);
    if (sN < dN) {
      ++srcI;
    } else if (dN < sN) {
      ++dstI;
    } else {
      numValidEqual += 1;
      if (numValidEqual >= j) {
        return true;
      }
      ++srcI;
      ++dstI;
    }
  }
  return numValidEqual >= j;
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  if (2 > trussNum) {
    std::cerr << "trussNum >= 2\n";
    return -1;
  }

  std::cout << "Verifying maximal " << trussNum << "-truss\n";
  std::cout << "Truss is computed for " << inputFile << " and stored in "
            << trussFile << "\n";

  Graph g;
  EdgeVec work, shouldBeInvalid, shouldBeValid;

  galois::graphs::readGraph(g, inputFile, true);
  std::cout << "Read " << g.size() << " nodes\n";

  galois::StatTimer execTime("Timer_0");
  execTime.start();

  initialize(g);
  readTruss(g);
  //  printGraph(g);

  auto validNum = countValidNodesAndEdges(g);
  std::cout << validNum.first << " valid nodes\n";
  std::cout << validNum.second << " valid edges\n";

  // every valid node should have at least trussNum-1 valid neighbors
  // so # valid edges >= smallest # directed edges among valid nodes
  assert((validNum.first * (trussNum - 1)) <= validNum.second * 2);

  // symmetry breaking:
  // consider only edges (i, j) where i < j
  galois::do_all(
      galois::iterate(g),
      [&g, &work](GNode n) {
        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
          auto dst = g.getEdgeDst(e);
          if (dst > n) {
            work.push_back(std::make_pair(n, dst));
          }
        }
      },
      galois::steal());

  // pick out the following:
  // 1. valid edges whose support < trussNum-2
  // 2. removed edges whose support >= trussNum-2
  galois::do_all(
      galois::iterate(work),
      [&g, &shouldBeInvalid, &shouldBeValid](Edge e) {
        bool isSupportEnough =
            isSupportNoLessThanJ(g, e.first, e.second, trussNum - 2);
        bool isRemoved =
            g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) & 0x1;
        if (!isRemoved && !isSupportEnough) {
          shouldBeInvalid.push_back(e);
        } else if (isRemoved && isSupportEnough) {
          shouldBeValid.push_back(e);
        }
      },
      galois::steal());

  auto numShouldBeInvalid =
      std::distance(shouldBeInvalid.begin(), shouldBeInvalid.end());
  auto numShouldBeValid =
      std::distance(shouldBeValid.begin(), shouldBeValid.end());
  if (!numShouldBeInvalid && !numShouldBeValid) {
    std::cout << "Verification succeeded\n";
  } else {
    for (auto e : shouldBeInvalid) {
      std::cerr << "(" << e.first << ", " << e.second
                << ") should be invalid\n";
    }
    for (auto e : shouldBeValid) {
      std::cerr << "(" << e.first << ", " << e.second << ") should be valid\n";
    }
    std::cerr << "Verification failed!\n";
    return 1;
  }

  execTime.start();

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/k-truss/bmktest2.py
================================================
import bmk2
from bmkprops import graph_bmk, PERF_RE, get_ktruss_checker
import os

class KtrussGaloisBase(graph_bmk):
    bmk = "ktruss"
    algo = None

    def filter_inputs(self, inputs):
        def finput(x):
            if not "symmetric" in x.props.flags: return False
            if x.props.format == 'bin/galois': return True

            return False

        return filter(finput, inputs)

    def get_run_spec(self, bmkinput):
        x = bmk2.RunSpec(self, bmkinput)

        k, ec = get_ktruss_checker(bmkinput, self.config['k'])
        t = int(self.config['t'])

        x.set_binary(self.props._cwd, 'k-truss')
        x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)
        assert self.algo is not None
        x.set_arg('-algo=%s' % (self.algo,), bmk2.AT_OPAQUE)
        x.set_arg('-trussNum=%d' % (k,), bmk2.AT_OPAQUE)
        x.set_arg("-t=%d" % (t,), bmk2.AT_OPAQUE)
        x.set_arg('-o=@output', bmk2.AT_TEMPORARY_OUTPUT)
        x.set_checker(bmk2.ExternalChecker(ec))

        x.set_perf(bmk2.PerfRE(r"^\(NULL\),.*, Time,0,0,(?P<time_ms>[0-9]+)$"))
        return x

class KtrussGaloisBSP(KtrussGaloisBase):
    variant = "galois+bsp"
    algo = "bsp"

class KtrussGaloisBSPIm(KtrussGaloisBase):
    variant = "galois+bspIm"
    algo = "bspIm"

class KtrussGaloisBSPCoreThenTruss(KtrussGaloisBase):
    variant = "galois+bspCoreThenTruss"
    algo = "bspCoreThenTruss"

class KtrussGaloisAsync(KtrussGaloisBase):
    variant = "galois+async"
    algo = "async"

        
BINARIES = [KtrussGaloisBSP(),
            KtrussGaloisBSPIm(),
            KtrussGaloisBSPCoreThenTruss(),
            KtrussGaloisAsync(),]


================================================
FILE: lonestar/analytics/cpu/matching/CMakeLists.txt
================================================
add_executable(maximum-cardinality-matching-cpu bipartite-mcm.cpp)
add_dependencies(apps maximum-cardinality-matching-cpu)
target_link_libraries(maximum-cardinality-matching-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS maximum-cardinality-matching-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small1 maximum-cardinality-matching-cpu -symmetricGraph -inputType generated -n 100 -numEdges 1000 -numGroups 10 -seed 0)
add_test_scale(small2 maximum-cardinality-matching-cpu -symmetricGraph -inputType generated -n 100 -numEdges 10000 -numGroups 100 -seed 0)


================================================
FILE: lonestar/analytics/cpu/matching/README.md
================================================
Maximum Cardinality Matching
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program finds the maximum cardinality bipartite matching in a bipartite graph.
It uses the Alt-Blum-Melhorn-Paul Algorithm described at
https://web.eecs.umich.edu/~pettie/matching/Alt-Blum-Mehlhorn-Paul-bipartite-matching-dense-graphs.pdf
This algoritm is also described in:
K. Mehlhorn and S. Naeher. LEDA: A Platform for Combinatorial and Geometric Computing. Cambridge University Press, 1999

After all the augmenting paths of a given length are found, the algorithm
finishes using the Ford-Fulkerson algorithm for matching.

By default, a randomly generated input is used, though input can be taken from
a file instead. In general, the parallelism available to this algorithm is
heavily dependent on the characteristics of the input.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/matching && make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

 - `./maximum-cardinality-matching-cpu -symmetricGraph -abmpAlgo -inputType=generated -numEdges=100000000 -numGroups=10000 -seed=0 -n=1000000 -t=40`
 - `./maximum-cardinality-matching-cpu -symmetricGraph -abmpAlgo -inputType=generated -numEdges=1000000000 -numGroups=2000000 -seed=0 -n=10000000 -t=40`


================================================
FILE: lonestar/analytics/cpu/matching/bipartite-mcm.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// TODO(ddn): Needs a graph implementation that supports reversing edges more
// efficiently

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/FileGraph.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

#include <algorithm>
#include <iostream>
#include <random>
#include <string>
#include <vector>

namespace cll = llvm::cl;

static const char* name = "Maximum cardinality matching in bipartite graphs";
static const char* desc =
    "Computes maximum cardinality matching in bipartite graphs. "
    "A matching of G is a subset of edges that do not share an endpoint. "
    "The maximum cardinality matching is the matching with the most number of "
    "edges.";
static const char* url = "bipartite_mcm";

enum MatchingAlgo { pfpAlgo, ffAlgo, abmpAlgo };

enum ExecutionType { serial, parallel };

enum InputType { generated, fromFile };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Optional);
static cll::opt<MatchingAlgo>
    algo(cll::desc("Choose an algorithm:"),
         cll::values(clEnumVal(pfpAlgo, "Preflow-push"),
                     clEnumVal(ffAlgo, "Ford-Fulkerson augmenting paths"),
                     clEnumVal(abmpAlgo, "Alt-Blum-Mehlhorn-Paul")),
         cll::init(abmpAlgo));
static cll::opt<ExecutionType> executionType(
    cll::desc("Choose execution type:"),
    cll::values(clEnumVal(serial, "Serial"), clEnumVal(parallel, "Parallel")),
    cll::init(parallel));
static cll::opt<InputType>
    inputType("inputType", cll::desc("Input type:"),
              cll::values(clEnumVal(generated, "Generated"),
                          clEnumVal(fromFile, "From file")),
              cll::init(fromFile));
static cll::opt<int>
    N("n", cll::desc("Size of each set of nodes in generated input"),
      cll::init(100));
static cll::opt<int> numEdges("numEdges",
                              cll::desc("Number of edges in generated input"),
                              cll::init(1000));
static cll::opt<int> numGroups("numGroups",
                               cll::desc("Number of groups in generated input"),
                               cll::init(10));
static cll::opt<int> seed("seed", cll::desc("Random seed for generated input"),
                          cll::init(0));
static cll::opt<bool> runIteratively(
    "runIteratively",
    cll::desc("After finding matching, removed matched edges and repeat"),
    cll::init(false));

// TODO(ddn): switch to this graph for FF and ABMP algos when we fix reading
// graphs
template <typename NodeTy, typename EdgeTy>
struct BipartiteGraph : public galois::graphs::LC_Morph_Graph<NodeTy, EdgeTy> {
  typedef galois::graphs::LC_Morph_Graph<NodeTy, EdgeTy> Super;
  typedef std::vector<typename Super::GraphNode> NodeList;

  NodeList A;
  NodeList B;
};

template <typename NodeTy, typename EdgeTy>
struct MFBipartiteGraph
    : public galois::graphs::MorphGraph<NodeTy, EdgeTy, true> {
  typedef galois::graphs::MorphGraph<NodeTy, EdgeTy, true> Super;
  typedef std::vector<typename Super::GraphNode> NodeList;

  NodeList A;
  NodeList B;
};

//******************************** Common ************************

template <typename G, template <typename, bool> class Algo>
struct Exists {
  bool operator()(G&, const typename G::edge_iterator&) { return true; }
};

template <typename G>
struct GraphTypes {
  typedef typename G::GraphNode GraphNode;
  typedef std::pair<GraphNode, GraphNode> Edge;
  typedef std::vector<Edge> Matching;
};

struct BaseNode {
  size_t id;
  int degree;
  bool covered;
  bool free;
  bool reachable; // for preparing node cover
  BaseNode(size_t i = -1)
      : id(i), degree(0), covered(false), free(true), reachable(false) {}
  void reset() {
    degree    = 0;
    covered   = false;
    free      = true;
    reachable = false;
  }
};

template <typename G>
struct MarkReachable {
  typedef typename G::GraphNode GraphNode;
  typedef typename G::edge_iterator edge_iterator;

  void operator()(G& g, const GraphNode& root) {
    std::deque<GraphNode> queue;
    queue.push_back(root);

    while (!queue.empty()) {
      GraphNode cur = queue.front();
      queue.pop_front();
      if (g.getData(cur).reachable)
        continue;
      g.getData(cur).reachable = true;
      for (auto ii : g.edges(cur)) {
        GraphNode dst = g.getEdgeDst(ii);
        queue.push_back(dst);
      }
    }
  }
};

template <typename G, template <typename, bool> class Algo>
struct PrepareForVerifier {
  typedef typename GraphTypes<G>::Edge Edge;
  typedef typename GraphTypes<G>::Matching Matching;
  typedef typename G::GraphNode GraphNode;
  typedef typename G::NodeList NodeList;
  typedef typename G::node_data_type node_data_type;
  typedef typename G::edge_iterator edge_iterator;

  void operator()(G& g, Matching* matching) {
    Exists<G, Algo> exists;

    for (auto src : g.B) {
      for (auto ii : g.edges(src)) {
        GraphNode dst = g.getEdgeDst(ii);
        if (exists(g, ii)) {
          matching->push_back(Edge(src, dst));
        }
      }
    }

    for (typename NodeList::iterator ii = g.A.begin(), ei = g.A.end(); ii != ei;
         ++ii) {
      if (g.getData(*ii).free)
        MarkReachable<G>()(g, *ii);
    }

    for (typename Matching::iterator ii = matching->begin(),
                                     ei = matching->end();
         ii != ei; ++ii) {
      if (g.getData(ii->first).reachable) {
        // Reachable from a free node in A
        g.getData(ii->first).covered = true;
      } else {
        g.getData(ii->second).covered = true;
      }
    }
  }
};

//********************** FF Algorithm **************************

struct FFNode : public BaseNode {
  int pred;
  bool reached;
  FFNode(size_t i = -1) : BaseNode(i), pred(-1), reached(false) {}
  void reset() {
    BaseNode::reset();
    reached = false;
    pred    = -1;
  }
};

//! Switch between concurrent and serial instances
template <typename T1, typename T2, bool B>
struct InstanceWrapper;
template <typename T1, typename T2>
struct InstanceWrapper<T1, T2, true> {
  T1& m_t1;
  T2& m_t2;
  typedef T2 Type;
  InstanceWrapper(T1& t1, T2& t2) : m_t1(t1), m_t2(t2) {}
  T2& get() { return m_t2; }
};
template <typename T1, typename T2>
struct InstanceWrapper<T1, T2, false> {
  T1& m_t1;
  T2& m_t2;
  typedef T1 Type;
  InstanceWrapper(T1& t1, T2& t2) : m_t1(t1), m_t2(t2) {}
  T1& get() { return m_t1; }
};

//! Switch between concurrent and serial types
template <typename T1, typename T2, bool B>
struct TypeWrapper;
template <typename T1, typename T2>
struct TypeWrapper<T1, T2, true> {
  typedef T2 Type;
};
template <typename T1, typename T2>
struct TypeWrapper<T1, T2, false> {
  typedef T1 Type;
};

//! Matching algorithm of Ford and Fulkerson
template <typename G, bool Concurrent>
struct MatchingFF {
  typedef typename G::GraphNode GraphNode;
  typedef typename G::NodeList NodeList;
  typedef typename G::node_data_type node_data_type;
  typedef typename G::edge_iterator edge_iterator;
  typedef typename GraphTypes<G>::Edge Edge;

  typedef std::vector<Edge> SerialRevs;
  typedef std::vector<GraphNode> SerialReached;

  typedef std::vector<Edge,
                      typename galois::PerIterAllocTy::rebind<Edge>::other>
      ParallelRevs;
  typedef std::vector<GraphNode,
                      typename galois::PerIterAllocTy::rebind<GraphNode>::other>
      ParallelReached;

  typedef InstanceWrapper<SerialRevs, ParallelRevs, Concurrent> RevsWrapper;
  typedef InstanceWrapper<SerialReached, ParallelReached, Concurrent>
      ReachedWrapper;

  typedef std::deque<GraphNode,
                     typename galois::PerIterAllocTy::rebind<GraphNode>::other>
      Queue;
  typedef std::vector<GraphNode,
                      typename galois::PerIterAllocTy::rebind<GraphNode>::other>
      Preds;

  static const galois::MethodFlag flag =
      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;

  static const bool canRunIteratively = true;

  std::string name() {
    return std::string(Concurrent ? "Concurrent" : "Serial") +
           " Ford-Fulkerson";
  }

  template <typename C>
  bool findAugmentingPath(G& g, const GraphNode& root, C& ctx,
                          typename RevsWrapper::Type& revs,
                          typename ReachedWrapper::Type& reached) {
    Queue queue(ctx.getPerIterAlloc());
    Preds preds(ctx.getPerIterAlloc());

    // Order matters between (1) and (2)
    g.getData(root, flag).reached = true; // (1)
    reached.push_back(root);              // (2)

    queue.push_back(root);

    while (!queue.empty()) {
      GraphNode src = queue.front();
      queue.pop_front();

      for (auto ii : g.edges(src, flag)) {
        GraphNode dst        = g.getEdgeDst(ii);
        node_data_type& ddst = g.getData(dst, galois::MethodFlag::UNPROTECTED);
        if (ddst.reached)
          continue;

        ddst.reached = true;
        reached.push_back(dst);

        ddst.pred = preds.size();
        preds.push_back(src);

        if (ddst.free) {
          // Fail-safe point modulo ``reached'' which is handled separately
          ddst.free     = false;
          GraphNode cur = dst;
          while (cur != root) {
            GraphNode pred =
                preds[g.getData(cur, galois::MethodFlag::UNPROTECTED).pred];
            revs.push_back(Edge(pred, cur));
            cur = pred;
          }
          return true;
        } else {
          assert(std::distance(g.edge_begin(dst), g.edge_end(dst)) == 1);
          for (auto jj : g.edges(dst, flag)) {
            GraphNode cur = g.getEdgeDst(jj);

            g.getData(cur, galois::MethodFlag::UNPROTECTED).pred = preds.size();
            preds.push_back(dst);

            g.getData(cur, galois::MethodFlag::UNPROTECTED).reached = true;
            reached.push_back(cur);

            queue.push_back(cur);
          }
        }
      }
    }
    return false;
  }

  //! Makes sure that ``reached'' to properly reset even if we get aborted
  struct ReachedCleanup {
    G& g;
    typename ReachedWrapper::Type& reached;

    ReachedCleanup(G& g, typename ReachedWrapper::Type& r) : g(g), reached(r) {}

    ~ReachedCleanup() { cleanup(); }

    virtual void release() { cleanup(); }

    void cleanup() {
      // In non-concurrent case, we can continue reusing reached
      if (Concurrent)
        clear();
    }

    void clear() {
      for (typename ReachedWrapper::Type::iterator ii = reached.begin(),
                                                   ei = reached.end();
           ii != ei; ++ii) {
        assert(g.getData(*ii, galois::MethodFlag::UNPROTECTED).reached);
        g.getData(*ii, galois::MethodFlag::UNPROTECTED).reached = false;
      }
      reached.clear();
    }
  };

  template <typename C>
  void propagate(G& g, const GraphNode& src, C& ctx,
                 typename RevsWrapper::Type& revs,
                 typename ReachedWrapper::Type& reached) {

    ReachedCleanup cleanup(g, reached);

    if (findAugmentingPath(g, src, ctx, revs, reached)) {
      g.getData(src, galois::MethodFlag::UNPROTECTED).free = false;

      // Reverse edges in augmenting path
      for (typename RevsWrapper::Type::iterator jj = revs.begin(),
                                                ej = revs.end();
           jj != ej; ++jj) {
        auto edge =
            g.findEdge(jj->first, jj->second, galois::MethodFlag::UNPROTECTED);
        assert(edge != g.edge_end(jj->first));
        g.removeEdge(jj->first, edge, galois::MethodFlag::UNPROTECTED);
        g.addEdge(jj->second, jj->first, galois::MethodFlag::UNPROTECTED);
      }
      revs.clear();

      cleanup.clear();
    }
  }

  void operator()(G& g) {
    SerialRevs revs;
    SerialReached reached;

    galois::setActiveThreads(Concurrent ? numThreads : 1);

    galois::for_each(
        galois::iterate(g.A),
        [&, this](const GraphNode& node, auto& ctx) {
          if (!g.getData(node, flag).free)
            return;

          ParallelRevs parallelRevs(ctx.getPerIterAlloc());
          ParallelReached parallelReached(ctx.getPerIterAlloc());

          this->propagate(g, node, ctx, RevsWrapper(revs, parallelRevs).get(),
                          ReachedWrapper(reached, parallelReached).get());
        },
        galois::loopname("MatchingFF"), galois::per_iter_alloc(),
        galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());
  }
};

//********************** ABMP Algorithm **************************

struct ABMPNode : public FFNode {
  unsigned layer;
  int next;
  ABMPNode(size_t i = -1) : FFNode(i), layer(0), next(0) {}
  void reset() {
    FFNode::reset();
    layer = 0;
    next  = 0;
  }
};

//! Matching algorithm of Alt, Blum, Mehlhorn and Paul
template <typename G, bool Concurrent>
struct MatchingABMP {
  typedef typename G::NodeList NodeList;
  typedef typename G::GraphNode GraphNode;
  typedef typename G::edge_iterator edge_iterator;
  typedef typename G::node_data_type node_data_type;
  typedef typename GraphTypes<G>::Edge Edge;
  typedef std::vector<Edge,
                      typename galois::PerIterAllocTy::rebind<Edge>::other>
      Revs;
  typedef std::pair<GraphNode, unsigned> WorkItem;

  static const galois::MethodFlag flag =
      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;

  static const bool canRunIteratively = true;

  std::string name() {
    return std::string(Concurrent ? "Concurrent" : "Serial") +
           " Alt-Blum-Mehlhorn-Paul";
  }

  bool nextEdge(G& g, const GraphNode& src, GraphNode& next) {
    node_data_type& dsrc = g.getData(src, galois::MethodFlag::UNPROTECTED);
    unsigned l           = dsrc.layer - 1;

    // Start search where we last left off
    edge_iterator ii = g.edge_begin(src, flag);
    edge_iterator ei = g.edge_end(src, flag);
    assert(dsrc.next <= std::distance(ii, ei));
    std::advance(ii, dsrc.next);
    for (; ii != ei &&
           g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED).layer !=
               l;
         ++ii, ++dsrc.next) {
      ;
    }

    if (ii == ei) {
      return false;
    } else {
      next = g.getEdgeDst(ii);
      return true;
    }
  }

  //! Returns true if we've added a new element
  // TODO: better name here
  template <typename C>
  bool propagate(G& g, const GraphNode& root, C& ctx) {
    Revs revs(ctx.getPerIterAlloc());

    GraphNode cur = root;

    g.getData(root, flag);

    while (true) {
      GraphNode next;
      if (g.getData(cur, galois::MethodFlag::UNPROTECTED).free &&
          g.getData(cur, galois::MethodFlag::UNPROTECTED).layer == 0) {
        assert(g.getData(root, galois::MethodFlag::UNPROTECTED).free);
        // (1) Breakthrough
        g.getData(cur, galois::MethodFlag::UNPROTECTED).free =
            g.getData(root, galois::MethodFlag::UNPROTECTED).free = false;

        // Reverse edges in augmenting path
        for (typename Revs::iterator ii = revs.begin(), ei = revs.end();
             ii != ei; ++ii) {
          auto edge = g.findEdge(ii->first, ii->second,
                                 galois::MethodFlag::UNPROTECTED);
          assert(edge != g.edge_end(ii->first));
          g.removeEdge(ii->first, edge, galois::MethodFlag::UNPROTECTED);
          g.addEdge(ii->second, ii->first, galois::MethodFlag::UNPROTECTED);
        }
        // revs.clear();
        if (revs.size() > 1024) {
          std::cout << "WARNING: allocating large amounts in parallel: "
                    << revs.size() << "elements\n";
        }
        return false;
      } else if (nextEdge(g, cur, next)) {
        // (2) Advance
        revs.push_back(Edge(cur, next));
        cur = next;
      } else {
        // (3) Retreat
        unsigned& layer = g.getData(cur, galois::MethodFlag::UNPROTECTED).layer;
        layer += 2;
        g.getData(cur, galois::MethodFlag::UNPROTECTED).next = 0;
        if (revs.empty()) {
          ctx.push(std::make_pair(cur, layer));
          return true;
        }
        cur = revs.back().first;
        revs.pop_back();
      }
    }
  }

  void operator()(G& g) {
    galois::StatTimer t("serial");
    t.start();
    std::vector<WorkItem> initial;
    for (typename NodeList::iterator ii = g.A.begin(), ei = g.A.end(); ii != ei;
         ++ii) {
      g.getData(*ii).layer = 1;
      if (g.getData(*ii).free)
        initial.push_back(std::make_pair(*ii, 1));
    }
    t.stop();

    unsigned maxLayer =
        (unsigned)(0.1 * sqrt(std::distance(g.begin(), g.end())));
    // size_t size = initial.size();
    galois::setActiveThreads(Concurrent ? numThreads : 1);

    using namespace galois::worklists;

    auto indexer = [](const WorkItem& n) { return n.second; };

    typedef PerSocketChunkFIFO<1024> PSchunk;
    typedef OrderedByIntegerMetric<decltype(indexer), PSchunk> OBIM;

    galois::for_each(
        galois::iterate(initial),
        [&, this](const WorkItem& item, auto& ctx) {
          unsigned curLayer = item.second;
          if (curLayer > maxLayer) {
            // std::cout << "Reached max layer: " << curLayer <<
            // "\n";
            ctx.breakLoop();
            return;
          }
          // if (size <= 50 * curLayer) {
          //  std::cout << "Reached min size: " << size << "\n";
          //  ctx.breakLoop();
          //}
          if (!this->propagate(g, item.first, ctx)) {
            //__sync_fetch_and_add(&size, -1);
          }
        },
        galois::per_iter_alloc(), galois::parallel_break(),
        galois::loopname("MatchingABMP"), galois::wl<OBIM>(indexer));

    t.start();
    MatchingFF<G, false> algo;
    // std::cout << "Switching to " << algo.name() << "\n";
    algo(g);
    t.stop();
  }
};

// *************************** MaxFlow Algorithm *******************************
struct MFNode : public BaseNode {
  size_t excess;
  unsigned height;
  int current;
  MFNode(size_t i = -1) : BaseNode(i), excess(0), height(1), current(0) {}
  void reset() {
    BaseNode::reset();
    excess  = 0;
    height  = 1;
    current = 0;
  }
};

struct MFEdge {
  int cap;
  MFEdge() : cap(1) {}
  MFEdge(int c) : cap(c) {}
};

//! Matching via reduction to maxflow
template <typename G, bool Concurrent>
struct MatchingMF {
  typedef typename G::NodeList NodeList;
  typedef typename G::GraphNode GraphNode;
  typedef typename G::edge_iterator edge_iterator;
  typedef typename G::iterator iterator;
  typedef typename G::node_data_type node_data_type;
  typedef typename G::edge_data_type edge_data_type;
  static const galois::MethodFlag flag =
      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;
  static const bool canRunIteratively = false;

  /**
   * Beta parameter the original Goldberg algorithm to control when global
   * relabeling occurs. For comparison purposes, we keep them the same as
   * before, but it is possible to achieve much better performance by adjusting
   * the global relabel frequency.
   */
  static const int BETA = 12;
  /**
   * Alpha parameter the original Goldberg algorithm to control when global
   * relabeling occurs. For comparison purposes, we keep them the same as
   * before, but it is possible to achieve much better performance by adjusting
   * the global relabel frequency.
   */
  static const int ALPHA = 6;

  std::string name() {
    return std::string(Concurrent ? "Concurrent" : "Serial") + " Max Flow";
  }

  void reduceCapacity(edge_data_type& edge1, edge_data_type& edge2,
                      int amount) {
    edge1.cap -= amount;
    edge2.cap += amount;
  }

  template <typename C>
  bool discharge(G& g, const GraphNode& src, C& ctx, const GraphNode& source,
                 const GraphNode& sink, unsigned numNodes) {
    node_data_type& node = g.getData(src, flag);
    // unsigned prevHeight = node.height;
    bool relabeled = false;

    if (node.excess == 0) {
      return false;
    }

    while (true) {
      galois::MethodFlag f = relabeled ? galois::MethodFlag::UNPROTECTED : flag;
      bool finished        = false;
      int current          = -1;

      for (auto ii : g.edges(src, f)) {
        ++current;
        GraphNode dst        = g.getEdgeDst(ii);
        edge_data_type& edge = g.getEdgeData(ii);
        if (edge.cap == 0 || current < node.current)
          continue;

        node_data_type& dnode = g.getData(dst, galois::MethodFlag::UNPROTECTED);
        if (node.height - 1 != dnode.height)
          continue;

        // Push flow
        int amount = std::min(static_cast<int>(node.excess), edge.cap);
        reduceCapacity(edge,
                       g.getEdgeData(g.findEdge(
                           dst, src, galois::MethodFlag::UNPROTECTED)),
                       amount);

        // Only add once
        if (dst != sink && dst != source && dnode.excess == 0)
          ctx.push(dst);

        node.excess -= amount;
        dnode.excess += amount;

        if (node.excess == 0) {
          finished     = true;
          node.current = current;
          break;
        }
      }

      if (finished)
        break;

      relabel(g, src, numNodes);
      relabeled = true;

      // prevHeight = node.height;
    }

    return relabeled;
  }

  void relabel(G& g, const GraphNode& src, unsigned int) {
    unsigned minHeight = std::numeric_limits<unsigned>::max();
    int minEdge        = 0; // TODO: not sure of initial value

    int current = -1;
    for (auto ii : g.edges(src, galois::MethodFlag::UNPROTECTED)) {
      ++current;
      GraphNode dst = g.getEdgeDst(ii);
      int cap       = g.getEdgeData(ii).cap;
      if (cap > 0) {
        node_data_type& dnode = g.getData(dst, galois::MethodFlag::UNPROTECTED);
        if (dnode.height < minHeight) {
          minHeight = dnode.height;
          minEdge   = current;
        }
      }
    }

    assert(minHeight != std::numeric_limits<unsigned>::max());
    ++minHeight;

    node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);
    node.height          = minHeight;
    node.current         = minEdge;
  }

  void globalRelabel(G& g, const GraphNode& source, const GraphNode& sink,
                     unsigned numNodes, std::vector<GraphNode>& incoming) {

    for (iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
      GraphNode src        = *ii;
      node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);
      node.height          = numNodes;
      node.current         = 0;
      if (src == sink)
        node.height = 0;
    }

    constexpr bool useCAS = false;

    galois::StatTimer T("BfsTime");
    T.start();
    galois::for_each(
        galois::iterate({sink}),
        [&](const GraphNode& src, auto& ctx) {
          for (auto ii :
               g.edges(src, useCAS ? galois::MethodFlag::UNPROTECTED : flag)) {
            GraphNode dst = g.getEdgeDst(ii);
            if (g.getEdgeData(
                     g.findEdge(dst, src, galois::MethodFlag::UNPROTECTED))
                    .cap > 0) {
              node_data_type& node =
                  g.getData(dst, galois::MethodFlag::UNPROTECTED);
              unsigned newHeight =
                  g.getData(src, galois::MethodFlag::UNPROTECTED).height + 1;
              if (useCAS) {
                unsigned oldHeight = 0;
                while (newHeight < (oldHeight = node.height)) {
                  if (__sync_bool_compare_and_swap(&node.height, oldHeight,
                                                   newHeight)) {
                    ctx.push(dst);
                    break;
                  }
                }
              } else {
                if (newHeight < node.height) {
                  node.height = newHeight;
                  ctx.push(dst);
                }
              }
            }
          }
        },
        galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());
    T.stop();

    for (iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
      GraphNode src        = *ii;
      node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);
      if (src == sink || src == source)
        continue;
      if (node.excess > 0)
        incoming.push_back(src);
    }
  }

  void initializePreflow(G& g, const GraphNode& source,
                         std::vector<GraphNode>& initial) {
    for (auto ii : g.edges(source)) {
      GraphNode dst        = g.getEdgeDst(ii);
      edge_data_type& edge = g.getEdgeData(ii);
      int cap              = edge.cap;
      if (cap > 0)
        initial.push_back(dst);
      reduceCapacity(edge, g.getEdgeData(g.findEdge(dst, source)), cap);
      g.getData(dst).excess += cap;
    }
  }

  //! Adds reverse edges
  void initializeGraph(G& g, GraphNode& source, GraphNode& sink,
                       unsigned& numNodes, unsigned& globalRelabelInterval) {
    size_t numEdges = 0;

    numNodes                 = std::distance(g.begin(), g.end());
    source                   = g.createNode(node_data_type(numNodes++));
    sink                     = g.createNode(node_data_type(numNodes++));
    g.getData(source).height = numNodes;
    g.addNode(source);
    g.addNode(sink);

    // Add reverse edge
    for (auto src : g.A) {
      for (auto ii : g.edges(src)) {
        GraphNode dst = g.getEdgeDst(ii);
        g.getEdgeData(g.addMultiEdge(dst, src, galois::MethodFlag::WRITE)) =
            edge_data_type(0);
        ++numEdges;
      }
    }

    // Add edge from source to each node in A
    for (typename NodeList::iterator src = g.A.begin(), esrc = g.A.end();
         src != esrc; ++src) {
      g.getEdgeData(g.addMultiEdge(source, *src, galois::MethodFlag::WRITE)) =
          edge_data_type();
      g.getEdgeData(g.addMultiEdge(*src, source, galois::MethodFlag::WRITE)) =
          edge_data_type(0);
      ++numEdges;
    }

    // Add edge to sink from each node in B
    for (typename NodeList::iterator src = g.B.begin(), esrc = g.B.end();
         src != esrc; ++src) {
      g.getEdgeData(g.addMultiEdge(*src, sink, galois::MethodFlag::WRITE)) =
          edge_data_type();
      g.getEdgeData(g.addMultiEdge(sink, *src, galois::MethodFlag::WRITE)) =
          edge_data_type(0);
      ++numEdges;
    }

    globalRelabelInterval = numNodes * ALPHA + numEdges;
  }

  //! Extract matching from saturated edges
  void extractMatching(G& g) {
    for (auto src : g.A) {
      for (auto ii : g.edges(src)) {
        GraphNode dst = g.getEdgeDst(ii);
        if (g.getEdgeData(ii).cap == 0) {
          g.getData(src).free = g.getData(dst).free = false;
        }
      }
    }
  }

  void operator()(G& g) {
    galois::StatTimer t("serial");

    t.start();
    GraphNode source;
    GraphNode sink;
    unsigned numNodes;
    unsigned globalRelabelInterval;
    initializeGraph(g, source, sink, numNodes, globalRelabelInterval);

    std::vector<GraphNode> initial;
    initializePreflow(g, source, initial);
    t.stop();

    bool shouldGlobalRelabel = false;
    unsigned counter         = 0;
    galois::setActiveThreads(Concurrent ? numThreads : 1);

    while (!initial.empty()) {
      galois::for_each(
          galois::iterate(initial),
          [&, this](const GraphNode& src, auto& ctx) {
            int increment = 1;
            if (this->discharge(g, src, ctx, source, sink, numNodes)) {
              increment += BETA;
            }

            counter += increment;
            if (globalRelabelInterval && counter >= globalRelabelInterval) {
              shouldGlobalRelabel = true;
              ctx.breakLoop();
              return;
            }
          },
          galois::loopname("MatchingMF"), galois::parallel_break(),
          galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());

      if (!shouldGlobalRelabel)
        break;

      t.start();
      std::cout << "Starting global relabel, current excess at sink "
                << g.getData(sink).excess << "\n";
      initial.clear();
      globalRelabel(g, source, sink, numNodes, initial);
      shouldGlobalRelabel = false;
      t.stop();
    }

    t.start();
    std::cout << "Final excess at sink " << g.getData(sink).excess << "\n";
    g.removeNode(sink);
    g.removeNode(source);
    extractMatching(g);
    t.stop();
  }
};

template <typename G>
struct Exists<G, MatchingMF> {
  typedef typename G::edge_iterator edge_iterator;

  bool operator()(G& g, const edge_iterator& ii) {
    // assert(g.getEdgeData(src, dst).cap + g.getEdgeData(dst, src).cap == 1);
    // assert(g.getEdgeData(src, dst).cap != g.getEdgeData(dst, src).cap);
    return g.getEdgeData(ii).cap == 1;
  }
};

// ******************* Verification ***************************

template <typename G>
struct Verifier {
  typedef typename G::GraphNode GraphNode;
  typedef typename G::node_data_type node_data_type;
  typedef typename G::edge_iterator edge_iterator;
  typedef typename G::NodeList NodeList;
  typedef typename GraphTypes<G>::Matching Matching;

  bool hasCoveredNeighbors(G& g, const GraphNode& src) {
    for (auto ii : g.edges(src)) {
      GraphNode dst = g.getEdgeDst(ii);
      if (!g.getData(dst).covered)
        return false;
    }
    return true;
  }

  void check(G& g, typename NodeList::iterator ii,
             typename NodeList::iterator ei, size_t& count, bool& retval) {
    for (; ii != ei; ++ii) {
      node_data_type& dii = g.getData(*ii);
      if (dii.degree > 1) {
        std::cerr << "Error: not a matching, node " << dii.id << " incident to "
                  << dii.degree << " edges\n";
        retval = false;
      }

      if (dii.covered) {
        count++;
      }

      if (dii.covered || hasCoveredNeighbors(g, *ii)) {
        // Good
      } else {
        std::cerr << "Error: not a node cover, node " << dii.id
                  << " with degree " << dii.degree
                  << " not covered nor incident to covered node\n";
        retval = false;
      }
    }
  }

  bool operator()(G& g, const Matching& matching) {
    for (typename Matching::const_iterator ii = matching.begin(),
                                           ei = matching.end();
         ii != ei; ++ii) {
      g.getData(ii->first).degree++;
      g.getData(ii->second).degree++;
    }

    bool retval  = true;
    size_t count = 0;
    check(g, g.A.begin(), g.A.end(), count, retval);
    check(g, g.B.begin(), g.B.end(), count, retval);

    if (count != matching.size()) {
      std::cerr << "Error: matching is different than node cover "
                << matching.size() << " vs " << count << "\n";
      retval = false;
    }

    return retval;
  }
};

/**
 * Generate a random bipartite graph as used in LEDA evaluation and
 * refererenced in [CGM+97]. Nodes are divided into numGroups groups of size
 * numA/numGroups each. Each node in A has degree d = numEdges/numA and the
 * edges out of a node in group i of A go to random nodes in groups i+1 and
 * i-1  of B. If numGroups == 0, just randomly assign nodes of A to nodes of
 * B.
 */
template <typename G>
void generateRandomInput(int numA, int numB, int numEdges, int numGroups,
                         int seed, G& g) {
  typedef typename G::edge_data_type edge_data_type;

  std::cout << "numGroups: " << numGroups << " seed: " << seed << "\n";

  galois::graphs::FileGraphWriter p;
  p.setNumNodes(numA + numB);
  p.setNumEdges<edge_data_type>(numEdges);

  for (int phase = 0; phase < 2; ++phase) {
    if (phase == 0)
      p.phase1();
    else
      p.phase2();

    std::mt19937 gen(seed);
    std::uniform_int_distribution<> dist(0, 1);

    assert(numA > 0 && numB > 0);

    int d = numEdges / numA;
    if (numGroups > numA)
      numGroups = numA;
    if (numGroups > numB)
      numGroups = numB;

    int count = 0;
    if (numGroups > 0) {
      int aSize = numA / numGroups;
      int bSize = numB / numGroups;

      for (int ii = 0; ii < numA; ++ii, ++count) {
        int group = count / aSize;
        if (group == numGroups)
          break;
        int base1 = group == 0 ? (numGroups - 1) * bSize : (group - 1) * bSize;
        int base2 = group == numGroups - 1 ? 0 : (group + 1) * bSize;
        for (int i = 0; i < d; ++i) {
          int b   = dist(gen) < 0.5 ? base1 : base2;
          int off = (int)(dist(gen) * (bSize - 1));
          if (phase == 0)
            p.incrementDegree(ii);
          else
            p.addNeighbor(ii, b + off + numA);
        }
      }
    }

    int r = numEdges - count * d;
    while (r--) {
      int ind_a = (int)(dist(gen) * (numA - 1));
      int ind_b = (int)(dist(gen) * (numB - 1));
      if (phase == 0)
        p.incrementDegree(ind_a);
      else
        p.addNeighbor(ind_a, ind_b + numA);
    }
  }

  // Leave edge data uninitialized
  p.finish<edge_data_type>();
  galois::graphs::readGraph(g, p);
}

/**
 * Read bipartite graph from file.
 *
 * Assumes
 *  (1) nodes in set A have edges while nodes in set B don't
 *  (2) nodes in set A are the first numA nodes (followed by nodes in set B)
 */
template <typename G>
void readInput(const std::string& filename, G& g) {
  galois::graphs::readGraph(g, filename);
}

template <template <typename, bool> class Algo, typename G>
size_t countMatching(G& g) {
  Exists<G, Algo> exists;
  size_t count = 0;
  for (auto n : g.B) {
    for (auto edge : g.out_edges(n)) {
      if (exists(g, edge)) {
        count += 1;
      }
    }
  }
  return count;
}

template <template <typename, bool> class Algo, typename G>
void removeMatchedEdges(G& g) {
  Exists<G, Algo> exists;
  for (auto n : g.B) {
    assert(std::distance(g.edge_begin(n), g.edge_end(n)) <= 1);
    for (auto edge : g.out_edges(n)) {
      if (exists(g, edge)) {
        g.removeEdge(n, edge);
        break;
      }
    }
  }
}

template <template <typename, bool> class Algo, typename G, bool Concurrent>
void start(int N, int numEdges, int numGroups) {
  typedef Algo<G, Concurrent> A;

  A algo;
  G g;

  if (runIteratively && !algo.canRunIteratively)
    GALOIS_DIE("algo does not support iterative execution");

  switch (inputType) {
  case generated:
    generateRandomInput(N, N, numEdges, numGroups, seed, g);
    break;
  case fromFile:
    readInput(inputFile, g);
    break;
  default:
    GALOIS_DIE("unknown input type");
  }

  size_t id = 0;
  for (auto n : g) {
    g.getData(n).id = id++;
    if (g.edge_begin(n) != g.edge_end(n))
      g.A.push_back(n);
    else
      g.B.push_back(n);
  }

  std::cout << "numA: " << g.A.size() << " numB: " << g.B.size() << "\n";

  std::cout << "Starting " << algo.name() << "\n";

  galois::StatTimer execTime("Timer_0");

  while (true) {
    execTime.start();
    algo(g);
    execTime.stop();

    if (!skipVerify) {
      typename GraphTypes<G>::Matching matching;
      PrepareForVerifier<G, Algo>()(g, &matching);
      if (!Verifier<G>()(g, matching)) {
        GALOIS_DIE("verification failed");
      } else {
        std::cout << "Verification successful.\n";
      }
    }

    size_t matchingSize = countMatching<Algo>(g);
    std::cout << "Matching of cardinality: " << matchingSize << "\n";

    if (!runIteratively || matchingSize == 0)
      break;

    removeMatchedEdges<Algo>(g);
    for (auto n : g)
      g.getData(n).reset();
  }
}

template <bool Concurrent>
void start() {
  switch (algo) {
  case abmpAlgo:
    start<MatchingABMP, MFBipartiteGraph<ABMPNode, void>, Concurrent>(
        N, numEdges, numGroups);
    break;
  case pfpAlgo:
    start<MatchingMF, MFBipartiteGraph<MFNode, MFEdge>, Concurrent>(N, numEdges,
                                                                    numGroups);
    break;
  case ffAlgo:
    start<MatchingFF, MFBipartiteGraph<FFNode, void>, Concurrent>(N, numEdges,
                                                                  numGroups);
    break;
  default:
    GALOIS_DIE("unknown algo");
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  switch (executionType) {
  case serial:
    start<false>();
    break;
  case parallel:
    start<true>();
    break;
  default:
    GALOIS_DIE("unknown execution type");
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/CMakeLists.txt
================================================
add_executable(matrixcompletion-cpu matrixCompletion.cpp)
add_dependencies(apps matrixcompletion-cpu)
target_link_libraries(matrixcompletion-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS matrixcompletion-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)


if(CMAKE_COMPILER_IS_GNUCC)
  target_compile_options(matrixcompletion-cpu PRIVATE -ffast-math)
endif()

find_package(Eigen3 CONFIG)
if(Eigen_FOUND)
  target_link_libraries(matrixcompletion-cpu Eigen3::Eigen)
  target_compile_definitions(matrixcompletion-cpu PRIVATE -DHAS_EIGEN -DEIGEN_DONT_PARALLELIZE)
endif()

if (Eigen_FOUND)
  add_test_scale(small-sync matrixcompletion-cpu -algo=syncALS -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")

  add_test_scale(small-simple matrixcompletion-cpu -algo=simpleALS -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")
endif()

add_test_scale(small-edge matrixcompletion-cpu -algo=sgdBlockEdge -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")

add_test_scale(small-jump matrixcompletion-cpu -algo=sgdBlockJump -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")

add_test_scale(small-byitems matrixcompletion-cpu -algo=sgdByItems -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")

add_test_scale(small-byedges matrixcompletion-cpu -algo=sgdByEdges -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit "${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr")


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/README.md
================================================
Matrix Completion
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This program performs the matrix completion using different stochastic gradient
descent (SGD) and alternating least squares (ALS) algorithms on a bipartite graph.
We have implemeted 4 SGD based algorithms and 2 ALS based algorithms.

SGD algorithms:
  1. sgdByItems
  2. sgdByEdges
  3. sgdBlockEdge
  4. sgdBlockJump

ALS algorithms:
  1. SimpleALS
  2. SyncALS

INPUT
--------------------------------------------------------------------------------

All versions expect a bipartite graph in gr format.
NOTE: The bipartite must have all the nodes with out-going edges in the beginning,
followed by all the nodes without any out-going edges. For example, a bipartite
graph with out-going edges from users to movies, where each edge is a rating
given by a user for a movie, the graph layout must have all the user nodes 
together in the beginning followed by all the movie nodes.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/matrixcompletion; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

`$./matrixcompletion-cpu <path-to-graph> -algo=sgdBlockJump  -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.0001 -t 40 -updatesPerEdge=1 -maxUpdates=20`

To list all the options including the names of the algorithms (-algo):
`$./matrixcompletion-cpu --help`

In our experience, out of all the SGD algorithms on netflix graph
(#nodes: 497959, #edges: 99072112), sgdBlockEdge gives the best performance 
and out of ALS algorithms SyncALS performs the best.

TUNING PERFORMANCE
--------------------------------------------------------------------------------

Performance of different algorithmic variants is input dependent. 
The values for '-lambda', '-learningRateFunction', and '-learningRate' need 
to be tuned for each input graph. If root mean square erro (RMSE) is 'nan', try 
different values for 'lambda', 'learningRateFunction', and 'learningRate'.


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/bipartite-gen.py
================================================
import random
import optparse
import collections
import sys

def main(num_users, num_movies, num_edges, options):
	random.seed(1000)
	num_nodes = num_users + num_movies
	adj = collections.defaultdict(set)
	
	#print('p sp %d %d' % (num_nodes, num_edges))

	user_set = set(xrange(1, num_users+1))

	def randUser():
		x = random.randint(num_movies+1, num_movies + num_users)
		return x	
	def randMovie():
		x =random.randint(1, num_movies)
		return x	
	def randRating():
		return random.randint(1, 5)
	def addEdge(src, dst, w):
		if dst in adj[src]:
			return False
		print('a %d %d %d' % (src, dst, w))
		adj[src].add(dst)
		return True
	
	edges_emitted = num_movies
	for movie in xrange(1, num_movies+1):
		user = randUser()
		addEdge(movie, user, randRating())
		user_set.discard(user)
	
	edges_emitted = edges_emitted + len(user_set)
	for user in user_set:
		while not addEdge(randMovie(), num_movies + user, randRating()):
			pass

	for i in xrange(num_edges - edges_emitted):
		while not addEdge(randMovie(), randUser(), randRating()):
			pass

if __name__ == '__main__':
	usage = 'usage: %prog <num users> <num movies> <num edges>'
	parser = optparse.OptionParser(usage=usage)
	(options, args) = parser.parse_args()
	if len(args) != 3:
		parser.error('missing arguments')
	main(int(args[0]), int(args[1]), int(args[2]), options)


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/commandLineParam.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "llvm/Support/CommandLine.h"
/**
 * Common commandline parameters to for matrix completion algorithms
 */
namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);

// (Purdue, Neflix): 0.012, (Purdue, Yahoo Music): 0.00075, (Purdue, HugeWiki):
// 0.001 Intel: 0.001 Bottou: 0.1
static cll::opt<float> learningRate("learningRate",
                                    cll::desc("learning rate parameter [alpha] "
                                              "for Bold, Bottou, Intel and "
                                              "Purdue step size function"),
                                    cll::init(0.012));

// (Purdue, Netflix): 0.015, (Purdue, Yahoo Music): 0.01,
// (Purdue, HugeWiki): 0.0, Intel: 0.9
static cll::opt<float> decayRate("decayRate",
                                 cll::desc("decay rate parameter [beta] for "
                                           "Intel and Purdue step size "
                                           "function"),
                                 cll::init(0.015));
// (Purdue, Netflix): 0.05, (Purdue, Yahoo Music): 1.0, (Purdue, HugeWiki): 0.01
// Intel: 0.001
static cll::opt<float> lambda("lambda",
                              cll::desc("regularization parameter [lambda]"),
                              cll::init(0.05));

static cll::opt<unsigned> usersPerBlock("usersPerBlock",
                                        cll::desc("users per block"),
                                        cll::init(2048));
static cll::opt<unsigned> itemsPerBlock("itemsPerBlock",
                                        cll::desc("items per block"),
                                        cll::init(350));
static cll::opt<float>
    tolerance("tolerance", cll::desc("convergence tolerance"), cll::init(0.01));

static cll::opt<bool> useSameLatentVector("useSameLatentVector",
                                          cll::desc("initialize all nodes to "
                                                    "use same latent vector"),
                                          cll::init(false));

// Regarding algorithm termination
static cll::opt<unsigned> maxUpdates("maxUpdates",
                                     cll::desc("Max number of times to update "
                                               "latent vectors (default 100)"),
                                     cll::init(100));


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/matrixCompletion.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "matrixCompletion.h"
#include "galois/ParallelSTL.h"
#include "galois/graphs/Graph.h"
#include "galois/runtime/TiledExecutor.h"
#include "Lonestar/BoilerPlate.h"

#include <cmath>
#include <fstream>
#include <iostream>
#include <ostream>

#ifdef HAS_EIGEN
#include <Eigen/Sparse>
#include <Eigen/Dense>
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

static const char* const name = "Matrix Completion";
static const char* const desc =
    "Computes Matrix Decomposition using Stochastic "
    "Gradient Descent or Alternating Least Squares";

enum Algo {
  syncALS,
  simpleALS,
  sgdByItems,
  sgdByEdges,
  sgdBlockEdge,
  sgdBlockJump,
};

enum Step { bold, bottou, intel, inverse, purdue };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

/*
 * Commandline options for different Algorithms
 */
static cll::opt<Algo>
    algo("algo", cll::desc("Choose an algorithm:"),
         cll::values(
             clEnumValN(Algo::syncALS, "syncALS", "Alternating least squares"),
             clEnumValN(Algo::simpleALS, "simpleALS",
                        "Simple alternating least squares"),
             clEnumValN(Algo::sgdBlockEdge, "sgdBlockEdge",
                        "SGD Edge blocking (default)"),
             clEnumValN(Algo::sgdBlockJump, "sgdBlockJump",
                        "SGD using Block jumping "),
             clEnumValN(Algo::sgdByItems, "sgdByItems", "Simple SGD on Items"),
             clEnumValN(Algo::sgdByEdges, "sgdByEdges", "Simple SGD on edges")),
         cll::init(Algo::sgdBlockEdge));
/*
 * Commandline options for different learning functions
 */
static cll::opt<Step> learningRateFunction(
    "learningRateFunction", cll::desc("Choose learning rate function:"),
    cll::values(clEnumValN(Step::intel, "intel", "Intel"),
                clEnumValN(Step::purdue, "purdue", "Purdue"),
                clEnumValN(Step::bottou, "bottou", "Bottou"),
                clEnumValN(Step::bold, "bold", "Bold (default)"),
                clEnumValN(Step::inverse, "inverse", "Inverse")),
    cll::init(Step::bold));

static cll::opt<int> cutoff("cutoff");

#ifdef HAS_EIGEN
static const unsigned ALS_CHUNK_SIZE = 4;
#endif

size_t NUM_ITEM_NODES = 0;

struct PurdueStepFunction : public StepFunction {
  virtual std::string name() const { return "Purdue"; }
  virtual LatentValue stepSize(int round) const {
    return learningRate * 1.5 / (1.0 + decayRate * pow(round + 1, 1.5));
  }
};

struct IntelStepFunction : public StepFunction {
  virtual std::string name() const { return "Intel"; }
  virtual LatentValue stepSize(int round) const {
    return learningRate * pow(decayRate, round);
  }
};

struct BottouStepFunction : public StepFunction {
  virtual std::string name() const { return "Bottou"; }
  virtual LatentValue stepSize(int round) const {
    return learningRate / (1.0 + learningRate * lambda * round);
  }
};

struct InverseStepFunction : public StepFunction {
  virtual std::string name() const { return "Inverse"; }
  virtual LatentValue stepSize(int round) const { return 1.0 / (round + 1); }
};

struct BoldStepFunction : public StepFunction {
  virtual std::string name() const { return "Bold"; }
  virtual bool isBold() const { return true; }
  virtual LatentValue stepSize(int) const { return 0.0; }
};

template <typename Graph>
double sumSquaredError(Graph& g) {
  typedef typename Graph::GraphNode GNode;
  // computing Root Mean Square Error
  // Assuming only item nodes have edges
  galois::GAccumulator<double> error;

  galois::do_all(
      galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES), [&](GNode n) {
        for (auto ii = g.edge_begin(n), ei = g.edge_end(n); ii != ei; ++ii) {
          GNode dst = g.getEdgeDst(ii);
          LatentValue e =
              predictionError(g.getData(n).latentVector,
                              g.getData(dst).latentVector, g.getEdgeData(ii));
          error += (e * e);
        }
      });
  return error.reduce();
}

template <typename Graph>
size_t countEdges(Graph& g) {
  typedef typename Graph::GraphNode GNode;
  galois::GAccumulator<size_t> edges;
  galois::runtime::Fixed2DGraphTiledExecutor<Graph> executor(g);
  std::cout << "NUM_ITEM_NODES : " << NUM_ITEM_NODES << "\n";
  executor.execute(
      g.begin(), g.begin() + NUM_ITEM_NODES, g.begin() + NUM_ITEM_NODES,
      g.end(), itemsPerBlock, usersPerBlock,
      [&](GNode, GNode, typename Graph::edge_iterator) { edges += 1; },
      false); // false = no locks
  return edges.reduce();
}

template <typename Graph>
void verify(Graph& g, const std::string& prefix) {
  std::cout << countEdges(g) << " : " << g.sizeEdges() << "\n";
  if (countEdges(g) != g.sizeEdges()) {
    GALOIS_DIE("edge list of input graph probably not sorted");
  }

  double error = sumSquaredError(g);
  double rmse  = std::sqrt(error / g.sizeEdges());

  std::cout << prefix << "RMSE: " << rmse << "\n";
}

template <typename T, unsigned Size>
struct ExplicitFiniteChecker {};

template <typename T>
struct ExplicitFiniteChecker<T, 4U> {
  static_assert(std::numeric_limits<T>::is_iec559, "Need IEEE floating point");
  bool isFinite(T v) {
    union {
      T value;
      uint32_t bits;
    } a = {v};
    if (a.bits == 0x7F800000) {
      return false; // +inf
    } else if (a.bits == 0xFF800000) {
      return false; // -inf
    } else if (a.bits >= 0x7F800001 && a.bits <= 0x7FBFFFFF) {
      return false; // signaling NaN
    } else if (a.bits >= 0xFF800001 && a.bits <= 0xFFBFFFFF) {
      return false; // signaling NaN
    } else if (a.bits >= 0x7FC00000 && a.bits <= 0x7FFFFFFF) {
      return false; // quiet NaN
    } else if (a.bits >= 0xFFC00000 && a.bits <= 0xFFFFFFFF) {
      return false; // quiet NaN
    }
    return true;
  }
};

template <typename T>
struct ExplicitFiniteChecker<T, 8U> {
  static_assert(std::numeric_limits<T>::is_iec559, "Need IEEE floating point");
  bool isFinite(T v) {
    union {
      T value;
      uint64_t bits;
    } a = {v};
    if (a.bits == 0x7FF0000000000000) {
      return false; // +inf
    } else if (a.bits == 0xFFF0000000000000) {
      return false; // -inf
    } else if (a.bits >= 0x7FF0000000000001 && a.bits <= 0x7FF7FFFFFFFFFFFF) {
      return false; // signaling NaN
    } else if (a.bits >= 0xFFF0000000000001 && a.bits <= 0xFFF7FFFFFFFFFFFF) {
      return false; // signaling NaN
    } else if (a.bits >= 0x7FF8000000000000 && a.bits <= 0x7FFFFFFFFFFFFFFF) {
      return false; // quiet NaN
    } else if (a.bits >= 0xFFF8000000000000 && a.bits <= 0xFFFFFFFFFFFFFFFF) {
      return false; // quiet NaN
    }
    return true;
  }
};

template <typename T>
bool isFinite(T v) {
#ifdef __FAST_MATH__
  return ExplicitFiniteChecker<T, sizeof(T)>().isFinite(v);
#else
  return std::isfinite(v);
#endif
}

double countFlops(size_t nnz, int rounds, int k) {
  double flop = 0;
  if (useExactError) {
    // dotProduct = 2K, square = 1, sum = 1
    flop += nnz * (2.0 * k + 1 + 1);
  } else {
    // Computed during gradient update: square = 1, sum = 1
    flop += nnz * (1 + 1);
  }
  // dotProduct = 2K, gradient = 10K,
  flop += rounds * (nnz * (12.0 * k));
  return flop;
}

/*
 * Common function to execute different algorithms
 * till convergence.
 *
 * @param StepFunction to be used
 * @param Graph
 * @param fn (algorithm)
 *
 */
template <typename Graph, typename Fn>
void executeUntilConverged(const StepFunction& sf, Graph& g, Fn fn) {
  galois::GAccumulator<double> errorAccum;
  std::vector<LatentValue> steps(updatesPerEdge);
  LatentValue last    = -1.0;
  unsigned deltaRound = updatesPerEdge;
  LatentValue rate    = learningRate;

  galois::StatTimer executeAlgoTimer("Algorithm Execution Time");
  galois::TimeAccumulator elapsed;
  elapsed.start();

  unsigned long lastTime = 0;

  for (unsigned int round = 0;; round += deltaRound) {
    if (fixedRounds > 0 && round >= fixedRounds)
      break;
    if (fixedRounds > 0)
      deltaRound = std::min(deltaRound, fixedRounds - round);

    for (unsigned i = 0; i < updatesPerEdge; ++i) {
      // Assume that loss decreases
      if (sf.isBold())
        steps[i] = i == 0 ? rate : steps[i - 1] * 1.05;
      else
        steps[i] = sf.stepSize(round + i);
    }

    executeAlgoTimer.start();
    fn(&steps[0], round + deltaRound, useExactError ? &errorAccum : NULL);
    executeAlgoTimer.stop();
    double error = useExactError ? errorAccum.reduce() : sumSquaredError(g);

    elapsed.stop();

    unsigned long curElapsed = elapsed.get();
    elapsed.start();
    unsigned long millis = curElapsed - lastTime;
    lastTime             = curElapsed;

    double gflops = countFlops(g.sizeEdges(), deltaRound, LATENT_VECTOR_SIZE) /
                    millis / 1e6;

    int curRound = round + deltaRound;
    galois::gPrint("R: ", curRound, " elapsed (ms): ", curElapsed,
                   " GFLOP/s: ", gflops);
    if (useExactError) {
      galois::gPrint(" RMSE (R ", curRound,
                     "): ", std::sqrt(error / g.sizeEdges()), "\n");
    } else {
      galois::gPrint(" Approx. RMSE (R ", (curRound - 1),
                     ".5): ", std::sqrt(std::abs(error / g.sizeEdges())), "\n");
    }

    galois::gPrint("Error Change : ", std::abs((last - error) / last), "\n");
    if (!isFinite(error))
      break;
    if (fixedRounds <= 0 &&
        (round >= maxUpdates || std::abs((last - error) / last) < tolerance))
      break;
    if (sf.isBold()) {
      // Assume that loss decreases first round
      if (last >= 0.0 && last < error)
        rate = steps[deltaRound - 1] * 0.5;
      else
        rate = steps[deltaRound - 1] * 1.05;
    }
    last = error;
  }
}

/*
 * Divides the Items and users into 2D blocks.
 * Locks each block to work on it.
 */
struct SGDBlockJumpAlgo {
  bool isSgd() const { return true; }
  typedef galois::substrate::PaddedLock<true> SpinLock;
  static const bool precomputeOffsets = true; // false;

  std::string name() const { return "sgdBlockJumpAlgo"; }

  struct Node {
    LatentValue latentVector[LATENT_VECTOR_SIZE];
  };

  typedef galois::graphs::LC_CSR_Graph<Node, EdgeType>
      //    ::with_numa_alloc<true>::type
      ::with_no_lockable<true>::type Graph;
  typedef Graph::GraphNode GNode;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  size_t userIdToUserNode(size_t userId) { return userId + NUM_ITEM_NODES; }

  struct BlockInfo {
    size_t id;
    size_t x;
    size_t y;
    size_t userStart;
    size_t userEnd;
    size_t itemStart;
    size_t itemEnd;
    size_t numitems;
    size_t updates;
    double error;
    int* userOffsets;

    std::ostream& print(std::ostream& os) {
      os << "id: " << id << " x: " << x << " y: " << y
         << " userStart: " << userStart << " userEnd: " << userEnd
         << " itemStart: " << itemStart << " itemEnd: " << itemEnd
         << " updates: " << updates << "\n";
      return os;
    }

    ~BlockInfo() { delete[] userOffsets; }
  };

  struct Process {
    Graph& g;
    SpinLock *xLocks, *yLocks;
    BlockInfo* blocks;
    size_t numXBlocks, numYBlocks;
    const LatentValue* steps;
    size_t maxUpdates;
    galois::GAccumulator<double>* errorAccum;

    struct GetDst {
      Graph* g;
      GetDst() {}
      GetDst(Graph* _g) : g(_g) {}
      GNode operator()(Graph::edge_iterator ii) const {
        return g->getEdgeDst(ii);
      }
    };

    /**
     * Preconditions: row and column of slice are locked.
     *
     * Postconditions: increments update count, does sgd update on each item
     * and user in the slice
     */
    template <bool Enable = precomputeOffsets>
    size_t runBlock(BlockInfo& si,
                    typename std::enable_if<!Enable>::type* = 0) {
      if (si.updates >= maxUpdates)
        return 0;
      typedef galois::NoDerefIterator<Graph::edge_iterator> no_deref_iterator;
      typedef boost::transform_iterator<GetDst, no_deref_iterator>
          edge_dst_iterator;

      LatentValue stepSize = steps[si.updates - maxUpdates + updatesPerEdge];
      size_t seen          = 0;
      double error         = 0.0;

      // Set up item iterators
      size_t itemId      = 0;
      Graph::iterator mm = g.begin(), em = g.begin();
      std::advance(mm, si.itemStart);
      std::advance(em, si.itemEnd);

      GetDst fn{&g};

      // For each item in the range
      for (; mm != em; ++mm, ++itemId) {
        GNode item      = *mm;
        Node& itemData  = g.getData(item);
        size_t lastUser = si.userEnd + NUM_ITEM_NODES;

        edge_dst_iterator start(no_deref_iterator(g.edge_begin(
                                    item, galois::MethodFlag::UNPROTECTED)),
                                fn);
        edge_dst_iterator end(no_deref_iterator(g.edge_end(
                                  item, galois::MethodFlag::UNPROTECTED)),
                              fn);

        // For each edge in the range
        for (auto ii =
                 std::lower_bound(start, end, si.userStart + NUM_ITEM_NODES);
             ii != end; ++ii) {
          GNode user = g.getEdgeDst(*ii.base());

          if (user >= lastUser)
            break;

          LatentValue e = doGradientUpdate(itemData.latentVector,
                                           g.getData(user).latentVector, lambda,
                                           g.getEdgeData(*ii.base()), stepSize);
          if (errorAccum)
            error += e * e;
          ++seen;
        }
      }

      si.updates += 1;
      if (errorAccum) {
        *errorAccum += (error - si.error);
        si.error = error;
      }

      return seen;
    }

    template <bool Enable = precomputeOffsets>
    size_t runBlock(BlockInfo& si, typename std::enable_if<Enable>::type* = 0) {
      if (si.updates >= maxUpdates)
        return 0;
      LatentValue stepSize = steps[si.updates - maxUpdates + updatesPerEdge];
      size_t seen          = 0;
      double error         = 0.0;

      // Set up item iterators
      size_t itemId      = 0;
      Graph::iterator mm = g.begin(), em = g.begin();
      std::advance(mm, si.itemStart);
      std::advance(em, si.itemEnd);

      // For each item in the range
      for (; mm != em; ++mm, ++itemId) {
        if (si.userOffsets[itemId] < 0)
          continue;

        GNode item      = *mm;
        Node& itemData  = g.getData(item);
        size_t lastUser = si.userEnd + NUM_ITEM_NODES;

        // For each edge in the range
        for (auto ii = g.edge_begin(item) + si.userOffsets[itemId],
                  ei = g.edge_end(item);
             ii != ei; ++ii) {
          GNode user = g.getEdgeDst(ii);

          if (user >= lastUser)
            break;

          LatentValue e = doGradientUpdate(itemData.latentVector,
                                           g.getData(user).latentVector, lambda,
                                           g.getEdgeData(ii), stepSize);
          if (errorAccum)
            error += e * e;
          ++seen;
        }
      }

      si.updates += 1;
      if (errorAccum) {
        *errorAccum += (error - si.error);
        si.error = error;
      }

      return seen;
    }

    /**
     * Searches next slice to work on.
     *
     * @returns slice id to work on, x and y locks are held on the slice
     */
    size_t getNextBlock(BlockInfo* sp) {
      size_t numBlocks   = numXBlocks * numYBlocks;
      size_t nextBlockId = sp->id + 1;
      for (size_t i = 0; i < 2 * numBlocks; ++i, ++nextBlockId) {
        // Wrap around
        if (nextBlockId == numBlocks)
          nextBlockId = 0;

        BlockInfo& nextBlock = blocks[nextBlockId];

        if (nextBlock.updates < maxUpdates && xLocks[nextBlock.x].try_lock()) {
          if (yLocks[nextBlock.y].try_lock()) {
            // Return while holding locks
            return nextBlockId;
          } else {
            xLocks[nextBlock.x].unlock();
          }
        }
      }

      return numBlocks;
    }

    void operator()(unsigned tid, unsigned total) {
      galois::StatTimer timer("PerThreadTime");
      // TODO: Report Accumulators at the end
      galois::GAccumulator<size_t> edgesVisited;
      galois::GAccumulator<size_t> blocksVisited;
      size_t numBlocks = numXBlocks * numYBlocks;
      size_t xBlock    = (numXBlocks + total - 1) / total;
      size_t xStart    = std::min(xBlock * tid, numXBlocks - 1);
      size_t yBlock    = (numYBlocks + total - 1) / total;
      size_t yStart    = std::min(yBlock * tid, numYBlocks - 1);
      BlockInfo* sp    = &blocks[xStart + yStart + numXBlocks];

      timer.start();

      while (true) {
        sp = &blocks[getNextBlock(sp)];
        if (sp == &blocks[numBlocks])
          break;
        blocksVisited += 1;
        edgesVisited += runBlock(*sp);

        xLocks[sp->x].unlock();
        yLocks[sp->y].unlock();
      }

      timer.stop();
    }
  };

  void operator()(Graph& g, const StepFunction& sf) {
    galois::StatTimer preProcessTimer("PreProcessingTime");
    preProcessTimer.start();
    const size_t numUsers = g.size() - NUM_ITEM_NODES;
    const size_t numYBlocks =
        (NUM_ITEM_NODES + itemsPerBlock - 1) / itemsPerBlock;
    const size_t numXBlocks = (numUsers + usersPerBlock - 1) / usersPerBlock;
    const size_t numBlocks  = numXBlocks * numYBlocks;

    SpinLock* xLocks = new SpinLock[numXBlocks];
    SpinLock* yLocks = new SpinLock[numYBlocks];

    std::cout << "itemsPerBlock: " << itemsPerBlock
              << " usersPerBlock: " << usersPerBlock
              << " numBlocks: " << numBlocks << " numXBlocks: " << numXBlocks
              << " numYBlocks: " << numYBlocks << "\n";

    // Initialize
    BlockInfo* blocks = new BlockInfo[numBlocks];
    for (size_t i = 0; i < numBlocks; i++) {
      BlockInfo& si = blocks[i];
      si.id         = i;
      si.x          = i % numXBlocks;
      si.y          = i / numXBlocks;
      si.updates    = 0;
      si.error      = 0.0;
      si.userStart  = si.x * usersPerBlock;
      si.userEnd    = std::min((si.x + 1) * usersPerBlock, numUsers);
      si.itemStart  = si.y * itemsPerBlock;
      si.itemEnd    = std::min((si.y + 1) * itemsPerBlock, NUM_ITEM_NODES);
      si.numitems   = si.itemEnd - si.itemStart;
      if (precomputeOffsets) {
        si.userOffsets = new int[si.numitems];
      } else {
        si.userOffsets = nullptr;
      }
    }

    // Partition item edges in blocks to users according to range [userStart,
    // userEnd)
    if (precomputeOffsets) {
      galois::do_all(galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),
                     [&](GNode item) {
                       size_t sliceY = item / itemsPerBlock;
                       BlockInfo* s  = &blocks[sliceY * numXBlocks];

                       size_t pos = item - s->itemStart;
                       auto ii = g.edge_begin(item), ei = g.edge_end(item);
                       size_t offset = 0;
                       for (size_t i = 0; i < numXBlocks; ++i, ++s) {
                         size_t start = userIdToUserNode(s->userStart);
                         size_t end   = userIdToUserNode(s->userEnd);

                         if (ii != ei && g.getEdgeDst(ii) >= start &&
                             g.getEdgeDst(ii) < end) {
                           s->userOffsets[pos] = offset;
                         } else {
                           s->userOffsets[pos] = -1;
                         }
                         for (; ii != ei && g.getEdgeDst(ii) < end;
                              ++ii, ++offset)
                           ;
                       }
                     });
    }
    preProcessTimer.stop();

    galois::StatTimer executeTimer("Time");
    executeTimer.start();
    executeUntilConverged(sf, g,
                          [&](LatentValue* steps, size_t maxUpdates,
                              galois::GAccumulator<double>* errorAccum) {
                            Process fn{g,      xLocks,     yLocks,
                                       blocks, numXBlocks, numYBlocks,
                                       steps,  maxUpdates, errorAccum};
                            galois::on_each(fn);
                          });
    executeTimer.stop();

    delete[] xLocks;
    delete[] yLocks;
    delete[] blocks;
  }
};

/*
 * Simple SGD going over all the destination(users) for a given
 * source(Item)
 */
class SGDItemsAlgo {
  static const bool makeSerializable = false;

  struct BasicNode {
    LatentValue latentVector[LATENT_VECTOR_SIZE];
  };

  using Node = BasicNode;

public:
  bool isSgd() const { return true; }

  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>
      //::template with_numa_alloc<true>::type
      ::template with_out_of_line_lockable<true>::type ::
          template with_no_lockable<!makeSerializable>::type Graph;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  std::string name() const { return "sgdItemsAlgo"; }

  size_t numItems() const { return NUM_ITEM_NODES; }

private:
  using GNode         = typename Graph::GraphNode;
  using edge_iterator = typename Graph::edge_iterator;

  struct Execute {
    Graph& g;
    galois::GAccumulator<unsigned>& edgesVisited;

    void operator()(LatentValue* steps, int,
                    galois::GAccumulator<double>* errorAccum) {

      const LatentValue stepSize = steps[0];
      galois::for_each(
          galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),
          [&](GNode src, auto&) {
            for (auto ii : g.edges(src)) {

              GNode dst         = g.getEdgeDst(ii);
              LatentValue error = doGradientUpdate(
                  g.getData(src, galois::MethodFlag::UNPROTECTED).latentVector,
                  g.getData(dst).latentVector, lambda, g.getEdgeData(ii),
                  stepSize);

              edgesVisited += 1;
              if (useExactError)
                *errorAccum += error;
            }
          },
          galois::wl<galois::worklists::PerSocketChunkFIFO<64>>(),
          galois::no_pushes(), galois::loopname("sgdItemsAlgo"));
    }
  };

public:
  void operator()(Graph& g, const StepFunction& sf) {
    verify(g, "sgdItemsAlgo");
    galois::GAccumulator<unsigned> edgesVisited;

    galois::StatTimer executeTimer("Time");
    executeTimer.start();

    Execute fn{g, edgesVisited};
    executeUntilConverged(sf, g, fn);

    executeTimer.stop();

    galois::runtime::reportStat_Single("sgdItemsAlgo", "EdgesVisited",
                                       edgesVisited.reduce());
  }
};

/**
 * Simple by-edge grouped by items (only one edge per item on the WL at any
 * time)
 */
class SGDEdgeItem {
  static const bool makeSerializable = false;

  struct BasicNode {
    // latent vector to be learned.
    LatentValue latentVector[LATENT_VECTOR_SIZE];
    // if a item's update is interrupted, where to start when resuming.
    unsigned int edge_offset;
  };

  using Node = BasicNode;

public:
  bool isSgd() const { return true; }

  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>
      //::template with_numa_alloc<true>::type
      ::template with_out_of_line_lockable<true>::type ::
          template with_no_lockable<!makeSerializable>::type Graph;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  std::string name() const { return "sgdEdgeItem"; }

  size_t numItems() const { return NUM_ITEM_NODES; }

private:
  using GNode         = typename Graph::GraphNode;
  using edge_iterator = typename Graph::edge_iterator;

  struct Execute {
    Graph& g;
    galois::GAccumulator<unsigned>& edgesVisited;
    void operator()(LatentValue* steps, int,
                    galois::GAccumulator<double>* errorAccum) {
      const LatentValue stepSize = steps[0];
      galois::for_each(
          galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),
          [&](GNode src, auto& ctx) {
            auto ii = g.edge_begin(src, galois::MethodFlag::UNPROTECTED);
            auto ee = g.edge_end(src, galois::MethodFlag::UNPROTECTED);

            if (ii == ee)
              return;

            // Do not need lock on the source node, since only one thread can
            // work on a given src(item).
            auto& srcData = g.getData(src, galois::MethodFlag::UNPROTECTED);
            // Advance to the edge that has not been worked yet.
            std::advance(ii, srcData.edge_offset);
            // Take lock on the destination as multiple source may update the
            // same destination.
            auto& dstData = g.getData(g.getEdgeDst(ii));
            LatentValue error =
                doGradientUpdate(srcData.latentVector, dstData.latentVector,
                                 lambda, g.getEdgeData(ii), stepSize);

            ++srcData.edge_offset;
            ++ii;

            edgesVisited += 1;
            if (useExactError)
              *errorAccum += error;

            if (ii == ee) {
              // Finished the last edge.
              // Start from the first edge.
              srcData.edge_offset = 0;
              return;
            } else {
              // More edges to work on, therefore push the current src
              // to the worklist.
              ctx.push(src);
            }
          },
          galois::wl<galois::worklists::PerSocketChunkLIFO<8>>(),
          galois::loopname("sgdEdgeItem"));
    }
  };

public:
  void operator()(Graph& g, const StepFunction& sf) {
    verify(g, "sgdEdgeItem");
    galois::GAccumulator<unsigned> edgesVisited;

    galois::StatTimer executeTimer("Time");
    executeTimer.start();

    Execute fn{g, edgesVisited};
    executeUntilConverged(sf, g, fn);

    executeTimer.stop();

    galois::runtime::reportStat_Single("sgdEdgeItem", "EdgesVisited",
                                       edgesVisited.reduce());
  }
};

/*
 * Simple edge-wise operator
 * Use Fixed2DGraphTiledExecutor to divide Items and Users in to blocks.
 * Locks blocks (blocks may share Items or Users) to work on them.
 *
 */
class SGDBlockEdgeAlgo {
  static const bool makeSerializable = false;

  struct BasicNode {
    LatentValue latentVector[LATENT_VECTOR_SIZE];
  };

  using Node = BasicNode;

public:
  bool isSgd() const { return true; }

  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>
      //::template with_numa_alloc<true>::type
      ::template with_out_of_line_lockable<true>::type ::
          template with_no_lockable<!makeSerializable>::type Graph;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  std::string name() const { return "sgdBlockEdge"; }

  size_t numItems() const { return NUM_ITEM_NODES; }

private:
  using GNode         = typename Graph::GraphNode;
  using edge_iterator = typename Graph::edge_iterator;

  struct Execute {
    Graph& g;
    galois::GAccumulator<unsigned>& edgesVisited;

    void operator()(LatentValue* steps, int,
                    galois::GAccumulator<double>* errorAccum) {
      galois::runtime::Fixed2DGraphTiledExecutor<Graph> executor(g);
      executor.execute(
          g.begin(), g.begin() + NUM_ITEM_NODES, g.begin() + NUM_ITEM_NODES,
          g.end(), itemsPerBlock, usersPerBlock,
          [&](GNode src, GNode dst, edge_iterator edge) {
            const LatentValue stepSize = steps[0];
            LatentValue error          = doGradientUpdate(
                g.getData(src).latentVector, g.getData(dst).latentVector,
                lambda, g.getEdgeData(edge), stepSize);
            edgesVisited += 1;
            if (useExactError)
              *errorAccum += error;
          },
          true // use locks
      );
    }
  };

public:
  void operator()(Graph& g, const StepFunction& sf) {
    verify(g, "sgdBlockEdgeAlgo");
    galois::GAccumulator<unsigned> edgesVisited;

    galois::StatTimer executeTimer("Time");
    executeTimer.start();

    Execute fn{g, edgesVisited};
    executeUntilConverged(sf, g, fn);

    executeTimer.stop();

    galois::runtime::reportStat_Single("sgdBlockEdgeAlgo", "EdgesVisited",
                                       edgesVisited.reduce());
  }
};

/**
 * ALS algorithms
 */

#ifdef HAS_EIGEN

struct SimpleALSalgo {
  bool isSgd() const { return false; }
  std::string name() const { return "AlternatingLeastSquares"; }
  struct Node {
    LatentValue latentVector[LATENT_VECTOR_SIZE];
  };

  typedef typename galois::graphs::LC_CSR_Graph<
      Node, EdgeType>::with_no_lockable<true>::type Graph;
  typedef Graph::GraphNode GNode;
  // Column-major access
  typedef Eigen::SparseMatrix<LatentValue> Sp;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> MT;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, 1> V;
  typedef Eigen::Map<V> MapV;

  Sp A;
  Sp AT;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  void copyToGraph(Graph& g, MT& WT, MT& HT) {
    // Copy out
    for (GNode n : g) {
      LatentValue* ptr = &g.getData(n).latentVector[0];
      MapV mapV{ptr};
      if (n < NUM_ITEM_NODES) {
        mapV = WT.col(n);
      } else {
        mapV = HT.col(n - NUM_ITEM_NODES);
      }
    }
  }

  void copyFromGraph(Graph& g, MT& WT, MT& HT) {
    for (GNode n : g) {
      LatentValue* ptr = &g.getData(n).latentVector[0];
      MapV mapV{ptr};
      if (n < NUM_ITEM_NODES) {
        WT.col(n) = mapV;
      } else {
        HT.col(n - NUM_ITEM_NODES) = mapV;
      }
    }
  }

  void initializeA(Graph& g) {
    typedef Eigen::Triplet<int> Triplet;
    std::vector<Triplet> triplets{g.sizeEdges()};
    auto it = triplets.begin();
    for (auto n : g) {
      for (auto edge : g.out_edges(n)) {
        *it++ = Triplet(n, g.getEdgeDst(edge) - NUM_ITEM_NODES,
                        g.getEdgeData(edge));
      }
    }
    A.resize(NUM_ITEM_NODES, g.size() - NUM_ITEM_NODES);
    A.setFromTriplets(triplets.begin(), triplets.end());
    AT = A.transpose();
  }

  void operator()(Graph& g, const StepFunction&) {
    galois::TimeAccumulator elapsed;
    elapsed.start();

    // Find W, H that minimize ||W H^T - A||_2^2 by solving alternating least
    // squares problems:
    //   (W^T W + lambda I) H^T = W^T A (solving for H^T)
    //   (H^T H + lambda I) W^T = H^T A^T (solving for W^T)
    MT WT{LATENT_VECTOR_SIZE, NUM_ITEM_NODES};
    MT HT{LATENT_VECTOR_SIZE, g.size() - NUM_ITEM_NODES};
    typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, LATENT_VECTOR_SIZE>
        XTX;
    typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> XTSp;
    typedef galois::substrate::PerThreadStorage<XTX> PerThrdXTX;

    galois::gPrint("ALS::Start initializeA\n");
    initializeA(g);
    galois::gPrint("ALS::End initializeA\n");
    galois::gPrint("ALS::Start copyFromGraph\n");
    copyFromGraph(g, WT, HT);
    galois::gPrint("ALS::End copyFromGraph\n");

    double last = -1.0;
    galois::StatTimer mmTime("MMTime");
    galois::StatTimer update1Time("UpdateTime1");
    galois::StatTimer update2Time("UpdateTime2");
    galois::StatTimer copyTime("CopyTime");
    galois::StatTimer totalExecTime("totalExecTime");
    galois::StatTimer totalAlgoTime("Time");
    PerThrdXTX xtxs;

    totalAlgoTime.start();
    for (unsigned round = 1;; ++round) {
      totalExecTime.start();
      mmTime.start();
      // TODO parallelize this using tiled executor
      XTSp WTA = WT * A;
      mmTime.stop();

      update1Time.start();
      // TODO: Change to Do_all, pass ints to iterator
      galois::for_each(
          galois::iterate(boost::counting_iterator<int>(0),
                          boost::counting_iterator<int>(A.outerSize())),
          [&](int col, galois::UserContext<int>&) {
            // Compute WTW = W^T * W for sparse A
            XTX& WTW = *xtxs.getLocal();
            WTW.setConstant(0);
            for (Sp::InnerIterator it(A, col); it; ++it)
              WTW.triangularView<Eigen::Upper>() +=
                  WT.col(it.row()) * WT.col(it.row()).transpose();
            for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)
              WTW(i, i) += lambda;
            HT.col(col) =
                WTW.selfadjointView<Eigen::Upper>().llt().solve(WTA.col(col));
          });
      update1Time.stop();

      mmTime.start();
      XTSp HTAT = HT * AT;
      mmTime.stop();

      update2Time.start();
      galois::for_each(
          galois::iterate(boost::counting_iterator<int>(0),
                          boost::counting_iterator<int>(AT.outerSize())),
          [&](int col, galois::UserContext<int>&) {
            // Compute HTH = H^T * H for sparse A
            XTX& HTH = *xtxs.getLocal();
            HTH.setConstant(0);
            for (Sp::InnerIterator it(AT, col); it; ++it)
              HTH.triangularView<Eigen::Upper>() +=
                  HT.col(it.row()) * HT.col(it.row()).transpose();
            for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)
              HTH(i, i) += lambda;
            WT.col(col) =
                HTH.selfadjointView<Eigen::Upper>().llt().solve(HTAT.col(col));
          });
      update2Time.stop();

      copyTime.start();
      copyToGraph(g, WT, HT);
      copyTime.stop();
      totalExecTime.stop();

      double error = sumSquaredError(g);
      elapsed.stop();
      std::cout << "R: " << round << " elapsed (ms): " << elapsed.get()
                << " RMSE (R " << round
                << "): " << std::sqrt(error / g.sizeEdges()) << "\n";
      elapsed.start();

      if (fixedRounds <= 0 && round > 1 &&
          std::abs((last - error) / last) < tolerance)
        break;
      if (fixedRounds > 0 && round >= fixedRounds)
        break;

      last = error;
    }
    totalAlgoTime.stop();
  }
};

struct SyncALSalgo {

  bool isSgd() const { return false; }

  std::string name() const { return "SynchronousAlternatingLeastSquares"; }

  struct Node {
    LatentValue latentVector[LATENT_VECTOR_SIZE];
  };

  static const bool NEEDS_LOCKS = false;
  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType> BaseGraph;
  typedef typename std::conditional<
      NEEDS_LOCKS,
      typename BaseGraph::template with_out_of_line_lockable<true>::type,
      typename BaseGraph::template with_no_lockable<true>::type>::type Graph;
  typedef typename Graph::GraphNode GNode;
  // Column-major access
  typedef Eigen::SparseMatrix<LatentValue> Sp;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> MT;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, 1> V;
  typedef Eigen::Map<V> MapV;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, LATENT_VECTOR_SIZE>
      XTX;
  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> XTSp;

  typedef galois::substrate::PerThreadStorage<XTX> PerThrdXTX;
  typedef galois::substrate::PerThreadStorage<V> PerThrdV;

  Sp A;
  Sp AT;

  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }

  void copyToGraph(Graph& g, MT& WT, MT& HT) {
    // Copy out
    for (GNode n : g) {
      LatentValue* ptr = &g.getData(n).latentVector[0];
      MapV mapV{ptr};
      if (n < NUM_ITEM_NODES) {
        mapV = WT.col(n);
      } else {
        mapV = HT.col(n - NUM_ITEM_NODES);
      }
    }
  }

  void copyFromGraph(Graph& g, MT& WT, MT& HT) {
    for (GNode n : g) {
      LatentValue* ptr = &g.getData(n).latentVector[0];
      MapV mapV{ptr};
      if (n < NUM_ITEM_NODES) {
        WT.col(n) = mapV;
      } else {
        HT.col(n - NUM_ITEM_NODES) = mapV;
      }
    }
  }

  void initializeA(Graph& g) {
    typedef Eigen::Triplet<int> Triplet;
    std::vector<Triplet> triplets{g.sizeEdges()};
    auto it = triplets.begin();
    for (auto n : g) {
      for (auto edge : g.out_edges(n)) {
        *it++ = Triplet(n, g.getEdgeDst(edge) - NUM_ITEM_NODES,
                        g.getEdgeData(edge));
      }
    }
    A.resize(NUM_ITEM_NODES, g.size() - NUM_ITEM_NODES);
    A.setFromTriplets(triplets.begin(), triplets.end());
    AT = A.transpose();
  }

  void update(Graph& g, size_t col, MT& WT, MT& HT, PerThrdXTX& xtxs,
              PerThrdV& rhs) {
    // Compute WTW = W^T * W for sparse A
    V& r = *rhs.getLocal();
    if (col < NUM_ITEM_NODES) {
      r.setConstant(0);
      // HTAT = HT * AT; r = HTAT.col(col)
      for (Sp::InnerIterator it(AT, col); it; ++it)
        r += it.value() * HT.col(it.row());
      XTX& HTH = *xtxs.getLocal();
      HTH.setConstant(0);
      for (Sp::InnerIterator it(AT, col); it; ++it)
        HTH.triangularView<Eigen::Upper>() +=
            HT.col(it.row()) * HT.col(it.row()).transpose();
      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)
        HTH(i, i) += lambda;
      WT.col(col) = HTH.selfadjointView<Eigen::Upper>().llt().solve(r);
    } else {
      col = col - NUM_ITEM_NODES;
      r.setConstant(0);
      // WTA = WT * A; x = WTA.col(col)
      for (Sp::InnerIterator it(A, col); it; ++it)
        r += it.value() * WT.col(it.row());
      XTX& WTW = *xtxs.getLocal();
      WTW.setConstant(0);
      for (Sp::InnerIterator it(A, col); it; ++it)
        WTW.triangularView<Eigen::Upper>() +=
            WT.col(it.row()) * WT.col(it.row()).transpose();
      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)
        WTW(i, i) += lambda;
      HT.col(col) = WTW.selfadjointView<Eigen::Upper>().llt().solve(r);
    }
  }

  struct NonDetTraits {
    typedef std::tuple<> base_function_traits;
  };

  struct Process {
    struct LocalState {
      LocalState(Process&, galois::PerIterAllocTy&) {}
    };

    struct DeterministicId {
      uintptr_t operator()(size_t x) const { return x; }
    };

    typedef std::tuple<galois::per_iter_alloc, galois::intent_to_read,
                       galois::local_state<LocalState>,
                       galois::det_id<DeterministicId>>
        ikdg_function_traits;
    typedef std::tuple<galois::per_iter_alloc, galois::fixed_neighborhood,
                       galois::local_state<LocalState>,
                       galois::det_id<DeterministicId>>
        add_remove_function_traits;
    typedef std::tuple<> nondet_function_traits;

    SyncALSalgo& self;
    Graph& g;
    MT& WT;
    MT& HT;
    PerThrdXTX& xtxs;
    PerThrdV& rhs;

    Process(SyncALSalgo& self, Graph& g, MT& WT, MT& HT, PerThrdXTX& xtxs,
            PerThrdV& rhs)
        : self(self), g(g), WT(WT), HT(HT), xtxs(xtxs), rhs(rhs) {}

    void operator()(size_t col, galois::UserContext<size_t>& ctx) {
      self.update(g, col, WT, HT, xtxs, rhs);
    }
  };

  void operator()(Graph& g, const StepFunction&) {
    if (!useSameLatentVector) {
      galois::gWarn("Results are not deterministic with different numbers of "
                    "threads unless -useSameLatentVector is true");
    }
    galois::TimeAccumulator elapsed;
    elapsed.start();

    // Find W, H that minimize ||W H^T - A||_2^2 by solving alternating least
    // squares problems:
    //   (W^T W + lambda I) H^T = W^T A (solving for H^T)
    //   (H^T H + lambda I) W^T = H^T A^T (solving for W^T)
    MT WT{LATENT_VECTOR_SIZE, NUM_ITEM_NODES};
    MT HT{LATENT_VECTOR_SIZE, g.size() - NUM_ITEM_NODES};

    initializeA(g);
    copyFromGraph(g, WT, HT);

    double last = -1.0;
    galois::StatTimer updateTime("UpdateTime");
    galois::StatTimer copyTime("CopyTime");
    galois::StatTimer totalExecTime("totalExecTime");
    galois::StatTimer totalAlgoTime("Time");
    PerThrdXTX xtxs;
    PerThrdV rhs;

    totalAlgoTime.start();
    for (unsigned round = 1;; ++round) {

      totalExecTime.start();
      updateTime.start();

      typedef galois::worklists::PerThreadChunkLIFO<ALS_CHUNK_SIZE> WL_ty;
      galois::for_each(
          galois::iterate(boost::counting_iterator<size_t>(0),
                          boost::counting_iterator<size_t>(NUM_ITEM_NODES)),
          Process(*this, g, WT, HT, xtxs, rhs), galois::wl<WL_ty>(),
          galois::loopname("syncALS-users"));
      galois::for_each(
          galois::iterate(boost::counting_iterator<size_t>(NUM_ITEM_NODES),
                          boost::counting_iterator<size_t>(g.size())),
          Process(*this, g, WT, HT, xtxs, rhs), galois::wl<WL_ty>(),
          galois::loopname("syncALS-items"));

      updateTime.stop();

      copyTime.start();
      copyToGraph(g, WT, HT);
      copyTime.stop();
      totalExecTime.stop();

      double error = sumSquaredError(g);
      elapsed.stop();
      std::cout << "R: " << round << " elapsed (ms): " << elapsed.get()
                << " RMSE (R " << round
                << "): " << std::sqrt(error / g.sizeEdges()) << "\n";
      elapsed.start();

      if (fixedRounds <= 0 && round > 1 &&
          std::abs((last - error) / last) < tolerance)
        break;
      if (fixedRounds > 0 && round >= fixedRounds)
        break;

      last = error;
    } // end for
    totalAlgoTime.stop();
  }
};

#endif // HAS_EIGEN

/**
 * Initializes latent vector with random values and returns basic graph
 * parameters.
 *
 * @tparam Graph type of g
 * @param g Graph to initialize
 * @returns number of item nodes, i.e. nodes with outgoing edges. They should
 * be the first nodes of the graph in memory
 */

template <typename Graph>
size_t initializeGraphData(Graph& g) {
  galois::gPrint("initializeGraphData\n");
  galois::StatTimer initTimer("InitializeGraph");
  initTimer.start();
  double top = 1.0 / std::sqrt(LATENT_VECTOR_SIZE);
  galois::substrate::PerThreadStorage<std::mt19937> gen;

#if __cplusplus >= 201103L || defined(HAVE_CXX11_UNIFORM_INT_DISTRIBUTION)
  std::uniform_real_distribution<LatentValue> dist(0, top);
#else
  std::uniform_real<LatentValue> dist(0, top);
#endif

  if (useDetInit) {
    galois::do_all(galois::iterate(g), [&](typename Graph::GraphNode n) {
      auto& data = g.getData(n);
      auto val   = genVal(n);
      for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
        data.latentVector[i] = val;
      }
    });
  } else {
    galois::do_all(galois::iterate(g), [&](typename Graph::GraphNode n) {
      auto& data = g.getData(n);

      // all threads initialize their assignment with same generator or
      // a thread local one
      if (useSameLatentVector) {
        std::mt19937 sameGen;
        for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
          data.latentVector[i] = dist(sameGen);
        }
      } else {
        for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
          data.latentVector[i] = dist(*gen.getLocal());
        }
      }
    });
  }

  auto activeThreads = galois::getActiveThreads();
  std::vector<uint32_t> largestNodeID_perThread(activeThreads);

  galois::on_each([&](unsigned tid, unsigned nthreads) {
    unsigned int block_size = g.size() / nthreads;
    if ((g.size() % nthreads) > 0)
      ++block_size;

    uint32_t start = tid * block_size;
    uint32_t end   = (tid + 1) * block_size;
    if (end > g.size())
      end = g.size();

    largestNodeID_perThread[tid] = 0;
    for (uint32_t i = start; i < end; ++i) {
      if (std::distance(g.edge_begin(i), g.edge_end(i))) {
        if (largestNodeID_perThread[tid] < i)
          largestNodeID_perThread[tid] = i;
      }
    }
  });

  uint32_t largestNodeID = 0;
  for (uint32_t t = 0; t < activeThreads; ++t) {
    if (largestNodeID < largestNodeID_perThread[t])
      largestNodeID = largestNodeID_perThread[t];
  }
  size_t numItemNodes = largestNodeID + 1;

  initTimer.stop();
  return numItemNodes;
}

StepFunction* newStepFunction() {
  switch (learningRateFunction) {
  case Step::intel:
    return new IntelStepFunction;
  case Step::purdue:
    return new PurdueStepFunction;
  case Step::bottou:
    return new BottouStepFunction;
  case Step::inverse:
    return new InverseStepFunction;
  case Step::bold:
    return new BoldStepFunction;
  default:
    GALOIS_DIE("unknown step function");
  }
}

template <typename Graph>
void writeBinaryLatentVectors(Graph& g, const std::string& filename) {
  std::ofstream file(filename);
  for (auto ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
    auto& v = g.getData(*ii).latentVector;
    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {
      file.write(reinterpret_cast<char*>(&v[i]), sizeof(v[i]));
    }
  }
  file.close();
}

template <typename Graph>
void writeAsciiLatentVectors(Graph& g, const std::string& filename) {
  std::ofstream file(filename);
  for (auto ii = g.begin(), ei = g.end(); ii != ei; ++ii) {
    auto& v = g.getData(*ii).latentVector;
    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {
      file << v[i] << " ";
    }
    file << "\n";
  }
  file.close();
}

/**
 * Run the provided algorithm (provided through the template argument).
 *
 * @param Algo algorithm to run
 */
template <typename Algo>
void run() {
  typename Algo::Graph g;
  Algo algo;

  galois::runtime::reportNumaAlloc("NumaAlloc0");

  // Bipartite graph in general graph data structure should be following:
  // * items are the first m nodes
  // * users are the next n nodes
  // * only items have outedges
  algo.readGraph(g);

  galois::runtime::reportNumaAlloc("NumaAlloc1");

  // initialize latent vectors and get number of item nodes
  NUM_ITEM_NODES = initializeGraphData(g);

  galois::runtime::reportNumaAlloc("NumaAlloc2");

  std::cout << "num users: " << g.size() - NUM_ITEM_NODES
            << " num items: " << NUM_ITEM_NODES
            << " num ratings: " << g.sizeEdges() << "\n";

  std::unique_ptr<StepFunction> sf{newStepFunction()};
  std::cout << "latent vector size: " << LATENT_VECTOR_SIZE
            << " algo: " << algo.name() << " lambda: " << lambda;

  if (algo.isSgd()) {
    std::cout << " learning rate: " << learningRate
              << " decay rate: " << decayRate
              << " step function: " << sf->name();
  }

  std::cout << "\n";

  if (!skipVerify) {
    verify(g, "Initial");
  }

  // algorithm call
  galois::StatTimer execTime("Timer_0");
  execTime.start();
  algo(g, *sf);
  execTime.stop();

  if (!skipVerify) {
    verify(g, "Final");
  }

  if (outputFilename != "") {
    std::cout << "Writing latent vectors to " << outputFilename << "\n";
    switch (outputType) {
    case OutputType::binary:
      writeBinaryLatentVectors(g, outputFilename);
      break;
    case OutputType::ascii:
      writeAsciiLatentVectors(g, outputFilename);
      break;
    default:
      GALOIS_DIE("invalid output type for latent vector output");
    }
  }

  galois::runtime::reportNumaAlloc("NumaAlloc");
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  switch (algo) {
#ifdef HAS_EIGEN
  case Algo::syncALS:
    run<SyncALSalgo>();
    break;
  case Algo::simpleALS:
    run<SimpleALSalgo>();
    break;
#endif
  case Algo::sgdByItems:
    run<SGDItemsAlgo>();
    break;
  case Algo::sgdByEdges:
    run<SGDEdgeItem>();
    break;
  case Algo::sgdBlockEdge:
    run<SGDBlockEdgeAlgo>();
    break;
  case Algo::sgdBlockJump:
    run<SGDBlockJumpAlgo>();
    break;
  default:
    GALOIS_DIE("unknown algorithm");
    break;
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef LONESTAR_MATRIXCOMPLETION_H
#define LONESTAR_MATRIXCOMPLETION_H

#include <cassert>
#include <galois/gstl.h>
#include <string>
#include "llvm/Support/CommandLine.h"

typedef float LatentValue;
typedef float EdgeType;

// Purdue, CSGD: 100; Intel: 20
// static const int LATENT_VECTOR_SIZE = 100;
static const int LATENT_VECTOR_SIZE = 20; // Purdue, CSGD: 100; Intel: 20

/**
 * Common commandline parameters to for matrix completion algorithms
 */
enum OutputType { binary, ascii };

namespace cll = llvm::cl;

/*
 * (Purdue, Neflix): 0.012, (Purdue, Yahoo Music): 0.00075, (Purdue, HugeWiki):
 * 0.001 Intel: 0.001 Bottou: 0.1
 */
static cll::opt<float> learningRate("learningRate",
                                    cll::desc("learning rate parameter [alpha] "
                                              "for Bold, Bottou, Intel and "
                                              "Purdue step size function"),
                                    cll::init(0.012));

/*
 * (Purdue, Netflix): 0.015, (Purdue, Yahoo Music): 0.01,
 * (Purdue, HugeWiki): 0.0, Intel: 0.9
 */
static cll::opt<float> decayRate("decayRate",
                                 cll::desc("decay rate parameter [beta] for "
                                           "Intel and Purdue step size "
                                           "function"),
                                 cll::init(0.015));
/*
 * (Purdue, Netflix): 0.05, (Purdue, Yahoo Music): 1.0, (Purdue, HugeWiki): 0.01
 * Intel: 0.001
 */
static cll::opt<float> lambda("lambda",
                              cll::desc("regularization parameter [lambda]"),
                              cll::init(0.05));

static cll::opt<unsigned> usersPerBlock("usersPerBlock",
                                        cll::desc("users per block"),
                                        cll::init(2048));
static cll::opt<unsigned> itemsPerBlock("itemsPerBlock",
                                        cll::desc("items per block"),
                                        cll::init(350));
static cll::opt<float>
    tolerance("tolerance", cll::desc("convergence tolerance"), cll::init(0.01));

static cll::opt<bool> useSameLatentVector("useSameLatentVector",
                                          cll::desc("initialize all nodes to "
                                                    "use same latent vector"),
                                          cll::init(false));

/*
 * Regarding algorithm termination
 */
static cll::opt<unsigned> maxUpdates("maxUpdates",
                                     cll::desc("Max number of times to update "
                                               "latent vectors (default 100)"),
                                     cll::init(100));

static cll::opt<std::string>
    outputFilename(cll::Positional, cll::desc("[output file]"), cll::init(""));
static cll::opt<std::string>
    transposeGraphName("graphTranspose", cll::desc("Transpose of input graph"));
static cll::opt<OutputType>
    outputType("output", cll::desc("Output type:"),
               cll::values(clEnumValN(OutputType::binary, "binary", "Binary"),
                           clEnumValN(OutputType::ascii, "ascii", "ASCII")),
               cll::init(OutputType::binary));

static cll::opt<unsigned int>
    updatesPerEdge("updatesPerEdge", cll::desc("number of updates per edge"),
                   cll::init(1));

static cll::opt<unsigned int>
    fixedRounds("fixedRounds", cll::desc("run for a fixed number of rounds"),
                cll::init(0));
static cll::opt<bool> useExactError("useExactError",
                                    cll::desc("use exact error for testing "
                                              "convergence"),
                                    cll::init(false));
static cll::opt<bool>
    useDetInit("useDetInit",
               cll::desc("initialize all nodes to "
                         "use deterministic values for latent vector"),
               cll::init(false));

/**
 * Inner product of 2 vectors.
 *
 * Like std::inner_product but rewritten here to check vectorization
 *
 * @param first1 Pointer to beginning of vector 1
 * @param last1 Pointer to end of vector 1. Should be exactly LATENT_VECTOR_SIZE
 * away from first1
 * @param first2 Pointer to beginning of vector 2. Should have at least
 * LATENT_VECTOR_SIZE elements from this point.
 * @param init Initial value to accumulate sum into
 *
 * @returns init + the inner product (i.e. the inner product if init is 0, error
 * if init is -"ground truth"
 */
template <typename T>
T innerProduct(T* __restrict__ first1,
               T* __restrict__ GALOIS_USED_ONLY_IN_DEBUG(last1),
               T* __restrict__ first2, T init) {
  assert(first1 + LATENT_VECTOR_SIZE == last1);
  for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {
    init += first1[i] * first2[i];
  }
  return init;
}

template <typename T>
T predictionError(T* __restrict__ itemLatent, T* __restrict__ userLatent,
                  double actual) {
  T v = actual;
  return innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE, userLatent,
                      -v);
}

/**
 * Objective: squared loss with weighted-square-norm regularization
 *
 * Updates latent vectors to reduce the error from the edge value.
 *
 * @param itemLatent latent vector of the item
 * @param userLatent latent vector of the user
 * @param lambda learning parameter
 * @param edgeRating Data on the edge, i.e. the number that the inner product
 * of the 2 latent vectors should eventually get to
 * @param stepSize learning parameter: how much to adjust vectors by to
 * correct for erro
 *
 * @return Error before gradient update
 */
template <typename T>
T doGradientUpdate(T* __restrict__ itemLatent, T* __restrict__ userLatent,
                   double lambda, double edgeRating, double stepSize) {
  // Implicit cast to type T
  T l      = lambda;
  T step   = stepSize;
  T rating = edgeRating;
  T error  = innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE,
                         userLatent, -rating);

  // Take gradient step to reduce error
  for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
    T prevItem = itemLatent[i];
    T prevUser = userLatent[i];
    itemLatent[i] -= step * (error * prevUser + l * prevItem);
    userLatent[i] -= step * (error * prevItem + l * prevUser);
  }

  return error;
}

struct StepFunction {
  virtual LatentValue stepSize(int round) const = 0;
  virtual std::string name() const              = 0;
  virtual bool isBold() const { return false; }
  virtual ~StepFunction() {}
};

/*
 * Generate a number [-1, 1] using node id
 * for deterministic runs
 */
static double genVal(uint32_t n) {
  return 2.0 * ((double)n / (double)RAND_MAX) - 1.0;
}

StepFunction* newStepFunction();

template <typename Graph>
size_t initializeGraphData(Graph& g);

#endif


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/parselog.sh
================================================
#!/bin/bash
perl -pe 'if(/(\d+) elapsed \(ms\): (\S+).*GFLOP\/s: (\S+) RMSE.*: (\S+)/){ print "STAT SINGLE Elapsed$1 (null) $2\n";}' \
| perl -pe 'if(/(\d+) elapsed \(ms\): (\S+).*GFLOP\/s: (\S+) RMSE.*: (\S+)/){ print "STAT SINGLE GFLOPS$1 (null) $3\n";}' \
| perl -pe 'if(/(\d+) elapsed \(ms\): (\S+).*GFLOP\/s: (\S+) RMSE.*: (\S+)/){ print "STAT SINGLE RMSE$1 (null) $4\n";}' \
| ~/w/GaloisDefault/scripts/report.py


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/plot.R
================================================
library(grid)
library(gridExtra)
library(ggplot2)
library(plyr)
library(reshape2)

theme_set(theme_bw(base_size=16) + 
  theme(legend.background=element_rect(color="white", fill="white"),
        legend.key=element_rect(color="white", fill="white"),
        #legend.margin=unit(0, "lines"),
        #plot.margin=unit(c(0, 0.25, 0, 0), "lines"),
        strip.background=element_rect(color="white", fill="white")))

dd <- read.csv("data.csv", stringsAsFactors=F)
dd <- dd[,!grepl("null", names(dd))]
dd <- subset(dd, !is.na(RMSE1))
dd$Kind <- ""
dd$Kind[grepl("mc-ddn", dd$CommandLine)] <- "galois"
dd$Kind[grepl("nomad", dd$CommandLine)] <- "nomad"
dd$Kind[grepl("collaborative", dd$CommandLine)] <- "graphlab"
dd$Input <- ""
dd$Input[grepl("bgg", dd$CommandLine)] <- "bgg"
dd$Input[grepl("yahoo", dd$CommandLine)] <- "yahoo"
dd$Input[grepl("netflix", dd$CommandLine)] <- "netflix"

g1 <- ddply(dd, .(Input, Kind, Threads), function (d) {
    ldply(grep("GFLOPS", names(d), value=T), function(n) {
      i=sub("GFLOPS(\\d+)", "\\1", n)
      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), GFLOPS=d[,n])})
})
g2 <- ddply(dd, .(Input, Kind, Threads), function (d) {
    ldply(grep("Elapsed", names(d), value=T), function(n) {
      i=sub("Elapsed(\\d+)", "\\1", n)
      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), Elapsed=d[,n])})
})
g3 <- ddply(dd, .(Input, Kind, Threads), function (d) {
    ldply(grep("RMSE", names(d), value=T), function(n) {
      i=sub("RMSE(\\d+)", "\\1", n)
      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), RMSE=d[,n])})
})

gg <- merge(g1, merge(g2, g3))
gg <- rbind(gg,
  data.frame(
    Input=rep(c("bgg", "netflix", "yahoo"), 3),
    Kind=c(rep("galois", 3), rep("graphlab", 3), rep("nomad", 3)),
    Threads=20,
    I=0,
    GFLOPS=NA,
    Elapsed=0,
    RMSE=rep(c(70.6059, 3.40143, 61.7137), 3)
  ))
gg$Elapsed <- gg$Elapsed / 1000

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

errorLevelY <- ddply(subset(gg, Threads==20), .(Input), summarize, intercept=1.1*min(RMSE))
errorLevelX <- ddply(subset(gg, Threads==20), .(Input, Kind), function(d) {
  fn <- approxfun(d$Elapsed, d$RMSE)
  target <- subset(errorLevelY, Input==d$Input[1])$intercept[1]
  opt <- optimize(f=function(x) { (fn(x) - target)^2 }, c(0, 1000))
  data.frame(xintercept=opt$minimum)
})

p1 <- ggplot(subset(gg, Input=="bgg" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +
  geom_line(size=1) +
  scale_x_continuous("") +
  scale_y_continuous("") +
  scale_color_brewer(type="qual") +
  geom_abline(data=subset(errorLevelY, Input=="bgg"), aes(intercept=intercept, slope=0)) +
  geom_vline(data=subset(errorLevelX, Input=="bgg"),
             linetype="dashed",
             aes(xintercept=xintercept, color=Kind)) +
  geom_text(data=subset(errorLevelX, Input=="bgg"),
            color="black",
            aes(x=xintercept, y=0, label=sprintf("%.0f", xintercept), vjust = -1, hjust = 0)) +
  facet_wrap(~Input, scale="free") +
  theme(legend.position="bottom") +
  coord_cartesian(xlim=c(0, 500), ylim=c(0, 15))
p2 <- ggplot(subset(gg, Input=="netflix" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +
  geom_line(size=1) +
  scale_x_continuous("") +
  scale_y_continuous("") +
  scale_color_brewer(type="qual") +
  geom_abline(data=subset(errorLevelY, Input=="netflix"), aes(intercept=intercept, slope=0)) +
  geom_vline(data=subset(errorLevelX, Input=="netflix"),
             linetype="dashed",
             aes(xintercept=xintercept, color=Kind)) +
  geom_text(data=subset(errorLevelX, Input=="netflix"),
            color="black",
            aes(x=xintercept, y=0.75, label=sprintf("%.0f", xintercept), vjust = -1, hjust = 0)) +
  facet_wrap(~Input, scale="free") +
  theme(legend.position="none") +
  coord_cartesian(xlim=c(0, 200), ylim=c(0.75, 1.5))
p3 <- ggplot(subset(gg, Input=="yahoo" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +
  geom_line(size=1) +
  scale_x_continuous("") +
  scale_y_continuous("") +
  scale_color_brewer(type="qual") +
  geom_abline(data=subset(errorLevelY, Input=="yahoo"), aes(intercept=intercept, slope=0)) +
  geom_vline(data=subset(errorLevelX, Input=="yahoo"),
             linetype="dashed",
             aes(xintercept=xintercept, color=Kind)) +
  geom_text(data=subset(errorLevelX, Input=="yahoo"),
            color="black",
            aes(x=xintercept, y=c(18, 0, 17), label=sprintf("%.0f", xintercept), vjust = -1, hjust = 0)) +
  facet_wrap(~Input, scale="free") +
  theme(legend.position="none") +
  coord_cartesian(xlim=c(0, 700), ylim=c(17, 30))
plegend <- g_legend(p1)
pp <- arrangeGrob(
  arrangeGrob(p1 + theme(legend.position="none"), p2, p3, nrow=1),
  plegend, nrow=2, heights=c(1, 0.1))

ggplot(gg, aes(x=as.factor(Threads), y=GFLOPS, color=Kind)) + 
  geom_boxplot(position="identity", outlier.colour = NULL) +
  scale_color_brewer(type="qual") +
  scale_x_discrete("Threads") +
  facet_wrap(~Input, scale="free")


dd <- read.csv("/net/peltier/workspace/ddn/build/default/tyahoo-transpose.degreehist")
dd <- read.csv("/net/peltier/workspace/ddn/build/default/tyahoo.degreehist")
dd <- subset(dd, Degree != 0)
mod <- nls(Count ~ exp(a + b * Degree), data=dd, start = list(a=0, b=0))
dd$Kind <- "O"
pp <- data.frame(Hist=0, Degree=dd$Degree, Count=predict(mod, list(x=dd$Degree)), Kind="P")
dd <- rbind(pp, dd)
ggplot(dd, aes(x=Degree, y=Count, color=Kind)) +
  geom_point() +
  scale_x_log10()


================================================
FILE: lonestar/analytics/cpu/matrixcompletion/runexp.py
================================================
#!/usr/bin/env python

import os
import sys
import subprocess
import math

RootDir = '/net/gilbert/workspace/ddn/build'
RunPy = '%s/w/GaloisDefault/scripts/run.py' % os.environ['HOME']
CommandTable = [{
  'name': 'bgg',
  'lambda': '0.01',
  'learningRate': '0.0005',
  'decayRate': '0.001',
  'time': 500,
  'secondsPerIteration': {'nomad': 1, 'galois': 0.25, 'graphlab': 0.5 },
  'nomadSecondsPerIteration': 10,
  'galoisOption': '-itemsPerBlock 200 -usersPerBlock 1024',
}, {
  'name': 'netflix',
  'lambda': '0.05',
  'learningRate': '0.012',
  'decayRate': '0.015',
  'time': 1000,
  'secondsPerIteration': {'nomad': 10, 'galois': 1, 'graphlab': 30 },
  'galoisOption': '-itemsPerBlock 150 -usersPerBlock 2048'
}, {
  'name': 'yahoo',
  'lambda': '1.0',
  'learningRate': '0.00075',
  'decayRate': '0.01',
  'time': 1000,
  'secondsPerIteration': {'nomad': 25, 'galois': 25, 'graphlab': 100 },
  #'galoisOption': '-itemsPerBlock 1500 -usersPerBlock 40000' # transpose
  'galoisOption': '-itemsPerBlock 4625 -usersPerBlock 7025'
}]

def galoisCommand(c):
  # exp/apps/sgd/sgd-ddn /net/faraday/workspace/inputs/weighted/bipartite/bgg.gr -t 20 -algo blockedEdge -lambda 0.01 -learningRate 0.0005 -decayRate 0.001 -learningRateFunction Purdue -itemsPerBlock 200 -usersPerBlock 1024 -fixedRounds 400 -useExactError
  cmd = os.path.join(RootDir, 'default/exp/apps/matrixcompletion/mc-ddn')
  input = os.path.join('/net/faraday/workspace/inputs/weighted/bipartite', c['name'] + '.gr')
  
  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['galois']))
  opts = ['-useSameLatentVector', '-algo blockedEdge', '-useExactError', '-fixedRounds', str(iterations)]
  opts += ['-learningRateFunction purdue']
  opts += [c['galoisOption']]
  nparams = ['-lambda', '-learningRate', '-decayRate']
  cparams = ['lambda', 'learningRate', 'decayRate']
  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]

  s = [RunPy, '-t 20,40', '--', cmd, input]
  #s = [RunPy, '-t 20', '--', cmd, input]
  s += [v for s in params for v in s]
  s += opts
  subprocess.call(' '.join(s), shell=True)

def graphlabCommand(c):
  # toolkits/collaborative_filtering/sgd --matrix /net/faraday/workspace/ddn/inputs/graphlab/bgg --D 100 --lambda 0.01 --gamma 0.0005 --step_dec 0.001 --ncpus 40 --max_iter 400
  cmd = os.path.join(RootDir, 'graphlab/toolkits/collaborative_filtering/sgd')
  input = os.path.join('/net/faraday/workspace/ddn/inputs/graphlab', c['name'])
  
  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['graphlab']))
  opts = ['--D 100', '--max_iter', str(iterations)]
  nparams = ['--lambda', '--gamma', '--step_dec']
  cparams = ['lambda', 'learningRate', 'decayRate']
  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]

  s = [RunPy, '--no-default-thread', '-x Threads::--ncpus::20,40', '--', cmd, input]
  s += [v for s in params for v in s]
  s += opts
  subprocess.call(' '.join(s), shell=True)

def nomadCommand(c):
  # mpirun ./nomad_double --path /net/faraday/workspace/ddn/inputs/nomad/bgg --nthreads 40 --lrate 0.0005  --drate 0.001  --dim  100  --reg 0.01 --timeout 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200
  cmd = os.path.join(RootDir, 'nomad/nomad_double')
  input = os.path.join('/net/faraday/workspace/ddn/inputs/nomad', c['name'])
  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['nomad']))
  timeouts = [str(c['secondsPerIteration']['nomad'] * x) for x in range(1, iterations + 1)]
  opts = ['--dim 100', '--timeout', ' '.join(timeouts)]
  nparams = ['--reg', '--lrate', '--drate']
  cparams = ['lambda', 'learningRate', 'decayRate']
  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]

  s = [RunPy, '--no-default-thread', '--append-arguments', '-x Threads::--nthreads::20,40', '--', 'mpirun', cmd, '--path', input]
  s += [v for s in params for v in s]
  s += opts
  subprocess.call(' '.join(s), shell=True)

def main():
  sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
  for c in CommandTable:
    #for t in [galoisCommand, graphlabCommand, nomadCommand]:
    for t in [galoisCommand]:
      t(c)


if __name__ == '__main__':
  main()


================================================
FILE: lonestar/analytics/cpu/pagerank/CMakeLists.txt
================================================
add_executable(pagerank-pull-cpu PageRank-pull.cpp)
add_dependencies(apps pagerank-pull-cpu)
target_link_libraries(pagerank-pull-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS pagerank-pull-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small pagerank-pull-cpu -transposedGraph -tolerance=0.01 "${BASEINPUT}/scalefree/transpose/rmat10.tgr")
add_test_scale(small-topo pagerank-pull-cpu -transposedGraph -tolerance=0.01 -algo=Topo "${BASEINPUT}/scalefree/transpose/rmat10.tgr")

add_executable(pagerank-push-cpu PageRank-push.cpp)
add_dependencies(apps pagerank-push-cpu)
target_link_libraries(pagerank-push-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS pagerank-push-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small pagerank-push-cpu -tolerance=0.01 "${BASEINPUT}/scalefree/transpose/rmat10.tgr")
add_test_scale(small-sync pagerank-push-cpu -tolerance=0.01 -algo=Sync "${BASEINPUT}/scalefree/transpose/rmat10.tgr")


================================================
FILE: lonestar/analytics/cpu/pagerank/PageRank-constants.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef LONESTAR_PAGERANK_CONSTANTS_H
#define LONESTAR_PAGERANK_CONSTANTS_H

#include <iostream>

#define DEBUG 0

static const char* name = "Page Rank";
static const char* url  = nullptr;

//! All PageRank algorithm variants use the same constants for ease of
//! comparison.
constexpr static const float ALPHA         = 0.85;
constexpr static const float INIT_RESIDUAL = 1 - ALPHA;

constexpr static const float TOLERANCE   = 1.0e-3;
constexpr static const unsigned MAX_ITER = 1000;

constexpr static const unsigned PRINT_TOP = 20;

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<float> tolerance("tolerance", cll::desc("tolerance"),
                                 cll::init(TOLERANCE));
static cll::opt<unsigned int> maxIterations(
    "maxIterations",
    cll::desc("Maximum iterations, applies round-based versions only"),
    cll::init(MAX_ITER));

//! Type definitions.
typedef float PRTy;

template <typename GNode>
struct TopPair {
  float value;
  GNode id;

  TopPair(float v, GNode i) : value(v), id(i) {}

  bool operator<(const TopPair& b) const {
    if (value == b.value)
      return id > b.id;
    return value < b.value;
  }
};

//! Helper functions.

PRTy atomicAdd(std::atomic<PRTy>& v, PRTy delta) {
  PRTy old;
  do {
    old = v;
  } while (!v.compare_exchange_strong(old, old + delta));
  return old;
}

template <typename Graph>
void printTop(Graph& graph, unsigned topn = PRINT_TOP) {

  using GNode = typename Graph::GraphNode;
  typedef TopPair<GNode> Pair;
  typedef std::map<Pair, GNode> TopMap;

  TopMap top;

  for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
    GNode src  = *ii;
    auto& n    = graph.getData(src);
    PRTy value = n.value;
    Pair key(value, src);

    if (top.size() < topn) {
      top.insert(std::make_pair(key, src));
      continue;
    }

    if (top.begin()->first < key) {
      top.erase(top.begin());
      top.insert(std::make_pair(key, src));
    }
  }

  int rank = 1;
  std::cout << "Rank PageRank Id\n";
  for (auto ii = top.rbegin(), ei = top.rend(); ii != ei; ++ii, ++rank) {
    std::cout << rank << ": " << ii->first.value << " " << ii->first.id << "\n";
  }
}

#if DEBUG
template <typename Graph>
void printPageRank(Graph& graph) {
  std::cout << "Id\tPageRank\n";
  int counter = 0;
  for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ii++) {
    auto& sd = graph.getData(*ii);
    std::cout << counter << " " << sd.value << "\n";
    counter++;
  }
}
#endif

#endif


================================================
FILE: lonestar/analytics/cpu/pagerank/PageRank-pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Lonestar/BoilerPlate.h"
#include "PageRank-constants.h"
#include "galois/Galois.h"
#include "galois/LargeArray.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/gstl.h"

const char* desc =
    "Computes page ranks a la Page and Brin. This is a pull-style algorithm.";

enum Algo { Topo = 0, Residual };

static cll::opt<Algo> algo("algo", cll::desc("Choose an algorithm:"),
                           cll::values(clEnumVal(Topo, "Topological"),
                                       clEnumVal(Residual, "Residual")),
                           cll::init(Residual));

//! Flag that forces user to be aware that they should be passing in a
//! transposed graph.
static cll::opt<bool>
    transposedGraph("transposedGraph",
                    cll::desc("Specify that the input graph is transposed"),
                    cll::init(false));

constexpr static const unsigned CHUNK_SIZE = 32;

struct LNode {
  PRTy value;
  uint32_t nout;
};

typedef galois::graphs::LC_CSR_Graph<LNode, void>::with_no_lockable<
    true>::type ::with_numa_alloc<true>::type Graph;
typedef typename Graph::GraphNode GNode;

using DeltaArray    = galois::LargeArray<PRTy>;
using ResidualArray = galois::LargeArray<PRTy>;

//! Initialize nodes for the topological algorithm.
void initNodeDataTopological(Graph& g) {
  PRTy init_value = 1.0f / g.size();
  galois::do_all(
      galois::iterate(g),
      [&](const GNode& n) {
        auto& sdata = g.getData(n, galois::MethodFlag::UNPROTECTED);
        sdata.value = init_value;
        sdata.nout  = 0;
      },
      galois::no_stats(), galois::loopname("initNodeData"));
}

//! Initialize nodes for the residual algorithm.
void initNodeDataResidual(Graph& g, DeltaArray& delta,
                          ResidualArray& residual) {
  galois::do_all(
      galois::iterate(g),
      [&](const GNode& n) {
        auto& sdata = g.getData(n, galois::MethodFlag::UNPROTECTED);
        sdata.value = 0;
        sdata.nout  = 0;
        delta[n]    = 0;
        residual[n] = INIT_RESIDUAL;
      },
      galois::no_stats(), galois::loopname("initNodeData"));
}

//! Computing outdegrees in the tranpose graph is equivalent to computing the
//! indegrees in the original graph.
void computeOutDeg(Graph& graph) {
  galois::StatTimer outDegreeTimer("computeOutDegFunc");
  outDegreeTimer.start();

  galois::LargeArray<std::atomic<size_t>> vec;
  vec.allocateInterleaved(graph.size());

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& src) { vec.constructAt(src, 0ul); }, galois::no_stats(),
      galois::loopname("InitDegVec"));

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& src) {
        for (auto nbr : graph.edges(src)) {
          GNode dst = graph.getEdgeDst(nbr);
          vec[dst].fetch_add(1ul);
        };
      },
      galois::steal(), galois::chunk_size<CHUNK_SIZE>(), galois::no_stats(),
      galois::loopname("computeOutDeg"));

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& src) {
        auto& srcData = graph.getData(src, galois::MethodFlag::UNPROTECTED);
        srcData.nout  = vec[src];
      },
      galois::no_stats(), galois::loopname("CopyDeg"));

  outDegreeTimer.stop();
}

/**
 * It does not calculate the pagerank for each iteration,
 * but only calculate the residual to be added from the previous pagerank to
 * the current one.
 * If the residual is smaller than the tolerance, that is not reflected to
 * the next pagerank.
 */
//! [scalarreduction]
void computePRResidual(Graph& graph, DeltaArray& delta,
                       ResidualArray& residual) {
  unsigned int iterations = 0;
  galois::GAccumulator<unsigned int> accum;

  while (true) {
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          auto& sdata = graph.getData(src);
          delta[src]  = 0;

          //! Only the residual higher than tolerance will be reflected
          //! to the pagerank.
          if (residual[src] > tolerance) {
            PRTy oldResidual = residual[src];
            residual[src]    = 0.0;
            sdata.value += oldResidual;
            if (sdata.nout > 0) {
              delta[src] = oldResidual * ALPHA / sdata.nout;
              accum += 1;
            }
          }
        },
        galois::no_stats(), galois::loopname("PageRank_delta"));

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          float sum = 0;
          for (auto nbr : graph.edges(src)) {
            GNode dst = graph.getEdgeDst(nbr);
            if (delta[dst] > 0) {
              sum += delta[dst];
            }
          }
          if (sum > 0) {
            residual[src] = sum;
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(), galois::no_stats(),
        galois::loopname("PageRank"));

#if DEBUG
    std::cout << "iteration: " << iterations << "\n";
#endif
    iterations++;
    if (iterations >= maxIterations || !accum.reduce()) {
      break;
    }
    accum.reset();
  } ///< End while(true).
    //! [scalarreduction]

  if (iterations >= maxIterations) {
    std::cerr << "ERROR: failed to converge in " << iterations
              << " iterations\n";
  }
}

/**
 * PageRank pull topological.
 * Always calculate the new pagerank for each iteration.
 */
void computePRTopological(Graph& graph) {
  unsigned int iteration = 0;
  galois::GAccumulator<float> accum;

  float base_score = (1.0f - ALPHA) / graph.size();
  while (true) {
    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          constexpr const galois::MethodFlag flag =
              galois::MethodFlag::UNPROTECTED;

          LNode& sdata = graph.getData(src, flag);
          float sum    = 0.0;

          for (auto jj = graph.edge_begin(src, flag),
                    ej = graph.edge_end(src, flag);
               jj != ej; ++jj) {
            GNode dst = graph.getEdgeDst(jj);

            LNode& ddata = graph.getData(dst, flag);
            sum += ddata.value / ddata.nout;
          }

          //! New value of pagerank after computing contributions from
          //! incoming edges in the original graph.
          float value = sum * ALPHA + base_score;
          //! Find the delta in new and old pagerank values.
          float diff = std::fabs(value - sdata.value);

          //! Do not update pagerank before the diff is computed since
          //! there is a data dependence on the pagerank value.
          sdata.value = value;
          accum += diff;
        },
        galois::no_stats(), galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("PageRank"));

#if DEBUG
    std::cout << "iteration: " << iteration << " max delta: " << delta << "\n";
#endif

    iteration += 1;
    if (accum.reduce() <= tolerance || iteration >= maxIterations) {
      break;
    }
    accum.reset();

  } ///< End while(true).

  galois::runtime::reportStat_Single("PageRank", "Rounds", iteration);
  if (iteration >= maxIterations) {
    std::cerr << "ERROR: failed to converge in " << iteration
              << " iterations\n";
  }
}

void prTopological(Graph& graph) {
  initNodeDataTopological(graph);
  computeOutDeg(graph);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  computePRTopological(graph);
  execTime.stop();
}

void prResidual(Graph& graph) {
  DeltaArray delta;
  delta.allocateInterleaved(graph.size());
  ResidualArray residual;
  residual.allocateInterleaved(graph.size());

  initNodeDataResidual(graph, delta, residual);
  computeOutDeg(graph);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  computePRResidual(graph, delta, residual);
  execTime.stop();
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  if (!transposedGraph) {
    GALOIS_DIE("This application requires a transposed graph input;"
               " please use the -transposedGraph flag "
               " to indicate the input is a transposed graph.");
  }
  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph transposeGraph;
  std::cout << "WARNING: pull style algorithms work on the transpose of the "
               "actual graph\n"
            << "WARNING: this program assumes that " << inputFile
            << " contains transposed representation\n\n"
            << "Reading graph: " << inputFile << "\n";

  galois::graphs::readGraph(transposeGraph, inputFile);
  std::cout << "Read " << transposeGraph.size() << " nodes, "
            << transposeGraph.sizeEdges() << " edges\n";

  galois::preAlloc(2 * numThreads + (3 * transposeGraph.size() *
                                     sizeof(typename Graph::node_data_type)) /
                                        galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  switch (algo) {
  case Topo:
    std::cout << "Running Pull Topological version, tolerance:" << tolerance
              << ", maxIterations:" << maxIterations << "\n";
    prTopological(transposeGraph);
    break;
  case Residual:
    std::cout << "Running Pull Residual version, tolerance:" << tolerance
              << ", maxIterations:" << maxIterations << "\n";
    prResidual(transposeGraph);
    break;
  default:
    std::abort();
  }

  galois::reportPageAlloc("MeminfoPost");

  //! Sanity checking code.
  galois::GReduceMax<PRTy> maxRank;
  galois::GReduceMin<PRTy> minRank;
  galois::GAccumulator<PRTy> distanceSum;
  maxRank.reset();
  minRank.reset();
  distanceSum.reset();

  //! [example of no_stats]
  galois::do_all(
      galois::iterate(transposeGraph),
      [&](uint64_t i) {
        PRTy rank = transposeGraph.getData(i).value;

        maxRank.update(rank);
        minRank.update(rank);
        distanceSum += rank;
      },
      galois::loopname("Sanity check"), galois::no_stats());
  //! [example of no_stats]

  PRTy rMaxRank = maxRank.reduce();
  PRTy rMinRank = minRank.reduce();
  PRTy rSum     = distanceSum.reduce();
  galois::gInfo("Max rank is ", rMaxRank);
  galois::gInfo("Min rank is ", rMinRank);
  galois::gInfo("Sum is ", rSum);

  if (!skipVerify) {
    printTop(transposeGraph);
  }

#if DEBUG
  printPageRank(transposeGraph);
#endif

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/pagerank/PageRank-push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Lonestar/BoilerPlate.h"
#include "PageRank-constants.h"
#include "galois/Bag.h"
#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"

/**
 * These implementations are based on the Push-based PageRank computation
 * (Algorithm 4) as described in the PageRank Europar 2015 paper.
 *
 * WHANG, Joyce Jiyoung, et al. Scalable data-driven pagerank: Algorithms,
 * system issues, and lessons learned. In: European Conference on Parallel
 * Processing. Springer, Berlin, Heidelberg, 2015. p. 438-450.
 */

const char* desc =
    "Computes page ranks a la Page and Brin. This is a push-style algorithm.";

constexpr static const unsigned CHUNK_SIZE = 16;

enum Algo { Async, Sync }; ///< Async has better asbolute performance.

static cll::opt<Algo> algo("algo", cll::desc("Choose an algorithm:"),
                           cll::values(clEnumVal(Async, "Async"),
                                       clEnumVal(Sync, "Sync")),
                           cll::init(Async));

struct LNode {
  PRTy value;
  std::atomic<PRTy> residual;

  void init() {
    value    = 0.0;
    residual = INIT_RESIDUAL;
  }

  friend std::ostream& operator<<(std::ostream& os, const LNode& n) {
    os << "{PR " << n.value << ", residual " << n.residual << "}";
    return os;
  }
};

typedef galois::graphs::LC_CSR_Graph<LNode, void>::with_numa_alloc<
    true>::type ::with_no_lockable<true>::type Graph;
typedef typename Graph::GraphNode GNode;

void asyncPageRank(Graph& graph) {
  typedef galois::worklists::PerSocketChunkFIFO<CHUNK_SIZE> WL;
  galois::for_each(
      galois::iterate(graph),
      [&](GNode src, auto& ctx) {
        LNode& sdata = graph.getData(src);
        constexpr const galois::MethodFlag flag =
            galois::MethodFlag::UNPROTECTED;

        if (sdata.residual > tolerance) {
          PRTy oldResidual = sdata.residual.exchange(0.0);
          sdata.value += oldResidual;
          int src_nout = std::distance(graph.edge_begin(src, flag),
                                       graph.edge_end(src, flag));
          if (src_nout > 0) {
            PRTy delta = oldResidual * ALPHA / src_nout;
            //! For each out-going neighbors.
            for (auto jj : graph.edges(src, flag)) {
              GNode dst    = graph.getEdgeDst(jj);
              LNode& ddata = graph.getData(dst, flag);
              if (delta > 0) {
                auto old = atomicAdd(ddata.residual, delta);
                if ((old < tolerance) && (old + delta >= tolerance)) {
                  ctx.push(dst);
                }
              }
            }
          }
        }
      },
      galois::loopname("PushResidualAsync"),
      galois::disable_conflict_detection(), galois::no_stats(),
      galois::wl<WL>());
}

void syncPageRank(Graph& graph) {
  struct Update {
    PRTy delta;
    Graph::edge_iterator beg;
    Graph::edge_iterator end;
  };

  constexpr ptrdiff_t EDGE_TILE_SIZE = 128;

  galois::InsertBag<Update> updates;
  galois::InsertBag<GNode> activeNodes;

  galois::do_all(
      galois::iterate(graph), [&](const GNode& src) { activeNodes.push(src); },
      galois::no_stats());

  size_t iter = 0;
  for (; !activeNodes.empty() && iter < maxIterations; ++iter) {
    galois::do_all(
        galois::iterate(activeNodes),
        [&](const GNode& src) {
          constexpr const galois::MethodFlag flag =
              galois::MethodFlag::UNPROTECTED;
          LNode& sdata = graph.getData(src, flag);

          if (sdata.residual > tolerance) {
            PRTy oldResidual = sdata.residual;
            sdata.value += oldResidual;
            sdata.residual = 0.0;

            int src_nout = std::distance(graph.edge_begin(src, flag),
                                         graph.edge_end(src, flag));
            PRTy delta   = oldResidual * ALPHA / src_nout;

            auto beg       = graph.edge_begin(src, flag);
            const auto end = graph.edge_end(src, flag);

            assert(beg <= end);

            //! Edge tiling for large outdegree nodes.
            if ((end - beg) > EDGE_TILE_SIZE) {
              for (; beg + EDGE_TILE_SIZE < end;) {
                auto ne = beg + EDGE_TILE_SIZE;
                updates.push(Update{delta, beg, ne});
                beg = ne;
              }
            }

            if ((end - beg) > 0) {
              updates.push(Update{delta, beg, end});
            }
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("CreateEdgeTiles"), galois::no_stats());

    activeNodes.clear();

    galois::do_all(
        galois::iterate(updates),
        [&](const Update& up) {
          constexpr const galois::MethodFlag flag =
              galois::MethodFlag::UNPROTECTED;
          //! For each out-going neighbors.
          for (auto jj = up.beg; jj != up.end; ++jj) {
            GNode dst    = graph.getEdgeDst(jj);
            LNode& ddata = graph.getData(dst, flag);
            auto old     = atomicAdd(ddata.residual, up.delta);
            //! If fabs(old) is greater than tolerance, then it would
            //! already have been processed in the previous do_all
            //! loop.
            if ((old <= tolerance) && (old + up.delta >= tolerance)) {
              activeNodes.push(dst);
            }
          }
        },
        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),
        galois::loopname("PushResidualSync"), galois::no_stats());

    updates.clear();
  }

  if (iter >= maxIterations) {
    std::cerr << "ERROR: failed to converge in " << iter << " iterations\n";
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  galois::preAlloc(5 * numThreads +
                   (5 * graph.size() * sizeof(typename Graph::node_data_type)) /
                       galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  std::cout << "tolerance:" << tolerance << ", maxIterations:" << maxIterations
            << "\n";

  galois::do_all(
      galois::iterate(graph), [&graph](GNode n) { graph.getData(n).init(); },
      galois::no_stats(), galois::loopname("Initialize"));

  galois::StatTimer execTime("Timer_0");
  execTime.start();

  switch (algo) {
  case Async:
    std::cout << "Running Edge Async push version,";
    asyncPageRank(graph);
    break;

  case Sync:
    std::cout << "Running Edge Sync push version,";
    syncPageRank(graph);
    break;

  default:
    std::abort();
  }

  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify) {
    printTop(graph);
  }

#if DEBUG
  printPageRank(graph);
#endif

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/pagerank/README.md
================================================
Pagerank
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

We implement both pull and push-style PageRank algorithms. The push-style
algorithms are based on the computations (Algorithm 4) described in the 
PageRank Europar 2015 paper.

Whang et al. Scalable Data-driven PageRank: Algorithms, System Issues, and 
Lessons Learned. Europar 2015.

There are two variants, topological and residual, of the pull-style algorithm 
that are implemented. The pull variants perform better than the push variants 
since there are no atomic operations. The residual version performs and scales 
the best. It does less work and uses separate arrays for storing delta and 
residual information to improve locality and use of memory bandwidth.

INPUT
--------------------------------------------------------------------------------

The push variant takes in Galois .gr format.
The pull variant takes in transposed Galois .gr graphs.
You must specify the -transposedGraph flag when running the pull variant.

BUILD
--------------------------------------------------------------------------------

1. Run `cmake` at the BUILD directory (refer to top-level README for instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/pagerank; make -j` 

RUN
--------------------------------------------------------------------------------

The following are a few examples of invoking PageRank.

* `$ ./pagerank-pull-cpu <path-transpose-graph> -tolerance=0.001 -transposedGraph`

* `$ ./pagerank-pull-cpu <path-transpose-graph> -t=20 -tolerance=0.001 -algo=Residual -transposedGraph`

* `$ ./pagerank-push-cpu <path-graph> -t=40 -tolerance=0.001 -algo=Async`

PERFORMANCE  
--------------------------------------------------------------------------------

The performance of the push and the pull versions depend on an optimal choice 
of the the compile time constant, CHUNK_SIZE. For the pull version, CHUNK_SIZE 
denotes the granularity of stolen work when work stealing is enabled (via 
galois::steal()). The optimal value of the constant might depend on the 
architecture, so you might want to evaluate the performance over a range of 
values (say [16-4096]).


================================================
FILE: lonestar/analytics/cpu/pointstoanalysis/CMakeLists.txt
================================================
add_executable(pointstoanalysis-cpu PointsTo.cpp)
add_dependencies(apps pointstoanalysis-cpu)
target_link_libraries(pointstoanalysis-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS pointstoanalysis-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small pointstoanalysis-cpu "${BASEINPUT}/java/pta/gap_constraints.txt")


================================================
FILE: lonestar/analytics/cpu/pointstoanalysis/PointsTo.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"
#include <iostream>
#include <fstream>
#include <deque>
#include "SparseBitVector.h"

////////////////////////////////////////////////////////////////////////////////
// Command line parameters
////////////////////////////////////////////////////////////////////////////////

namespace cll = llvm::cl;

const char* name = "Points-to Analysis";
const char* desc = "Performs inclusion-based points-to analysis over the input "
                   "constraints.";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<bool>
    useSerial("serial",
              cll::desc("Runs serial version of the algorithm "
                        "(i.e. 1 thread, no galois::for_each) "
                        "(default false)"),
              cll::init(false));

static cll::opt<bool>
    printAnswer("printAnswer",
                cll::desc("If set, prints all points to facts "
                          "at the end"),
                cll::init(false));

static cll::opt<bool>
    useCycleDetection("ocd",
                      cll::desc("If set, online cycle detection is"
                                " used in algorithm (serial only) "
                                "(default false)"),
                      cll::init(false));

static cll::opt<unsigned>
    THRESHOLD_LS("lsThreshold",
                 cll::desc("Determines how many constraints to "
                           "process before load/store constraints "
                           "are reprocessed (serial only) "
                           "(default 500000)"),
                 cll::init(500000));

////////////////////////////////////////////////////////////////////////////////
// Declaration of strutures, types, and variables
////////////////////////////////////////////////////////////////////////////////

/**
 * Class representing a points-to constraint.
 */
class PtsToCons {
public:
  using ConstraintType = enum { AddressOf = 0, Copy, Load, Store, GEP };

private:
  unsigned src;
  unsigned dst;
  ConstraintType type;

public:
  PtsToCons(ConstraintType tt, unsigned ss, unsigned dd) {
    src  = ss;
    dst  = dd;
    type = tt;
  }

  /**
   * @returns This constraint's src and dst node
   */
  std::pair<unsigned, unsigned> getSrcDst() const {
    return std::pair<unsigned, unsigned>(src, dst);
  }

  /**
   * @returns The type of this constraint
   */
  ConstraintType getType() const { return type; }

  /**
   * Print out this constraint to stderr
   */
  void print() const {
    if (type == Store) {
      std::cerr << "*";
    }

    std::cerr << "v" << dst;
    std::cerr << " = ";

    if (type == Load) {
      std::cerr << "*";
    } else if (type == AddressOf) {
      std::cerr << "&";
    }

    std::cerr << "v" << src;

    std::cerr << "\n";
  }
};

/**
 * Points to analysis runner base class. Does not have a run method itself.
 *
 * @tparam IsConcurrent if set to true, the data structures used for points
 * to results and outgoing edges will be thread safe
 */
template <bool IsConcurrent>
class PTABase {
  // sparse bit vector is concurrent or serial based on template parameter
  using SparseBitVector = galois::SparseBitVector<IsConcurrent>;

  using PointsToConstraints = std::vector<PtsToCons>;
  using PointsToInfo        = std::vector<SparseBitVector>;
  using EdgeVector          = std::vector<SparseBitVector>;

  using NodeAllocator =
      galois::FixedSizeAllocator<typename SparseBitVector::Node>;

protected:
  PointsToInfo pointsToResult; // pointsTo results for nodes
  EdgeVector outgoingEdges;    // holds outgoing edges of a node

  PointsToConstraints addressCopyConstraints;
  PointsToConstraints loadStoreConstraints;

  size_t numNodes = 0;

  ////////////////////////////////////////////////////////////////////////////////
  /**
   * Online Cycle Detection and elimination structure + functions.
   */
  struct OnlineCycleDetection {
  private:
    PTABase<IsConcurrent>&
        outerPTA; // reference to outer PTA instance to get runtime info

    galois::gstl::Vector<unsigned> ancestors; // TODO find better representation
    galois::gstl::Vector<bool> visited;       // TODO use better representation
    galois::gstl::Vector<unsigned> representative;

    unsigned NoRepresentative; // "constant" that represents no representative

    /**
     * @returns true of the nodeid is an ancestor node
     * FIXME find better way to do this instead of linear scan
     */
    bool isAncestor(unsigned nodeid) {
      for (unsigned ii : ancestors) {
        if (ii == nodeid) {
          return true;
        }
      }
      return false;
    }

    /**
     * Depth first recursion of the nodeid to see if it eventually
     * reaches an ancestor, in which case there is a cycle. The cycle is
     * then collapsed, i.e. all nodes in the cycle have their representative
     * changed to the representative of the node where the cycle starts.
     *
     * Note it is okay not to detect all cycles as it is only an efficiency
     * concern.
     *
     * @param nodeid nodeid to check the existance of a cycle
     * @param cyclenode This is used as OUTPUT parameter; if a node is
     * detected to be an ancestor, it is returned via this variable.
     * @returns true if a cycle has been detected (i.e. a node has a path
     * to an ancestor), false otherwise
     */
    bool cycleDetect(unsigned nodeID, unsigned& cycleNode) {
      unsigned nodeRep = getFinalRepresentative(nodeID);

      // if the node is an ancestor, that means there's a path from the ancestor
      // to the ancestor (i.e. cycle)
      if (isAncestor(nodeRep)) {
        cycleNode = nodeRep;
        return true;
      }

      if (visited[nodeRep]) {
        return false;
      }

      visited[nodeRep] = true;

      // keep track of the current depth first search path
      ancestors.push_back(nodeRep);

      // don't use an iterator here because outgoing edges might get updated
      // during this loop
      std::vector<unsigned> repOutgoingEdges =
          outerPTA.outgoingEdges[nodeRep].getAllSetBits();

      for (auto dst : repOutgoingEdges) {
        // recursive depth first cycle detection; if a cycle is found,
        // collapse the path
        if (cycleDetect(dst, cycleNode)) {
          cycleCollapse(cycleNode);
        }
      }
      ancestors.pop_back();

      return false;
    }

    /**
     * Make all nodes that are a part of some detected cycle starting at
     * repr have their representatives changed to the representative of
     * repr (thereby "collapsing" the cycle).
     *
     * @param repr The node at which the cycle begins
     */
    void cycleCollapse(unsigned repr) {
      // assert(repr is present in ancestors).
      unsigned repToChangeTo = getFinalRepresentative(repr);

      for (auto ii = ancestors.begin(); ii != ancestors.end(); ++ii) {
        if (*ii == repr) {
          galois::gDebug("collapsing cycle for ", repr);
          // cycle exists between nodes ancestors[*ii..end].
          for (auto jj = ii; jj != ancestors.end(); ++jj) {
            // jjRepr has no representative.
            unsigned jjRepr = getFinalRepresentative(*jj);
            makeRepr(jjRepr, repToChangeTo);
          }

          break;
        }
      }
    }

    /**
     * Make repr the representative of nodeID.
     *
     * @param nodeID node to change the representative of
     * @param repr nodeID will have its representative changed to this
     */
    void makeRepr(unsigned nodeID, unsigned repr) {
      if (repr != nodeID) {
        galois::gDebug("change repr[", nodeID, "] = ", repr);

        representative[nodeID] = repr;

        // the representative needs to have all of the items that the nodes
        // it is representing has, so if the node has more than the rep,
        // unify
        if (!outerPTA.pointsToResult[nodeID].isSubsetEq(
                outerPTA.pointsToResult[repr])) {
          outerPTA.pointsToResult[repr].unify(outerPTA.pointsToResult[nodeID]);
        }

        // unify edges as well if necessary since rep represents it now
        if (!outerPTA.outgoingEdges[nodeID].isSubsetEq(
                outerPTA.outgoingEdges[repr])) {
          outerPTA.outgoingEdges[repr].unify(outerPTA.outgoingEdges[nodeID]);
        }
      }
    }

  public:
    OnlineCycleDetection(PTABase<IsConcurrent>& o) : outerPTA(o) {}

    /**
     * Init fields (outerPTA needs to have numNodes set).
     */
    void init() {
      NoRepresentative = outerPTA.numNodes;
      visited.resize(outerPTA.numNodes);
      representative.resize(outerPTA.numNodes);

      for (unsigned ii = 0; ii < outerPTA.numNodes; ++ii) {
        representative[ii] = NoRepresentative;
      }
    }

    /**
     * Given a node id, find its representative. Also, do path compression
     * of the path to the representative.
     *
     * @param nodeid Node id to get the representative of
     * @returns The representative of nodeid
     */
    unsigned getFinalRepresentative(unsigned nodeid) {
      unsigned finalRep = nodeid;

      // follow chain of representatives until a "root" is reached
      while (representative[finalRep] != NoRepresentative) {
        finalRep = representative[finalRep];
      }

      // path compression; make all things along path to final representative
      // point to the final representative
      unsigned curRep = representative[nodeid];

      while (curRep != NoRepresentative) {
        representative[nodeid] = finalRep;
        nodeid                 = curRep;
        curRep                 = representative[nodeid];
      }

      return finalRep;
    }

    /**
     * Go over all sources of new edges to see if there are cycles in them.
     * If so, collapse the cycles.
     *
     * @param updates vector of nodes that are sources of new edges.
     */
    template <typename VecType>
    void process(VecType& updates) {
      if (!useCycleDetection) {
        return;
      }

      // TODO this can probably be made more efficient (fill?)
      for (unsigned ii = 0; ii < outerPTA.numNodes; ++ii) {
        visited[ii] = false;
      }

      unsigned cycleNode = NoRepresentative; // set to invalid id.

      for (unsigned update : updates) {
        galois::gDebug("cycle process ", update);

        if (cycleDetect(update, cycleNode)) {
          cycleCollapse(cycleNode);
        }
      }
    }
  }; // end struct OnlineCycleDetection
  ////////////////////////////////////////////////////////////////////////////////

  OnlineCycleDetection ocd; // cycle detector/squasher; only works with serial

  /**
   * Adds edges to the graph based on load/store constraints.
   *
   * A load from src -> dst means anything that src points to must also
   * point to dst.
   *
   * A store from src -> dst means src must point to anything
   * that dst points to.
   *
   * Any updated nodes are returned in the updates vector.
   *
   * @tparam LoopInvoker Functor that will run the loop
   * @tparam VecType object that supports a push_back function that represents
   * nodes to be worked on
   *
   * @param constraints Load/store constraints to use to add edges
   * @param updates output variable that will have updated nodes added to it
   */
  template <typename LoopInvoker, typename VecType>
  void processLoadStore(const PointsToConstraints& constraints,
                        VecType& updates) {

    LoopInvoker()(galois::iterate(constraints), [&](auto constraint) {
      unsigned src;
      unsigned dst;
      std::tie(src, dst) = constraint.getSrcDst();

      unsigned srcRepr = ocd.getFinalRepresentative(src);
      unsigned dstRepr = ocd.getFinalRepresentative(dst);

      if (constraint.getType() == PtsToCons::Load) {
        for (auto pointee = pointsToResult[srcRepr].begin();
             pointee != pointsToResult[srcRepr].end(); pointee++) {
          unsigned pointeeRepr = ocd.getFinalRepresentative(*pointee);

          // add edge from pointee to dst if it doesn't already exist
          if (pointeeRepr != dstRepr &&
              !outgoingEdges[pointeeRepr].test(dstRepr)) {
            outgoingEdges[pointeeRepr].set(dstRepr);

            updates.push_back(pointeeRepr);
          }
        }
      } else { // store whatever src has into whatever dst points to
        bool newEdgeAdded = false;

        for (auto pointee = pointsToResult[dstRepr].begin();
             pointee != pointsToResult[dstRepr].end(); pointee++) {
          unsigned pointeeRepr = ocd.getFinalRepresentative(*pointee);

          // add edge from src -> pointee if it doesn't exist
          if (srcRepr != pointeeRepr &&
              !outgoingEdges[srcRepr].test(pointeeRepr)) {
            outgoingEdges[srcRepr].set(pointeeRepr);

            newEdgeAdded = true;
          }
        }

        if (newEdgeAdded) {
          updates.push_back(srcRepr);
        }
      }
    });
  }

  /**
   * Processes the AddressOf, Copy constraints.
   *
   * Sets the bitvector for AddressOf constraints, i.e. a set bit means
   * that you point to whatever that bit represents.
   *
   * Creates edges for Copy constraints, i.e. edge from a to b indicates
   * b is a copy of a.
   *
   * @tparam LoopInvoker Functor that will run the loop
   * @tparam VecType object that supports a push_back function as well
   * as iteration over pushed objects
   *
   * @param constraints vector of AddressOf and Copy constraints
   * @returns vector of UpdatesRequests from all sources with new edges
   * added by the Copy constraint
   */
  template <typename LoopInvoker, typename VecType>
  VecType processAddressOfCopy(const PointsToConstraints& constraints) {
    VecType updates;

    LoopInvoker()(galois::iterate(constraints), [&](auto ii) {
      unsigned src;
      unsigned dst;

      std::tie(src, dst) = ii.getSrcDst();

      if (ii.getType() == PtsToCons::AddressOf) { // addressof; save point info
        pointsToResult[dst].set(src);
      } else if (src != dst) { // copy constraint; add an edge
        outgoingEdges[src].set(dst);
        updates.push_back(src);
      }
    });

    return updates;
  }

  /**
   * If an edge exists from src to dst, then dst is a copy of src.
   * Propogate any points to information from source to dest.
   *
   * @param src Source node in graph
   * @param dst Dest node in graph
   * @returns non-negative value if any bitvector has changed
   */
  unsigned propagate(unsigned src, unsigned dst) {
    unsigned newPtsTo = 0;

    if (src != dst) {
      unsigned srcRepr = ocd.getFinalRepresentative(src);
      unsigned dstRepr = ocd.getFinalRepresentative(dst);

      // if src is a not subset of dst... (i.e. src has more), then
      // propogate src's points to info to dst
      if (srcRepr != dstRepr &&
          !pointsToResult[srcRepr].isSubsetEq(pointsToResult[dstRepr])) {
        // galois::gDebug("unifying ", dstRepr, " by ", srcRepr);
        // newPtsTo is positive if changes are made
        newPtsTo += pointsToResult[dstRepr].unify(pointsToResult[srcRepr]);
      }
    }

    return newPtsTo;
  }

public:
  PTABase() : ocd(*this) {}

  /**
   * Given the number of nodes in the constraint graph, initialize the
   * structures needed for the points-to algorithm.
   *
   * @param n Number of nodes in the constraint graph
   * @param nodeAllocator galois allocator object to allocate nodes in the
   * sparse bit vector
   */
  void initialize(size_t n, NodeAllocator& nodeAllocator) {
    numNodes = n;

    // initialize different constructs based on which version is being run
    pointsToResult.resize(numNodes);
    outgoingEdges.resize(numNodes);

    // initialize vectors
    for (unsigned i = 0; i < numNodes; i++) {
      pointsToResult[i].init(&nodeAllocator);
      outgoingEdges[i].init(&nodeAllocator);
    }

    ocd.init();
  }

  //! frees memory allocated by the node allocator
  void freeNodeAllocatorMemory() {
    for (unsigned i = 0; i < numNodes; i++) {
      pointsToResult[i].freeAll();
      outgoingEdges[i].freeAll();
    }
  }

  /**
   * Read a constraint file and load its contents into memory.
   *
   * @param file filename to read
   * @returns number of nodes in the constraint graph
   */
  unsigned readConstraints(const char* file) {
    galois::gInfo("GEP constraints (constraint type 4) and any constraints "
                  "with offsets are ignored.");

    unsigned numNodes     = 0;
    unsigned nconstraints = 0;

    std::ifstream cfile(file);
    std::string cstr;

    getline(cfile, cstr); // # of vars.
    sscanf(cstr.c_str(), "%d", &numNodes);

    getline(cfile, cstr); // # of constraints.
    sscanf(cstr.c_str(), "%d", &nconstraints);

    addressCopyConstraints.clear();
    loadStoreConstraints.clear();

    unsigned constraintNum;
    unsigned src;
    unsigned dst;
    unsigned offset;

    PtsToCons::ConstraintType type;

    // Create constraint objects and save them to appropriate location
    for (unsigned ii = 0; ii < nconstraints; ++ii) {
      getline(cfile, cstr);
      union {
        int as_int;
        PtsToCons::ConstraintType as_ctype;
      } type_converter;
      sscanf(cstr.c_str(), "%d,%d,%d,%d,%d", &constraintNum, &src, &dst,
             &type_converter.as_int, &offset);

      type = type_converter.as_ctype;

      PtsToCons cc(type, src, dst);

      if (type == PtsToCons::AddressOf || type == PtsToCons::Copy) {
        addressCopyConstraints.push_back(cc);
      } else if (type == PtsToCons::Load || type == PtsToCons::Store) {
        if (offset == 0) { // ignore load/stores with offsets
          loadStoreConstraints.push_back(cc);
        }
      }
      // ignore GEP constraints
    }

    cfile.close();

    return numNodes;
  }

  //////////////////////////////////////////////////////////////////////////////
  // Debugging/output functions
  //////////////////////////////////////////////////////////////////////////////

  /**
   * Prints the constraints in the passed in vector of constraints.
   *
   * @param constraints vector of PtsToCons
   */
  void printConstraints(PointsToConstraints& constraints) {
    for (auto ii = constraints.begin(); ii != constraints.end(); ++ii) {
      ii->print();
    }
  }

  /**
   * Checks to make sure that all representative point to at LEAST
   * what the nodes that it represents are pointing to. Necessary but not
   * sufficient check for correctness.
   */
  void checkReprPointsTo() {
    for (unsigned ii = 0; ii < pointsToResult.size(); ++ii) {
      unsigned repr = ocd.getFinalRepresentative(ii);
      if (repr != ii && !pointsToResult[ii].isSubsetEq(pointsToResult[repr])) {
        galois::gError("pointsto(", ii,
                       ") is not less than its "
                       "representative pointsto(",
                       repr, ").");
      }
    }
  }

  /**
   * Makes sure that the representative of a set of nodes has all of the
   * edges that the nodes it is representing has.
   */
  void checkReprEdges() {
    for (unsigned ii = 0; ii < outgoingEdges.size(); ++ii) {
      unsigned repr = ocd.getFinalRepresentative(ii);
      if (repr != ii && !outgoingEdges[ii].isSubsetEq(outgoingEdges[repr])) {
        galois::gError("edges(", ii,
                       ") is not less than its "
                       "representative edges(",
                       repr, ").");
      }
    }
  }

  /**
   * @returns The total number of points to facts in the system.
   */
  unsigned countPointsToFacts() {
    unsigned count = 0;

    for (auto ii = pointsToResult.begin(); ii != pointsToResult.end(); ++ii) {
      unsigned repr = ocd.getFinalRepresentative(ii - pointsToResult.begin());
      count += pointsToResult[repr].count();
    }

    return count;
  }

  /**
   * Prints out points to info for all verticies in the constraint graph.
   */
  void printPointsToInfo() {
    std::string prefix = "v";

    for (auto ii = pointsToResult.begin(); ii != pointsToResult.end(); ++ii) {
      std::cerr << prefix << ii - pointsToResult.begin() << ": ";
      unsigned repr = ocd.getFinalRepresentative(ii - pointsToResult.begin());
      pointsToResult[repr].print(std::cerr, prefix);
    }
  }
}; // end class PTA

/**
 * Serial points to executor.
 */
class PTASerial : public PTABase<false> {
public:
  /**
   * Run points-to-analysis on a single thread.
   */
  void run() {
    galois::gDebug(
        "no of addr+copy constraints = ", addressCopyConstraints.size(),
        ", no of load+store constraints = ", loadStoreConstraints.size());
    galois::gDebug("no of nodes = ", numNodes);

    std::deque<unsigned> updates;
    updates = processAddressOfCopy<galois::StdForEach, std::deque<unsigned>>(
        addressCopyConstraints);
    processLoadStore<galois::StdForEach>(loadStoreConstraints, updates);

    unsigned numUps = 0;

    // FIFO
    while (!updates.empty()) {
      unsigned src = updates.front();
      updates.pop_front();

      for (auto dst = outgoingEdges[src].begin();
           dst != outgoingEdges[src].end(); dst++) {
        unsigned newPtsTo = propagate(src, *dst);

        if (newPtsTo) { // newPtsTo is positive if dst changed
          updates.push_back(ocd.getFinalRepresentative(*dst));
        }

        numUps++;
      }

      if (updates.empty() || numUps >= THRESHOLD_LS) {
        galois::gDebug("No of points-to facts computed = ",
                       countPointsToFacts());
        numUps = 0;

        // After propagating all constraints, see if load/store
        // constraints need to be added in since graph was potentially updated
        processLoadStore<galois::StdForEach>(loadStoreConstraints, updates);

        // do cycle squashing
        ocd.process(updates);
      }
    }
  }
};

/**
 * Concurrent points to executor.
 */
class PTAConcurrent : public PTABase<true> {
public:
  /**
   * Run points-to-analysis using galois::for_each as the main loop.
   */
  void run() {
    galois::gDebug(
        "no of addr+copy constraints = ", addressCopyConstraints.size(),
        ", no of load+store constraints = ", loadStoreConstraints.size());
    galois::gDebug("no of nodes = ", numNodes);

    galois::InsertBag<unsigned> updates;
    updates = processAddressOfCopy<galois::DoAll, galois::InsertBag<unsigned>>(
        addressCopyConstraints);
    processLoadStore<galois::DoAll>(loadStoreConstraints, updates);

    while (!updates.empty()) {
      galois::for_each(
          galois::iterate(updates),
          [this](unsigned req, auto& ctx) {
            for (auto dst = this->outgoingEdges[req].begin();
                 dst != this->outgoingEdges[req].end(); dst++) {
              unsigned newPtsTo = this->propagate(req, *dst);

              if (newPtsTo)
                ctx.push(this->ocd.getFinalRepresentative(*dst));
            }
          },
          galois::loopname("PointsToMainUpdateLoop"),
          galois::disable_conflict_detection(),
          galois::wl<galois::worklists::PerSocketChunkFIFO<8>>());

      galois::gDebug("No of points-to facts computed = ", countPointsToFacts());

      updates.clear();

      // After propagating all constraints, see if load/store constraints need
      // to be added in since graph was potentially updated
      processLoadStore<galois::DoAll>(loadStoreConstraints, updates);

      // do cycle squashing
      // ocd.process(updates); // TODO have parallel OCD, if possible
    }
  }
};

/**
 * Method from running PTA.
 */
template <typename PTAClass, typename Alloc>
void runPTA(PTAClass& pta, Alloc& nodeAllocator) {
  size_t numNodes = pta.readConstraints(inputFile.c_str());
  pta.initialize(numNodes, nodeAllocator);

  galois::StatTimer execTime("Timer_0");

  execTime.start();
  pta.run();
  execTime.stop();

  galois::gInfo("No of points-to facts computed = ", pta.countPointsToFacts());

  if (!skipVerify) {
    galois::gInfo("Doing verification step");
    pta.checkReprPointsTo();
    pta.checkReprEdges();
  }

  if (printAnswer) {
    pta.printPointsToInfo();
  }

  // free everything nodeallocator allocated
  pta.freeNodeAllocatorMemory();
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  // depending on serial or concurrent, create the correct class and pass it
  // into the run harness which takes care of the rest
  if (!useSerial) {
    galois::gInfo("-------- Parallel version: ", galois::getActiveThreads(),
                  " threads.");
    galois::gInfo("Note correctness of this version is relative to the serial "
                  "version.");

    PTAConcurrent p;
    galois::FixedSizeAllocator<typename galois::SparseBitVector<true>::Node>
        nodeAllocator;
    runPTA(p, nodeAllocator);
  } else {
    galois::gInfo("-------- Sequential version.");
    galois::gInfo(
        "The load store threshold (-lsThreshold) may need tweaking for "
        "best performance; its current setting may not be the best for "
        "your input and may actually degrade performance.");
    PTASerial p;
    galois::FixedSizeAllocator<typename galois::SparseBitVector<false>::Node>
        nodeAllocator;
    runPTA(p, nodeAllocator);
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/pointstoanalysis/README.md
================================================
Points To Analysis
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Points-to analysis based on Hardekopf and Lin's points-to analysis algorithm.

Given a constraint file (format detailed below), runs a graph based points-to
analysis algorithm to determine which nodes point to which other nodes.
Both a serial and a multi-threaded version exist, and the serial version
supports online cycle detection.

Performance is achieved by using a sparse bit vector to represent both
edges and points-to information.

INPUT
--------------------------------------------------------------------------------

The input is a constraint file in the following format:

```
<num vars> <num constraints>
<constraint num> <src> <dst> <type> <offset>
<constraint num> <src> <dst> <type> <offset>
<constraint num> <src> <dst> <type> <offset>
.
.
.
<constraint num> <src> <dst> <type> <offset>
<EOF>
```

`<src>` and `<dst>` are node IDs, and `<type>` specifies the relation
between them. `<offset>` is not supported in the implementation: it must be
set to 0. If it is not, the entire constraint will be ignored.

The constraint types supported are the following:

0 = Address Of Constraint
1 = Copy Constraint
2 = Load Constraint
3 = Store Constraint

All other constraint types will be ignored.

Note that the correctness of the parallel version is relative to the serial
version, which may or may not match other implementations of points-to
analysis.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/pointstoanalysis; make -j`

RUN
--------------------------------------------------------------------------------

Run serial points-to analysis with the following command:
`./pointstoanalysis-cpu <constraint file> -serial`

Run serial points-to analysis with online cycle detection with the following 
command:
`./pointstoanalysis-cpu <constraint file> -serial -ocd`

Run serial points-to analysis that reprocesses load/store constraints after
N constraints with the following command:
`./pointstoanalysis-cpu <constraint file> -serial -lsThreshold=N`

Run the parallel version of points-to analysis with the following command:
`./pointstoanalysis-cpu <constraint file> -t=<num threads>`

Run the parallel version of points-to analysis and print the results with
the following command (the serial version also supports printAnswer):
`./pointstoanalysis-cpu <constraint file> -t=<num threads> -printAnswer`

PERFORMANCE  
--------------------------------------------------------------------------------

Online cycle detection in the serial version may or may not help depending on the
input. There are cases where it can hurt performance. The serial version also
has a threshold that determines load/store constraints are reprocessed.
Depending on your input, you may get better performance by tuning the frequency
at which these constraints are reprocessed (the idea is that it may eliminate
redundant constraints that currently exist in the worklist).


================================================
FILE: lonestar/analytics/cpu/pointstoanalysis/SparseBitVector.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _GALOIS_SPARSEBITVECTOR_
#define _GALOIS_SPARSEBITVECTOR_

#include <galois/AtomicWrapper.h>
#include <galois/Mem.h>
#include <utility>
#include <boost/iterator/iterator_facade.hpp>

namespace galois {

/**
 * Sparse bit vector using a linked list. Also thread safe; however, only
 * guarantee is that functions return values based on the state of the
 * vector AT THE TIME THE FUNCTION IS CALLED. (i.e. if concurrent update
 * happens in a function call, the update may or may not be visible).
 */
template <bool IsConcurrent>
struct SparseBitVector {
  using WORD                     = unsigned int;
  static const unsigned wordSize = sizeof(WORD) * 8;

  //////////////////////////////////////////////////////////////////////////////

  /**
   * Node in sparse bit vector linked list
   */
  struct Node {
    unsigned _base; // base to multiply by/used to sort linked list
    // If concurrent, then wrap these in a copyable atomic
    using WordType =
        typename std::conditional<IsConcurrent, galois::CopyableAtomic<WORD>,
                                  WORD>::type;
    WordType _bitVector; // stores set bits for a base

    using NodeType =
        typename std::conditional<IsConcurrent, galois::CopyableAtomic<Node*>,
                                  Node*>::type;
    NodeType _next; // pointer to next node in linked list

    /**
     * Needs a base when being constructed.
     *
     * @param base
     */
    Node(unsigned base) {
      _base      = base;
      _bitVector = 0;
      _next      = nullptr;
    }

    /**
     * Construct with an already set
     *
     * @param base
     */
    Node(unsigned base, unsigned offset) {
      _base = base;

      // set bit at offset
      unsigned toStore = 0;
      toStore |= ((WORD)1 << offset);
      _bitVector = toStore;

      _next = nullptr;
    }

    /**
     * Thread safe set. Uses compare and swap to atomically update the
     * bitvector.
     *
     * @param offset Offset to set the bit at
     * @returns true if the set bit wasn't set previously
     */
    template <bool A                            = IsConcurrent,
              typename std::enable_if<A>::type* = nullptr>
    bool set(unsigned offset) {
      unsigned expected = _bitVector;
      unsigned newValue = expected | ((WORD)1 << offset);
      bool changed      = (expected != newValue);

      while (changed && !std::atomic_compare_exchange_weak(
                            &_bitVector, &expected, newValue)) {
        // if cas fails, then update new value
        newValue = expected | ((WORD)1 << offset);
        changed  = (expected != newValue);
      }

      return changed;
    }

    /**
     * Sets the bit at the provided offset. Not thread safe/ not concurrent
     * version.
     *
     * @param offset Offset to set the bit at
     * @returns true if the set bit wasn't set previously
     */
    template <bool A                             = IsConcurrent,
              typename std::enable_if<!A>::type* = nullptr>
    bool set(unsigned offset) {
      WORD beforeBits = _bitVector;
      _bitVector |= ((WORD)1 << offset);
      return _bitVector != beforeBits;
    }

    /**
     * @param offset Offset into bits to check status of
     * @returns true if bit at offset is set, false otherwise
     */
    bool test(unsigned offset) const {
      WORD mask = (WORD)1 << offset;
      return ((_bitVector & mask) == mask);
    }

    /**
     * Determines if second has set all of the bits that this objects has set.
     *
     * @param second pointer to compare against
     * @returns true if second word's bits has everything that this
     * word's bits have
     */
    bool isSubsetEq(Node* second) const {
      WORD current = _bitVector;
      return (current & (second->_bitVector)) == current;
    }

    /**
     * Bitwise or with second's bits field on our field.
     *
     * @param second sbv node to do a bitwise or with
     * @returns 1 if something changed, 0 otherwise
     */
    template <bool A                            = IsConcurrent,
              typename std::enable_if<A>::type* = nullptr>
    unsigned unify(Node* second) {
      if (second) {
        WORD oldVector = _bitVector;
        WORD newVector = oldVector | (second->_bitVector);
        bool changed   = (oldVector != newVector);

        while (changed && !std::atomic_compare_exchange_weak(
                              &_bitVector, &oldVector, newVector)) {
          // if cas fails, update again
          newVector = oldVector | (second->_bitVector);
          changed   = (oldVector != newVector);
        }

        return changed;
      }

      return 0;
    }

    /**
     * Bitwise or with second's bits field on our field. Non-concurrent
     * version.
     *
     * @param second Node to do a bitwise or with
     * @returns 1 if something changed, 0 otherwise
     */
    template <bool A                             = IsConcurrent,
              typename std::enable_if<!A>::type* = nullptr>
    unsigned unify(Node* second) {
      if (second) {
        WORD oldBits = _bitVector;
        _bitVector |= second->_bitVector;
        return (_bitVector != oldBits);
      }

      return 0;
    }

    // TODO revise this to use a Galois allocator
    /**
     * @returns a pointer to a copy of this word without the preservation
     * of the linked list
     */
    Node* clone(galois::FixedSizeAllocator<Node>* nodeAllocator) const {
      Node* newWord = nodeAllocator->allocate(1);
      nodeAllocator->construct(newWord, 0);

      newWord->_base      = _base;
      newWord->_bitVector = _bitVector;
      newWord->_next      = nullptr;

      return newWord;
    }

    /**
     * TODO can probably use bit twiddling to get count more efficiently
     *
     * @returns The number of set bits in this word
     */
    unsigned count() const {
      unsigned numElements = 0;

      WORD bitMask = 1;
      WORD bits    = _bitVector;

      for (unsigned ii = 0; ii < wordSize; ++ii) {
        if (bits & bitMask) {
          ++numElements;
        }

        bitMask <<= 1;
      }
      return numElements;
    }

    /**
     * Gets the set bits in this word and adds them to the passed in
     * vector.
     *
     * @tparam VectorTy vector type that supports push_back
     * @param setBits Vector to add set bits to
     * @returns Number of set bits in this word
     */
    template <typename VectorTy>
    unsigned getAllSetBits(VectorTy& setbits) const {
      // or mask used to mask set bits
      WORD orMask     = 1;
      unsigned numSet = 0;
      WORD bits       = _bitVector;

      for (unsigned curBit = 0; curBit < wordSize; ++curBit) {
        if (bits & orMask) {
          setbits.push_back(_base * wordSize + curBit);
          numSet++;
        }

        orMask <<= 1;
      }

      return numSet;
    }
  };

  //////////////////////////////////////////////////////////////////////////////

  /**
   * Iterator for SparseBitVector
   *
   * BEHAVIOR IF THE BIT VECTOR IS ALTERED DURING ITERATION IS UNDEFINED.
   * (i.e. correctness is not guaranteed)
   */
  class SBVIterator
      : public boost::iterator_facade<SBVIterator, const unsigned,
                                      boost::forward_traversal_tag> {
    Node* currentHead;
    unsigned currentBit{0};
    unsigned currentValue{~0U};

    void advanceToNextBit(bool inclusive) {
      if (!inclusive) {
        currentBit++; // current bit doesn't count for checking
      }

      bool found = false;
      while (!found && currentHead != nullptr) {
        while (currentBit < wordSize) {
          if (currentHead->test(currentBit)) {
            found = true;
            break;
          } else {
            currentBit++;
          }
        }

        if (!found) {
          currentHead = (currentHead->_next);
          currentBit  = 0;
        }
      }

      if (currentHead != nullptr) {
        currentValue = (currentHead->_base * wordSize) + currentBit;
      } else {
        currentValue = -1;
      }
    }

  public:
    /**
     * This is the end for an iterator.
     */
    SBVIterator() : currentHead(nullptr), currentBit(0), currentValue(-1) {
      currentValue = -1;
    }

    SBVIterator(Node* firstHead)
        : currentHead(firstHead), currentBit(0), currentValue(-1) {
      advanceToNextBit(true);
    }

    SBVIterator(SparseBitVector* bv) : currentBit(0), currentValue(-1) {
      currentHead = (bv->head);
      advanceToNextBit(true);
    }

  private:
    friend class boost::iterator_core_access;

    /**
     * Goes to next bit of bitvector.
     */
    void increment() {
      if (currentHead != nullptr) {
        advanceToNextBit(false); // false = increment currentBit
      }                          // do nothing if head is nullptr (i.e. the end)
    }

    /**
     * @param other Another iterator to compare against
     * @returns true if other iterator currently points to the same location
     */
    bool equal(const SBVIterator& other) const {
      if (currentHead != nullptr) {
        if (other.currentHead == currentHead &&
            other.currentBit == currentBit) {
          return true;
        } else {
          return false;
        }
      } else {
        if (other.currentHead == nullptr) {
          return true;
        } else {
          return false;
        }
      }
    }

    /**
     * @returns the current value that the iterator is pointing to
     */
    const unsigned& dereference() const { return currentValue; }
  };

  //////////////////////////////////////////////////////////////////////////////

  using NodeType =
      typename std::conditional<IsConcurrent, galois::CopyableAtomic<Node*>,
                                Node*>::type;
  // head of linked list
  NodeType head;
  // allocator of new nodes
  galois::FixedSizeAllocator<Node>* nodeAllocator;

  /**
   * Default constructor = nullptrs
   */
  SparseBitVector() {
    head          = nullptr;
    nodeAllocator = nullptr;
  }

  /**
   * Free all nodes allocated with the node allocator
   */
  ~SparseBitVector() {
    if (nodeAllocator) {
      this->freeAll();
    }
  }

  void freeAll() {
    if (nodeAllocator) { // check to make sure nodeAllocator isn't null
      while (head) {
        Node* current = head;
        head          = current->_next;
        nodeAllocator->destroy(current);
        nodeAllocator->deallocate(current, 1);
      }
    }

    // set to null pointer so it doesn't try this again later on destructor
    nodeAllocator = nullptr;
  }

  /**
   * Initialize by setting head to nullptr and saving the word allocator.
   *
   * @param _nodeAllocator allocator object for nodes to use when creating
   * a new linked list node
   */
  void init(galois::FixedSizeAllocator<Node>* _nodeAllocator) {
    head          = nullptr;
    nodeAllocator = _nodeAllocator;
  }

  /**
   * @returns iterator to first set element of this bitvector
   */
  SBVIterator begin() { return SBVIterator(this); }

  /**
   * @returns end iterator of this bitvector.
   */
  SBVIterator end() { return SBVIterator(); }

  /**
   * Set the provided bit num in the bitvector. Will create a new word if the
   * word needed to set the bit doesn't exist yet + will rearrange linked
   * list of words as necessary.
   *
   * @param num The bit to set in the bitvector
   * @returns true if the bit set wasn't set previously
   */
  template <bool A = IsConcurrent, typename std::enable_if<A>::type* = nullptr>
  bool set(unsigned num) {
    unsigned baseWord;
    unsigned offsetIntoWord;

    // determine base word and bit that corresponds to the num
    std::tie(baseWord, offsetIntoWord) = getOffsets(num);

    Node* curPtr = head;
    Node* prev   = nullptr;

    // while true due to fact that compare and swap (CAS) may fail
    while (true) {
      // pointers should be in sorted order
      // loop through linked list to find the correct base word (if it exists)
      while (curPtr != nullptr && curPtr->_base < baseWord) {
        prev   = curPtr;
        curPtr = (curPtr->_next);
      }

      // if base already exists, then set the correct offset bit
      if (curPtr != nullptr && curPtr->_base == baseWord) {
        return curPtr->set(offsetIntoWord);
        // else the base wasn't found; create and set, then rearrange linked
        // list accordingly
      } else {
        Node* newWord = nodeAllocator->allocate(1);
        nodeAllocator->construct(newWord, baseWord, offsetIntoWord);

        // note at this point curPtr is the next element in the list that
        // the new one we create should point to
        newWord->_next = curPtr;

        // attempt a compare and swap: if it fails, that means the list was
        // altered, so go back to beginning of this loop to check again
        if (prev) {
          if (std::atomic_compare_exchange_weak(&(prev->_next), &curPtr,
                                                newWord)) {
            return true;
          } else {
            // if it fails, return to the top; current pointer has new value
            // that needs to be checked
            nodeAllocator->destroy(newWord);
            nodeAllocator->deallocate(newWord, 1);
          }
        } else {
          if (std::atomic_compare_exchange_weak(&head, &curPtr, newWord)) {
            return true;
          } else {
            // if it fails, return to the top; current pointer has new value
            // that needs to be checked
            nodeAllocator->destroy(newWord);
            nodeAllocator->deallocate(newWord, 1);
          }
        }
      }
    }
  }

  /**
   * Set the provided bit in the bitvector. Will create a new word if the
   * word needed to set the bit doesn't exist yet + will rearrange linked
   * list of words as necessary.
   *
   * Not thread safe.
   *
   * @param bit The bit to set in the bitvector
   * @returns true if the bit set wasn't set previously
   */
  template <bool A = IsConcurrent, typename std::enable_if<!A>::type* = nullptr>
  bool set(unsigned bit) {
    unsigned baseWord;
    unsigned offsetIntoWord;

    std::tie(baseWord, offsetIntoWord) = getOffsets(bit);

    Node* curPtr = head;
    Node* prev   = nullptr;

    // pointers should be in sorted order
    // loop through linked list to find the correct base word (if it exists)
    while (curPtr != nullptr && curPtr->_base < baseWord) {
      prev   = curPtr;
      curPtr = curPtr->_next;
    }

    // if base already exists, then set the correct offset bit
    if (curPtr != nullptr && curPtr->_base == baseWord) {
      return curPtr->set(offsetIntoWord);
      // else the base wasn't found; create and set, then rearrange linked list
      // accordingly
    } else {
      Node* newWord = nodeAllocator->allocate(1);
      nodeAllocator->construct(newWord, baseWord, offsetIntoWord);

      // this should point to prev's next, prev should point to this
      if (prev) {
        newWord->_next = prev->_next;
        prev->_next    = newWord;
      } else {
        if (curPtr == nullptr) {
          // this is the first word we are adding since both prev and head are
          // null; next is nothing
          newWord->_next = nullptr;
        } else {
          // this new word goes before curptr; if prev is null and curptr isn't,
          // it means it had to go before
          newWord->_next = head;
        }

        head = newWord;
      }

      return true;
    }
  }

  /**
   * Determines if a particular number bit in the bitvector is set.
   *
   * Note it may return false if a bit is set concurrently by another
   * thread.
   *
   * @param num Bit in bitvector to check status of
   * @returns true if the argument bit is set in this bitvector, false
   * otherwise. May also return false if the bit being tested for is set
   * concurrently by another thread.
   */
  bool test(unsigned num) const {
    unsigned baseWord;
    unsigned offsetIntoWord;

    std::tie(baseWord, offsetIntoWord) = getOffsets(num);
    Node* curPointer                   = head;

    while (curPointer != nullptr && curPointer->_base < baseWord) {
      curPointer = (curPointer->_next);
    }

    if (curPointer != nullptr && curPointer->_base == baseWord) {
      return curPointer->test(offsetIntoWord);
    } else {
      return false;
    }
  }

  /**
   * READ THIS REGARDING THE CONCURRENT VERSION OF THIS:
   *
   * This function, in some sense, will not return false incorrectly (i.e.
   * if it returns false, then at that point in time this vector actually
   * isnt' a subset of the other vector; however, it is entirely possible
   * that a concurrent update will change that as this function is returning
   * "false").
   *
   * So basically, no guarantees. It exists to somewhat optimize some steps
   * of PointsTo, but it should not be relied on if writes to THIS bitvector
   * can happen concurrently (writes to second are fine).
   *
   * In serial execution it will work as expected.
   *
   * @param second Vector to check if this vector is a subset of
   * @returns true if this vector is a subset of the second vector
   */
  bool isSubsetEq(const SparseBitVector& second) const {
    Node* ptrOne = head;
    Node* ptrTwo = second.head;

    while (ptrOne != nullptr && ptrTwo != nullptr) {
      if (ptrOne->_base == ptrTwo->_base) {
        if (!ptrOne->isSubsetEq(ptrTwo)) {
          return false;
        }

        // subset check successful; advance both pointers
        ptrOne = (ptrOne->_next);
        ptrTwo = (ptrTwo->_next);
      } else if (ptrOne->_base < ptrTwo->_base) {
        // ptrTwo has overtaken ptrOne, i.e. one has something (a base)
        // two doesn't
        return false;
      } else { // ptrOne > ptrTwo
        // greater than case; advance ptrTwo to see if it eventually
        // reaches what ptrOne is currently at
        ptrTwo = (ptrTwo->_next);
      }
    }

    if (ptrOne != nullptr) {
      // if ptrOne is not null, the loop exited because ptrTwo is nullptr,
      // meaning this vector has more than the other vector, i.e. not a subset
      return false;
    } else {
      // here means ptrOne == nullptr => it has sucessfully subset checked all
      // words that matter
      return true;
    }
  }

  /**
   * Concurrent version.
   *
   * Takes the passed in bitvector and does an "or" with it to update this
   * bitvector.
   *
   * ONLY GUARANTEE IS THAT YOU WILL GET THINGS IN second THAT EXISTED
   * AT TIME OF CALL; IF second IS UPDATED CONCURRENTLY, YOU MAY OR MAY
   * NOT GET THOSE UPDATES.
   *
   * @param second BitVector to merge this one with
   * @returns a non-negative value if something changed
   */
  template <bool A = IsConcurrent, typename std::enable_if<A>::type* = nullptr>
  unsigned unify(const SparseBitVector& second) {
    unsigned changed = 0;

    Node* prev   = nullptr;
    Node* ptrOne = head;
    Node* ptrTwo = second.head;

    while (ptrTwo != nullptr) {
      while (ptrOne != nullptr && ptrTwo != nullptr) {
        if (ptrOne->_base == ptrTwo->_base) {
          // merged ptrTwo's word with our word, then advance both
          changed += ptrOne->unify(ptrTwo);

          prev   = ptrOne;
          ptrOne = (ptrOne->_next);
          ptrTwo = (ptrTwo->_next);
        } else if (ptrOne->_base < ptrTwo->_base) {
          // advance our pointer until we reach new bases we don't have
          prev   = ptrOne;
          ptrOne = (ptrOne->_next);
        } else { // oneBase > twoBase
          // two has something we don't have; add it between prev and current
          // ptrone
          Node* newWord = ptrTwo->clone(nodeAllocator);
          // newWord comes before our current word
          (newWord->_next) = ptrOne;

          if (prev) {
            if (!std::atomic_compare_exchange_weak(&(prev->_next), &ptrOne,
                                                   newWord)) {
              // if it fails, return to the top; ptrOne has new value
              // that needs to be checked
              nodeAllocator->destroy(newWord);
              nodeAllocator->deallocate(newWord, 1);
              continue;
            }
          } else {
            if (!std::atomic_compare_exchange_weak(&head, &ptrOne, newWord)) {
              // if it fails, return to the top; ptrOne has new value
              // that needs to be checked
              nodeAllocator->destroy(newWord);
              nodeAllocator->deallocate(newWord, 1);
              continue;
            }
          }

          prev = newWord;
          // done with ptrTwo's word, advance
          ptrTwo = (ptrTwo->_next);

          changed++;
        }
      }

      // ptrOne = nullptr, but ptrTwo still has values; clone values
      // and attempt to add
      while (ptrTwo) {
        Node* newWord = ptrTwo->clone(nodeAllocator);

        // note ptrOne in below cases should be nullptr...
        if (prev) {
          if (!std::atomic_compare_exchange_weak(&(prev->_next), &ptrOne,
                                                 newWord)) {
            // if it fails, return to the top; ptrOne has new value
            // that needs to be checked
            nodeAllocator->destroy(newWord);
            nodeAllocator->deallocate(newWord, 1);
            break; // goes out to outermost while loop
          }
        } else {
          if (!std::atomic_compare_exchange_weak(&head, &ptrOne, newWord)) {
            // if it fails, return to the top; ptrOne has new value
            // that needs to be checked
            nodeAllocator->destroy(newWord);
            nodeAllocator->deallocate(newWord, 1);
            break; // goes out to outermost while loop
          }
        }

        prev   = newWord;
        ptrTwo = (ptrTwo->_next);

        changed++;
      }
    }

    return changed;
  }

  /**
   * Non-concurrent version.
   *
   * Takes the passed in bitvector and does an "or" with it to update this
   * bitvector.
   *
   * @param second BitVector to merge this one with
   * @returns a non-negative value if something changed
   */
  template <bool A = IsConcurrent, typename std::enable_if<!A>::type* = nullptr>
  unsigned unify(const SparseBitVector& second) {
    unsigned changed = 0;

    Node* prev   = nullptr;
    Node* ptrOne = head;
    Node* ptrTwo = second.head;

    while (ptrOne != nullptr && ptrTwo != nullptr) {
      if (ptrOne->_base == ptrTwo->_base) {
        // merged ptrTwo's word with our word, then advance both
        changed += ptrOne->unify(ptrTwo);

        prev   = ptrOne;
        ptrOne = ptrOne->_next;
        ptrTwo = ptrTwo->_next;
      } else if (ptrOne->_base < ptrTwo->_base) {
        // advance our pointer until we reach new bases we don't have
        prev   = ptrOne;
        ptrOne = (ptrOne->_next);
      } else { // oneBase > twoBase
        // two has something we don't have; add it between prev and current
        // ptrone
        Node* newWord = ptrTwo->clone(nodeAllocator);
        // newWord comes before our current word
        newWord->_next = ptrOne;

        if (prev) {
          prev->_next = newWord;
        } else {
          head = newWord;
        }

        prev = newWord;
        // done with ptrTwo's word, advance
        ptrTwo = ptrTwo->_next;

        changed++;
      }
    }

    // ptrOne = nullptr, but ptrTwo still has values; clone values and add
    while (ptrTwo) {
      Node* newWord = ptrTwo->clone(nodeAllocator);

      if (prev) {
        prev->_next = newWord;
      } else {
        head = newWord;
      }

      prev   = newWord;
      ptrTwo = (ptrTwo->_next);

      changed++;
    }

    return changed;
  }

  /**
   * @returns number of bits set by all words in this bitvector
   */
  unsigned count() const {
    unsigned nbits = 0;

    for (Node* ptr = head; ptr; ptr = (ptr->_next)) {
      nbits += ptr->count();
    }

    return nbits;
  }

  /**
   * Gets the set bits in this bitvector and returns them in a vector type.
   *
   * @tparam VectorTy vector type that supports push_back
   * @returns Vector with all set bits
   */
  std::vector<unsigned> getAllSetBits() const {
    std::vector<unsigned> setBits;

    // loop through all words in the bitvector and get their set bits
    for (Node* curPtr = head; curPtr != nullptr; curPtr = curPtr->_next) {
      curPtr->getAllSetBits(setBits);
    }

    return setBits;
  }

  /**
   * Output the bits that are set in this bitvector.
   *
   * @param out Stream to output to
   * @param prefix A string to append to the set bit numbers
   */
  void print(std::ostream& out, std::string prefix = std::string("")) const {
    std::vector<unsigned> setBits = getAllSetBits();
    out << "Elements(" << setBits.size() << "): ";

    for (auto setBitNum : setBits) {
      out << prefix << setBitNum << ", ";
    }

    out << "\n";
  }

private:
  /**
   * @param num Bit that needs to be set
   * @returns a pair signifying a base word and the offset into a
   * baseword that corresponds to num
   */
  std::pair<unsigned, unsigned> getOffsets(unsigned num) const {
    unsigned baseWord       = num / wordSize;
    unsigned offsetIntoWord = num % wordSize;

    return std::pair<unsigned, unsigned>(baseWord, offsetIntoWord);
  }
};

} // namespace galois

#endif


================================================
FILE: lonestar/analytics/cpu/preflowpush/CMakeLists.txt
================================================
add_executable(preflowpush-cpu Preflowpush.cpp)
add_dependencies(apps preflowpush-cpu)
target_link_libraries(preflowpush-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS preflowpush-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 preflowpush-cpu "${BASEINPUT}/reference/structured/torus5.gr" "-sourceNode=0" "-sinkNode=10")


================================================
FILE: lonestar/analytics/cpu/preflowpush/Preflowpush.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Bag.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

#include <boost/iterator/iterator_adaptor.hpp>

#include <fstream>
#include <iostream>

namespace cll = llvm::cl;

const char* name = "Preflow Push";
const char* desc =
    "Finds the maximum flow in a network using the preflow push technique";
const char* url = "preflow_push";

enum DetAlgo { nondet = 0, detBase, detDisjoint };

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<uint32_t> sourceId("sourceNode", cll::desc("Source node"),
                                   cll::Required);
static cll::opt<uint32_t> sinkId("sinkNode", cll::desc("Sink node"),
                                 cll::Required);
static cll::opt<bool> useHLOrder("useHLOrder",
                                 cll::desc("Use HL ordering heuristic"),
                                 cll::init(false));
static cll::opt<bool>
    useUnitCapacity("useUnitCapacity",
                    cll::desc("Assume all capacities are unit"),
                    cll::init(false));
static cll::opt<bool> useSymmetricDirectly(
    "useSymmetricDirectly",
    cll::desc("Assume input graph is symmetric and has unit capacities"),
    cll::init(false));
static cll::opt<int>
    relabelInt("relabel",
               cll::desc("relabel interval X: relabel every X iterations "
                         "(default 0 uses default interval)"),
               cll::init(0));
static cll::opt<DetAlgo>
    detAlgo(cll::desc("Deterministic algorithm:"),
            cll::values(clEnumVal(nondet, "Non-deterministic (default)"),
                        clEnumVal(detBase, "Base execution"),
                        clEnumVal(detDisjoint, "Disjoint execution")),
            cll::init(nondet));

/**
 * Alpha parameter the original Goldberg algorithm to control when global
 * relabeling occurs. For comparison purposes, we keep them the same as
 * before, but it is possible to achieve much better performance by adjusting
 * the global relabel frequency.
 */
static const int ALPHA = 6;

/**
 * Beta parameter the original Goldberg algorithm to control when global
 * relabeling occurs. For comparison purposes, we keep them the same as
 * before, but it is possible to achieve much better performance by adjusting
 * the global relabel frequency.
 */
static const int BETA = 12;

struct Node {
  uint32_t id;
  int64_t excess;
  int height;
  int current;

  Node() : excess(0), height(1), current(0) {}
};

std::ostream& operator<<(std::ostream& os, const Node& n) {
  os << "("
     << "id: " << n.id << ", excess: " << n.excess << ", height: " << n.height
     << ", current: " << n.current << ")";
  return os;
}

using Graph =
    galois::graphs::LC_CSR_Graph<Node, int32_t>::with_numa_alloc<false>::type;
using GNode   = Graph::GraphNode;
using Counter = galois::GAccumulator<int>;

struct PreflowPush {

  Graph graph;
  GNode sink;
  GNode source;
  int global_relabel_interval;
  bool should_global_relabel = false;
  galois::LargeArray<Graph::edge_iterator>
      reverseDirectionEdgeIterator; // ideally should be on the graph as
                                    // graph.getReverseEdgeIterator()

  void reduceCapacity(const Graph::edge_iterator& ii, int64_t amount) {
    Graph::edge_data_type& cap1 = graph.getEdgeData(ii);
    Graph::edge_data_type& cap2 =
        graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);
    cap1 -= amount;
    cap2 += amount;
  }

  Graph::edge_iterator findEdge(GNode src, GNode dst) {

    auto i     = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
    auto end_i = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

    if ((end_i - i) < 32) {
      return findEdgeLinear(dst, i, end_i);

    } else {
      return findEdgeLog2(dst, i, end_i);
    }
  }

  Graph::edge_iterator findEdgeLinear(GNode dst, Graph::edge_iterator beg_e,
                                      Graph::edge_iterator end_e) {

    auto ii = beg_e;
    for (; ii != end_e; ++ii) {
      if (graph.getEdgeDst(ii) == dst)
        break;
    }
    assert(ii != end_e); // Never return the end iterator
    return ii;
  }

  Graph::edge_iterator findEdgeLog2(GNode dst, Graph::edge_iterator i,
                                    Graph::edge_iterator end_i) {

    struct EdgeDstIter
        : public boost::iterator_facade<
              EdgeDstIter, GNode, boost::random_access_traversal_tag, GNode> {
      Graph* g;
      Graph::edge_iterator ei;

      EdgeDstIter() : g(nullptr) {}

      EdgeDstIter(Graph* g, Graph::edge_iterator ei) : g(g), ei(ei) {}

    private:
      friend boost::iterator_core_access;

      GNode dereference() const { return g->getEdgeDst(ei); }

      void increment() { ++ei; }

      void decrement() { --ei; }

      bool equal(const EdgeDstIter& that) const {
        assert(this->g == that.g);
        return this->ei == that.ei;
      }

      void advance(ptrdiff_t n) { ei += n; }

      ptrdiff_t distance_to(const EdgeDstIter& that) const {
        assert(this->g == that.g);

        return that.ei - this->ei;
      }
    };

    EdgeDstIter ai(&graph, i);
    EdgeDstIter end_ai(&graph, end_i);

    auto ret = std::lower_bound(ai, end_ai, dst);

    assert(ret != end_ai);
    assert(*ret == dst);

    return ret.ei;
  }

  void acquire(const GNode& src) {
    // LC Graphs have a different idea of locking
    for (auto ii : graph.edges(src, galois::MethodFlag::WRITE)) {
      GNode dst = graph.getEdgeDst(ii);
      graph.getData(dst, galois::MethodFlag::WRITE);
    }
  }

  void relabel(const GNode& src) {
    int minHeight = std::numeric_limits<int>::max();
    int minEdge   = 0;

    int current = 0;
    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
      GNode dst   = graph.getEdgeDst(ii);
      int64_t cap = graph.getEdgeData(ii);
      if (cap > 0) {
        const Node& dnode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
        if (dnode.height < minHeight) {
          minHeight = dnode.height;
          minEdge   = current;
        }
      }
      ++current;
    }

    assert(minHeight != std::numeric_limits<int>::max());
    ++minHeight;

    Node& node = graph.getData(src, galois::MethodFlag::UNPROTECTED);
    if (minHeight < (int)graph.size()) {
      node.height  = minHeight;
      node.current = minEdge;
    } else {
      node.height = graph.size();
    }
  }

  template <typename C>
  bool discharge(const GNode& src, C& ctx) {
    Node& node     = graph.getData(src, galois::MethodFlag::UNPROTECTED);
    bool relabeled = false;

    if (node.excess == 0 || node.height >= (int)graph.size()) {
      return false;
    }

    while (true) {
      galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;
      bool finished           = false;
      int current             = node.current;

      auto ii = graph.edge_begin(src, flag);
      auto ee = graph.edge_end(src, flag);

      std::advance(ii, node.current);

      for (; ii != ee; ++ii, ++current) {
        GNode dst   = graph.getEdgeDst(ii);
        int64_t cap = graph.getEdgeData(ii);
        if (cap == 0) // || current < node.current)
          continue;

        Node& dnode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
        if (node.height - 1 != dnode.height)
          continue;

        // Push flow
        int64_t amount = std::min(node.excess, cap);
        reduceCapacity(ii, amount);

        // Only add once
        if (dst != sink && dst != source && dnode.excess == 0)
          ctx.push(dst);

        assert(node.excess >= amount);
        node.excess -= amount;
        dnode.excess += amount;

        if (node.excess == 0) {
          finished     = true;
          node.current = current;
          break;
        }
      }

      if (finished)
        break;

      relabel(src);
      relabeled = true;

      if (node.height == (int)graph.size())
        break;

      // prevHeight = node.height;
    }

    return relabeled;
  }

  template <DetAlgo version>
  void detDischarge(galois::InsertBag<GNode>& initial, Counter& counter) {
    typedef galois::worklists::Deterministic<> DWL;

    auto detIDfn = [this](const GNode& item) -> uint32_t {
      return graph.getData(item, galois::MethodFlag::UNPROTECTED).id;
    };

    const int relabel_interval =
        global_relabel_interval / galois::getActiveThreads();

    auto detBreakFn = [&, this](void) -> bool {
      if (this->global_relabel_interval > 0 &&
          counter.getLocal() >= relabel_interval) {
        this->should_global_relabel = true;
        return true;
      } else {
        return false;
      }
    };

    galois::for_each(
        galois::iterate(initial),
        [&, this](GNode& src, auto& ctx) {
          if (version != nondet) {
            if (ctx.isFirstPass()) {
              this->acquire(src);
            }
            if (version == detDisjoint && ctx.isFirstPass()) {
              return;
            } else {
              this->graph.getData(src, galois::MethodFlag::WRITE);
              ctx.cautiousPoint();
            }
          }

          int increment = 1;
          if (this->discharge(src, ctx)) {
            increment += BETA;
          }

          counter += increment;
        },
        galois::loopname("detDischarge"), galois::wl<DWL>(),
        galois::per_iter_alloc(), galois::det_id<decltype(detIDfn)>(detIDfn),
        galois::det_parallel_break<decltype(detBreakFn)>(detBreakFn));
  }

  template <typename W>
  void nonDetDischarge(galois::InsertBag<GNode>& initial, Counter& counter,
                       const W& wl_opt) {

    // per thread
    const int relabel_interval =
        global_relabel_interval / galois::getActiveThreads();

    galois::for_each(
        galois::iterate(initial),
        [&counter, relabel_interval, this](GNode& src, auto& ctx) {
          int increment = 1;
          this->acquire(src);
          if (this->discharge(src, ctx)) {
            increment += BETA;
          }

          counter += increment;
          if (this->global_relabel_interval > 0 &&
              counter.getLocal() >= relabel_interval) { // local check

            this->should_global_relabel = true;
            ctx.breakLoop();
            return;
          }
        },
        galois::loopname("nonDetDischarge"), galois::parallel_break(), wl_opt);
  }

  /**
   * Do reverse BFS on residual graph.
   */
  template <DetAlgo version, typename WL, bool useCAS = true>
  void updateHeights() {

    galois::for_each(
        galois::iterate({sink}),
        [&, this](const GNode& src, auto& ctx) {
          if (version != nondet) {

            if (ctx.isFirstPass()) {
              for (auto ii :
                   this->graph.edges(src, galois::MethodFlag::WRITE)) {
                GNode dst = this->graph.getEdgeDst(ii);
                int64_t rdata =
                    this->graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);
                if (rdata > 0) {
                  this->graph.getData(dst, galois::MethodFlag::WRITE);
                }
              }
            }

            if (version == detDisjoint && ctx.isFirstPass()) {
              return;
            } else {
              this->graph.getData(src, galois::MethodFlag::WRITE);
              ctx.cautiousPoint();
            }
          }

          for (auto ii :
               this->graph.edges(src, useCAS ? galois::MethodFlag::UNPROTECTED
                                             : galois::MethodFlag::WRITE)) {
            GNode dst = this->graph.getEdgeDst(ii);
            int64_t rdata =
                this->graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);
            if (rdata > 0) {
              Node& node =
                  this->graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              int newHeight =
                  this->graph.getData(src, galois::MethodFlag::UNPROTECTED)
                      .height +
                  1;
              if (useCAS) {
                int oldHeight = 0;
                while (newHeight < (oldHeight = node.height)) {
                  if (__sync_bool_compare_and_swap(&node.height, oldHeight,
                                                   newHeight)) {
                    ctx.push(dst);
                    break;
                  }
                }
              } else {
                if (newHeight < node.height) {
                  node.height = newHeight;
                  ctx.push(dst);
                }
              }
            }
          } // end for
        },
        galois::wl<WL>(), galois::disable_conflict_detection(),
        galois::loopname("updateHeights"));
  }

  template <typename IncomingWL>
  void globalRelabel(IncomingWL& incoming) {

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& src) {
          Node& node   = graph.getData(src, galois::MethodFlag::UNPROTECTED);
          node.height  = graph.size();
          node.current = 0;
          if (src == sink)
            node.height = 0;
        },
        galois::loopname("ResetHeights"));

    using BSWL = galois::worklists::BulkSynchronous<>;
    using DWL  = galois::worklists::Deterministic<>;
    switch (detAlgo) {
    case nondet:
      updateHeights<nondet, BSWL>();
      break;
    case detBase:
      updateHeights<detBase, DWL>();
      break;
    case detDisjoint:
      updateHeights<detDisjoint, DWL>();
      break;
    default:
      std::cerr << "Unknown algorithm" << detAlgo << "\n";
      abort();
    }

    galois::do_all(
        galois::iterate(graph),
        [&incoming, this](const GNode& src) {
          Node& node =
              this->graph.getData(src, galois::MethodFlag::UNPROTECTED);
          if (src == this->sink || src == this->source ||
              node.height >= (int)this->graph.size())
            return;
          if (node.excess > 0)
            incoming.push_back(src);
        },
        galois::loopname("FindWork"));
  }

  template <typename C>
  void initializePreflow(C& initial) {
    for (auto ii : graph.edges(source)) {
      GNode dst   = graph.getEdgeDst(ii);
      int64_t cap = graph.getEdgeData(ii);
      reduceCapacity(ii, cap);
      Node& node = graph.getData(dst);
      node.excess += cap;
      if (cap > 0)
        initial.push_back(dst);
    }
  }

  void run() {
    Graph* captured_graph = &graph;
    auto obimIndexer      = [=](const GNode& n) {
      return -captured_graph->getData(n, galois::MethodFlag::UNPROTECTED)
                  .height;
    };

    typedef galois::worklists::PerSocketChunkFIFO<16> Chunk;
    typedef galois::worklists::OrderedByIntegerMetric<decltype(obimIndexer),
                                                      Chunk>
        OBIM;

    galois::InsertBag<GNode> initial;
    initializePreflow(initial);

    while (initial.begin() != initial.end()) {
      galois::StatTimer T_discharge("DischargeTime");
      T_discharge.start();
      Counter counter;
      switch (detAlgo) {
      case nondet:
        if (useHLOrder) {
          nonDetDischarge(initial, counter, galois::wl<OBIM>(obimIndexer));
        } else {
          nonDetDischarge(initial, counter, galois::wl<Chunk>());
        }
        break;
      case detBase:
        detDischarge<detBase>(initial, counter);
        break;
      case detDisjoint:
        detDischarge<detDisjoint>(initial, counter);
        break;
      default:
        std::cerr << "Unknown algorithm" << detAlgo << "\n";
        abort();
      }
      T_discharge.stop();

      if (should_global_relabel) {
        galois::StatTimer T_global_relabel("GlobalRelabelTime");
        T_global_relabel.start();
        initial.clear();
        globalRelabel(initial);
        should_global_relabel = false;
        std::cout << " Flow after global relabel: "
                  << graph.getData(sink).excess << "\n";
        T_global_relabel.stop();
      } else {
        break;
      }
    }
  }

  template <typename EdgeTy>
  static void writePfpGraph(const std::string& inputFile,
                            const std::string& outputFile) {
    typedef galois::graphs::FileGraph ReaderGraph;
    typedef ReaderGraph::GraphNode ReaderGNode;

    ReaderGraph reader;
    reader.fromFile(inputFile);

    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;

    // Count edges
    size_t numEdges = 0;
    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;
         ++ii) {
      ReaderGNode rsrc = *ii;
      for (auto jj : reader.edges(rsrc)) {
        ReaderGNode rdst = reader.getEdgeDst(jj);
        if (rsrc == rdst)
          continue;
        if (!reader.hasNeighbor(rdst, rsrc))
          ++numEdges;
        ++numEdges;
      }
    }

    p.setNumNodes(reader.size());
    p.setNumEdges<EdgeTy>(numEdges);

    p.phase1();
    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;
         ++ii) {
      ReaderGNode rsrc = *ii;
      for (auto jj : reader.edges(rsrc)) {
        ReaderGNode rdst = reader.getEdgeDst(jj);
        if (rsrc == rdst)
          continue;
        if (!reader.hasNeighbor(rdst, rsrc))
          p.incrementDegree(rdst);
        p.incrementDegree(rsrc);
      }
    }

    EdgeTy one = 1;
    static_assert(sizeof(one) == sizeof(uint32_t), "Unexpected edge data size");
    one = galois::convert_le32toh(one);

    p.phase2();
    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;
         ++ii) {
      ReaderGNode rsrc = *ii;
      for (auto jj : reader.edges(rsrc)) {
        ReaderGNode rdst = reader.getEdgeDst(jj);
        if (rsrc == rdst)
          continue;
        if (!reader.hasNeighbor(rdst, rsrc))
          p.addNeighbor<EdgeTy>(rdst, rsrc, 0);
        EdgeTy cap = useUnitCapacity ? one : reader.getEdgeData<EdgeTy>(jj);
        p.addNeighbor<EdgeTy>(rsrc, rdst, cap);
      }
    }

    p.finish();

    using Wnode = Writer::GraphNode;

    struct IdLess {
      bool
      operator()(const galois::graphs::EdgeSortValue<Wnode, EdgeTy>& e1,
                 const galois::graphs::EdgeSortValue<Wnode, EdgeTy>& e2) const {
        return e1.dst < e2.dst;
      }
    };

    for (Writer::iterator i = p.begin(), end_i = p.end(); i != end_i; ++i) {
      p.sortEdges<EdgeTy>(*i, IdLess());
    }

    p.toFile(outputFile);
  }

  void initializeGraph(std::string inputFile, uint32_t sourceId,
                       uint32_t sinkId) {
    if (useSymmetricDirectly) {
      galois::graphs::readGraph(graph, inputFile);
      for (auto ss : graph)
        for (auto ii : graph.edges(ss))
          graph.getEdgeData(ii) = 1;
    } else {
      if (inputFile.find(".gr.pfp") != inputFile.size() - strlen(".gr.pfp")) {
        std::string pfpName = inputFile + ".pfp";
        std::ifstream pfpFile(pfpName.c_str());
        if (!pfpFile.good()) {
          galois::gPrint("Writing new input file: ", pfpName, "\n");
          writePfpGraph<Graph::edge_data_type>(inputFile, pfpName);
        }
        inputFile = pfpName;
      }
      galois::gPrint("Reading graph: ", inputFile, "\n");
      galois::graphs::readGraph(graph, inputFile);

      // Assume that input edge data has already been converted instead
#if 0 // def HAVE_BIG_ENDIAN
      // Convert edge data to host ordering
      for (auto ss : newApp->graph) {
        for (auto ii : newApp->graph.edges(ss)) {
          Graph::edge_data_type& cap = newApp->graph.getEdgeData(ii);
          static_assert(sizeof(cap) == sizeof(uint32_t), "Unexpected edge data size");
          cap = galois::convert_le32toh(cap);
        }
      }
#endif
    }

    if (sourceId == sinkId || sourceId >= graph.size() ||
        sinkId >= graph.size()) {
      std::cerr << "invalid source or sink id\n";
      abort();
    }

    uint32_t id = 0;
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
         ++ii, ++id) {
      if (id == sourceId) {
        source                       = *ii;
        graph.getData(source).height = graph.size();
      } else if (id == sinkId) {
        sink = *ii;
      }
      graph.getData(*ii).id = id;
    }

    reverseDirectionEdgeIterator.allocateInterleaved(graph.sizeEdges());
    // memoize the reverse direction edge-iterators
    galois::do_all(
        galois::iterate(graph.begin(), graph.end()),
        [&, this](const GNode& src) {
          for (auto ii :
               this->graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
            GNode dst                         = this->graph.getEdgeDst(ii);
            reverseDirectionEdgeIterator[*ii] = this->findEdge(dst, src);
          }
        },
        galois::loopname("FindReverseDirectionEdges"));
  }

  void checkSorting(void) {
    for (auto n : graph) {
      galois::optional<GNode> prevDst;
      for (auto e : graph.edges(n, galois::MethodFlag::UNPROTECTED)) {
        GNode dst = graph.getEdgeDst(e);
        if (prevDst.is_initialized()) {
          Node& prevNode =
              graph.getData(*prevDst, galois::MethodFlag::UNPROTECTED);
          Node& currNode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
          GALOIS_ASSERT(prevNode.id != currNode.id,
                        "Adjacency list cannot have duplicates");
          GALOIS_ASSERT(prevNode.id <= currNode.id, "Adjacency list unsorted");
        }
        prevDst = dst;
      }
    }
  }

  void checkAugmentingPath() {
    // Use id field as visited flag
    for (Graph::iterator ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {
      GNode src             = *ii;
      graph.getData(src).id = 0;
    }

    std::deque<GNode> queue;

    graph.getData(source).id = 1;
    queue.push_back(source);

    while (!queue.empty()) {
      GNode& src = queue.front();
      queue.pop_front();
      for (auto ii : graph.edges(src)) {
        GNode dst = graph.getEdgeDst(ii);
        if (graph.getData(dst).id == 0 && graph.getEdgeData(ii) > 0) {
          graph.getData(dst).id = 1;
          queue.push_back(dst);
        }
      }
    }

    if (graph.getData(sink).id != 0) {
      assert(false && "Augmenting path exisits");
      abort();
    }
  }

  void checkHeights() {
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      int sh    = graph.getData(src).height;
      for (auto jj : graph.edges(src)) {
        GNode dst   = graph.getEdgeDst(jj);
        int64_t cap = graph.getEdgeData(jj);
        int dh      = graph.getData(dst).height;
        if (cap > 0 && sh > dh + 1) {
          std::cerr << "height violated at " << graph.getData(src) << "\n";
          abort();
        }
      }
    }
  }

  void checkConservation(PreflowPush& orig) {
    std::vector<GNode> map;
    map.resize(graph.size());

    // Setup ids assuming same iteration order in both graphs
    uint32_t id = 0;
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
         ++ii, ++id) {
      graph.getData(*ii).id = id;
    }
    id = 0;
    for (Graph::iterator ii = orig.graph.begin(), ei = orig.graph.end();
         ii != ei; ++ii, ++id) {
      orig.graph.getData(*ii).id = id;
      map[id]                    = *ii;
    }

    // Now do some checking
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src        = *ii;
      const Node& node = graph.getData(src);
      uint32_t srcId   = node.id;

      if (src == source || src == sink)
        continue;

      if (node.excess != 0 && node.height != (int)graph.size()) {
        std::cerr << "Non-zero excess at " << node << "\n";
        abort();
      }

      int64_t sum = 0;
      for (auto jj : graph.edges(src)) {
        GNode dst      = graph.getEdgeDst(jj);
        uint32_t dstId = graph.getData(dst).id;
        int64_t ocap =
            orig.graph.getEdgeData(orig.findEdge(map[srcId], map[dstId]));
        int64_t delta = 0;
        if (ocap > 0)
          delta -= (ocap - graph.getEdgeData(jj));
        else
          delta += graph.getEdgeData(jj);
        sum += delta;
      }

      if (node.excess != sum) {
        std::cerr << "Not pseudoflow: " << node.excess << " != " << sum
                  << " at " << node << "\n";
        abort();
      }
    }
  }

  void verify(PreflowPush& orig) {
    // FIXME: doesn't fully check result
    checkHeights();
    checkConservation(orig);
    checkAugmentingPath();
  }
};

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  PreflowPush app;
  app.initializeGraph(inputFile, sourceId, sinkId);

  app.checkSorting();

  if (relabelInt == 0) {
    app.global_relabel_interval =
        app.graph.size() * ALPHA + app.graph.sizeEdges() / 3;
  } else {
    app.global_relabel_interval = relabelInt;
  }
  std::cout << "Number of nodes: " << app.graph.size() << "\n";
  std::cout << "Global relabel interval: " << app.global_relabel_interval
            << "\n";

  galois::preAlloc(numThreads * app.graph.size() /
                   galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  app.run();
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  std::cout << "Flow is " << app.graph.getData(app.sink).excess << "\n";

  if (!skipVerify) {
    PreflowPush orig;
    orig.initializeGraph(inputFile, sourceId, sinkId);
    app.verify(orig);
    std::cout << "(Partially) Verified\n";
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/preflowpush/README.md
================================================
Preflow Push algorithm
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program computes the maximum flow from a given source to a given sink 
in a given directed graph using the preflow-push algorithm (also called 
push-relabel algorithm):

A. Goldberg. Efficient Graph Algorithms for Sequential and Parallel Computers. 
PhD thesis. Dept. of EECS, MIT. 1987.

It also incorporates global relabel and gap detection heuristics:

B. Cherkassy, A. Goldberg. On implementing the push-relabel method for the 
maximum flow problem. Algorithmica. 1997

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analysis/cpu/preflowpush; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./preflowpush-cpu <path-to-graph> <source-ID> <sink-ID>`
-`$ ./preflowpush-cpu <path-to-graph> <source-ID> <sink-ID> -t=20`

PERFORMANCE
--------------------------------------------------------------------------------

* In our experience, the deterministic algorithms perform much slower than the 
  non-deterministic one.

* The performance of all algorithms depend on an optimal choice of the compile 
  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is 
  enabled (via galois::steal()). The optimal value of the constant might depend on 
  the architecture, so you might want to evaluate the performance over a range of 
  values (say [16-4096]).


================================================
FILE: lonestar/analytics/cpu/spanningtree/Boruvka.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/ParallelSTL.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/UnionFind.h"
#include "galois/graphs/LCGraph.h"
#include "galois/runtime/Profile.h"
#include "Lonestar/BoilerPlate.h"

#include "llvm/Support/CommandLine.h"

#include <atomic>
#include <utility>
#include <algorithm>
#include <iostream>

namespace cll = llvm::cl;

static const char* name = "Boruvka's Minimum Spanning Tree Algorithm";
static const char* desc = "Computes the minimum spanning forest of a graph";
static const char* url  = "mst";

enum Algo { parallel, exp_parallel };

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo>
    algo("algo", cll::desc("Choose an algorithm (default value parallel):"),
         cll::values(clEnumVal(parallel, "Parallel")), cll::init(parallel));

typedef int EdgeData;

struct Node : public galois::UnionFindNode<Node> {
  std::atomic<EdgeData*> lightest;
  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}
};

typedef galois::graphs::LC_CSR_Graph<Node, EdgeData>::with_numa_alloc<
    true>::type ::with_no_lockable<true>::type Graph;

typedef Graph::GraphNode GNode;

std::ostream& operator<<(std::ostream& os, const Node& n) {
  os << "[id: " << &n << ", c: " << n.find() << "]";
  return os;
}

struct Edge {
  GNode src;
  GNode dst;
  const EdgeData* weight;
  Edge(const GNode& s, const GNode& d, const EdgeData* w)
      : src(s), dst(d), weight(w) {}
};

/**
 * Boruvka's algorithm. Implemented bulk-synchronously in order to avoid the
 * need to merge edge lists.
 */
template <bool useExp>
struct ParallelAlgo {
  struct WorkItem {
    Edge edge;
    int cur;
    WorkItem(const GNode& s, const GNode& d, const EdgeData* w, int c)
        : edge(s, d, w), cur(c) {}
  };

  typedef galois::InsertBag<WorkItem> WL;

  Graph graph;

  WL wls[3];
  WL* current;
  WL* next;
  WL* pending;
  EdgeData limit;
  galois::InsertBag<Edge> mst;
  EdgeData inf;
  EdgeData heaviest;

  /**
   * Find lightest edge between components leaving a node and add it to the
   * worklist.
   */
  template <bool useLimit, typename Context, typename Pending>
  static void findLightest(ParallelAlgo* self, const GNode& src, int cur,
                           Context& ctx, Pending& pending) {
    Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);
    Graph::edge_iterator ii =
        self->graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
    Graph::edge_iterator ei =
        self->graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

    std::advance(ii, cur);

    for (; ii != ei; ++ii, ++cur) {
      GNode dst   = self->graph.getEdgeDst(ii);
      Node& ddata = self->graph.getData(dst, galois::MethodFlag::UNPROTECTED);
      EdgeData& weight = self->graph.getEdgeData(ii);
      if (useLimit && weight > self->limit) {
        pending.push(WorkItem(src, dst, &weight, cur));
        return;
      }
      Node* rep;
      if ((rep = sdata.findAndCompress()) != ddata.findAndCompress()) {
        // const EdgeData& weight = self->graph.getEdgeData(ii);
        EdgeData* old;
        ctx.push(WorkItem(src, dst, &weight, cur));
        while (weight < *(old = rep->lightest)) {
          if (rep->lightest.compare_exchange_strong(old, &weight))
            break;
        }
        return;
      }
    }
  }

  /**
   * Merge step specialized for first round of the algorithm.
   */
  struct Initialize {
    ParallelAlgo* self;

    Initialize(ParallelAlgo* s) : self(s) {}

    void operator()(const GNode& src) const {
      (*this)(src, *self->next, *self->pending);
    }

    template <typename Context>
    void operator()(const GNode& src, Context& ctx) const {
      (*this)(src, ctx, *self->pending);
    }

    template <typename Context, typename Pending>
    void operator()(const GNode& src, Context& ctx, Pending& pending) const {
      Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);
      sdata.lightest = &self->inf;
      findLightest<false>(self, src, 0, ctx, pending);
    }
  };

  struct Merge {

    ParallelAlgo* self;

    Merge(ParallelAlgo* s) : self(s) {}

    void operator()(const WorkItem& item) const {
      (*this)(item, *self->next, *self->pending);
    }

    template <typename Context>
    void operator()(const WorkItem& item, Context& ctx) const {
      (*this)(item, ctx, *self->pending);
    }

    template <typename Context, typename Pending>
    void operator()(const WorkItem& item, Context&, Pending&) const {
      GNode src   = item.edge.src;
      Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);
      Node* rep   = sdata.findAndCompress();
      int cur     = item.cur;

      if (rep->lightest == item.edge.weight) {
        GNode dst   = item.edge.dst;
        Node& ddata = self->graph.getData(dst, galois::MethodFlag::UNPROTECTED);
        if ((rep = sdata.merge(&ddata))) {
          rep->lightest = &self->inf;
          self->mst.push(Edge(src, dst, item.edge.weight));
        }
        ++cur;
      }
    }
  };

  struct Find {
    ParallelAlgo* self;

    Find(ParallelAlgo* s) : self(s) {}

    void operator()(const WorkItem& item) const {
      (*this)(item, *self->next, *self->pending);
    }

    template <typename Context>
    void operator()(const WorkItem& item, Context& ctx) const {
      (*this)(item, ctx, *self->pending);
    }

    template <typename Context, typename Pending>
    void operator()(const WorkItem& item, Context& ctx,
                    Pending& pending) const {
      findLightest<true>(self, item.edge.src, item.cur, ctx, pending);
    }
  };

  void init() {
    current = &wls[0];
    next    = &wls[1];
    pending = &wls[2];

    EdgeData delta = std::max(heaviest / 5, 1);
    limit          = delta;
  }

  void process() {

    constexpr unsigned CHUNK_SIZE = 16;

    size_t rounds = 0;

    init();

    galois::do_all(galois::iterate(graph), Initialize(this),
                   galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
                   galois::loopname("Initialize"));

    while (true) {
      while (true) {
        rounds += 1;

        std::swap(current, next);
        galois::do_all(galois::iterate(*current), Merge(this), galois::steal(),
                       galois::chunk_size<CHUNK_SIZE>(),
                       galois::loopname("Merge"));
        galois::do_all(galois::iterate(*current), Find(this), galois::steal(),
                       galois::chunk_size<CHUNK_SIZE>(),
                       galois::loopname("Find"));
        current->clear();

        if (next->empty())
          break;
      }

      if (pending->empty())
        break;

      std::swap(next, pending);

      limit *= 2;
    }

    galois::runtime::reportStat_Single("Boruvka", "rounds", rounds);
  }

  void processExp() { GALOIS_DIE("not supported"); }

  void operator()() {
    if (useExp) {
      processExp();
    } else {
      process();
    }
  }

  bool checkAcyclic(void) {
    galois::GAccumulator<unsigned> roots;

    galois::do_all(galois::iterate(graph), [&roots, this](const GNode& n) {
      const auto& data = graph.getData(n, galois::MethodFlag::UNPROTECTED);
      if (data.isRep())
        roots += 1;
    });

    unsigned numRoots = roots.reduce();
    unsigned numEdges = std::distance(mst.begin(), mst.end());

    if (graph.size() - numRoots != numEdges) {
      std::cerr << "Generated graph is not a forest. "
                << "Expected " << graph.size() - numRoots << " edges but "
                << "found " << numEdges << "\n";
      return false;
    }

    std::cout << "Num trees: " << numRoots << "\n";
    std::cout << "Tree edges: " << numEdges << "\n";
    return true;
  }

  EdgeData sortEdges() {

    galois::GReduceMax<EdgeData> heavy;

    galois::do_all(galois::iterate(graph), [&heavy, this](const GNode& src) {
      //! [sortEdgeByEdgeData]
      graph.sortEdgesByEdgeData(src, std::less<EdgeData>(),
                                galois::MethodFlag::UNPROTECTED);
      //! [sortEdgeByEdgeData]

      Graph::edge_iterator ii =
          graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
      Graph::edge_iterator ei =
          graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
      ptrdiff_t dist = std::distance(ii, ei);
      if (dist == 0)
        return;
      std::advance(ii, dist - 1);
      heavy.update(graph.getEdgeData(ii));
    });

    return heavy.reduce();
  }

  bool verify() {

    auto is_bad_graph = [this](const GNode& n) {
      Node& me = graph.getData(n);
      for (auto ii : graph.edges(n)) {
        GNode dst  = graph.getEdgeDst(ii);
        Node& data = graph.getData(dst);
        if (me.findAndCompress() != data.findAndCompress()) {
          std::cerr << "not in same component: " << me << " and " << data
                    << "\n";
          return true;
        }
      }
      return false;
    };

    auto is_bad_mst = [this](const Edge& e) {
      return graph.getData(e.src).findAndCompress() !=
             graph.getData(e.dst).findAndCompress();
    };

    if (galois::ParallelSTL::find_if(graph.begin(), graph.end(),
                                     is_bad_graph) == graph.end()) {
      if (galois::ParallelSTL::find_if(mst.begin(), mst.end(), is_bad_mst) ==
          mst.end()) {
        return checkAcyclic();
      }
    }
    return false;
  }

  void initializeGraph() {
    galois::graphs::FileGraph origGraph;
    galois::graphs::FileGraph symGraph;

    origGraph.fromFileInterleaved<EdgeData>(inputFilename);
    if (!symmetricGraph)
      galois::graphs::makeSymmetric<EdgeData>(origGraph, symGraph);
    else
      std::swap(symGraph, origGraph);

    galois::graphs::readGraph(graph, symGraph);

    galois::StatTimer Tsort("InitializeSortTime");
    Tsort.start();
    heaviest = sortEdges();
    if (heaviest == std::numeric_limits<EdgeData>::max() ||
        heaviest == std::numeric_limits<EdgeData>::min()) {
      GALOIS_DIE("Edge weights of graph out of range");
    }
    inf = heaviest + 1;

    Tsort.stop();

    std::cout << "Nodes: " << graph.size() << " edges: " << graph.sizeEdges()
              << " heaviest edge: " << heaviest << "\n";
  }
};

template <typename Algo>
void run() {

  Algo algo;

  galois::StatTimer Tinitial("InitializeTime");
  Tinitial.start();
  algo.initializeGraph();
  Tinitial.stop();

  galois::preAlloc(8 * galois::getActiveThreads() +
                   16 * (algo.graph.size() + algo.graph.sizeEdges()) /
                       galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  galois::runtime::profileVtune([&](void) { algo(); }, "boruvka");
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  auto get_weight = [](const Edge& e) { return *e.weight; };

  auto w = galois::ParallelSTL::map_reduce(
      algo.mst.begin(), algo.mst.end(), get_weight, std::plus<size_t>(), 0UL);

  std::cout << "MST weight: " << w << "\n";

  if (!skipVerify && !algo.verify()) {
    GALOIS_DIE("verification failed");
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFilename);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  switch (algo) {
  case parallel:
    run<ParallelAlgo<false>>();
    break;
  case exp_parallel:
    run<ParallelAlgo<true>>();
    break;
  default:
    std::cerr << "Unknown algo: " << algo << "\n";
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/spanningtree/CMakeLists.txt
================================================
add_executable(minimum-spanningtree-cpu Boruvka.cpp)
add_dependencies(apps minimum-spanningtree-cpu)
target_link_libraries(minimum-spanningtree-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS minimum-spanningtree-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small1 minimum-spanningtree-cpu "${BASEINPUT}/scalefree/rmat10.gr")
add_test_scale(small2 minimum-spanningtree-cpu "${BASEINPUT}/reference/structured/rome99.gr")


================================================
FILE: lonestar/analytics/cpu/spanningtree/README.md
================================================
Minimum Weight Spanning Tree
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program computes a minimum-weight spanning tree (MST) of an input graph.

This implementation uses a Union-Find (aka Disjoint Set) data structure to keep
track of spanning trees and to avoid cycles in the tree.  The algorithm proceeds in multiple rounds, 
where in each round, it performs two
parallel phases. One phase performs *Find* operations while the other phase
performs *Union* operations. 

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

- If the input is a non-symmetric graph, the program first converts it into symmetric
  graph (MST is defined for undirected/symmetric graphs only).
- If the input is a symmetric graph, the user must provide -symmetricGraph flag at
  commandline

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/spanningtree; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./minimum-spanningtree-cpu <path-to-directed-graph> -algo parallel -t 40`
-`$ ./minimum-spanningtree-cpu <path-to-symmetric-graph> -symmetricGraph -algo parallel -t 40`

PERFORMANCE  
--------------------------------------------------------------------------------

* All parallel loops in 'parallel' algorithm rely on CHUNK_SIZE parameter for load-balancing,
  which needs to be tuned for machine and input graph. 


================================================
FILE: lonestar/analytics/cpu/spanningtree/UnionFind.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_UNION_FIND
#define GALOIS_UNION_FIND

#include <cstddef>

template <typename ElTy, ElTy initializer>
struct UnionFind {

  ElTy* parents;
  const size_t size;

  explicit UnionFind(size_t sz) : size(sz) {

    parents = new ElTy[size];
    for (size_t s = 0; s < sz; s++)
      parents[s] = initializer;
  }

  ElTy uf_find(ElTy e) {
    if (parents[e] == initializer)
      return e;
    ElTy tmp = e;
    ElTy rep = initializer;
    while (parents[tmp] != initializer)
      tmp = parents[tmp];
    rep = tmp;
    tmp = e;
    while (parents[tmp] != initializer) {
      parents[tmp] = rep;
      tmp          = parents[tmp];
    }
    return rep;
  }

  void uf_union(ElTy e1, ElTy e2) { parents[e1] = e2; }

  ~UnionFind() { delete parents; }
};

void test_uf() { UnionFind<int, -1> sample(10000); }
#endif // def GALOIS_UNION_FIND


================================================
FILE: lonestar/analytics/cpu/sssp/CMakeLists.txt
================================================
add_executable(sssp-cpu SSSP.cpp)
add_dependencies(apps sssp-cpu)
target_link_libraries(sssp-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS sssp-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small1 sssp-cpu "${BASEINPUT}/reference/structured/rome99.gr" -delta 8)
add_test_scale(small2 sssp-cpu "${BASEINPUT}/scalefree/rmat10.gr" -delta 8)


================================================
FILE: lonestar/analytics/cpu/sssp/README.md
================================================
Single Source Shortest Path
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program computes the distance of shortest paths in a graph, starting from a
source node (specified by -startNode option). 

- deltaStep implements a variation on the Delta-Stepping algorithm by Meyer and
  Sanders, 2003. serDelta is its serial implementation 
- dijkstra is a serial implementation of Dijkstra's algorithm
- topo is a variation on Bellman-Ford algorithm, which visits all the nodes in the
  graph, every round, until convergence

Each algorithm has a variant that implements edge tiling, e.g. deltaTile, which
divides the edges of high-degree nodes into multiple work items for better
load balancing. 

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs having integer edge weights.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/sssp; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./sssp-cpu <path-to-graph> -algo deltaStep -delta 13 -t 40`
-`$ ./sssp-cpu <path-to-graph> -algo deltaTile -delta 13 -t 40`

PERFORMANCE  
--------------------------------------------------------------------------------

* deltaStep/deltaTile algorithms typically performs the best on high diameter
  graphs, such as road networks. Its performance is sensitive to the *delta* parameter, which is
  provided as a power-of-2 at the commandline. *delta* parameter should be tuned
  for every input graph
* topo/topoTile algorithms typically perform the best on low diameter graphs, such
  as social networks and RMAT graphs
* All algorithms rely on CHUNK_SIZE for load balancing, which needs to be
  tuned for machine and input graph. 
* Tile variants of algorithms provide better load balancing and performance
  for graphs with high-degree nodes. Tile size is controlled via
  EDGE_TILE_SIZE constant, which needs to be tuned. 


================================================
FILE: lonestar/analytics/cpu/sssp/SSSP.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/AtomicHelpers.h"
#include "galois/Reduction.h"
#include "galois/PriorityQueue.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/TypeTraits.h"
#include "Lonestar/BoilerPlate.h"
#include "Lonestar/BFS_SSSP.h"
#include "Lonestar/Utils.h"

#include "llvm/Support/CommandLine.h"

#include <iostream>

namespace cll = llvm::cl;

static const char* name = "Single Source Shortest Path";
static const char* desc =
    "Computes the shortest path from a source node to all nodes in a directed "
    "graph using a modified chaotic iteration algorithm";
static const char* url = "single_source_shortest_path";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<unsigned int>
    startNode("startNode",
              cll::desc("Node to start search from (default value 0)"),
              cll::init(0));
static cll::opt<unsigned int>
    reportNode("reportNode",
               cll::desc("Node to report distance to(default value 1)"),
               cll::init(1));
static cll::opt<unsigned int>
    stepShift("delta",
              cll::desc("Shift value for the deltastep (default value 13)"),
              cll::init(13));

enum Algo {
  deltaTile = 0,
  deltaStep,
  deltaStepBarrier,
  serDeltaTile,
  serDelta,
  dijkstraTile,
  dijkstra,
  topo,
  topoTile,
  AutoAlgo
};

const char* const ALGO_NAMES[] = {
    "deltaTile", "deltaStep",    "deltaStepBarrier", "serDeltaTile",
    "serDelta",  "dijkstraTile", "dijkstra",         "topo",
    "topoTile",  "Auto"};

static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm (default value auto):"),
    cll::values(clEnumVal(deltaTile, "deltaTile"),
                clEnumVal(deltaStep, "deltaStep"),
                clEnumVal(deltaStepBarrier, "deltaStepBarrier"),
                clEnumVal(serDeltaTile, "serDeltaTile"),
                clEnumVal(serDelta, "serDelta"),
                clEnumVal(dijkstraTile, "dijkstraTile"),
                clEnumVal(dijkstra, "dijkstra"), clEnumVal(topo, "topo"),
                clEnumVal(topoTile, "topoTile"),
                clEnumVal(AutoAlgo,
                          "auto: choose among the algorithms automatically")),
    cll::init(AutoAlgo));

//! [withnumaalloc]
using Graph = galois::graphs::LC_CSR_Graph<std::atomic<uint32_t>, uint32_t>::
    with_no_lockable<true>::type ::with_numa_alloc<true>::type;
//! [withnumaalloc]
typedef Graph::GraphNode GNode;

constexpr static const bool TRACK_WORK          = false;
constexpr static const unsigned CHUNK_SIZE      = 64U;
constexpr static const ptrdiff_t EDGE_TILE_SIZE = 512;

using SSSP                 = BFS_SSSP<Graph, uint32_t, true, EDGE_TILE_SIZE>;
using Dist                 = SSSP::Dist;
using UpdateRequest        = SSSP::UpdateRequest;
using UpdateRequestIndexer = SSSP::UpdateRequestIndexer;
using SrcEdgeTile          = SSSP::SrcEdgeTile;
using SrcEdgeTileMaker     = SSSP::SrcEdgeTileMaker;
using SrcEdgeTilePushWrap  = SSSP::SrcEdgeTilePushWrap;
using ReqPushWrap          = SSSP::ReqPushWrap;
using OutEdgeRangeFn       = SSSP::OutEdgeRangeFn;
using TileRangeFn          = SSSP::TileRangeFn;

namespace gwl = galois::worklists;
using PSchunk = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;
using OBIM    = gwl::OrderedByIntegerMetric<UpdateRequestIndexer, PSchunk>;
using OBIM_Barrier =
    gwl::OrderedByIntegerMetric<UpdateRequestIndexer,
                                PSchunk>::with_barrier<true>::type;

template <typename T, typename OBIMTy = OBIM, typename P, typename R>
void deltaStepAlgo(Graph& graph, GNode source, const P& pushWrap,
                   const R& edgeRange) {

  //! [reducible for self-defined stats]
  galois::GAccumulator<size_t> BadWork;
  //! [reducible for self-defined stats]
  galois::GAccumulator<size_t> WLEmptyWork;

  graph.getData(source) = 0;

  galois::InsertBag<T> initBag;
  pushWrap(initBag, source, 0, "parallel");

  galois::for_each(
      galois::iterate(initBag),
      [&](const T& item, auto& ctx) {
        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;
        const auto& sdata                 = graph.getData(item.src, flag);

        if (sdata < item.dist) {
          if (TRACK_WORK)
            WLEmptyWork += 1;
          return;
        }

        for (auto ii : edgeRange(item)) {

          GNode dst          = graph.getEdgeDst(ii);
          auto& ddist        = graph.getData(dst, flag);
          Dist ew            = graph.getEdgeData(ii, flag);
          const Dist newDist = sdata + ew;
          Dist oldDist       = galois::atomicMin<uint32_t>(ddist, newDist);
          if (newDist < oldDist) {
            if (TRACK_WORK) {
              //! [per-thread contribution of self-defined stats]
              if (oldDist != SSSP::DIST_INFINITY) {
                BadWork += 1;
              }
              //! [per-thread contribution of self-defined stats]
            }
            pushWrap(ctx, dst, newDist);
          }
        }
      },
      galois::wl<OBIMTy>(UpdateRequestIndexer{stepShift}),
      galois::disable_conflict_detection(), galois::loopname("SSSP"));

  if (TRACK_WORK) {
    //! [report self-defined stats]
    galois::runtime::reportStat_Single("SSSP", "BadWork", BadWork.reduce());
    //! [report self-defined stats]
    galois::runtime::reportStat_Single("SSSP", "WLEmptyWork",
                                       WLEmptyWork.reduce());
  }
}

template <typename T, typename P, typename R>
void serDeltaAlgo(Graph& graph, const GNode& source, const P& pushWrap,
                  const R& edgeRange) {

  SerialBucketWL<T, UpdateRequestIndexer> wl(UpdateRequestIndexer{stepShift});
  ;
  graph.getData(source) = 0;

  pushWrap(wl, source, 0);

  size_t iter = 0UL;
  while (!wl.empty()) {

    auto& curr = wl.minBucket();

    while (!curr.empty()) {
      ++iter;
      auto item = curr.front();
      curr.pop_front();

      if (graph.getData(item.src) < item.dist) {
        // empty work
        continue;
      }

      for (auto e : edgeRange(item)) {

        GNode dst   = graph.getEdgeDst(e);
        auto& ddata = graph.getData(dst);

        const auto newDist = item.dist + graph.getEdgeData(e);

        if (newDist < ddata) {
          ddata = newDist;
          pushWrap(wl, dst, newDist);
        }
      }
    }

    wl.goToNextBucket();
  }

  if (!wl.allEmpty()) {
    std::abort();
  }
  galois::runtime::reportStat_Single("SSSP-Serial-Delta", "Iterations", iter);
}

template <typename T, typename P, typename R>
void dijkstraAlgo(Graph& graph, const GNode& source, const P& pushWrap,
                  const R& edgeRange) {

  using WL = galois::MinHeap<T>;

  graph.getData(source) = 0;

  WL wl;
  pushWrap(wl, source, 0);

  size_t iter = 0;

  while (!wl.empty()) {
    ++iter;

    T item = wl.pop();

    if (graph.getData(item.src) < item.dist) {
      // empty work
      continue;
    }

    for (auto e : edgeRange(item)) {

      GNode dst   = graph.getEdgeDst(e);
      auto& ddata = graph.getData(dst);

      const auto newDist = item.dist + graph.getEdgeData(e);

      if (newDist < ddata) {
        ddata = newDist;
        pushWrap(wl, dst, newDist);
      }
    }
  }

  galois::runtime::reportStat_Single("SSSP-Dijkstra", "Iterations", iter);
}

void topoAlgo(Graph& graph, const GNode& source) {

  galois::LargeArray<Dist> oldDist;
  oldDist.allocateInterleaved(graph.size());

  constexpr Dist INFTY = SSSP::DIST_INFINITY;
  galois::do_all(
      galois::iterate(size_t{0}, graph.size()),
      [&](size_t i) { oldDist.constructAt(i, INFTY); }, galois::no_stats(),
      galois::loopname("initDistArray"));

  graph.getData(source) = 0;

  galois::GReduceLogicalOr changed;
  size_t rounds = 0;

  do {

    ++rounds;
    changed.reset();

    galois::do_all(
        galois::iterate(graph),
        [&](const GNode& n) {
          const auto& sdata = graph.getData(n);

          if (oldDist[n] > sdata) {

            oldDist[n] = sdata;
            changed.update(true);

            for (auto e : graph.edges(n)) {
              const auto newDist = sdata + graph.getEdgeData(e);
              auto dst           = graph.getEdgeDst(e);
              auto& ddata        = graph.getData(dst);
              galois::atomicMin(ddata, newDist);
            }
          }
        },
        galois::steal(), galois::loopname("Update"));

  } while (changed.reduce());

  galois::runtime::reportStat_Single("SSSP-topo", "rounds", rounds);
}

void topoTileAlgo(Graph& graph, const GNode& source) {

  galois::InsertBag<SrcEdgeTile> tiles;

  graph.getData(source) = 0;

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& n) {
        SSSP::pushEdgeTiles(tiles, graph, n,
                            SrcEdgeTileMaker{n, SSSP::DIST_INFINITY});
      },
      galois::steal(), galois::loopname("MakeTiles"));

  galois::GReduceLogicalOr changed;
  size_t rounds = 0;

  do {
    ++rounds;
    changed.reset();

    galois::do_all(
        galois::iterate(tiles),
        [&](SrcEdgeTile& t) {
          const auto& sdata = graph.getData(t.src);

          if (t.dist > sdata) {

            t.dist = sdata;
            changed.update(true);

            for (auto e = t.beg; e != t.end; ++e) {
              const auto newDist = sdata + graph.getEdgeData(e);
              auto dst           = graph.getEdgeDst(e);
              auto& ddata        = graph.getData(dst);
              galois::atomicMin(ddata, newDist);
            }
          }
        },
        galois::steal(), galois::loopname("Update"));

  } while (changed.reduce());

  galois::runtime::reportStat_Single("SSSP-topo", "rounds", rounds);
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  Graph graph;
  GNode source;
  GNode report;

  std::cout << "Reading from file: " << inputFile << "\n";
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  if (startNode >= graph.size() || reportNode >= graph.size()) {
    std::cerr << "failed to set report: " << reportNode
              << " or failed to set source: " << startNode << "\n";
    assert(0);
    abort();
  }

  auto it = graph.begin();
  std::advance(it, startNode.getValue());
  source = *it;
  it     = graph.begin();
  std::advance(it, reportNode.getValue());
  report = *it;

  size_t approxNodeData = graph.size() * 64;
  galois::preAlloc(numThreads +
                   approxNodeData / galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  if (algo == deltaStep || algo == deltaTile || algo == serDelta ||
      algo == serDeltaTile) {
    std::cout << "INFO: Using delta-step of " << (1 << stepShift) << "\n";
    std::cout
        << "WARNING: Performance varies considerably due to delta parameter.\n";
    std::cout
        << "WARNING: Do not expect the default to be good for your graph.\n";
  }

  galois::do_all(galois::iterate(graph),
                 [&graph](GNode n) { graph.getData(n) = SSSP::DIST_INFINITY; });

  graph.getData(source) = 0;

  std::cout << "Running " << ALGO_NAMES[algo] << " algorithm\n";

  galois::StatTimer autoAlgoTimer("AutoAlgo_0");
  galois::StatTimer execTime("Timer_0");
  execTime.start();

  if (algo == AutoAlgo) {
    autoAlgoTimer.start();
    if (isApproximateDegreeDistributionPowerLaw(graph)) {
      algo = deltaStep;
    } else {
      algo = deltaStepBarrier;
    }
    autoAlgoTimer.stop();
    galois::gInfo("Choosing ", ALGO_NAMES[algo], " algorithm");
  }

  switch (algo) {
  case deltaTile:
    deltaStepAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},
                               TileRangeFn());
    break;
  case deltaStep:
    deltaStepAlgo<UpdateRequest>(graph, source, ReqPushWrap(),
                                 OutEdgeRangeFn{graph});
    break;
  case serDeltaTile:
    serDeltaAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},
                              TileRangeFn());
    break;
  case serDelta:
    serDeltaAlgo<UpdateRequest>(graph, source, ReqPushWrap(),
                                OutEdgeRangeFn{graph});
    break;
  case dijkstraTile:
    dijkstraAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},
                              TileRangeFn());
    break;
  case dijkstra:
    dijkstraAlgo<UpdateRequest>(graph, source, ReqPushWrap(),
                                OutEdgeRangeFn{graph});
    break;
  case topo:
    topoAlgo(graph, source);
    break;
  case topoTile:
    topoTileAlgo(graph, source);
    break;

  case deltaStepBarrier:
    deltaStepAlgo<UpdateRequest, OBIM_Barrier>(graph, source, ReqPushWrap(),
                                               OutEdgeRangeFn{graph});
    break;

  default:
    std::abort();
  }

  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  std::cout << "Node " << reportNode << " has distance "
            << graph.getData(report) << "\n";

  // Sanity checking code
  galois::GReduceMax<uint64_t> maxDistance;
  galois::GAccumulator<uint64_t> distanceSum;
  galois::GAccumulator<uint32_t> visitedNode;
  maxDistance.reset();
  distanceSum.reset();
  visitedNode.reset();

  galois::do_all(
      galois::iterate(graph),
      [&](uint64_t i) {
        uint32_t myDistance = graph.getData(i);

        if (myDistance != SSSP::DIST_INFINITY) {
          maxDistance.update(myDistance);
          distanceSum += myDistance;
          visitedNode += 1;
        }
      },
      galois::loopname("Sanity check"), galois::no_stats());

  // report sanity stats
  uint64_t rMaxDistance = maxDistance.reduce();
  uint64_t rDistanceSum = distanceSum.reduce();
  uint64_t rVisitedNode = visitedNode.reduce();
  galois::gInfo("# visited nodes is ", rVisitedNode);
  galois::gInfo("Max distance is ", rMaxDistance);
  galois::gInfo("Sum of visited distances is ", rDistanceSum);

  if (!skipVerify) {
    if (SSSP::verify(graph, source)) {
      std::cout << "Verification successful.\n";
    } else {
      GALOIS_DIE("verification failed");
    }
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/cpu/triangle-counting/CMakeLists.txt
================================================
add_executable(triangle-counting-cpu Triangles.cpp)
add_dependencies(apps triangle-counting-cpu)
target_link_libraries(triangle-counting-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS triangle-counting-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small-ordered-relabel triangle-counting-cpu -symmetricGraph -algo=orderedCount --relabel=true "${BASEINPUT}/scalefree/symmetric/rmat15.csgr" NOT_QUICK)
add_test_scale(small-ordered triangle-counting-cpu -symmetricGraph -algo=orderedCount "${BASEINPUT}/scalefree/symmetric/rmat15.csgr")
add_test_scale(small-node triangle-counting-cpu -symmetricGraph -algo=nodeiterator "${BASEINPUT}/scalefree/symmetric/rmat15.csgr")
add_test_scale(small-edge triangle-counting-cpu -symmetricGraph -algo=edgeiterator "${BASEINPUT}/scalefree/symmetric/rmat15.csgr")


================================================
FILE: lonestar/analytics/cpu/triangle-counting/README.md
================================================
Triangle counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program counts the number of triangles in a given undirected graph. We 
implement both node-iterator and edge-iterator algorithms from the following:

Thomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD
Thesis. Universitat Karlsruhe. 2007.

We also have an ordered count algorithm that sorts the nodes by degree before
execution: this has been found to give good performance. We implement the
ordered count algorithm from the following:

http://gap.cs.berkeley.edu/benchmark.html

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/cpu/triangle-counting; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./triangle-counting-cpu <path-symmetric-graph> -algo edgeiterator -t 40 -symmetricGraph`
-`$ ./triangle-counting-cpu <path-symmetric-graph> -t 20 -algo nodeiterator -symmetricGraph`
-`$ ./triangle-counting-cpu <path-symmetric-graph> -t 20 -algo orderedCount -symmetricGraph`

PERFORMANCE
--------------------------------------------------------------------------------

* In our experience, orderedCount algorithm gives the best performance.

* The performance of algorithms depend on an optimal choice of the compile 
  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is 
  enabled (via galois::steal()). The optimal value of the constant might depend on 
  the architecture, so you might want to evaluate the performance over a range of 
  values (say [16-4096]).


================================================
FILE: lonestar/analytics/cpu/triangle-counting/Triangles.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/ParallelSTL.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/BufferedGraph.h"
#include "galois/runtime/Profile.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/Utils.h"
#include "Lonestar/BoilerPlate.h"

#include <boost/iterator/transform_iterator.hpp>

#include <utility>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>

const char* name = "Triangles";
const char* desc = "Counts the triangles in a graph";

constexpr static const unsigned CHUNK_SIZE = 64U;
enum Algo { nodeiterator, edgeiterator, orderedCount };

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo> algo(
    "algo", cll::desc("Choose an algorithm:"),
    cll::values(clEnumValN(Algo::nodeiterator, "nodeiterator", "Node Iterator"),
                clEnumValN(Algo::edgeiterator, "edgeiterator", "Edge Iterator"),
                clEnumValN(Algo::orderedCount, "orderedCount",
                           "Ordered Simple Count (default)")),
    cll::init(Algo::orderedCount));

static cll::opt<bool>
    relabel("relabel",
            cll::desc("Relabel nodes of the graph (default value of false => "
                      "choose automatically)"),
            cll::init(false));

typedef galois::graphs::LC_CSR_Graph<void, void>::with_numa_alloc<
    true>::type ::with_no_lockable<true>::type Graph;

typedef Graph::GraphNode GNode;

/**
 * Like std::lower_bound but doesn't dereference iterators. Returns the first
 * element for which comp is not true.
 */
template <typename Iterator, typename Compare>
Iterator lowerBound(Iterator first, Iterator last, Compare comp) {
  using difference_type =
      typename std::iterator_traits<Iterator>::difference_type;

  Iterator it;
  difference_type count;
  difference_type half;

  count = std::distance(first, last);
  while (count > 0) {
    it   = first;
    half = count / 2;
    std::advance(it, half);
    if (comp(it)) {
      first = ++it;
      count -= half + 1;
    } else {
      count = half;
    }
  }
  return first;
}

/**
 * std::set_intersection over edge_iterators.
 */
template <typename G>
size_t countEqual(G& g, typename G::edge_iterator aa,
                  typename G::edge_iterator ea, typename G::edge_iterator bb,
                  typename G::edge_iterator eb) {
  size_t retval = 0;
  while (aa != ea && bb != eb) {
    typename G::GraphNode a = g.getEdgeDst(aa);
    typename G::GraphNode b = g.getEdgeDst(bb);
    if (a < b) {
      ++aa;
    } else if (b < a) {
      ++bb;
    } else {
      retval += 1;
      ++aa;
      ++bb;
    }
  }
  return retval;
}

template <typename G>
struct LessThan {
  G& g;
  typename G::GraphNode n;
  LessThan(G& g, typename G::GraphNode n) : g(g), n(n) {}
  bool operator()(typename G::edge_iterator it) { return g.getEdgeDst(it) < n; }
};

template <typename G>
struct GreaterThanOrEqual {
  G& g;
  typename G::GraphNode n;
  GreaterThanOrEqual(G& g, typename G::GraphNode n) : g(g), n(n) {}
  bool operator()(typename G::edge_iterator it) {
    return !(n < g.getEdgeDst(it));
  }
};

template <typename G>
struct DegreeLess {
  typedef typename G::GraphNode N;
  G* g;
  DegreeLess(G& g) : g(&g) {}

  bool operator()(const N& n1, const N& n2) const {
    return std::distance(g->edge_begin(n1), g->edge_end(n1)) <
           std::distance(g->edge_begin(n2), g->edge_end(n2));
  }
};
template <typename G>
struct DegreeGreater {
  typedef typename G::GraphNode N;
  G* g;
  DegreeGreater(G& g) : g(&g) {}

  bool operator()(const N& n1, const N& n2) const {
    return std::distance(g->edge_begin(n1), g->edge_end(n1)) >
           std::distance(g->edge_begin(n2), g->edge_end(n2));
  }
};
template <typename G>
struct GetDegree {
  typedef typename G::GraphNode N;
  G* g;
  GetDegree(G& g) : g(&g) {}

  ptrdiff_t operator()(const N& n) const {
    return std::distance(g->edge_begin(n), g->edge_end(n));
  }
};

template <typename GraphNode, typename EdgeTy>
struct IdLess {
  bool
  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,
             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {
    return e1.dst < e2.dst;
  }
};

/**
 * Node Iterator algorithm for counting triangles.
 * <code>
 * for (v in G)
 *   for (all pairs of neighbors (a, b) of v)
 *     if ((a,b) in G and a < v < b)
 *       triangle += 1
 * </code>
 *
 * Thomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD
 * Thesis. Universitat Karlsruhe. 2007.
 */
void nodeIteratingAlgo(Graph& graph) {

  galois::GAccumulator<size_t> numTriangles;

  //! [profile w/ vtune]
  galois::runtime::profileVtune(
      [&]() {
        galois::do_all(
            galois::iterate(graph),
            [&](const GNode& n) {
              // Partition neighbors
              // [first, ea) [n] [bb, last)
              Graph::edge_iterator first =
                  graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);
              Graph::edge_iterator last =
                  graph.edge_end(n, galois::MethodFlag::UNPROTECTED);
              Graph::edge_iterator ea =
                  lowerBound(first, last, LessThan<Graph>(graph, n));
              Graph::edge_iterator bb =
                  lowerBound(first, last, GreaterThanOrEqual<Graph>(graph, n));

              for (; bb != last; ++bb) {
                GNode B = graph.getEdgeDst(bb);
                for (auto aa = first; aa != ea; ++aa) {
                  GNode A = graph.getEdgeDst(aa);
                  Graph::edge_iterator vv =
                      graph.edge_begin(A, galois::MethodFlag::UNPROTECTED);
                  Graph::edge_iterator ev =
                      graph.edge_end(A, galois::MethodFlag::UNPROTECTED);
                  Graph::edge_iterator it =
                      lowerBound(vv, ev, LessThan<Graph>(graph, B));
                  if (it != ev && graph.getEdgeDst(it) == B) {
                    numTriangles += 1;
                  }
                }
              }
            },
            galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
            galois::loopname("nodeIteratingAlgo"));
      },
      "nodeIteratorAlgo");
  //! [profile w/ vtune]

  std::cout << "Num Triangles: " << numTriangles.reduce() << "\n";
}

/**
 * Lambda function to count triangles
 */
void orderedCountFunc(Graph& graph, GNode n,
                      galois::GAccumulator<size_t>& numTriangles) {
  size_t numTriangles_local = 0;
  for (auto it_v : graph.edges(n)) {
    auto v = graph.getEdgeDst(it_v);
    if (v >= n)
      break;
    Graph::edge_iterator it_n =
        graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);

    for (auto it_vv : graph.edges(v)) {
      auto vv = graph.getEdgeDst(it_vv);
      if (vv >= v)
        break;
      while (graph.getEdgeDst(it_n) < vv)
        it_n++;
      if (vv == graph.getEdgeDst(it_n)) {

        Graph::edge_iterator multi_it_n = it_n;

        while (multi_it_n !=
                   graph.edge_end(n, galois::MethodFlag::UNPROTECTED) &&
               graph.getEdgeDst(multi_it_n) == vv) {
          numTriangles_local += 1;
          multi_it_n++;
        }
      }
    }
  }
  numTriangles += numTriangles_local;
}

/*
 * Simple counting loop, instead of binary searching.
 */
void orderedCountAlgo(Graph& graph) {
  galois::GAccumulator<size_t> numTriangles;
  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& n) { orderedCountFunc(graph, n, numTriangles); },
      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
      galois::loopname("orderedCountAlgo"));

  galois::gPrint("Num Triangles: ", numTriangles.reduce(), "\n");
}

/**
 * Edge Iterator algorithm for counting triangles.
 * <code>
 * for ((a, b) in E)
 *   if (a < b)
 *     for (v in intersect(neighbors(a), neighbors(b)))
 *       if (a < v < b)
 *         triangle += 1
 * </code>
 *
 * Thomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD
 * Thesis. Universitat Karlsruhe. 2007.
 */
void edgeIteratingAlgo(Graph& graph) {

  struct WorkItem {
    GNode src;
    GNode dst;
    WorkItem(const GNode& a1, const GNode& a2) : src(a1), dst(a2) {}
  };

  galois::InsertBag<WorkItem> items;
  galois::GAccumulator<size_t> numTriangles;

  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) {
        for (Graph::edge_iterator edge :
             graph.out_edges(n, galois::MethodFlag::UNPROTECTED)) {
          GNode dst = graph.getEdgeDst(edge);
          if (n < dst)
            items.push(WorkItem(n, dst));
        }
      },
      galois::loopname("Initialize"));

  //  galois::runtime::profileVtune(
  //! [profile w/ papi]
  galois::runtime::profilePapi(
      [&]() {
        galois::do_all(
            galois::iterate(items),
            [&](const WorkItem& w) {
              // Compute intersection of range (w.src, w.dst) in neighbors of
              // w.src and w.dst
              Graph::edge_iterator abegin =
                  graph.edge_begin(w.src, galois::MethodFlag::UNPROTECTED);
              Graph::edge_iterator aend =
                  graph.edge_end(w.src, galois::MethodFlag::UNPROTECTED);
              Graph::edge_iterator bbegin =
                  graph.edge_begin(w.dst, galois::MethodFlag::UNPROTECTED);
              Graph::edge_iterator bend =
                  graph.edge_end(w.dst, galois::MethodFlag::UNPROTECTED);

              Graph::edge_iterator aa = lowerBound(
                  abegin, aend, GreaterThanOrEqual<Graph>(graph, w.src));
              Graph::edge_iterator ea =
                  lowerBound(abegin, aend, LessThan<Graph>(graph, w.dst));
              Graph::edge_iterator bb = lowerBound(
                  bbegin, bend, GreaterThanOrEqual<Graph>(graph, w.src));
              Graph::edge_iterator eb =
                  lowerBound(bbegin, bend, LessThan<Graph>(graph, w.dst));

              numTriangles += countEqual(graph, aa, ea, bb, eb);
            },
            galois::loopname("edgeIteratingAlgo"),
            galois::chunk_size<CHUNK_SIZE>(), galois::steal());
      },
      "edgeIteratorAlgo");
  //! [profile w/ papi]

  std::cout << "NumTriangles: " << numTriangles.reduce() << "\n";
}

//! Sorts read graph by degree (high degree nodes are reindexed to beginning)
void makeSortedGraph(Graph& graph) {
  galois::StatTimer readTimer("ReadGraphTimer");
  readTimer.start();
  // read original graph
  galois::graphs::BufferedGraph<void> initial;
  initial.loadGraph(inputFile);
  readTimer.stop();

  galois::StatTimer Trelabel("GraphRelabelTimer");
  Trelabel.start();

  size_t numGraphNodes = initial.size();
  // create node -> degree pairs
  using DegreeNodePair = std::pair<uint64_t, uint32_t>;
  std::vector<DegreeNodePair> dnPairs(numGraphNodes);
  galois::do_all(
      galois::iterate(size_t{0}, numGraphNodes),
      [&](size_t nodeID) {
        size_t nodeDegree =
            std::distance(initial.edgeBegin(nodeID), initial.edgeEnd(nodeID));
        dnPairs[nodeID] = DegreeNodePair(nodeDegree, nodeID);
      },
      galois::loopname("CreateDegreeNodeVector"));

  galois::StatTimer degSortTimer("DegreeSortTimer");
  degSortTimer.start();
  // sort by degree (first item)
  galois::ParallelSTL::sort(dnPairs.begin(), dnPairs.end(),
                            std::greater<DegreeNodePair>());
  degSortTimer.stop();

  // create mapping, get degrees out to another vector to get prefix sum
  std::vector<uint32_t> oldToNewMapping(numGraphNodes);
  std::vector<uint64_t> inProgressPrefixSum(numGraphNodes);
  galois::do_all(
      galois::iterate(size_t{0}, numGraphNodes),
      [&](size_t index) {
        // save degree, which is pair.first
        inProgressPrefixSum[index] = dnPairs[index].first;
        // save mapping; original index is in .second, map it to current index
        oldToNewMapping[dnPairs[index].second] = index;
      },
      galois::loopname("CreateRemappingGetPrefixSum"));

  std::vector<uint64_t> newPrefixSum(numGraphNodes);
  galois::ParallelSTL::partial_sum(inProgressPrefixSum.begin(),
                                   inProgressPrefixSum.end(),
                                   newPrefixSum.begin());

  // allocate graph
  graph.allocateFrom(numGraphNodes, initial.sizeEdges());
  // construct nodes
  graph.constructNodes();
  // set edge endpoints using prefix sum
  galois::do_all(
      galois::iterate(size_t{0}, numGraphNodes),
      [&](size_t nodeIndex) {
        graph.fixEndEdge(nodeIndex, newPrefixSum[nodeIndex]);
      },
      galois::loopname("SetEdgeEndpoints"));

  // construct edges by looping through filegraph and saving to correct
  // locations
  galois::do_all(
      galois::iterate(0u, initial.size()),
      [&](uint32_t oldNodeID) {
        uint32_t newIndex = oldToNewMapping[oldNodeID];

        // get the start location of this reindex'd nodes edges
        uint64_t currentEdgeIndex;
        if (newIndex != 0) {
          currentEdgeIndex = newPrefixSum[newIndex - 1];
        } else {
          currentEdgeIndex = 0;
        }

        // construct the graph, reindexing as it goes along
        for (auto e = initial.edgeBegin(oldNodeID);
             e < initial.edgeEnd(oldNodeID); e++) {
          // get destination, reindex
          uint32_t oldEdgeDst       = initial.edgeDestination(*e);
          uint32_t reindexedEdgeDst = oldToNewMapping[oldEdgeDst];

          // construct edge
          graph.constructEdge(currentEdgeIndex, reindexedEdgeDst);
          currentEdgeIndex++;
        }
        // this assert makes sure reindex was correct + makes sure all edges
        // are accounted for
        assert(currentEdgeIndex == newPrefixSum[newIndex]);
      },
      galois::steal(), galois::loopname("ReindexingGraph"));

  galois::StatTimer edgeSortTimer("EdgeSortTimer");
  edgeSortTimer.start();
  // sort by destinations
  graph.sortAllEdgesByDst();
  edgeSortTimer.stop();

  // initialize local ranges
  graph.initializeLocalRanges();

  Trelabel.stop();
}

void readGraph(Graph& graph) {
  galois::StatTimer autoAlgoTimer("AutoAlgo_0");
  if (!relabel) {
    galois::graphs::FileGraph degreeGraph;
    degreeGraph.fromFile(inputFile);
    degreeGraph.initNodeDegrees();
    autoAlgoTimer.start();
    relabel = isApproximateDegreeDistributionPowerLaw(degreeGraph);
    autoAlgoTimer.stop();
  }
  if (relabel) {
    galois::gInfo("Relabeling and sorting graph...");
    makeSortedGraph(graph);
  } else {
    galois::graphs::readGraph(graph, inputFile);
    // algorithm correctness requires sorting edges by destination
    graph.sortAllEdgesByDst();
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  Graph graph;

  galois::StatTimer initialTime("GraphReadingTime");
  initialTime.start();
  readGraph(graph);
  initialTime.stop();

  galois::preAlloc(numThreads + 16 * (graph.size() + graph.sizeEdges()) /
                                    galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  galois::gInfo("Starting triangle counting...");

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  // case by case preAlloc to avoid allocating unnecessarily
  switch (algo) {
  case nodeiterator:
    nodeIteratingAlgo(graph);
    break;

  case edgeiterator:
    edgeIteratingAlgo(graph);
    break;

  case orderedCount:
    orderedCountAlgo(graph);
    break;

  default:
    std::cerr << "Unknown algo: " << algo << "\n";
  }
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/CMakeLists.txt
================================================
include_directories("${PROJECT_SOURCE_DIR}/libgluon/include")

add_subdirectory(betweennesscentrality)
add_subdirectory(bfs)
add_subdirectory(connected-components)
add_subdirectory(k-core)
add_subdirectory(pagerank)
add_subdirectory(partition)
add_subdirectory(matrixcompletion)
add_subdirectory(sssp)
add_subdirectory(triangle-counting)


================================================
FILE: lonestar/analytics/distributed/README.md
================================================
Overview of Distributed and Heterogeneous Systems in Galois
================================================================================

This directory contains benchmarks that run using D-Galois and D-IrGL.

D-Galois is distributed Galois built using the Gluon communication substrate.
Similarly, D-IrGL is distributed IrGL built using Gluon.
Gluon is just the communication substrate: it is not a standalone system.

Basic Compiling Through CMake (Distributed and Heterogeneous Galois)
================================================================================

The dependencies for distributed Galois are exactly the same as shared-memory
Galois except that it requires an MPI library (e.g. mpich2) to be on the
system as well.

To build distributed/heterogeneous Galois, certain CMake flags must be
specified.

For distributed Galois, i.e. D-Galois:

`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1`

For distributed and heterogeneous Galois, i.e. D-IrGL:

`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1 -DGALOIS_CUDA_CAPABILITY=<insert CUDA capability here>`

The CUDA capability should be one that your GPU supports. For example, if you
wanted to use a GTX 1080 and a K80, the command would look like this:

`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1 -DGALOIS_CUDA_CAPABILITY="3.7;6.1"`

Note that heterogeneous Galois requires CUDA 8.0 and above and a compiler
that is compatible with the CUDA version that you use.

Note that heterogeneous Galois requires the cub and moderngpu git submodules, which can be cloned using the followed commands.

```Shell
cd $GALOIS_ROOT
git submodule init
git submodule update 
```
These modules will be cloned in the ${GALOIS\_ROOT}/external directory

Compiling with distributed Galois will add the `distributed` directory under
`lonestar/analytics` to the build folder.

Compiling Provided Apps
================================================================================

Once CMake is successfully completed, you can build the provided apps with the
following command in lonestar/analytics/distributed directory.

`make -j`

You can compile specific apps by going their directories and running make.

Running Provided Apps
================================================================================

You can learn how to run compiled applications by running them with the -help
command line option:

`./bfs-push -help`

Most of the provided graph applications take graphs in a .gr format, which
is a Galois graph format that stores the graph in a CSR or CSC format. We
provide a graph converter tool under 'tools/graph-convert' that can take
various graph formats and convert them to the Galois format.

Running Provided Apps (Distributed Apps)
================================================================================

First, note that if running multiple processes on a single machine (e.g.,
single-host multi-GPU or multi-host multi-GPU where a process is spawned for
each GPU), specifying `GALOIS_DO_NOT_BIND_THREADS=1` as an environment variable
is crucial for performance.

If using MPI, multiple processes split across multiple hosts can be specified
with the following:

`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=<# of processes> -hosts=<machines to run on> ./bfs-push <input graph>`

The distributed applications have a few common command line flags that are
worth noting. More details can be found by running a distributed application
with the -help flag.

`-partition=<partitioning policy>`

Specifies the partitioning that you would like to use when splitting the graph
among multiple hosts.

`-exec=Sync,Async`

Specifies synchronous communication (bulk-synchronous parallel where every host
blocks for messages from other hosts at the end of a round of execution)
or asynchronous communication (bulk-asynchronous parallel where a host does
not have to block on messages from other hosts at the end of the round and
may continue execution).

`-graphTranspose`

Specifies the transpose of the provided input graph. This is used to
create certain partitions of the graph (and is required for some of the
partitioning policies).

`-runs`

Number of times to run an application.

`-statFile`

Specify the file in which to output run statistics to.

`-t`

Number of threads to use on a single machine excluding a communication thread
that is used by all of the provided distributed benchmarks. Note that
GPUs only use 1 thread (excluding the communication thread).

`-output` / `-outputLocation=<directory>`

Outputs the result of running the application to a file. For example,
specifying this flag on a bfs application will output the shortest distances to
each node.

Running Provided Apps (Distributed Heterogeneous Apps)
================================================================================

Heterogeneous apps have additional command line parameters:

`-num_nodes=<num>`

Specifies the total number of PHYSICAL machines on the system. For example,
you could have 2 machines with 8 GPUs each for a total of 16 processes,
but you still would only have 2 machines. Therefore, you would use
`-num_nodes=2`. Note that there **must** be one process per GPU in use.

`-pset=<string>`

Specifies the architecture to run on on a single machine using "c" (CPUs) and
"g" (GPUs). For example, if you have 2 machines with 8 GPUs each,
but you want to run with 3 GPUs on each machine, you would use `-pset="ggg"`.
Therefore, combined with `-num_nodes=2`, you would have a total of 6 units of
execution: 3 GPUs on 2 machines for a total of 6. This creates a total of
6 processes across the 2 machines (1 for each GPU).

Also, it suffices to use only one "c" in pset to run on CPUs on your machines:
you can specify the amount of cores/hyperthreads to use using the
aforementioned thread option `-t`.

Examples for Running Provided Apps
================================================================================

To run 3 processes all on a single machine, use the following:
`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=3 ./bfs_push rmat15.gr -graphTranspose=rmat15.tgr -t=4 -num_nodes=1 -partition=oec`
Note: when heterogeneous execution is not enabled via `GALOIS_CUDA_CAPABILITY`,
`-num_nodes=1` is invalid and will not appear as an option.

is not correct if heterogeneous execution is not
enabled via specifying the CUDA capability (as it does not appear as an option if
heterogeneous execution is not on).

To run on 3 CPUs on h1, h2, and h3, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./cc_push rmat15.sgr -symmetricGraph -t=1 -num_nodes=1 -partition=iec`

To run on 3 GPUs on a single machine, use the following:
`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=3 ./sssp_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=1 -pset="ggg" -partition=cvc`

To run on 4 GPUs on 2 machines h1 and h2 (each with 2 GPUs), use the following:
`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 -hosts=h1,h2 ./bfs_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=2 -pset="gg" -partition=cvc-iec`
Note that `mpirun -n=4` is 4 because there are a total of 4 execution units being used.

To run on 1 CPU and 1 GPU each on 2 machines h1 and h2, use the following:
`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 -hosts=h1,h2 ./pagerank_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=2 -pset="cg" -partition=oec`

Performance Considerations
================================================================================

* As mentioned above if running multiple processes on a single machine,
  specifying `GALOIS_DO_NOT_BIND_THREADS=1` as an environment variable is
  crucial for performance.

* We have also observed that `GALOIS_DO_NOT_BIND_THREADS=1` to improve
  performance in a distributed setting as well (multiple hosts each with its
  own process).

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.

Publications Related to Distributed Applications
================================================================================

Please see the publications listed below for information on the distributed
runtime as well as performance studies we have conducted over the years.

Roshan Dathathri, Gurbinder Gill, Loc Hoang, Hoang-Vu Dang, Alex Brooks,
Nikoli Dryden, Marc Snir, Keshav Pingali, “Gluon: A Communication-Optimizing
Substrate for Distributed Heterogeneous Graph Analytics,” Proceedings of the
39th ACM SIGPLAN Conference on Programming Language Design and Implementation
(PLDI), June 2018.

Gurbinder Gill, Roshan Dathathri, Loc Hoang, Andrew Lenharth, Keshav Pingali,
“Abelian: A Compiler for Graph Analytics on Distributed, Heterogeneous
Platforms,” Proceedings of the 24th International European Conference on
Parallel and Distributed Computing (Euro-Par), August 2018.

Gurbinder Gill, Roshan Dathathri, Loc Hoang, Keshav Pingali, “A Study of
Partitioning Policies for Graph Analytics on Large-scale Distributed
Platforms,” Proceedings of the 45th International Conference on Very Large Data
Bases (PVLDB), 12(4): 321-334, December 2018.

Loc Hoang, Matteo Pontecorvi, Roshan Dathathri, Gurbinder Gill, Bozhi You,
Keshav Pingali, Vijaya Ramachandran, “A Round-Efficient Distributed
Betweenness Centrality Algorithm,” Proceedings of the 24th ACM SIGPLAN
Symposium on Principles and Practice of Parallel Programming (PPoPP), February
2019.

Roshan Dathathri, Gurbinder Gill, Loc Hoang, Keshav Pingali, “Phoenix: A
Substrate for Resilient Distributed Graph Analytics,” Proceedings of the 24th
ACM International Conference on Architectural Support for Programming Languages
and Operating Systems (ASPLOS), April 2019.

Loc Hoang, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, “CuSP: A
Customizable Streaming Edge Partitioner for Distributed Graph Analytics,”
Proceedings of the 33rd IEEE International Parallel and Distributed Processing
Symposium (IPDPS), May 2019.

Loc Hoang, Vishwesh Jatala, Xuhao Chen, Udit Agarwal, Roshan Dathathri,
Gurbinder Gill, Keshav Pingali, “DistTC: High Performance Distributed Triangle
Counting,” Proceedings of the IEEE International Conference on High Performance
Extreme Computing (HPEC), September 2019.

Roshan Dathathri, Gurbinder Gill, Loc Hoang, Hoang-Vu Dang, Vishwesh Jatala, V.
Krishna Nandivada, Marc Snir, Keshav Pingali, “Gluon-Async: A Bulk-Asynchronous
System for Distributed and Heterogeneous Graph Analytics,” Proceedings of the
28th IEEE International Conference on Parallel Architectures and Compilation
Techniques (PACT), September 2019.

Vishwesh Jatala, Roshan Dathathri, Gurbinder Gill, Loc Hoang, V. Krishna
Nandivada, Keshav Pingali, “A Study of Graph Analytics for Massive Datasets on
Distributed GPUs,” Proceedings of the 34th IEEE International Parallel and
Distributed Processing Symposium (IPDPS), May 2020.

Basic Use (Creating Your Own Applications)
================================================================================

You can run the sample applications and make your own Galois programs directly
in the build tree without installing anything. Just add a subdirectory under
distributed, copy a CMakeLists.txt file from another application to your new
application, and add the subdirectory to the CMakeLists in distributed.


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/CMakeLists.txt
================================================
app_dist(bc_level betweennesscentrality-level)
add_test_dist(betweennesscentrality-level-dist rmat15 NO_ASYNC ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4)

app_dist(bc_mr betweennesscentrality-minrounds NO_GPU)
add_test_dist(betweennesscentrality-minrounds-dist rmat15 NO_ASYNC NO_GPU ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4 -numRoundSources=4)
add_test_dist(betweennesscentrality-minrounds-dist rmat15all NO_ASYNC NO_GPU NOT_QUICK ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numRoundSources=4096)


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/README.md
================================================
Betweenness Centrality
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

betweenesscentrality-level is a bulk synchronous parallel version of Brandes's
Betweenness Centrality that does both the forward and backward phases of
Brandes's algorithm in a level by level, work-efficient fashion. The algorithm
solves dependencies for a single source at a time.

betweenesscentrality-minrounds is a provably round efficient distributed
algorithm that can solve for betweenness centrality dependencies for multiple
sources at a time. It leverages a proven insight that allows the algorithm to
know exactly which round that synchronization of source data needs to occur:
this results in communication only when necessary which further improves the
algorithms efficiency in the distributed setting. Details of the algorithm, the
proofs of correctness, and performance comparisons can be found in our paper:

Loc Hoang, Matteo Pontecorvi, Roshan Dathathri, Gurbinder Gill, Bozhi You,
Keshav Pingali, Vijaya Ramachandran, “A Round-Efficient Distributed
Betweenness Centrality Algorithm,” Proceedings of the 24th ACM SIGPLAN
Symposium on Principles and Practice of Parallel Programming (PPoPP), February
2019.


INPUT
--------------------------------------------------------------------------------

Takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/betweenesscentrality; make -j`

RUN
--------------------------------------------------------------------------------

All the command line arguments used by both apps are the same except for
`-numRoundSources`, which is used by minrounds to control the number of sources
being batched at any given point.

To run solving for all sources, use the following:
`./betweenesscentrality-level-dist <input-graph> -t=<num-threads>`

To run for the first n sources, use the following:
`./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -numOfSources=n`

To run using specified sources from a file, use the following:
`./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -sourcesToUse=<filename>`

To run on 3 hosts h1, h2, and h3 with a Cartesian vertex cut partition for all
sources, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -partition=cvc`

To run for all sources in batches of k on 3 hosts, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./betweenesscentrality-minrounds-dist <input-graph> -t=<num-threads> -numRoundSources=k`

PERFORMANCE
--------------------------------------------------------------------------------

* The minrounds implementation performs significantly better than the level
implementation on high diameter graphs as it batches multiple sources together
at once and significantly reduces (1) rounds executed and (2) the communication
overhead. 

* Batching more sources in minrounds is a tradeoff between memory usage
and efficiency: more sources generally leads to less rounds executed but
requires a linear increase in memory used by the implementation to store data
for all of the sources being batched.

* More details on the differences between level and minrounds can be found in
our performance study in the MRBC paper cited above.


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * This version of BC-Level uses an option in the synchronization runtime to
 * avoid the overheads of having 2 extra accumulator variables.
 */

//#define BCDEBUG

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/gstl.h"
#include "galois/DReducible.h"
#include "galois/runtime/Tracer.h"

#include <iomanip>
#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "bc_level_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
// type of the num shortest paths variable
using ShortPathType = double;
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "BC";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;
static cll::opt<std::string>
    sourcesToUse("sourcesToUse",
                 cll::desc("Whitespace separated list "
                           "of sources in a file to "
                           "use in BC (default empty)"),
                 cll::init(""));
static cll::opt<bool>
    singleSourceBC("singleSource",
                   cll::desc("Use for single source BC (default off)"),
                   cll::init(false));
static cll::opt<uint64_t>
    startSource("startNode",
                cll::desc("Starting source node used for "
                          "betweeness-centrality (default 0)"),
                cll::init(0));
static cll::opt<unsigned int>
    numberOfSources("numOfSources",
                    cll::desc("Number of sources to use for "
                              "betweeness-centraility (default all)"),
                    cll::init(0));

/******************************************************************************/
/* Graph structure declarations */
/******************************************************************************/
const uint32_t infinity          = std::numeric_limits<uint32_t>::max() / 4;
static uint64_t current_src_node = 0;
// global round numbers; 1 for forward, 1 for back; used in sync structs as well
uint32_t globalRoundNumber = 0;
uint32_t backRoundCount    = 0;

// NOTE: types assume that these values will not reach uint64_t: it may
// need to be changed for very large graphs
struct NodeData {
  // SSSP vars
  std::atomic<uint32_t> current_length;
  // Betweeness centrality vars
  std::atomic<ShortPathType> num_shortest_paths;
  float dependency;
  float betweeness_centrality;

  //#ifdef BCDEBUG
  void dump() {
    galois::gPrint("DUMP: ", current_length.load(), " ",
                   num_shortest_paths.load(), " ", dependency, "\n");
  }
  //#endif
};

// reading in list of sources to operate on if provided
std::ifstream sourceFile;
std::vector<uint64_t> sourceVector;

using Graph = galois::graphs::DistGraph<NodeData, void>;
using GNode = typename Graph::GraphNode;

// bitsets for tracking updates
galois::DynamicBitSet bitset_num_shortest_paths;
galois::DynamicBitSet bitset_current_length;
galois::DynamicBitSet bitset_dependency;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

// sync structures
#include "bc_level_sync.hh"

/******************************************************************************/
/* Functors for running the algorithm */
/******************************************************************************/

struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  /* Initialize the graph */
  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str(
          syncSubstrate->get_run_identifier("InitializeGraph"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          // pass in begin/end to not use local thread ranges
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{&_graph}, galois::no_stats(),
          galois::loopname("InitializeGraph"));
    }
  }

  /* Functor passed into the Galois operator to carry out initialization;
   * reset everything */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    src_data.betweeness_centrality = 0;
    src_data.num_shortest_paths    = 0;
    src_data.dependency            = 0;
  }
};

/* This is used to reset node data when switching to a difference source */
struct InitializeIteration {
  const uint32_t& local_infinity;
  const uint64_t& local_current_src_node;
  Graph* graph;

  InitializeIteration(const uint32_t& _local_infinity,
                      const uint64_t& _local_current_src_node, Graph* _graph)
      : local_infinity(_local_infinity),
        local_current_src_node(_local_current_src_node), graph(_graph) {}

  /* Reset necessary graph metadata for next iteration of SSSP */
  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str(
          syncSubstrate->get_run_identifier("InitializeIteration"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeIteration_allNodes_cuda(infinity, current_src_node, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeIteration{infinity, current_src_node, &_graph},
          galois::loopname(syncSubstrate
                               ->get_run_identifier(std::string(REGION_NAME) +
                                                    "_InitializeIteration")
                               .c_str()),
          galois::no_stats());
    }
  }

  /* Functor passed into the Galois operator to carry out reset of node data
   * (aside from betweeness centrality measure */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    bool is_source = graph->getGID(src) == local_current_src_node;

    if (!is_source) {
      src_data.current_length     = local_infinity;
      src_data.num_shortest_paths = 0;
    } else {
      src_data.current_length     = 0;
      src_data.num_shortest_paths = 1;
    }
    src_data.dependency = 0;
  }
};

/**
 * Forward pass does level by level BFS to find distances and number of
 * shortest paths
 */
struct ForwardPass {
  Graph* graph;
  galois::DGAccumulator<uint32_t>& dga;
  uint32_t local_r;

  ForwardPass(Graph* _graph, galois::DGAccumulator<uint32_t>& _dga,
              uint32_t roundNum)
      : graph(_graph), dga(_dga), local_r(roundNum) {}

  /**
   * Level by level BFS while also finding number of shortest paths to a
   * particular node in the BFS tree.
   *
   * @param _graph Graph to use
   * @param _dga distributed accumulator
   * @param[out] roundNumber Number of rounds taken to finish
   */
  void static go(Graph& _graph, galois::DGAccumulator<uint32_t>& _dga) {
    globalRoundNumber          = 0;
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    bool moreThanOne = galois::runtime::getSystemNetworkInterface().Num > 1;

    do {
      _dga.reset();

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str(syncSubstrate->get_run_identifier(
            std::string(REGION_NAME) + "_ForwardPass"));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        ForwardPass_nodesWithEdges_cuda(__retval, globalRoundNumber, cuda_ctx);
        _dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges),
            ForwardPass(&_graph, _dga, globalRoundNumber),
            galois::loopname(syncSubstrate
                                 ->get_run_identifier(std::string(REGION_NAME) +
                                                      "_ForwardPass")
                                 .c_str()),
            galois::steal(), galois::no_stats());
      }

      // synchronize distances and shortest paths
      // read any because a destination node without the correct distance
      // may use a different distance (leading to incorrectness)
      if (moreThanOne) {
        syncSubstrate->sync<writeDestination, readAny,
                            Reduce_min_current_length, Bitset_current_length>(
            std::string(REGION_NAME) + "_ForwardPass");
        syncSubstrate
            ->sync<writeDestination, readSource, Reduce_add_num_shortest_paths,
                   Bitset_num_shortest_paths>(std::string(REGION_NAME) +
                                              "_ForwardPass");
      }

      globalRoundNumber++;
    } while (_dga.reduce(syncSubstrate->get_run_identifier()));
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.current_length == local_r) {
      for (auto current_edge : graph->edges(src)) {
        GNode dst         = graph->getEdgeDst(current_edge);
        auto& dst_data    = graph->getData(dst);
        uint32_t new_dist = 1 + src_data.current_length;
        uint32_t old = galois::atomicMin(dst_data.current_length, new_dist);

        if (old > new_dist) {
          // assert(dst_data.current_length == r + 1);
          // assert(src_data.num_shortest_paths > 0);

          bitset_current_length.set(dst);
          double nsp = src_data.num_shortest_paths;
          galois::atomicAdd(dst_data.num_shortest_paths, nsp);
          bitset_num_shortest_paths.set(dst);

          dga += 1;
        } else if (old == new_dist) {
          // assert(src_data.num_shortest_paths > 0);
          // assert(dst_data.current_length == r + 1);

          double nsp = src_data.num_shortest_paths;
          galois::atomicAdd(dst_data.num_shortest_paths, nsp);
          bitset_num_shortest_paths.set(dst);

          dga += 1;
        }
      }
    }
  }
};

/**
 * Synchronize num shortest paths on destinations (should already
 * exist on all sources).
 */
struct MiddleSync {
  Graph* graph;
  const uint32_t local_infinity;

  MiddleSync(Graph* _graph, const uint32_t li)
      : graph(_graph), local_infinity(li){};

  void static go(Graph& _graph, const uint32_t _li) {
    // step only required if more than one host
    if (galois::runtime::getSystemNetworkInterface().Num > 1) {
      const auto& masters = _graph.masterNodesRange();

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str(syncSubstrate->get_run_identifier("MiddleSync"));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        MiddleSync_masterNodes_cuda(infinity, cuda_ctx);
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(masters.begin(), masters.end()),
            MiddleSync(&_graph, _li),
            galois::loopname(
                syncSubstrate->get_run_identifier("MiddleSync").c_str()),
            galois::no_stats());
      }

      syncSubstrate->sync<writeSource, readAny, Reduce_set_num_shortest_paths>(
          std::string(REGION_NAME) + "_MiddleSync");
    }
  }

  /**
   * Set node for sync if it has a non-zero distance
   */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.current_length != local_infinity) {
      bitset_num_shortest_paths.set(src);
    }
  }
};

/**
 * Propagate dependency backward by iterating backward over levels of BFS tree
 */
struct BackwardPass {
  Graph* graph;
  uint32_t local_r;

  BackwardPass(Graph* _graph, uint32_t roundNum)
      : graph(_graph), local_r(roundNum) {}

  void static go(Graph& _graph, uint32_t roundNumber) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    bool moreThanOne = galois::runtime::getSystemNetworkInterface().Num > 1;

    backRoundCount = roundNumber - 1;

    for (; backRoundCount > 0; backRoundCount--) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str(syncSubstrate->get_run_identifier("BackwardPass"));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        BackwardPass_nodesWithEdges_cuda(backRoundCount, cuda_ctx);
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges),
            BackwardPass(&_graph, backRoundCount),
            galois::loopname(syncSubstrate
                                 ->get_run_identifier(std::string(REGION_NAME) +
                                                      "_BackwardPass")
                                 .c_str()),
            galois::steal(), galois::no_stats());
      }

      if (moreThanOne) {
        syncSubstrate->sync<writeSource, readDestination, Reduce_add_dependency,
                            Bitset_dependency>(std::string(REGION_NAME) +
                                               "_BackwardPass");
      }
    }
  }

  /**
   * If on the correct level, calculate self-depndency by checking successor
   * nodes.
   */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.current_length == local_r) {
      uint32_t dest_to_find = src_data.current_length + 1;
      for (auto current_edge : graph->edges(src)) {
        GNode dst      = graph->getEdgeDst(current_edge);
        auto& dst_data = graph->getData(dst);

        if (dest_to_find == dst_data.current_length) {
          float contrib =
              ((float)1 + dst_data.dependency) / dst_data.num_shortest_paths;
          src_data.dependency = src_data.dependency + contrib;
          bitset_dependency.set(src);
        }
      }
      src_data.dependency *= src_data.num_shortest_paths;
    }
  }
};

struct BC {
  Graph* graph;

  BC(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint32_t>& dga) {
    globalRoundNumber = 0;
    // reset the graph aside from the between-cent measure
    InitializeIteration::go(_graph);
    // get distances and num paths
    ForwardPass::go(_graph, dga);

    // dependency calc only matters if there's a node with distance at
    // least 2
    if (globalRoundNumber > 2) {
      MiddleSync::go(_graph, infinity);
      BackwardPass::go(_graph, globalRoundNumber - 1);

      const auto& masters = _graph.masterNodesRange();
      // finally, since dependencies are finalized for this round at this
      // point, add them to the betweeness centrality measure on each node
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str(syncSubstrate->get_run_identifier("BC"));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        BC_masterNodes_cuda(cuda_ctx);
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(masters.begin(), masters.end()), BC(&_graph),
            galois::no_stats(),
            galois::loopname(
                syncSubstrate->get_run_identifier(std::string(REGION_NAME))
                    .c_str()));
      }
    }
  }

  /**
   * Adds dependency measure to BC measure
   */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.dependency > 0) {
      src_data.betweeness_centrality += src_data.dependency;
    }
  }
};

/******************************************************************************/
/* Sanity check */
/******************************************************************************/

struct Sanity {
  Graph* graph;

  galois::DGReduceMax<float>& DGAccumulator_max;
  galois::DGReduceMin<float>& DGAccumulator_min;
  galois::DGAccumulator<float>& DGAccumulator_sum;

  Sanity(Graph* _graph, galois::DGReduceMax<float>& _DGAccumulator_max,
         galois::DGReduceMin<float>& _DGAccumulator_min,
         galois::DGAccumulator<float>& _DGAccumulator_sum)
      : graph(_graph), DGAccumulator_max(_DGAccumulator_max),
        DGAccumulator_min(_DGAccumulator_min),
        DGAccumulator_sum(_DGAccumulator_sum) {}

  void static go(Graph& _graph, galois::DGReduceMax<float>& DGA_max,
                 galois::DGReduceMin<float>& DGA_min,
                 galois::DGAccumulator<float>& DGA_sum) {

    DGA_max.reset();
    DGA_min.reset();
    DGA_sum.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      // std::string impl_str(syncSubstrate->get_run_identifier("Sanity"));
      std::string impl_str("Sanity");
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      float sum, max, min;
      Sanity_masterNodes_cuda(sum, max, min, cuda_ctx);
      DGA_sum += sum;
      DGA_max.update(max);
      DGA_min.update(min);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     Sanity(&_graph, DGA_max, DGA_min, DGA_sum),
                     galois::no_stats(), galois::loopname("Sanity"));
    }

    float max_bc = DGA_max.reduce();
    float min_bc = DGA_min.reduce();
    float bc_sum = DGA_sum.reduce();

    // Only node 0 will print data
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Max BC is ", max_bc, "\n");
      galois::gPrint("Min BC is ", min_bc, "\n");
      galois::gPrint("BC sum is ", bc_sum, "\n");
    }
  }

  /* Gets the max, min rank from all owned nodes and
   * also the sum of ranks */
  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);

    DGAccumulator_max.update(sdata.betweeness_centrality);
    DGAccumulator_min.update(sdata.betweeness_centrality);
    DGAccumulator_sum += sdata.betweeness_centrality;
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).betweeness_centrality);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_betweeness_centrality_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main method for running */
/******************************************************************************/

constexpr static const char* const name =
    "Betweeness Centrality Level by Level";
constexpr static const char* const desc =
    "Betweeness Centrality Level by Level on Distributed Galois.";
constexpr static const char* const url = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> h_graph;
#ifdef GALOIS_ENABLE_GPU
  std::tie(h_graph, syncSubstrate) =
      distGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(h_graph, syncSubstrate) = distGraphInitialization<NodeData, void>();
#endif

  if (!sourcesToUse.empty()) {
    sourceFile.open(sourcesToUse);
    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},
                            std::istream_iterator<uint64_t>{});
    sourceVector = t;
    sourceFile.close();
  }

  bitset_num_shortest_paths.resize(h_graph->size());
  bitset_current_length.resize(h_graph->size());
  bitset_dependency.resize(h_graph->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*h_graph));
  galois::runtime::getHostBarrier().wait();

  // shared DG accumulator among all steps
  galois::DGAccumulator<uint32_t> dga;

  // sanity dg accumulators
  galois::DGReduceMax<float> dga_max;
  galois::DGReduceMin<float> dga_min;
  galois::DGAccumulator<float> dga_sum;

  galois::runtime::reportStat_Single(std::string(REGION_NAME),
                                     std::string("NumSources"),
                                     (unsigned int)numberOfSources);
  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] BC::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));

    uint64_t loop_end = 1;
    bool sSources     = false;

    if (!singleSourceBC) {
      if (!numberOfSources) {
        loop_end = h_graph->globalSize();
      } else {
        loop_end = numberOfSources;
      }

      // if provided a file of sources to work with, use that
      if (!sourceVector.empty()) {
        if (loop_end > sourceVector.size()) {
          loop_end = sourceVector.size();
        }
        sSources = true;
      }
    }

    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    for (uint64_t i = 0; i < loop_end; i++) {
      if (singleSourceBC) {
        // only 1 source; specified start source in command line
        assert(loop_end == 1);
        galois::gDebug("This is single source node BC");
        current_src_node = startSource;
      } else if (sSources) {
        current_src_node = sourceVector[i];
      } else {
        // all sources
        current_src_node = i;
      }

      globalRoundNumber = 0;
      backRoundCount    = 0;

      StatTimer_main.start();
      BC::go(*h_graph, dga);
      StatTimer_main.stop();

      // Round reporting
      if (galois::runtime::getSystemNetworkInterface().ID == 0) {
        galois::runtime::reportStat_Single(
            REGION_NAME, syncSubstrate->get_run_identifier("NumRounds", i),
            globalRoundNumber);
        uint32_t backRounds;
        if (globalRoundNumber > 2) {
          backRounds = globalRoundNumber - 2;
        } else {
          backRounds = 0;
        }
        galois::runtime::reportStat_Single(
            REGION_NAME,
            syncSubstrate->get_run_identifier("NumForwardRounds", i),
            globalRoundNumber);
        galois::runtime::reportStat_Single(
            REGION_NAME, syncSubstrate->get_run_identifier("NumBackRounds", i),
            backRounds);
        galois::runtime::reportStat_Single(
            REGION_NAME, std::string("TotalRounds_") + std::to_string(run),
            globalRoundNumber + backRounds);
      }
    }

    Sanity::go(*h_graph, dga_max, dga_min, dga_sum);

    // re-init graph for next run
    if ((run + 1) != numRuns) {
      galois::runtime::getHostBarrier().wait();
      (*syncSubstrate).set_num_run(run + 1);

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_num_shortest_paths_reset_cuda(cuda_ctx);
        bitset_current_length_reset_cuda(cuda_ctx);
        bitset_dependency_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else if (personality == CPU) {
        bitset_num_shortest_paths.reset();
        bitset_current_length.reset();
        bitset_dependency.reset();
      }

      InitializeGraph::go(*h_graph);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<float> results = makeResults(h_graph);
    auto globalIDs             = h_graph->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "betweenness_centrality", results.data(),
                results.size(), globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=False $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
bool enable_lb = false;
#include "bc_level_cuda.cuh"
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, float * p_dependency, ShortPathType * p_num_shortest_paths)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_betweeness_centrality[src] = 0;
      p_num_shortest_paths[src]    = 0;
      p_dependency[src]            = 0;
    }
  }
  // FP: "9 -> 10;
}
__global__ void InitializeIteration(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint64_t  local_current_src_node, const uint32_t  local_infinity, uint32_t * p_current_length, float * p_dependency, ShortPathType * p_num_shortest_paths)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  bool is_source;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      is_source = graph.node_data[src] == local_current_src_node;
      if (!is_source)
      {
        p_current_length[src]     = local_infinity;
        p_num_shortest_paths[src] = 0;
      }
      else
      {
        p_current_length[src]     = 0;
        p_num_shortest_paths[src] = 1;
      }
      p_dependency[src]       = 0;
    }
  }
  // FP: "15 -> 16;
}
__global__ void ForwardPass(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_r, uint32_t * p_current_length, ShortPathType * p_num_shortest_paths, DynamicBitset& bitset_current_length, DynamicBitset& bitset_num_shortest_paths, HGAccumulator<uint32_t> dga)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage dga_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  dga.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    index_type current_edge_end;
    bool pop  = src < __end;
    if (pop)
    {
      if (p_current_length[src] == local_r)
      {
      }
      else
      {
        pop = false;
      }
    }
    if (!pop)
    {
      continue;
    }
    current_edge_end = (graph).getFirstEdge((src) + 1);
    for (index_type current_edge = (graph).getFirstEdge(src) + 0; current_edge < current_edge_end; current_edge += 1)
    {
      index_type dst;
      uint32_t new_dist;
      uint32_t old;
      dst = graph.getAbsDestination(current_edge);
      new_dist = 1 + p_current_length[src];
      old = atomicTestMin(&p_current_length[dst], new_dist);
      if (old > new_dist)
      {
        double nsp;
        bitset_current_length.set(dst);
        nsp = p_num_shortest_paths[src];
        atomicTestAdd(&p_num_shortest_paths[dst], nsp);
        bitset_num_shortest_paths.set(dst);
        dga.reduce( 1);
      }
      else
      {
        if (old == new_dist)
        {
          double nsp;
          nsp = p_num_shortest_paths[src];
          atomicTestAdd(&p_num_shortest_paths[dst], nsp);
          bitset_num_shortest_paths.set(dst);
          dga.reduce( 1);
        }
      }
    }
  }
  // FP: "37 -> 38;
  dga.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(dga_ts);
  // FP: "38 -> 39;
}
__global__ void MiddleSync(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_infinity, uint32_t * p_current_length, DynamicBitset& bitset_num_shortest_paths)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_current_length[src] != local_infinity)
      {
        bitset_num_shortest_paths.set(src);
      }
    }
  }
  // FP: "9 -> 10;
}
__global__ void BackwardPass(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_r, uint32_t * p_current_length, float * p_dependency, ShortPathType * p_num_shortest_paths, DynamicBitset& bitset_dependency)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  uint32_t dest_to_find;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    index_type current_edge_end;
    bool pop  = src < __end;
    if (pop)
    {
      if (p_current_length[src] == local_r)
      {
        dest_to_find = p_current_length[src] + 1;
      }
      else
      {
        pop = false;
      }
    }
    if (!pop)
    {
      continue;
    }
    current_edge_end = (graph).getFirstEdge((src) + 1);
    for (index_type current_edge = (graph).getFirstEdge(src) + 0; current_edge < current_edge_end; current_edge += 1)
    {
      index_type dst;
      dst = graph.getAbsDestination(current_edge);
      if (dest_to_find == p_current_length[dst])
      {
        float contrib;
        contrib = ((float)1 + p_dependency[dst]) / p_num_shortest_paths[dst];
        p_dependency[src] = p_dependency[src] + contrib;
        bitset_dependency.set(src);
      }
    }
    p_dependency[src] *= p_num_shortest_paths[src];
  }
  // FP: "25 -> 26;
}
__global__ void BC(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, float * p_dependency)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_dependency[src] > 0)
      {
        p_betweeness_centrality[src] += p_dependency[src];
      }
    }
  }
  // FP: "9 -> 10;
}
__global__ void Sanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, HGAccumulator<float> DGAccumulator_sum, HGReduceMax<float> DGAccumulator_max, HGReduceMin<float> DGAccumulator_min)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_max_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_min_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_sum.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  DGAccumulator_max.thread_entry();
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  DGAccumulator_min.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      DGAccumulator_max.reduce(p_betweeness_centrality[src]);
      DGAccumulator_min.reduce(p_betweeness_centrality[src]);
      DGAccumulator_sum.reduce( p_betweeness_centrality[src]);
    }
  }
  // FP: "15 -> 16;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "16 -> 17;
  DGAccumulator_max.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_max_ts);
  // FP: "17 -> 18;
  DGAccumulator_min.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_min_ts);
  // FP: "18 -> 19;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void InitializeIteration_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeIteration <<<blocks, threads>>>(ctx->gg, __begin, __end, local_current_src_node, local_infinity, ctx->current_length.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeIteration_allNodes_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeIteration_cuda(0, ctx->gg.nnodes, local_infinity, local_current_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeIteration_masterNodes_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeIteration_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_current_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeIteration_nodesWithEdges_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeIteration_cuda(0, ctx->numNodesWithEdges, local_infinity, local_current_src_node, ctx);
  // FP: "2 -> 3;
}
void ForwardPass_cuda(unsigned int  __begin, unsigned int  __end, uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint32_t> _dga;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint32_t> dgaval  = Shared<uint32_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(dgaval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _dga.rv = dgaval.gpu_wr_ptr();
  // FP: "8 -> 9;
  ForwardPass <<<blocks, threads>>>(ctx->gg, __begin, __end, local_r, ctx->current_length.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr(), *(ctx->current_length.is_updated.gpu_rd_ptr()), *(ctx->num_shortest_paths.is_updated.gpu_rd_ptr()), _dga);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  dga = *(dgaval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void ForwardPass_allNodes_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ForwardPass_cuda(0, ctx->gg.nnodes, dga, local_r, ctx);
  // FP: "2 -> 3;
}
void ForwardPass_masterNodes_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ForwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, dga, local_r, ctx);
  // FP: "2 -> 3;
}
void ForwardPass_nodesWithEdges_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ForwardPass_cuda(0, ctx->numNodesWithEdges, dga, local_r, ctx);
  // FP: "2 -> 3;
}
void MiddleSync_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t local_infinity, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  MiddleSync <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->current_length.data.gpu_wr_ptr(), *(ctx->num_shortest_paths.is_updated.gpu_rd_ptr()));
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void MiddleSync_allNodes_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  MiddleSync_cuda(0, ctx->gg.nnodes, local_infinity, ctx);
  // FP: "2 -> 3;
}
void MiddleSync_masterNodes_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  MiddleSync_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, ctx);
  // FP: "2 -> 3;
}
void MiddleSync_nodesWithEdges_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  MiddleSync_cuda(0, ctx->numNodesWithEdges, local_infinity, ctx);
  // FP: "2 -> 3;
}
void BackwardPass_cuda(unsigned int  __begin, unsigned int  __end, uint32_t local_r, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  BackwardPass <<<blocks, threads>>>(ctx->gg, __begin, __end, local_r, ctx->current_length.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr(), *(ctx->dependency.is_updated.gpu_rd_ptr()));
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void BackwardPass_allNodes_cuda(uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BackwardPass_cuda(0, ctx->gg.nnodes, local_r, ctx);
  // FP: "2 -> 3;
}
void BackwardPass_masterNodes_cuda(uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BackwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_r, ctx);
  // FP: "2 -> 3;
}
void BackwardPass_nodesWithEdges_cuda(uint32_t local_r, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BackwardPass_cuda(0, ctx->numNodesWithEdges, local_r, ctx);
  // FP: "2 -> 3;
}
void BC_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  BC <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void BC_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BC_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void BC_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void BC_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BC_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void Sanity_cuda(unsigned int  __begin, unsigned int  __end, float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<float> _DGAccumulator_sum;
  HGReduceMax<float> _DGAccumulator_max;
  HGReduceMin<float> _DGAccumulator_min;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<float> DGAccumulator_sumval  = Shared<float>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<float> DGAccumulator_maxval  = Shared<float>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(DGAccumulator_maxval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _DGAccumulator_max.rv = DGAccumulator_maxval.gpu_wr_ptr();
  // FP: "12 -> 13;
  Shared<float> DGAccumulator_minval  = Shared<float>(1);
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  *(DGAccumulator_minval.cpu_wr_ptr()) = 1073741823;
  // FP: "15 -> 16;
  _DGAccumulator_min.rv = DGAccumulator_minval.gpu_wr_ptr();
  // FP: "16 -> 17;
  Sanity <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGAccumulator_max, _DGAccumulator_min);
  cudaDeviceSynchronize();
  // FP: "17 -> 18;
  check_cuda_kernel;
  // FP: "18 -> 19;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "19 -> 20;
  DGAccumulator_max = *(DGAccumulator_maxval.cpu_rd_ptr());
  // FP: "20 -> 21;
  DGAccumulator_min = *(DGAccumulator_minval.cpu_rd_ptr());
  // FP: "21 -> 22;
}
void Sanity_allNodes_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  Sanity_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);
  // FP: "2 -> 3;
}
void Sanity_masterNodes_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  Sanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);
  // FP: "2 -> 3;
}
void Sanity_nodesWithEdges_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  Sanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "bc_level_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<float> betweeness_centrality;
	struct CUDA_Context_Field<uint32_t> current_length;
	struct CUDA_Context_Field<float> dependency;
	struct CUDA_Context_Field<ShortPathType> num_shortest_paths;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->betweeness_centrality, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->current_length, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dependency, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->num_shortest_paths, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->betweeness_centrality, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->current_length, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dependency, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->num_shortest_paths, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->betweeness_centrality.data.zero_gpu();
	ctx->current_length.data.zero_gpu();
	ctx->dependency.data.zero_gpu();
	ctx->num_shortest_paths.data.zero_gpu();
}

void get_bitset_betweeness_centrality_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->betweeness_centrality.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx) {
	ctx->betweeness_centrality.is_updated.cpu_rd_ptr()->reset();
}

void bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->betweeness_centrality, begin, end);
}

float get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *betweeness_centrality = ctx->betweeness_centrality.data.cpu_rd_ptr();
	return betweeness_centrality[LID];
}

void set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();
	betweeness_centrality[LID] = v;
}

void add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();
	betweeness_centrality[LID] += v;
}

bool min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();
	if (betweeness_centrality[LID] > v){
		betweeness_centrality[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->betweeness_centrality, from_id, v);
}

void batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->betweeness_centrality, from_id, v);
}

void batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->betweeness_centrality, from_id, v, i);
}

void batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_add_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_min_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);
}

void batch_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->betweeness_centrality, begin, end, v);
}

void get_bitset_current_length_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->current_length.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_current_length_reset_cuda(struct CUDA_Context* ctx) {
	ctx->current_length.is_updated.cpu_rd_ptr()->reset();
}

void bitset_current_length_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->current_length, begin, end);
}

uint32_t get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *current_length = ctx->current_length.data.cpu_rd_ptr();
	return current_length[LID];
}

void set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();
	current_length[LID] = v;
}

void add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();
	current_length[LID] += v;
}

bool min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();
	if (current_length[LID] > v){
		current_length[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_length, from_id, v);
}

void batch_get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_length, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_length, from_id, v);
}

void batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_length, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_length, from_id, v, i);
}

void batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_length, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_add_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_min_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_length, from_id, v, data_mode);
}

void batch_reset_node_current_length_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->current_length, begin, end, v);
}

void get_bitset_dependency_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dependency.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dependency_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dependency.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dependency_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dependency, begin, end);
}

float get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *dependency = ctx->dependency.data.cpu_rd_ptr();
	return dependency[LID];
}

void set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *dependency = ctx->dependency.data.cpu_wr_ptr();
	dependency[LID] = v;
}

void add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *dependency = ctx->dependency.data.cpu_wr_ptr();
	dependency[LID] += v;
}

bool min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *dependency = ctx->dependency.data.cpu_wr_ptr();
	if (dependency[LID] > v){
		dependency[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->dependency, from_id, v);
}

void batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->dependency, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->dependency, from_id, v);
}

void batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->dependency, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->dependency, from_id, v, i);
}

void batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->dependency, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_add_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_min_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->dependency, from_id, v, data_mode);
}

void batch_reset_node_dependency_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->dependency, begin, end, v);
}

void get_bitset_num_shortest_paths_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->num_shortest_paths.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx) {
	ctx->num_shortest_paths.is_updated.cpu_rd_ptr()->reset();
}

void bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->num_shortest_paths, begin, end);
}

ShortPathType get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID) {
	ShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_rd_ptr();
	return num_shortest_paths[LID];
}

void set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {
	ShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();
	num_shortest_paths[LID] = v;
}

void add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {
	ShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();
	num_shortest_paths[LID] += v;
}

bool min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {
	ShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();
	if (num_shortest_paths[LID] > v){
		num_shortest_paths[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<ShortPathType, sharedMaster, false>(ctx, &ctx->num_shortest_paths, from_id, v);
}

void batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<ShortPathType, sharedMaster, false>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<ShortPathType, sharedMirror, false>(ctx, &ctx->num_shortest_paths, from_id, v);
}

void batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<ShortPathType, sharedMirror, false>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, ShortPathType i) {
	batch_get_shared_field<ShortPathType, sharedMirror, true>(ctx, &ctx->num_shortest_paths, from_id, v, i);
}

void batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, ShortPathType i) {
	batch_get_shared_field<ShortPathType, sharedMirror, true>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMirror, setOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMaster, setOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_add_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMirror, addOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMaster, addOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_min_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMirror, minOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<ShortPathType, sharedMaster, minOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);
}

void batch_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, ShortPathType v) {
	reset_data_field<ShortPathType>(&ctx->num_shortest_paths, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

// type of the num shortest paths variable
using ShortPathType = double;

void get_bitset_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                           uint64_t* bitset_compute);
void bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx);
void bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx,
                                             size_t begin, size_t end);
float get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                          unsigned LID);
void set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,
                                         float v);
void add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,
                                         float v);
bool min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,
                                         float v);
void batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v);
void batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               size_t* v_size,
                                               DataCommMode* data_mode);
void batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                      unsigned from_id,
                                                      uint8_t* v);
void batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                      unsigned from_id,
                                                      uint8_t* v,
                                                      size_t* v_size,
                                                      DataCommMode* data_mode);
void batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                     unsigned from_id,
                                                     uint8_t* v, float i);
void batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                     unsigned from_id,
                                                     uint8_t* v, size_t* v_size,
                                                     DataCommMode* data_mode,
                                                     float i);
void batch_set_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                      unsigned from_id,
                                                      uint8_t* v,
                                                      DataCommMode data_mode);
void batch_set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_add_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                      unsigned from_id,
                                                      uint8_t* v,
                                                      DataCommMode data_mode);
void batch_add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_min_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                      unsigned from_id,
                                                      uint8_t* v,
                                                      DataCommMode data_mode);
void batch_min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,
                                                 size_t begin, size_t end,
                                                 float v);

void get_bitset_current_length_cuda(struct CUDA_Context* ctx,
                                    uint64_t* bitset_compute);
void bitset_current_length_reset_cuda(struct CUDA_Context* ctx);
void bitset_current_length_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                      size_t end);
uint32_t get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
bool min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void batch_get_node_current_length_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v);
void batch_get_node_current_length_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size,
                                        DataCommMode* data_mode);
void batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v);
void batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               size_t* v_size,
                                               DataCommMode* data_mode);
void batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              uint32_t i);
void batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              size_t* v_size,
                                              DataCommMode* data_mode,
                                              uint32_t i);
void batch_set_mirror_node_current_length_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_set_node_current_length_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_add_mirror_node_current_length_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_add_node_current_length_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_min_mirror_node_current_length_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_min_node_current_length_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_reset_node_current_length_cuda(struct CUDA_Context* ctx,
                                          size_t begin, size_t end, uint32_t v);

void get_bitset_dependency_cuda(struct CUDA_Context* ctx,
                                uint64_t* bitset_compute);
void bitset_dependency_reset_cuda(struct CUDA_Context* ctx);
void bitset_dependency_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                  size_t end);
float get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v);
void batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode);
void batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx,
                                           unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx,
                                           unsigned from_id, uint8_t* v,
                                           size_t* v_size,
                                           DataCommMode* data_mode);
void batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          float i);
void batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          size_t* v_size,
                                          DataCommMode* data_mode, float i);
void batch_set_mirror_node_dependency_cuda(struct CUDA_Context* ctx,
                                           unsigned from_id, uint8_t* v,
                                           DataCommMode data_mode);
void batch_set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_dependency_cuda(struct CUDA_Context* ctx,
                                           unsigned from_id, uint8_t* v,
                                           DataCommMode data_mode);
void batch_add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_dependency_cuda(struct CUDA_Context* ctx,
                                           unsigned from_id, uint8_t* v,
                                           DataCommMode data_mode);
void batch_min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, DataCommMode data_mode);
void batch_reset_node_dependency_cuda(struct CUDA_Context* ctx, size_t begin,
                                      size_t end, float v);

void get_bitset_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                        uint64_t* bitset_compute);
void bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx);
void bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx,
                                          size_t begin, size_t end);
ShortPathType get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                               unsigned LID);
void set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,
                                      ShortPathType v);
void add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,
                                      ShortPathType v);
bool min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,
                                      ShortPathType v);
void batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v);
void batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode);
void batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                   unsigned from_id,
                                                   uint8_t* v);
void batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                   unsigned from_id, uint8_t* v,
                                                   size_t* v_size,
                                                   DataCommMode* data_mode);
void batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                  unsigned from_id, uint8_t* v,
                                                  ShortPathType i);
void batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                  unsigned from_id, uint8_t* v,
                                                  size_t* v_size,
                                                  DataCommMode* data_mode,
                                                  ShortPathType i);
void batch_set_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                   unsigned from_id, uint8_t* v,
                                                   DataCommMode data_mode);
void batch_set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            DataCommMode data_mode);
void batch_add_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                   unsigned from_id, uint8_t* v,
                                                   DataCommMode data_mode);
void batch_add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            DataCommMode data_mode);
void batch_min_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                                   unsigned from_id, uint8_t* v,
                                                   DataCommMode data_mode);
void batch_min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            DataCommMode data_mode);
void batch_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,
                                              size_t begin, size_t end,
                                              ShortPathType v);

void BC_cuda(unsigned int __begin, unsigned int __end,
             struct CUDA_Context* ctx);
void BC_allNodes_cuda(struct CUDA_Context* ctx);
void BC_masterNodes_cuda(struct CUDA_Context* ctx);
void BC_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void BackwardPass_cuda(unsigned int __begin, unsigned int __end,
                       uint32_t local_r, struct CUDA_Context* ctx);
void BackwardPass_allNodes_cuda(uint32_t local_r, struct CUDA_Context* ctx);
void BackwardPass_masterNodes_cuda(uint32_t local_r, struct CUDA_Context* ctx);
void BackwardPass_nodesWithEdges_cuda(uint32_t local_r,
                                      struct CUDA_Context* ctx);
void ForwardPass_cuda(unsigned int __begin, unsigned int __end, uint32_t& dga,
                      uint32_t local_r, struct CUDA_Context* ctx);
void ForwardPass_allNodes_cuda(uint32_t& dga, uint32_t local_r,
                               struct CUDA_Context* ctx);
void ForwardPass_masterNodes_cuda(uint32_t& dga, uint32_t local_r,
                                  struct CUDA_Context* ctx);
void ForwardPass_nodesWithEdges_cuda(uint32_t& dga, uint32_t local_r,
                                     struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeIteration_cuda(unsigned int __begin, unsigned int __end,
                              const uint32_t& local_infinity,
                              const uint64_t& local_current_src_node,
                              struct CUDA_Context* ctx);
void InitializeIteration_allNodes_cuda(const uint32_t& local_infinity,
                                       const uint64_t& local_current_src_node,
                                       struct CUDA_Context* ctx);
void InitializeIteration_masterNodes_cuda(
    const uint32_t& local_infinity, const uint64_t& local_current_src_node,
    struct CUDA_Context* ctx);
void InitializeIteration_nodesWithEdges_cuda(
    const uint32_t& local_infinity, const uint64_t& local_current_src_node,
    struct CUDA_Context* ctx);
void MiddleSync_cuda(unsigned int __begin, unsigned int __end,
                     const uint32_t local_infinity, struct CUDA_Context* ctx);
void MiddleSync_allNodes_cuda(const uint32_t local_infinity,
                              struct CUDA_Context* ctx);
void MiddleSync_masterNodes_cuda(const uint32_t local_infinity,
                                 struct CUDA_Context* ctx);
void MiddleSync_nodesWithEdges_cuda(const uint32_t local_infinity,
                                    struct CUDA_Context* ctx);
void Sanity_cuda(unsigned int __begin, unsigned int __end,
                 float& DGAccumulator_sum, float& DGAccumulator_max,
                 float& DGAccumulator_min, struct CUDA_Context* ctx);
void Sanity_allNodes_cuda(float& DGAccumulator_sum, float& DGAccumulator_max,
                          float& DGAccumulator_min, struct CUDA_Context* ctx);
void Sanity_masterNodes_cuda(float& DGAccumulator_sum, float& DGAccumulator_max,
                             float& DGAccumulator_min,
                             struct CUDA_Context* ctx);
void Sanity_nodesWithEdges_cuda(float& DGAccumulator_sum,
                                float& DGAccumulator_max,
                                float& DGAccumulator_min,
                                struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("bc_level_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_betweeness_centrality[src] = 0"]),
CBlock(["p_num_shortest_paths[src]    = 0"]),
CBlock(["p_dependency[src]            = 0"]),
]),
]),
]),
Kernel("InitializeIteration", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint64_t ', 'local_current_src_node'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_current_length'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths')],
[
CDecl([("bool", "is_source", "")]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["is_source = graph.node_data[src] == local_current_src_node"]),
If("!is_source",
[
CBlock(["p_current_length[src]     = local_infinity"]),
CBlock(["p_num_shortest_paths[src] = 0"]),
],
[
CBlock(["p_current_length[src]     = 0"]),
CBlock(["p_num_shortest_paths[src] = 1"]),
]),
CBlock(["p_dependency[src]       = 0"]),
]),
]),
]),
Kernel("ForwardPass", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_r'), ('uint32_t *', 'p_current_length'), ('ShortPathType *', 'p_num_shortest_paths'), ('DynamicBitset&', 'bitset_current_length'), ('DynamicBitset&', 'bitset_num_shortest_paths'), ('HGAccumulator<uint32_t>', 'dga')],
[
CDecl([("__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage", "dga_ts", "")]),
CBlock(["dga.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_current_length[src] == local_r",
[
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("current_edge", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(current_edge)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = 1 + p_current_length[src]"]),
CDecl([("uint32_t", "old", "")]),
CBlock(["old = atomicTestMin(&p_current_length[dst], new_dist)"]),
If("old > new_dist",
[
CBlock(["bitset_current_length.set(dst)"]),
CDecl([("double", "nsp", "")]),
CBlock(["nsp = p_num_shortest_paths[src]"]),
CBlock(["atomicTestAdd(&p_num_shortest_paths[dst], nsp)"]),
CBlock(["bitset_num_shortest_paths.set(dst)"]),
CBlock(["dga.reduce( 1)"]),
],
[
If("old == new_dist",
[
CDecl([("double", "nsp", "")]),
CBlock(["nsp = p_num_shortest_paths[src]"]),
CBlock(["atomicTestAdd(&p_num_shortest_paths[dst], nsp)"]),
CBlock(["bitset_num_shortest_paths.set(dst)"]),
CBlock(["dga.reduce( 1)"]),
]),
]),
]),
),
]),
CBlock(["dga.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(dga_ts)"], parse = False),
]),
Kernel("MiddleSync", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_infinity'), ('uint32_t *', 'p_current_length'), ('DynamicBitset&', 'bitset_num_shortest_paths')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_current_length[src] != local_infinity",
[
CBlock(["bitset_num_shortest_paths.set(src)"]),
]),
]),
]),
]),
Kernel("BackwardPass", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_r'), ('uint32_t *', 'p_current_length'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths'), ('DynamicBitset&', 'bitset_dependency')],
[
CDecl([("uint32_t", "dest_to_find", "")]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_current_length[src] == local_r",
[
CBlock(["dest_to_find = p_current_length[src] + 1"]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("current_edge", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(current_edge)"]),
If("dest_to_find == p_current_length[dst]",
[
CDecl([("float", "contrib", "")]),
CBlock(["contrib = ((float)1 + p_dependency[dst]) / p_num_shortest_paths[dst]"]),
CBlock(["p_dependency[src] = p_dependency[src] + contrib"]),
CBlock(["bitset_dependency.set(src)"]),
]),
]),
),
CBlock(["p_dependency[src] *= p_num_shortest_paths[src]"]),
]),
]),
Kernel("BC", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('float *', 'p_dependency')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dependency[src] > 0",
[
CBlock(["p_betweeness_centrality[src] += p_dependency[src]"]),
]),
]),
]),
]),
Kernel("Sanity", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGReduceMax<float>', 'DGAccumulator_max'), ('HGReduceMin<float>', 'DGAccumulator_min')],
[
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_max_ts", "")]),
CBlock(["DGAccumulator_max.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_min_ts", "")]),
CBlock(["DGAccumulator_min.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["DGAccumulator_max.reduce(p_betweeness_centrality[src])"]),
CBlock(["DGAccumulator_min.reduce(p_betweeness_centrality[src])"]),
CBlock(["DGAccumulator_sum.reduce( p_betweeness_centrality[src])"]),
]),
]),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["DGAccumulator_max.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_max_ts)"], parse = False),
CBlock(["DGAccumulator_min.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_min_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "ctx->betweeness_centrality.data.gpu_wr_ptr()", "ctx->dependency.data.gpu_wr_ptr()", "ctx->num_shortest_paths.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("InitializeIteration_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeIteration", ("ctx->gg", "__begin", "__end", "local_current_src_node", "local_infinity", "ctx->current_length.data.gpu_wr_ptr()", "ctx->dependency.data.gpu_wr_ptr()", "ctx->num_shortest_paths.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeIteration_allNodes_cuda", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeIteration_cuda(0, ctx->gg.nnodes, local_infinity, local_current_src_node, ctx)"]),
], host = True),
Kernel("InitializeIteration_masterNodes_cuda", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeIteration_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_current_src_node, ctx)"]),
], host = True),
Kernel("InitializeIteration_nodesWithEdges_cuda", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeIteration_cuda(0, ctx->numNodesWithEdges, local_infinity, local_current_src_node, ctx)"]),
], host = True),
Kernel("ForwardPass_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint32_t>", "dgaval", " = Shared<uint32_t>(1)")]),
CDecl([("HGAccumulator<uint32_t>", "_dga", "")]),
CBlock(["*(dgaval.cpu_wr_ptr()) = 0"]),
CBlock(["_dga.rv = dgaval.gpu_wr_ptr()"]),
Invoke("ForwardPass", ("ctx->gg", "__begin", "__end", "local_r", "ctx->current_length.data.gpu_wr_ptr()", "ctx->num_shortest_paths.data.gpu_wr_ptr()", "*(ctx->current_length.is_updated.gpu_rd_ptr())", "*(ctx->num_shortest_paths.is_updated.gpu_rd_ptr())", "_dga")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["dga = *(dgaval.cpu_rd_ptr())"]),
], host = True),
Kernel("ForwardPass_allNodes_cuda", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ForwardPass_cuda(0, ctx->gg.nnodes, dga, local_r, ctx)"]),
], host = True),
Kernel("ForwardPass_masterNodes_cuda", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ForwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, dga, local_r, ctx)"]),
], host = True),
Kernel("ForwardPass_nodesWithEdges_cuda", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ForwardPass_cuda(0, ctx->numNodesWithEdges, dga, local_r, ctx)"]),
], host = True),
Kernel("MiddleSync_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("MiddleSync", ("ctx->gg", "__begin", "__end", "local_infinity", "ctx->current_length.data.gpu_wr_ptr()", "*(ctx->num_shortest_paths.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("MiddleSync_allNodes_cuda", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["MiddleSync_cuda(0, ctx->gg.nnodes, local_infinity, ctx)"]),
], host = True),
Kernel("MiddleSync_masterNodes_cuda", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["MiddleSync_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, ctx)"]),
], host = True),
Kernel("MiddleSync_nodesWithEdges_cuda", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["MiddleSync_cuda(0, ctx->numNodesWithEdges, local_infinity, ctx)"]),
], host = True),
Kernel("BackwardPass_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("BackwardPass", ("ctx->gg", "__begin", "__end", "local_r", "ctx->current_length.data.gpu_wr_ptr()", "ctx->dependency.data.gpu_wr_ptr()", "ctx->num_shortest_paths.data.gpu_wr_ptr()", "*(ctx->dependency.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("BackwardPass_allNodes_cuda", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BackwardPass_cuda(0, ctx->gg.nnodes, local_r, ctx)"]),
], host = True),
Kernel("BackwardPass_masterNodes_cuda", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BackwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_r, ctx)"]),
], host = True),
Kernel("BackwardPass_nodesWithEdges_cuda", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BackwardPass_cuda(0, ctx->numNodesWithEdges, local_r, ctx)"]),
], host = True),
Kernel("BC_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("BC", ("ctx->gg", "__begin", "__end", "ctx->betweeness_centrality.data.gpu_wr_ptr()", "ctx->dependency.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("BC_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BC_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("BC_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("BC_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BC_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("Sanity_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<float>", "DGAccumulator_sumval", " = Shared<float>(1)")]),
CDecl([("HGAccumulator<float>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_maxval", " = Shared<float>(1)")]),
CDecl([("HGReduceMax<float>", "_DGAccumulator_max", "")]),
CBlock(["*(DGAccumulator_maxval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_max.rv = DGAccumulator_maxval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_minval", " = Shared<float>(1)")]),
CDecl([("HGReduceMin<float>", "_DGAccumulator_min", "")]),
CBlock(["*(DGAccumulator_minval.cpu_wr_ptr()) = 1073741823"]),
CBlock(["_DGAccumulator_min.rv = DGAccumulator_minval.gpu_wr_ptr()"]),
Invoke("Sanity", ("ctx->gg", "__begin", "__end", "ctx->betweeness_centrality.data.gpu_wr_ptr()", "_DGAccumulator_sum", "_DGAccumulator_max", "_DGAccumulator_min")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_max = *(DGAccumulator_maxval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_min = *(DGAccumulator_minval.cpu_rd_ptr())"]),
], host = True),
Kernel("Sanity_allNodes_cuda", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["Sanity_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)"]),
], host = True),
Kernel("Sanity_masterNodes_cuda", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["Sanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)"]),
], host = True),
Kernel("Sanity_nodesWithEdges_cuda", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["Sanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_level_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

////////////////////////////////////////////////////////////////////////////
// # short paths
////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(num_shortest_paths, ShortPathType);
// used for middle sync only
GALOIS_SYNC_STRUCTURE_REDUCE_SET(num_shortest_paths, ShortPathType);
GALOIS_SYNC_STRUCTURE_BITSET(num_shortest_paths);

////////////////////////////////////////////////////////////////////////////
// Current Lengths
////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_MIN(current_length, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(current_length);

////////////////////////////////////////////////////////////////////////////
// Dependency
////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(dependency, float);
GALOIS_SYNC_STRUCTURE_BITSET(dependency);


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/runtime/Tracer.h"

#include <iomanip>
#include <iostream>

// type of short path
using ShortPathType = double;

/**
 * Structure for holding data calculated during BC
 */
struct BCData {
  uint32_t minDistance;
  ShortPathType shortPathCount;
  galois::CopyableAtomic<float> dependencyValue;
};

constexpr static const char* const REGION_NAME = "MRBC";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;

static cll::opt<std::string> sourcesToUse("sourcesToUse",
                                          cll::desc("Sources to use in BC"),
                                          cll::init(""));
static cll::opt<unsigned int>
    numSourcesPerRound("numRoundSources",
                       cll::desc("Number of sources to use for APSP"),
                       cll::init(1));
static cll::opt<unsigned int>
    totalNumSources("numOfSources",
                    cll::desc("Total number of sources to do BC"),
                    cll::init(0));
static cll::opt<bool> useSingleSource("singleSource",
                                      cll::desc("Use a single source."),
                                      cll::init(false));
static cll::opt<unsigned long long>
    startNode("startNode", cll::desc("Single source start node."),
              cll::init(0));
static cll::opt<unsigned int> vIndex("index",
                                     cll::desc("DEBUG: Index to print for "
                                               "dist/short paths"),
                                     cll::init(0), cll::Hidden);
static cll::opt<unsigned int>
    vectorSize("vectorSize",
               cll::desc("DEBUG: Specify size of vector "
                         "used for node data"),
               cll::init(0), cll::Hidden);

// moved here so MRBCTree has access to numSourcesPerRound
#include "mrbc_tree.h"

/******************************************************************************/
/* Graph structure declarations */
/******************************************************************************/

// NOTE: declared types assume that these values will not reach uint64_t: it may
// need to be changed for very large graphs
struct NodeData {
  galois::gstl::Vector<BCData> sourceData;
  // distance map
  MRBCTree dTree;
  // final bc value
  float bc;
  // index that needs to be pulled in a round
  uint32_t roundIndexToSend;
};

using Graph = galois::graphs::DistGraph<NodeData, void>;
using GNode = typename Graph::GraphNode;

// Bitsets for tracking which nodes need to be sync'd with respect to a
// particular field
galois::DynamicBitSet bitset_minDistances;
galois::DynamicBitSet bitset_dependency;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

// moved here for access to ShortPathType, NodeData, DynamicBitSets
#include "mrbc_sync.hh"

/******************************************************************************/
/* Functions for running the algorithm */
/******************************************************************************/

/**
 * Graph initialization. Initialize all of the node data fields.
 *
 * @param graph Local graph to operate on
 */
void InitializeGraph(Graph& graph) {
  const auto& allNodes = graph.allNodesRange();

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode curNode) {
        NodeData& cur_data = graph.getData(curNode);
        cur_data.sourceData.resize(vectorSize);
        cur_data.bc = 0.0;
      },
      galois::loopname(
          syncSubstrate->get_run_identifier("InitializeGraph").c_str()),
      galois::no_stats()); // Only stats the runtime by loopname
}

/**
 * This is used to reset node data when switching to a different
 * source set. Initializes everything for the coming source set.
 *
 * @param graph Local graph to operate on
 * @param offset Offset into sources (i.e. number of sources already done)
 **/
void InitializeIteration(Graph& graph,
                         const std::vector<uint64_t>& nodesToConsider) {
  const auto& allNodes = graph.allNodesRange();

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode curNode) {
        NodeData& cur_data        = graph.getData(curNode);
        cur_data.roundIndexToSend = infinity;
        cur_data.dTree.initialize();
        for (unsigned i = 0; i < numSourcesPerRound; i++) {
          // min distance and short path count setup
          if (nodesToConsider[i] == graph.getGID(curNode)) { // source node
            cur_data.sourceData[i].minDistance     = 0;
            cur_data.sourceData[i].shortPathCount  = 1;
            cur_data.sourceData[i].dependencyValue = 0.0;
            cur_data.dTree.setDistance(i, 0);
          } else { // non-source node
            cur_data.sourceData[i].minDistance     = infinity;
            cur_data.sourceData[i].shortPathCount  = 0;
            cur_data.sourceData[i].dependencyValue = 0.0;
          }
        }
      },
      galois::loopname(
          syncSubstrate->get_run_identifier("InitializeIteration").c_str()),
      galois::no_stats());
};

/**
 * Find the message to send out from each node every round (if any exists to
 * be sent).
 *
 * @param graph Local graph to operate on
 * @param roundNumber current round number
 * @param dga Distributed accumulator for determining if work was done in
 * an iteration across all hosts
 */
void FindMessageToSync(Graph& graph, const uint32_t roundNumber,
                       galois::DGAccumulator<uint32_t>& dga) {
  const auto& allNodes = graph.allNodesRange();

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode curNode) {
        NodeData& cur_data        = graph.getData(curNode);
        cur_data.roundIndexToSend = cur_data.dTree.getIndexToSend(roundNumber);

        if (cur_data.roundIndexToSend != infinity) {
          if (cur_data.sourceData[cur_data.roundIndexToSend].minDistance != 0) {
            bitset_minDistances.set(curNode);
          }
          dga += 1;
        } else if (cur_data.dTree.moreWork()) {
          dga += 1;
        }
      },
      galois::loopname(syncSubstrate
                           ->get_run_identifier(std::string(REGION_NAME) +
                                                "_FindMessageToSync")
                           .c_str()),
      galois::steal(), galois::no_stats());
}

/**
 * Mark index we're sending out this round as sent + update metadata as
 * necessary.
 *
 * @param graph Local graph to operate on
 * @param roundNumber current round number
 */
void ConfirmMessageToSend(Graph& graph, const uint32_t roundNumber) {
  const auto& allNodes = graph.allNodesRange();

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode curNode) {
        NodeData& cur_data = graph.getData(curNode);
        if (cur_data.roundIndexToSend != infinity) {
          cur_data.dTree.markSent(roundNumber);
        }
      },
      galois::loopname(syncSubstrate
                           ->get_run_identifier(std::string(REGION_NAME) +
                                                "_ConfirmMessageToSend")
                           .c_str()),
      galois::no_stats());
}

/**
 * If a node has something to send (as indicated by its indexToSend variable),
 * it will be pulled by all of its outgoing neighbors.
 *
 * Pull-style is used here to avoid the need for locks as 2 variables must be
 * updated at once.
 *
 * @param graph Local graph to operate on
 * @param dga Distributed accumulator for determining if work was done in
 * an iteration across all hosts
 */
void SendAPSPMessagesOp(GNode dst, Graph& graph,
                        galois::DGAccumulator<uint32_t>& dga) {
  auto& dnode     = graph.getData(dst);
  auto& dnodeData = dnode.sourceData;

  for (auto inEdge : graph.edges(dst)) {
    NodeData& src_data   = graph.getData(graph.getEdgeDst(inEdge));
    uint32_t indexToSend = src_data.roundIndexToSend;

    if (indexToSend != infinity) {
      uint32_t distValue = src_data.sourceData[indexToSend].minDistance;
      uint32_t newValue  = distValue + 1;
      // Update minDistance vector
      auto& dnodeIndex  = dnodeData[indexToSend];
      uint32_t oldValue = dnodeIndex.minDistance;

      if (oldValue > newValue) {
        dnodeIndex.minDistance = newValue;
        dnode.dTree.setDistance(indexToSend, oldValue, newValue);
        // overwrite short path with this node's shortest path
        dnodeIndex.shortPathCount =
            src_data.sourceData[indexToSend].shortPathCount;
      } else if (oldValue == newValue) {
        assert(src_data.sourceData[indexToSend].shortPathCount != 0);
        // add to short path
        dnodeIndex.shortPathCount +=
            src_data.sourceData[indexToSend].shortPathCount;
      }

      dga += 1;
    }
  }
}

void SendAPSPMessages(Graph& graph, galois::DGAccumulator<uint32_t>& dga) {
  const auto& allNodesWithEdges = graph.allNodesWithEdgesRange();

  galois::do_all(
      galois::iterate(allNodesWithEdges),
      [&](GNode dst) { SendAPSPMessagesOp(dst, graph, dga); },
      galois::loopname(syncSubstrate
                           ->get_run_identifier(std::string(REGION_NAME) +
                                                "_SendAPSPMessages")
                           .c_str()),
      galois::steal(), galois::no_stats());
}

/**
 * Find all pairs shortest paths for the sources currently being worked on
 * as well as the number of shortest paths for each source.
 *
 * @param graph Local graph to operate on
 * @param dga Distributed accumulator for determining if work was done in
 * an iteration across all hosts
 *
 * @returns total number of rounds needed to do this phase
 */
uint32_t APSP(Graph& graph, galois::DGAccumulator<uint32_t>& dga) {
  uint32_t roundNumber = 0;

  do {
    dga.reset();
    galois::gDebug("[", galois::runtime::getSystemNetworkInterface().ID, "]",
                   " Round ", roundNumber);
    syncSubstrate->set_num_round(roundNumber);

    // you can think of this FindMessageToSync call being a part of the sync
    FindMessageToSync(graph, roundNumber, dga);

    // Template para's are struct names
    syncSubstrate->sync<writeAny, readAny, APSPReduce, Bitset_minDistances>(
        std::string(std::string(REGION_NAME) + "_APSP"));

    // confirm message to send after sync potentially changes what you were
    // planning on sending
    ConfirmMessageToSend(graph, roundNumber);

    // send messages (if any)
    SendAPSPMessages(graph, dga);

    roundNumber++;
  } while (dga.reduce(syncSubstrate->get_run_identifier()));

  return roundNumber;
}

/**
 * Get the round number for the backward propagation phase using the round
 * number from the APSP phase. This round number determines when a node should
 * send out a message for the backward propagation of dependency values.
 *
 * @param graph Local graph to operate on
 */
void RoundUpdate(Graph& graph) {
  const auto& allNodes = graph.allNodesRange();
  syncSubstrate->set_num_round(0);

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode node) {
        NodeData& cur_data = graph.getData(node);
        cur_data.dTree.prepForBackPhase();
      },
      galois::loopname(
          syncSubstrate
              ->get_run_identifier(std::string(REGION_NAME) + "_RoundUpdate")
              .c_str()),
      galois::no_stats());
}

/**
 * Find the message that needs to be back propagated this round by checking
 * round number.
 */
void BackFindMessageToSend(Graph& graph, const uint32_t roundNumber,
                           const uint32_t lastRoundNumber) {
  // has to be all nodes because even nodes without edges may have dependency
  // that needs to be sync'd
  const auto& allNodes = graph.allNodesRange();

  galois::do_all(
      galois::iterate(allNodes.begin(), allNodes.end()),
      [&](GNode dst) {
        NodeData& dst_data = graph.getData(dst);

        // if zero distances already reached, there is no point sending things
        // out since we don't care about dependecy for sources (i.e. distance
        // 0)
        if (!dst_data.dTree.isZeroReached()) {
          dst_data.roundIndexToSend =
              dst_data.dTree.backGetIndexToSend(roundNumber, lastRoundNumber);

          if (dst_data.roundIndexToSend != infinity) {
            // only comm if not redundant 0
            if (dst_data.sourceData[dst_data.roundIndexToSend]
                    .dependencyValue != 0) {
              bitset_dependency.set(dst);
            }
          }
        }
      },
      galois::loopname(syncSubstrate
                           ->get_run_identifier(std::string(REGION_NAME) +
                                                "_BackFindMessageToSend")
                           .c_str()),
      galois::no_stats());
}

/**
 * Back propagate dependency values depending on the round that a node
 * sent out the shortest path message.
 *
 * @param graph Local graph to operate on
 * @param lastRoundNumber last round number in the APSP phase
 */
void BackPropOp(GNode dst, Graph& graph) {
  NodeData& dst_data = graph.getData(dst);
  unsigned i         = dst_data.roundIndexToSend;

  if (i != infinity) {
    uint32_t myDistance = dst_data.sourceData[i].minDistance;

    // calculate final dependency value
    dst_data.sourceData[i].dependencyValue =
        dst_data.sourceData[i].dependencyValue *
        dst_data.sourceData[i].shortPathCount;

    // get the value to add to predecessors
    float toAdd = ((float)1 + dst_data.sourceData[i].dependencyValue) /
                  dst_data.sourceData[i].shortPathCount;

    for (auto inEdge : graph.edges(dst)) {
      GNode src               = graph.getEdgeDst(inEdge);
      auto& src_data          = graph.getData(src);
      uint32_t sourceDistance = src_data.sourceData[i].minDistance;

      // source nodes of this batch (i.e. distance 0) can be safely
      // ignored
      if (sourceDistance != 0) {
        // determine if this source is a predecessor
        if (myDistance == (sourceDistance + 1)) {
          // add to dependency of predecessor using our finalized one
          galois::atomicAdd(src_data.sourceData[i].dependencyValue, toAdd);
        }
      }
    }
  }
}

void BackProp(Graph& graph, const uint32_t lastRoundNumber) {
  // All nodes WITH EDGES (another at SendMessage)
  const auto& allNodesWithEdges = graph.allNodesWithEdgesRange();

  uint32_t currentRound = 0;

  while (currentRound <= lastRoundNumber) {
    syncSubstrate->set_num_round(currentRound);

    BackFindMessageToSend(graph, currentRound, lastRoundNumber);

    // write destination in this case being the source in the actual graph
    // since we're using the tranpose graph
    syncSubstrate->sync<writeDestination, readSource, DependencyReduce,
                        Bitset_dependency>(
        std::string(std::string(REGION_NAME) + "_DependencySync"));

    galois::do_all(
        galois::iterate(allNodesWithEdges),
        [&](GNode dst) { BackPropOp(dst, graph); },
        galois::loopname(
            syncSubstrate
                ->get_run_identifier(std::string(REGION_NAME) + "_BackProp")
                .c_str()),
        galois::steal(), galois::no_stats());

    currentRound++;
  }
}

/**
 * BC sum: take the dependency value for each source and add it to the
 * final BC value.
 *
 * @param graph Local graph to operate on
 * @param offset Offset into sources (i.e. number of sources already done)
 */
void BC(Graph& graph, const std::vector<uint64_t>& nodesToConsider) {
  const auto& masterNodes = graph.masterNodesRange();
  syncSubstrate->set_num_round(0);

  galois::do_all(
      galois::iterate(masterNodes.begin(), masterNodes.end()),
      [&](GNode node) {
        NodeData& cur_data = graph.getData(node);

        for (unsigned i = 0; i < numSourcesPerRound; i++) {
          // exclude sources themselves from BC calculation
          if (graph.getGID(node) != nodesToConsider[i]) {
            cur_data.bc += cur_data.sourceData[i].dependencyValue;
          }
        }
      },
      galois::loopname(
          syncSubstrate->get_run_identifier(std::string(REGION_NAME)).c_str()),
      galois::no_stats());
};

/******************************************************************************/
/* Sanity check */
/******************************************************************************/

void Sanity(Graph& graph) {
  galois::DGReduceMax<float> DGA_max;
  galois::DGReduceMin<float> DGA_min;
  galois::DGAccumulator<float> DGA_sum;

  DGA_max.reset();
  DGA_min.reset();
  DGA_sum.reset();

  galois::do_all(
      galois::iterate(graph.masterNodesRange().begin(),
                      graph.masterNodesRange().end()),
      [&](auto src) {
        NodeData& sdata = graph.getData(src);

        DGA_max.update(sdata.bc);
        DGA_min.update(sdata.bc);
        DGA_sum += sdata.bc;
      },
      galois::no_stats(), galois::loopname("Sanity"));

  float max_bc = DGA_max.reduce();
  float min_bc = DGA_min.reduce();
  float bc_sum = DGA_sum.reduce();

  // Only node 0 will print data
  if (galois::runtime::getSystemNetworkInterface().ID == 0) {
    galois::gPrint("Max BC is ", max_bc, "\n");
    galois::gPrint("Min BC is ", min_bc, "\n");
    galois::gPrint("BC sum is ", bc_sum, "\n");
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).bc);
  }

  return values;
}

/******************************************************************************/
/* Main method for running */
/******************************************************************************/

constexpr static const char* const name = "Min-Rounds Betweeness Centrality";
constexpr static const char* const desc = "Min-Rounds Betweeness "
                                          "Centrality on Distributed Galois.";
constexpr static const char* const url = nullptr;

uint64_t macroRound = 0; // macro round, i.e. number of batches done so far

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  // Total timer
  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);
  StatTimer_total.start();

  galois::gPrint("[", net.ID, "] InitializeGraph\n");
  std::unique_ptr<Graph> hg;
  // false = iterate over in edges
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void, false>();

  if (totalNumSources == 0) {
    galois::gDebug("Total num sources unspecified");
    totalNumSources = hg->globalSize();
  }

  if (useSingleSource) {
    totalNumSources    = 1;
    numSourcesPerRound = 1;
  }

  // set vector size in node data
  if (vectorSize == 0) {
    vectorSize = numSourcesPerRound.getValue();
  }
  GALOIS_ASSERT(vectorSize >= numSourcesPerRound);

  // Backup the number of sources per round
  uint64_t origNumRoundSources = numSourcesPerRound;

  // Start graph initialization
  galois::StatTimer StatTimer_graph_init("TIMER_GRAPH_INIT", REGION_NAME);
  StatTimer_graph_init.start();
  InitializeGraph(*hg);
  StatTimer_graph_init.stop();

  galois::runtime::getHostBarrier().wait();

  // shared DG accumulator among all steps
  galois::DGAccumulator<uint32_t> dga;

  // reading in list of sources to operate on if provided
  std::ifstream sourceFile;
  std::vector<uint64_t> sourceVector;
  if (!sourcesToUse.empty()) {
    sourceFile.open(sourcesToUse);
    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},
                            std::istream_iterator<uint64_t>{});
    sourceVector = t; // stored in source vector
    sourceFile.close();
  }

  if (startNode && !sourceVector.empty()) {
    GALOIS_DIE("startNode option not compatible with sourcesToUse");
  }

  // "sourceVector" if file not provided
  std::vector<uint64_t> nodesToConsider;
  nodesToConsider.resize(numSourcesPerRound);

  // bitset initialization
  bitset_dependency.resize(hg->size());
  bitset_minDistances.resize(hg->size());

  ////////////////////////////////////////////////////////////////////////////////

  galois::runtime::reportStat_Single(std::string(REGION_NAME),
                                     std::string("NumSources"),
                                     (unsigned int)totalNumSources);
  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] Run ", run, " started\n");

    // Timer per RUN
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    // Associated to totalNumSources
    uint64_t totalSourcesFound = 0;

    // offset into sources to operate on
    uint64_t offset = startNode;
    // node boundary to end search at
    uint64_t nodeBoundary =
        sourceVector.empty() ? hg->globalSize() : sourceVector.size();

    while (offset < nodeBoundary && totalSourcesFound < totalNumSources) {
      if (useSingleSource) {
        nodesToConsider[0] = startNode;
        totalNumSources    = 1;
        totalSourcesFound  = 1;
      } else {
        unsigned sourcesFound = 0;
        while (sourcesFound < numSourcesPerRound && offset < nodeBoundary &&
               totalSourcesFound < totalNumSources) {
          // choose from read source file or from beginning (0 to n)
          nodesToConsider[sourcesFound] =
              sourceVector.empty() ? offset : sourceVector[offset];
          offset++;
          sourcesFound++;
          totalSourcesFound++;
        }

        if (sourcesFound == 0) {
          assert((offset - startNode) == totalNumSources ||
                 totalSourcesFound == totalNumSources);
          break;
        }

        if ((offset - startNode) < totalNumSources) {
          assert(numSourcesPerRound == sourcesFound);
        } else {
          // >= totalNumSources
          assert((offset - startNode) == totalNumSources);
          galois::gDebug("Out of sources (found ", sourcesFound, ")");
          numSourcesPerRound = sourcesFound;
        }
      }

      galois::gDebug("Using the following sources");
      for (unsigned i = 0; i < numSourcesPerRound; i++) {
        galois::gDebug(nodesToConsider[i]);
      }

      if (net.ID == 0) {
        galois::gPrint("Begin batch #", macroRound, "\n");
      }

      // accumulate time per batch
      StatTimer_main.start();
      InitializeIteration(*hg, nodesToConsider);

      // APSP returns total number of rounds taken
      // subtract 2 to get to last round where message was sent (round
      // after that is empty round where nothing is done)
      uint32_t lastRoundNumber = APSP(*hg, dga) - 2;
      RoundUpdate(*hg);
      BackProp(*hg, lastRoundNumber);
      BC(*hg, nodesToConsider);

      StatTimer_main.stop();

      syncSubstrate->set_num_round(0);
      // report num rounds
      if (galois::runtime::getSystemNetworkInterface().ID == 0) {
        galois::runtime::reportStat_Single(
            REGION_NAME,
            // hg->get_run_identifier("NumForwardRounds", macroRound),
            syncSubstrate->get_run_identifier("NumForwardRounds"),
            lastRoundNumber + 2);
        galois::runtime::reportStat_Single(
            REGION_NAME,
            // hg->get_run_identifier("NumBackwardRounds", macroRound),
            syncSubstrate->get_run_identifier("NumBackwardRounds"),
            lastRoundNumber + 1);
        galois::runtime::reportStat_Single(
            REGION_NAME, syncSubstrate->get_run_identifier("TotalRounds"),
            lastRoundNumber + lastRoundNumber + 3);
      }

      macroRound++;
    }

    Sanity(*hg);

    // re-init graph for next run
    if ((run + 1) != numRuns) { // not the last run
      galois::runtime::getHostBarrier().wait();
      syncSubstrate->set_num_run(run + 1);
      syncSubstrate->set_num_round(0);
      offset             = 0;
      macroRound         = 0;
      numSourcesPerRound = origNumRoundSources;

      bitset_dependency.reset();
      bitset_minDistances.reset();

      InitializeGraph(*hg);
      galois::runtime::getHostBarrier().wait();
    }

    // Current run finished
  }

  StatTimer_total.stop();

  ////////////////////////////////////////////////////////////////////////////////

  if (output) {
    std::vector<float> results = makeResults(hg);
    auto globalIDs             = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "betweenness_centrality", results.data(),
                results.size(), globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/mrbc_bitset.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */
#ifndef _MR_BC_BIT_SET_
#define _MR_BC_BIT_SET_

#include "galois/DynamicBitset.h"
#include <boost/random/detail/integer_log2.hpp>

/** OPTIONS **********/
/** 1. Optimized mode: enable ONLY ONE of them at most **/
// #define REVERSE_MODE //! Not applicable for v6
// #define FLIP_MODE  //! Not applicable for v6
/** 2. Do you need an indicator? **/
#define USE_INDICATOR
/*********************/

/**
 * Derivate from DynamicBitSet
 **/
class MRBCBitSet : public galois::DynamicBitSet {
  // using Base = galois::DynamicBitSet;
  // @DynamicBitSet (protected)
  // using Base::bitvec;
  // using Base::num_bits;
  // using Base::bits_uint64;

  #ifdef USE_INDICATOR
    //! indicate the index of bit to process
    size_t indicator;
  #endif

  // Member functions
  inline size_t get_word(size_t pos) const { return pos < bits_uint64? 0 : pos / bits_uint64; }
  inline size_t get_offset(size_t pos) const { return pos < bits_uint64? pos : pos % bits_uint64; }
  inline uint64_t get_mask(size_t pos) const { return uint64_t(1) << get_offset(pos); }

  #if defined(REVERSE_MODE) || defined(FLIP_MODE)
    size_t reverse(size_t pos) {
      return pos == npos? npos : num_bits - pos - 1;
    }
  #endif

  #ifdef FLIP_MODE
    void flip_recursive(size_t pos) {
      size_t next = find_next(pos);
      if (next != npos)
        flip_recursive(next);
      // do the flip for pos
      uint64_t block = get_word(pos), mask = get_mask(pos);
      uint64_t rBlock = get_word(reverse(pos)), rMask = get_mask(reverse(pos));
      // flip if asymmetrical
      if (!(bitvec[rBlock] & rMask)) {
        bitvec[block].fetch_and(~mask);
        size_t r_old = bitvec[rBlock];
        while (!bitvec[rBlock].compare_exchange_weak(
          r_old, r_old | rMask, std::memory_order_relaxed));
      }
    }
  #endif

  size_t right_most_bit(uint64_t w) const {
    // assert(w >= 1);
    return boost::integer_log2<uint64_t>(w & -w);
  }

  size_t left_most_bit(uint64_t w) const {
      return boost::integer_log2<uint64_t>(w);
  }

  size_t find_from_block(size_t first, bool fore=true) const {
    size_t i;
    if (fore) {
      for (i = first; i < bitvec.size() && bitvec[i] == 0; i++);
      if (i >= bitvec.size())
          return npos;
      return i * bits_uint64 + right_most_bit(bitvec[i]);
    }
    else {
      for (i = first; i > 0 && bitvec[i] == 0; i--);
      if (i <= 0 && bitvec[i] == 0)
        return npos;
      return i * bits_uint64 + left_most_bit(bitvec[i]);
    }
  }

public:
  typedef size_t iterator;
  typedef size_t reverse_iterator;

  //! sign for N/A
  static const size_t npos = std::numeric_limits<size_t>::max();

  #ifdef FLIP_MODE
    void flip() {
      size_t first = find_first();
      if (first != npos)
        flip_recursive(first);
    }
  #endif

  #ifdef USE_INDICATOR
  // Accessors
  size_t getIndicator() const { return indicator; }
  void setIndicator(size_t index) { indicator = index; }
  #endif

  // @DynamicBitSet
  #ifdef _GALOIS_DYNAMIC_BIT_SET_
  // using Base::size;
  #else
  size_t size() const { return num_bits; }
  #endif

  //! Constructor which initializes to an empty bitset.
  MRBCBitSet() {
    resize(numSourcesPerRound);
    #ifdef USE_INDICATOR
      indicator = npos;
    #endif
  }

  #ifdef _GALOIS_DYNAMIC_BIT_SET_
  // using Base::test;
  #else
  // @DynamicBitSet
  bool test(size_t index) const {
    size_t bit_index = get_word(index);
    uint64_t bit_mask = get_mask(index);
    return ((bitvec[bit_index] & bit_mask) != 0);
  }
  #endif

  /**
   * Test and then set
   *
   * @returns test result before set
   */
  bool test_set(size_t pos, bool val=true) {
    bool const ret = test(pos);
    if (ret != val) {
      uint64_t old_val = bitvec[get_word(pos)];
      if (val) {
        while (!bitvec[get_word(pos)].compare_exchange_weak(
          old_val, (old_val | get_mask(pos)), 
          std::memory_order_relaxed));
      }
      else {
        while (!bitvec[get_word(pos)].compare_exchange_weak(
          old_val, (old_val & ~get_mask(pos)), 
          std::memory_order_relaxed));
      }
    }
    return ret;
  }

  #ifdef _GALOIS_DYNAMIC_BIT_SET_
  // using Base::set;
  #else
  // @DynamicBitSet
  void set(size_t index) {
    size_t bit_index = get_word(index);
    uint64_t bit_mask = get_mask(index);
    if ((bitvec[bit_index] & bit_mask) == 0) { // test and set
      size_t old_val = bitvec[bit_index];
      while (!bitvec[bit_index].compare_exchange_weak(
          old_val, old_val | bit_mask, 
          std::memory_order_relaxed));
    }
  }
  #endif

  // DISABLED @DynamicBitSet
  void reset(size_t index) {
    size_t bit_index = get_word(index);
    uint64_t bit_mask = get_mask(index);
    bitvec[bit_index].fetch_and(~bit_mask);
    // @ Better implementation:
    // while (!bitvec[bit_index].compare_exchange_weak(
    //   old_val, old_val & ~bit_mask, 
    //   std::memory_order_relaxed));
  }

  #ifdef _GALOIS_DYNAMIC_BIT_SET_
  // using Base::reset;
  // using Base::resize;
  #else
  // @DynamicBitSet
  void reset() { std::fill(bitvec.begin(), bitvec.end(), uint64_t(0)); }

  // @DynamicBitSet
  void resize(uint64_t n) {
    assert(bits_uint64 == 64); // compatibility with other devices
    num_bits = n;
    bitvec.resize((n + bits_uint64 - 1) / bits_uint64);
    reset();
  }
  #endif

  bool none() {
    for (size_t i = 0; i < bitvec.size(); ++i)
      if (bitvec[i])
        return false;
    return true;
  }

  #ifdef USE_INDICATOR
    /**
     * Set a bit with the side-effect updating indicator to the first.
     */
    void set_indicator(size_t pos) {
      #ifdef REVERSE_MODE
        set(reverse(pos));
      #else
        set(pos);
      #endif
      if (pos < indicator) {
        indicator = pos;
      }
    }

    bool test_set_indicator(size_t pos, bool val=true) {
      #ifdef REVERSE_MODE
        if (test_set(reverse(pos), val)) {
          if (pos == indicator) {
            forward_indicator();
          }
          return true;
        }
        else
          return false;
      #else
        if (test_set(pos, val)) {
          if (pos == indicator) {
            forward_indicator();
          }
          return true;
        }
        else
          return false;
      #endif
    }

    /**
     * Return true if indicator is npos
     */
    bool nposInd() {
      return indicator == npos;
    }
  #endif
  /**
   * Returns: the lowest index i such as bit i is set, or npos if *this has no on bits.
   */
  size_t find_first() const {
    return find_from_block(0);
  }

  size_t find_last() const {
    return find_from_block(bitvec.size() - 1, false);
  }

  #ifdef REVERSE_MODE
    inline size_t begin() { return reverse(find_last()); }
  #else
    inline size_t begin() { return find_first(); }
  #endif
  inline size_t end() { return npos; }

  #if defined(REVERSE_MODE) || defined(FLIP_MODE)
    inline size_t rbegin() { return reverse(find_first()); }
  #else
    inline size_t rbegin() { return find_last(); }
  #endif
  inline size_t rend() { return npos; }

  /**
   * Returns: the lowest index i greater than pos such as bit i is set, or npos if no such index exists.
   */
  size_t find_next(size_t pos) const {
    if (pos == npos) {
      return find_first();
    }
    if (++pos >= size() || size() == 0) {
      return npos;
    }
    size_t curBlock = get_word(pos);
    auto curOffset = get_offset(pos);
    auto seg = bitvec[curBlock];
    while (seg != bitvec[curBlock])
      seg = bitvec[curBlock];
    uint64_t res = seg >> curOffset;
    return res?
      pos + right_most_bit(res) : find_from_block(++curBlock);
  }

  size_t find_prev(size_t pos) const{
    if (pos >= size()) {
      return find_last();
    }
    // Return npos if no bit set
    if (pos-- == 0 || size() == 0) {
      return npos;
    }
    size_t curBlock = get_word(pos);
    uint64_t res = bitvec[curBlock] & ((uint64_t(2) << get_offset(pos)) - 1);
    return res?
      curBlock * bits_uint64 + left_most_bit(res) : 
      (curBlock?
        find_from_block(--curBlock, false) : npos);
  }

  /**
   * To move iterator to the previous set bit, and return the old value.
   */
  inline size_t forward_iterate(size_t& i) {
    size_t old = i;
    #ifdef REVERSE_MODE
      i = reverse(find_prev(reverse(i)));
    #else
      i = find_next(i);
    #endif
    return old;
  }

  /**
   * To move iterator to the next set bit.
   */
  inline size_t backward_iterate(size_t& i) {
    size_t old = i;
    #ifdef FLIP_MODE
      i = nposInd()? find_first() : find_next(i);
      return reverse(old);
    #else
      #ifdef REVERSE_MODE
      i = reverse(find_next(reverse(i)));
      #else
      i = find_prev(i);
      #endif
      return old;
    #endif
  }

  #ifdef USE_INDICATOR
    /**
     * To move indicator to the previous set bit, and return the old value.
     */
    size_t forward_indicator() {
      return forward_iterate(indicator);
    }

    /**
     * To move indicator to the next set bit.
     */
    size_t backward_indicator() {
      return backward_iterate(indicator);
    }
  #endif
};
#endif


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/mrbc_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

////////////////////////////////////////////////////////////////////////////////
// APSP synchronization
////////////////////////////////////////////////////////////////////////////////

struct APSPReduce {
  using ValTy = galois::TupleOfThree<uint32_t, uint32_t, ShortPathType>;

  static ValTy extract(uint32_t, struct NodeData& node) {
    uint32_t indexToGet = node.roundIndexToSend;

    uint32_t a;
    uint32_t b;
    ShortPathType c;

    a = indexToGet;
    if (indexToGet != infinity) {
      // get min distance and # shortest paths
      b = node.sourceData[indexToGet].minDistance;
      c = node.sourceData[indexToGet].shortPathCount;
    } else {
      // no-op
      b = infinity;
      c = 0;
    }

    return ValTy(a, b, c);
  }

  static bool extract_batch(unsigned, uint8_t*, size_t*,
                            DataCommMode*) { return false; }

  static bool extract_batch(unsigned, uint8_t*) { return false; }

  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
                                  DataCommMode*) { return false; }

  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }

  static bool reduce(uint32_t, struct NodeData& node, ValTy y) {
    uint32_t rIndex = y.first;

    if (rIndex != infinity) {
      uint32_t rDistance = y.second;
      ShortPathType rNumPaths = y.third;

      // do updates based on received numbers
      uint32_t old = galois::min(node.sourceData[rIndex].minDistance, rDistance);

      // reset shortest paths if min dist changed (i.e. don't add to it)
      if (old > rDistance) {
        node.dTree.setDistance(rIndex, old, rDistance);
        assert(rNumPaths != 0);
        node.sourceData[rIndex].shortPathCount = rNumPaths;
      } else if (old == rDistance) {
        // add to short path
        node.sourceData[rIndex].shortPathCount += rNumPaths;
      }

      // if received distance is smaller than current candidate for sending, send
      // it out instead (if tie breaker wins i.e. lower in position)
      if (node.roundIndexToSend == infinity ||
          (node.sourceData[rIndex].minDistance <
            node.sourceData[node.roundIndexToSend].minDistance)) {
          node.roundIndexToSend = rIndex;
      } else if (node.sourceData[rIndex].minDistance ==
                 node.sourceData[node.roundIndexToSend].minDistance) {
        if (rIndex < node.roundIndexToSend) {
          node.roundIndexToSend = rIndex;
        }
      }

      // return true: if it received a message for some node, then that
      // node on a mirror needs to get the most updated value (i.e. value on
      // master)
      return true;
    }

    return false;
  }

  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }

  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { return false; }

  // reset the number of shortest paths (the master will now have it)
  static void reset(uint32_t, struct NodeData &node) {
    if (node.roundIndexToSend != infinity) {
      node.sourceData[node.roundIndexToSend].shortPathCount = 0;
    }
  }

  static void setVal(uint32_t, struct NodeData & node, ValTy y) {
    uint32_t rIndex = y.first;
    if (rIndex != infinity) {
      uint32_t rDistance = y.second;
      ShortPathType rNumPaths = y.third;

      // values from master are canonical ones for this round
      node.roundIndexToSend = rIndex;
      uint32_t oldDistance = node.sourceData[rIndex].minDistance;
      node.sourceData[rIndex].minDistance = rDistance;
      node.sourceData[rIndex].shortPathCount = rNumPaths;
      node.dTree.setDistance(rIndex, oldDistance, rDistance);
    }
  }

  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
};

////////////////////////////////////////////////////////////////////////////////

struct DependencyReduce {
  using ValTy = galois::Pair<uint32_t, float>;

  static ValTy extract(uint32_t, struct NodeData& node) {
    uint32_t indexToGet = node.roundIndexToSend;
    float thing;
    if (indexToGet != infinity) {
      thing = node.sourceData[indexToGet].dependencyValue;
    } else {
      thing = 0;
    }

    return ValTy(indexToGet, thing);
  }

  static bool extract_batch(unsigned, uint8_t*, size_t*,
                            DataCommMode*) { return false; }

  static bool extract_batch(unsigned, uint8_t*) { return false; }

  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
                                  DataCommMode*) { return false; }

  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }

  static bool reduce(uint32_t, struct NodeData& node, ValTy y) {
    uint32_t rIndex = y.first;

    if (rIndex != infinity) {
      if (node.roundIndexToSend != rIndex) {
        galois::gError(node.roundIndexToSend, " ", rIndex);
      }
      assert(node.roundIndexToSend == rIndex);

      float rToAdd = y.second;
      galois::atomicAdd(node.sourceData[rIndex].dependencyValue, rToAdd);
      return true;
    }

    return false;
  }

  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }

  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { return false; }

  // reset the number of shortest paths (the master will now have it)
  static void reset(uint32_t, struct NodeData &node) {
    if (node.roundIndexToSend != infinity) {
      node.sourceData[node.roundIndexToSend].dependencyValue = 0;
    }
  }

  static void setVal(uint32_t, struct NodeData & node, ValTy y) {
    uint32_t rIndex = y.first;
    if (rIndex != infinity) {
      float rDep = y.second;
      assert(node.roundIndexToSend == rIndex);
      node.sourceData[rIndex].dependencyValue = rDep;
    }
  }

  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
};

////////////////////////////////////////////////////////////////////////////////
// Bitsets
////////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_BITSET(minDistances);
GALOIS_SYNC_STRUCTURE_BITSET(dependency);


================================================
FILE: lonestar/analytics/distributed/betweennesscentrality/mrbc_tree.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _MRBCTREE_
#define _MRBCTREE_
#include <boost/container/flat_map.hpp>
#include "mrbc_bitset.hh"

const uint32_t infinity = std::numeric_limits<uint32_t>::max() >> 2;

/**
 * Binary tree class to make finding a source's message to send out during MRBC
 * easier.
 */
class MRBCTree {
  using BitSet  = MRBCBitSet;
  using FlatMap = boost::container::flat_map<
      uint32_t, BitSet, std::less<uint32_t>,
      galois::gstl::Pow2Alloc<std::pair<uint32_t, BitSet>>>;

  //! map to a bitset of nodes that belong in a particular distance group
  FlatMap distanceTree;
  //! number of sources that have already been sent out
  uint32_t numSentSources;
  //! number of non-infinity values (i.e. number of sources added already)
  uint32_t numNonInfinity;
  //! indicates if zero distance has been reached for backward iteration
  bool zeroReached;

  //! reverse iterator over map
  using TreeIter = typename FlatMap::reverse_iterator;
  //! Current iterator for reverse map
  TreeIter curKey;
  //! End key for reverse map iterator
  TreeIter endCurKey;

public:
  /*** InitializeIteration
   * *****************************************************/

  /**
   * Reset the map, initialize all distances to infinity, and reset the "sent"
   * vector and num sent sources.
   */
  void initialize() {
    distanceTree.clear();
    // reset number of sent sources
    numSentSources = 0;
    // reset number of non infinity sources that exist
    numNonInfinity = 0;
    // reset the flag for backward phase
    zeroReached = false;
  }

  /**
   * Assumes you're adding a NEW distance; i.e. there better not be a duplicate
   * of index somewhere.
   */
  void setDistance(uint32_t index, uint32_t newDistance) {
    // Only for iterstion initialization
    // assert(newDistance == 0);
    // assert(distanceTree[newDistance].size() == numSourcesPerRound);
    distanceTree[newDistance].set_indicator(index);
    numNonInfinity++;
  }

  /*** FindMessageToSync
   * ********************************************************/

  /**
   * Get the index that needs to be sent out this round given the round number.
   */
  uint32_t getIndexToSend(uint32_t roundNumber) {
    uint32_t distanceToCheck = roundNumber - numSentSources;
    uint32_t indexToSend     = infinity;

    auto setIter = distanceTree.find(distanceToCheck);
    if (setIter != distanceTree.end()) {
      BitSet& setToCheck = setIter->second;
      auto index         = setToCheck.getIndicator();
      if (index != setToCheck.npos) {
        indexToSend = index;
      }
    }
    return indexToSend;
  }

  /**
   * Return true if potentially more work exists to be done
   */
  bool moreWork() { return numNonInfinity > numSentSources; }

  /*** ConfirmMessageToSend
   * *****************************************************/

  /**
   * Note that a particular source's message has already been sent in the data
   * structure and increment the number of sent sources.
   */
  void markSent(uint32_t roundNumber) {
    uint32_t distanceToCheck = roundNumber - numSentSources;
    BitSet& setToCheck       = distanceTree[distanceToCheck];
    setToCheck.forward_indicator();

    numSentSources++;
  }

  /*** SendAPSPMessages
   * *********************************************************/

  /**
   * Update the distance map: given an index to update as well as its old
   * distance, remove the old distance and replace with new distance.
   */
  void setDistance(uint32_t index, uint32_t oldDistance, uint32_t newDistance) {
    if (oldDistance == newDistance) {
      return;
    }

    auto setIter = distanceTree.find(oldDistance);
    bool existed = false;
    // if it exists, remove it
    if (setIter != distanceTree.end()) {
      BitSet& setToChange = setIter->second;
      existed =
          setToChange.test_set_indicator(index, false); // Test, set, update
    }

    // if it didn't exist before, add to number of non-infinity nodes
    if (!existed) {
      numNonInfinity++;
    }

    // asset(distanceTree[newDistance].size() == numSourcesPerRound);
    distanceTree[newDistance].set_indicator(index);
  }

  /*** RoundUpdate
   * **************************************************************/

  /**
   * Begin the setup for the back propagation phase by setting up the
   * iterators.
   */
  void prepForBackPhase() {
    curKey    = distanceTree.rbegin();
    endCurKey = distanceTree.rend();

    if (curKey != endCurKey) {
      // find non-empty distance if first one happens to be empty
      if (curKey->second.none()) {
        for (++curKey; curKey != endCurKey && curKey->second.none(); ++curKey)
          ;
      }
    }

    // setup if not empty
    if (curKey != endCurKey) {
      BitSet& curSet = curKey->second;
#ifdef FLIP_MODE
      curSet.flip();
#endif
      curSet.backward_indicator();
    }
  }

  /*** BackFindMessageToSend
   * *****************************************************/

  /**
   * Given a round number, figure out which index needs to be sent out for the
   * back propagation phase.
   */
  uint32_t backGetIndexToSend(const uint32_t roundNumber,
                              const uint32_t lastRound) {
    uint32_t indexToReturn = infinity;

    while (curKey != endCurKey) {
      uint32_t distance = curKey->first;
      if ((distance + numSentSources - 1) != (lastRound - roundNumber)) {
        // round to send not reached yet; get out
        return infinity;
      }

      if (distance == 0) {
        zeroReached = true;
        return infinity;
      }

      BitSet& curSet = curKey->second;
      if (!curSet.nposInd()) {
        // this number should be sent out this round
        indexToReturn = curSet.backward_indicator();
        numSentSources--;
        break;
      } else {
        // set exhausted; go onto next set
        for (++curKey; curKey != endCurKey && curKey->second.none(); ++curKey)
          ;

        // if another set exists, set it up, else do nothing
        if (curKey != endCurKey) {
          BitSet& nextSet = curKey->second;
#ifdef FLIP_MODE
          nextSet.flip();
#endif
          nextSet.backward_indicator();
        }
      }
    }

    if (curKey == endCurKey) {
      assert(numSentSources == 0);
    }

    return indexToReturn;
  }

  /**
   * Returns zeroReached variable.
   */
  bool isZeroReached() { return zeroReached; }
};

#endif


================================================
FILE: lonestar/analytics/distributed/bfs/CMakeLists.txt
================================================
app_dist(bfs_push bfs-push)
add_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)

app_dist(bfs_pull bfs-pull)
add_test_dist(bfs-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)


================================================
FILE: lonestar/analytics/distributed/bfs/README.md
================================================
Breadth First Search
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program performs breadth-first search on an input graph, starting from a
source node (specified by -startNode option). 

The algorithm supports both a bulk-synchronous and a bulk-asynchronous
parallel algorithms. This benchmark consists of two algorithms,
push- and pull-based. In the push-based algorithm, a node that has been
updated from the last round will push out its distance value to its neighbors
and update them if necessary in each round. In the pull-based algorithm,
every node will check its neighbors' distance values and update their own
values based on what they see in each round.

INPUT
--------------------------------------------------------------------------------

Takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/bfs; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 host with start node 0, use the following:
`./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`
`./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`

To run on 3 hosts h1, h2, and h3 for start node 0, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`
`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`

To run on 3 hosts h1, h2, and h3 for start node 10 with an incoming edge cut, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`
`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`

PERFORMANCE
--------------------------------------------------------------------------------

* The push variant generally performs better in our experience.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "bfs_pull_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "BFS";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;

static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));

static cll::opt<uint64_t>
    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;

struct NodeData {
  uint32_t dist_current;
};

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

galois::DynamicBitSet bitset_dist_current;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "bfs_pull_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  const uint32_t& local_infinity;
  cll::opt<uint64_t>& local_src_node;
  Graph* graph;

  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
                  Graph* _graph)
      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes),
          InitializeGraph(src_node, infinity, &_graph), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.dist_current =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
  }
};

template <bool async>
struct BFS {
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  BFS(Graph* _graph, DGTerminatorDetector& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    unsigned _num_iterations = 0;
    DGTerminatorDetector dga;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    do {
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("BFS_" + (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        BFS_nodesWithEdges_cuda(__retval, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges), BFS(&_graph, dga),
            galois::no_stats(), galois::steal(),
            galois::loopname(syncSubstrate->get_run_identifier("BFS").c_str()));
      }
      syncSubstrate->sync<writeSource, readDestination, Reduce_min_dist_current,
                          Bitset_dist_current, async>("BFS");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, syncSubstrate->get_run_identifier("NumWorkItems"),
          (unsigned long)dga.read_local());
      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          (unsigned long)_num_iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_dist = dnode.dist_current + 1;
      uint32_t old_dist = galois::min(snode.dist_current, new_dist);
      if (old_dist > new_dist) {
        bitset_dist_current.set(src);
        active_vertices += 1;
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Prints total number of nodes visited + max distance */
struct BFSSanityCheck {
  const uint32_t& local_infinity;
  Graph* graph;

  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;
  galois::DGReduceMax<uint32_t>& DGMax;

  BFSSanityCheck(const uint32_t& _infinity, Graph* _graph,
                 galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm)
      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),
        DGMax(dgm) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm) {
    dgas.reset();
    dgm.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      uint32_t max;
      BFSSanityCheck_masterNodes_cuda(sum, max, infinity, cuda_ctx);
      dgas += sum;
      dgm.update(max);
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     BFSSanityCheck(infinity, &_graph, dgas, dgm),
                     galois::no_stats(), galois::loopname("BFSSanityCheck"));
    }

    uint64_t num_visited  = dgas.reduce();
    uint32_t max_distance = dgm.reduce();

    // Only host 0 will print the info
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes visited from source ", src_node, " is ",
                     num_visited, "\n");
      galois::gPrint("Max distance from source ", src_node, " is ",
                     max_distance, "\n");
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.dist_current < local_infinity) {
      DGAccumulator_sum += 1;
      DGMax.update(src_data.dist_current);
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).dist_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

static const char* const name = "BFS pull - Distributed Heterogeneous";
static const char* const desc = "BFS pull on Distributed Galois.";
static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }
  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void, false>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void, false>();
#endif

  bitset_dist_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  // accumulators for use in operators
  galois::DGAccumulator<uint64_t> DGAccumulator_sum;
  galois::DGReduceMax<uint32_t> m;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] BFS::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      BFS<true>::go(*hg);
    } else {
      BFS<false>::go(*hg);
    }
    StatTimer_main.stop();

    // sanity check
    BFSSanityCheck::go(*hg, DGAccumulator_sum, m);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_dist_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_dist_current.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go(*hg);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "level", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "bfs_pull_cuda.cuh"
static const int __tb_BFS = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint64_t local_src_node, uint32_t * p_dist_current)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
    }
  }
  // FP: "7 -> 8;
}
__global__ void BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        dst = graph.getAbsDestination(jj);
        new_dist = p_dist_current[dst] + 1;
        old_dist = atomicTestMin(&p_dist_current[src], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(src);
          active_vertices.reduce( 1);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_BFS;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "8 -> 9;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "9 -> 10;
    if (pop)
    {
    }
    // FP: "11 -> 12;
    // FP: "14 -> 15;
    // FP: "15 -> 16;
    int threshold = TOTAL_THREADS_1D;
    // FP: "16 -> 17;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "19 -> 20;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "20 -> 21;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "21 -> 22;
    _np_closure[threadIdx.x].src = src;
    // FP: "22 -> 23;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "25 -> 26;
    // FP: "26 -> 27;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "27 -> 28;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "28 -> 29;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "31 -> 32;
    __syncthreads();
    // FP: "32 -> 33;
    while (true)
    {
      // FP: "33 -> 34;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "36 -> 37;
      __syncthreads();
      // FP: "37 -> 38;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "38 -> 39;
        __syncthreads();
        // FP: "39 -> 40;
        break;
      }
      // FP: "41 -> 42;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "44 -> 45;
      __syncthreads();
      // FP: "45 -> 46;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "46 -> 47;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "49 -> 50;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "50 -> 51;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_dist_current[dst] + 1;
          old_dist = atomicTestMin(&p_dist_current[src], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "64 -> 65;
      __syncthreads();
    }
    // FP: "66 -> 67;

    // FP: "67 -> 68;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "68 -> 69;
      const int _np_laneid = cub::LaneId();
      // FP: "69 -> 70;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            dst = graph.getAbsDestination(jj);
            new_dist = p_dist_current[dst] + 1;
            old_dist = atomicTestMin(&p_dist_current[src], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(src);
              active_vertices.reduce( 1);
            }
          }
        }
      }
      // FP: "93 -> 94;
      __syncthreads();
      // FP: "94 -> 95;
    }

    // FP: "95 -> 96;
    __syncthreads();
    // FP: "96 -> 97;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "97 -> 98;
    while (_np.work())
    {
      // FP: "98 -> 99;
      int _np_i =0;
      // FP: "99 -> 100;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "100 -> 101;
      __syncthreads();
      // FP: "101 -> 102;

      // FP: "102 -> 103;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_dist_current[dst] + 1;
          old_dist = atomicTestMin(&p_dist_current[src], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "117 -> 118;
      _np.execute_round_done(ITSIZE);
      // FP: "118 -> 119;
      __syncthreads();
    }
    // FP: "120 -> 121;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "122 -> 123;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "123 -> 124;
}
__global__ void BFSSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGReduceMax<uint32_t> DGMax)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_sum.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  DGMax.thread_entry();
  // FP: "5 -> 6;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_dist_current[src] < local_infinity)
      {
        DGAccumulator_sum.reduce( 1);
        DGMax.reduce(p_dist_current[src]);
      }
    }
  }
  // FP: "14 -> 15;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "15 -> 16;
  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);
  // FP: "16 -> 17;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void BFS_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  BFS <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      BFS_TB_LB <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void BFS_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void BFS_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void BFS_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_sum;
  HGReduceMax<uint32_t> _DGMax;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(DGMaxval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _DGMax.rv = DGMaxval.gpu_wr_ptr();
  // FP: "12 -> 13;
  BFSSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGMax);
  cudaDeviceSynchronize();
  // FP: "13 -> 14;
  check_cuda_kernel;
  // FP: "14 -> 15;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "15 -> 16;
  DGMax = *(DGMaxval.cpu_rd_ptr());
  // FP: "16 -> 17;
}
void BFSSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "bfs_pull_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> dist_current;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->dist_current.data.zero_gpu();
}

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_current, begin, end);
}

uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();
	return dist_current[LID];
}

void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] = v;
}

void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] += v;
}

bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	if (dist_current[LID] > v){
		dist_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void BFS_cuda(unsigned int __begin, unsigned int __end,
              unsigned int& active_vertices, struct CUDA_Context* ctx);
void BFSSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                         uint64_t& DGAccumulator_sum, uint32_t& DGMax,
                         const uint32_t& local_infinity,
                         struct CUDA_Context* ctx);
void BFSSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum, uint32_t& DGMax,
                                  const uint32_t& local_infinity,
                                  struct CUDA_Context* ctx);
void BFSSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,
                                     uint32_t& DGMax,
                                     const uint32_t& local_infinity,
                                     struct CUDA_Context* ctx);
void BFSSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,
                                        uint32_t& DGMax,
                                        const uint32_t& local_infinity,
                                        struct CUDA_Context* ctx);
void BFS_allNodes_cuda(unsigned int& active_vertices, struct CUDA_Context* ctx);
void BFS_masterNodes_cuda(unsigned int& active_vertices,
                          struct CUDA_Context* ctx);
void BFS_nodesWithEdges_cuda(unsigned int& active_vertices,
                             struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          const uint32_t& local_infinity,
                          uint64_t local_src_node, struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,
                                   uint64_t local_src_node,
                                   struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,
                                      uint64_t local_src_node,
                                      struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,
                                         uint64_t local_src_node,
                                         struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("bfs_pull_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
]),
]),
]),
Kernel("BFS", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = p_dist_current[dst] + 1"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[src], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(src)"]),
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("BFSSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGReduceMax<uint32_t>', 'DGMax')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage", "DGMax_ts", "")]),
CBlock(["DGMax.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_current[src] < local_infinity",
[
CBlock(["DGAccumulator_sum.reduce( 1)"]),
CBlock(["DGMax.reduce(p_dist_current[src])"]),
]),
]),
]),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "local_infinity", "local_src_node", "ctx->dist_current.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("BFS_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("BFS", ("ctx->gg", "__begin", "__end", "ctx->dist_current.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("BFS_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("BFS_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("BFS_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_sumval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<uint32_t>", "DGMaxval", " = Shared<uint32_t>(1)")]),
CDecl([("HGReduceMax<uint32_t>", "_DGMax", "")]),
CBlock(["*(DGMaxval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGMax.rv = DGMaxval.gpu_wr_ptr()"]),
Invoke("BFSSanityCheck", ("ctx->gg", "__begin", "__end", "local_infinity", "ctx->dist_current.data.gpu_wr_ptr()", "_DGAccumulator_sum", "_DGMax")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["DGMax = *(DGMaxval.cpu_rd_ptr())"]),
], host = True),
Kernel("BFSSanityCheck_allNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_pull_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(dist_current);


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/gstl.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "bfs_push_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "BFS";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;

static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));

static cll::opt<uint64_t>
    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));

static cll::opt<uint32_t>
    delta("delta",
          cll::desc("Shift value for the delta step (default value 0)"),
          cll::init(0));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;

struct NodeData {
  std::atomic<uint32_t> dist_current;
  uint32_t dist_old;
};

galois::DynamicBitSet bitset_dist_current;

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "bfs_push_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  const uint32_t& local_infinity;
  cll::opt<uint64_t>& local_src_node;
  Graph* graph;

  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
                  Graph* _graph)
      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str(
          syncSubstrate->get_run_identifier("InitializeGraph_"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.dist_current =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
    sdata.dist_old =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
  }
};

template <bool async>
struct FirstItr_BFS {
  Graph* graph;

  FirstItr_BFS(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    uint32_t __begin, __end;
    if (_graph.isLocal(src_node)) {
      __begin = _graph.getLID(src_node);
      __end   = __begin + 1;
    } else {
      __begin = 0;
      __end   = 0;
    }
    syncSubstrate->set_num_round(0);
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str(syncSubstrate->get_run_identifier("BFS"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      FirstItr_BFS_cuda(__begin, __end, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      // one node
      galois::do_all(
          galois::iterate(__begin, __end), FirstItr_BFS{&_graph},
          galois::no_stats(),
          galois::loopname(syncSubstrate->get_run_identifier("BFS").c_str()));
    }

    syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,
                        Bitset_dist_current, async>("BFS");

    galois::runtime::reportStat_Tsum(
        REGION_NAME, syncSubstrate->get_run_identifier("NumWorkItems"),
        __end - __begin);
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);
    snode.dist_old  = snode.dist_current;

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_dist = 1 + snode.dist_current;
      uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
      if (old_dist > new_dist)
        bitset_dist_current.set(dst);
    }
  }
};

template <bool async>
struct BFS {
  uint32_t local_priority;
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;
  using DGAccumulatorTy = galois::DGAccumulator<unsigned int>;

  DGTerminatorDetector& active_vertices;
  DGAccumulatorTy& work_edges;

  BFS(uint32_t _local_priority, Graph* _graph, DGTerminatorDetector& _dga,
      DGAccumulatorTy& _work_edges)
      : local_priority(_local_priority), graph(_graph), active_vertices(_dga),
        work_edges(_work_edges) {}

  void static go(Graph& _graph) {
    FirstItr_BFS<async>::go(_graph);

    unsigned _num_iterations = 1;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    uint32_t priority;
    if (delta == 0)
      priority = std::numeric_limits<uint32_t>::max();
    else
      priority = 0;
    DGTerminatorDetector dga;
    DGAccumulatorTy work_edges;

    do {

      // if (work_edges.reduce() == 0)
      priority += delta;

      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      work_edges.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str(syncSubstrate->get_run_identifier("BFS"));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval  = 0;
        unsigned int __retval2 = 0;
        BFS_nodesWithEdges_cuda(__retval, __retval2, priority, cuda_ctx);
        dga += __retval;
        work_edges += __retval2;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges),
            BFS(priority, &_graph, dga, work_edges), galois::steal(),
            galois::no_stats(),
            galois::loopname(syncSubstrate->get_run_identifier("BFS").c_str()));
      }
      syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,
                          Bitset_dist_current, async>("BFS");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, syncSubstrate->get_run_identifier("NumWorkItems"),
          (unsigned long)work_edges.read_local());

      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    galois::runtime::reportStat_Tmax(
        REGION_NAME,
        "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
        (unsigned long)_num_iterations);
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    if (snode.dist_old > snode.dist_current) {
      active_vertices += 1;

      if (local_priority > snode.dist_current) {
        snode.dist_old = snode.dist_current;

        for (auto jj : graph->edges(src)) {
          work_edges += 1;

          GNode dst         = graph->getEdgeDst(jj);
          auto& dnode       = graph->getData(dst);
          uint32_t new_dist = 1 + snode.dist_current;
          uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
          if (old_dist > new_dist)
            bitset_dist_current.set(dst);
        }
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Prints total number of nodes visited + max distance */
struct BFSSanityCheck {
  const uint32_t& local_infinity;
  Graph* graph;

  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;
  galois::DGReduceMax<uint32_t>& DGMax;

  BFSSanityCheck(const uint32_t& _infinity, Graph* _graph,
                 galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm)
      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),
        DGMax(dgm) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm) {
    dgas.reset();
    dgm.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      uint32_t max;
      BFSSanityCheck_masterNodes_cuda(sum, max, infinity, cuda_ctx);
      dgas += sum;
      dgm.update(max);
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     BFSSanityCheck(infinity, &_graph, dgas, dgm),
                     galois::no_stats(), galois::loopname("BFSSanityCheck"));
    }

    uint64_t num_visited  = dgas.reduce();
    uint32_t max_distance = dgm.reduce();

    // Only host 0 will print the info
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes visited from source ", src_node, " is ",
                     num_visited, "\n");
      galois::gPrint("Max distance from source ", src_node, " is ",
                     max_distance, "\n");
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.dist_current < local_infinity) {
      DGAccumulator_sum += 1;
      DGMax.update(src_data.dist_current);
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).dist_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name =
    "BFS - Distributed Heterogeneous with "
    "worklist.";
constexpr static const char* const desc = "BFS on Distributed Galois.";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  const auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
    galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, void>();
#endif
  // bitset comm setup
  bitset_dist_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  // accumulators for use in operators
  galois::DGAccumulator<uint64_t> DGAccumulator_sum;
  galois::DGReduceMax<uint32_t> m;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] BFS::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      BFS<true>::go(*hg);
    } else {
      BFS<false>::go(*hg);
    }
    StatTimer_main.stop();

    // sanity check
    BFSSanityCheck::go(*hg, DGAccumulator_sum, m);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_dist_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_dist_current.reset();
      }

      syncSubstrate->set_num_run(run + 1);
      InitializeGraph::go((*hg));
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "level", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "bfs_push_cuda.cuh"
static const int __tb_BFS = TB_SIZE;
static const int __tb_FirstItr_BFS = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint64_t local_src_node, uint32_t * p_dist_current, uint32_t * p_dist_old)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
      p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
    }
  }
  // FP: "8 -> 9;
}
__global__ void FirstItr_BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        dst = graph.getAbsDestination(jj);
        new_dist = 1 + p_dist_current[src];
        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "49 -> 50;
}
__global__ void FirstItr_BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_FirstItr_BFS;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
      p_dist_old[src]  = p_dist_current[src];
    }
    // FP: "10 -> 11;
    // FP: "13 -> 14;
    // FP: "14 -> 15;
    int threshold = TOTAL_THREADS_1D;
    // FP: "15 -> 16;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "18 -> 19;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "19 -> 20;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "20 -> 21;
    _np_closure[threadIdx.x].src = src;
    // FP: "21 -> 22;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "24 -> 25;
    // FP: "25 -> 26;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "26 -> 27;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "27 -> 28;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "30 -> 31;
    __syncthreads();
    // FP: "31 -> 32;
    while (true)
    {
      // FP: "32 -> 33;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "35 -> 36;
      __syncthreads();
      // FP: "36 -> 37;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "37 -> 38;
        __syncthreads();
        // FP: "38 -> 39;
        break;
      }
      // FP: "40 -> 41;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "43 -> 44;
      __syncthreads();
      // FP: "44 -> 45;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "45 -> 46;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "48 -> 49;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "49 -> 50;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = 1 + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "62 -> 63;
      __syncthreads();
    }
    // FP: "64 -> 65;

    // FP: "65 -> 66;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "66 -> 67;
      const int _np_laneid = cub::LaneId();
      // FP: "67 -> 68;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            dst = graph.getAbsDestination(jj);
            new_dist = 1 + p_dist_current[src];
            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(dst);
            }
          }
        }
      }
      // FP: "90 -> 91;
      __syncthreads();
      // FP: "91 -> 92;
    }

    // FP: "92 -> 93;
    __syncthreads();
    // FP: "93 -> 94;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "94 -> 95;
    while (_np.work())
    {
      // FP: "95 -> 96;
      int _np_i =0;
      // FP: "96 -> 97;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "97 -> 98;
      __syncthreads();
      // FP: "98 -> 99;

      // FP: "99 -> 100;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = 1 + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "113 -> 114;
      _np.execute_round_done(ITSIZE);
      // FP: "114 -> 115;
      __syncthreads();
    }
    // FP: "116 -> 117;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "118 -> 119;
}
__global__ void BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_items, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        work_items.reduce( 1);
        dst = graph.getAbsDestination(jj);
        new_dist = 1 + p_dist_current[src];
        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_items, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_BFS;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage work_items_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  work_items.thread_entry();
  // FP: "9 -> 10;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "10 -> 11;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "11 -> 12;
    if (pop)
    {
      if (p_dist_old[src] > p_dist_current[src])
      {
        active_vertices.reduce( 1);
        if (local_priority > p_dist_current[src])
        {
          p_dist_old[src] = p_dist_current[src];
        }
        else
        {
          pop = false;
        }
      }
      else
      {
        pop = false;
      }
    }
    // FP: "19 -> 20;
    // FP: "22 -> 23;
    // FP: "23 -> 24;
    int threshold = TOTAL_THREADS_1D;
    // FP: "24 -> 25;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "27 -> 28;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "28 -> 29;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "29 -> 30;
    _np_closure[threadIdx.x].src = src;
    // FP: "30 -> 31;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "33 -> 34;
    // FP: "34 -> 35;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "35 -> 36;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "36 -> 37;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "39 -> 40;
    __syncthreads();
    // FP: "40 -> 41;
    while (true)
    {
      // FP: "41 -> 42;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "44 -> 45;
      __syncthreads();
      // FP: "45 -> 46;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "46 -> 47;
        __syncthreads();
        // FP: "47 -> 48;
        break;
      }
      // FP: "49 -> 50;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "52 -> 53;
      __syncthreads();
      // FP: "53 -> 54;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "54 -> 55;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "57 -> 58;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "58 -> 59;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          work_items.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = 1 + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "72 -> 73;
      __syncthreads();
    }
    // FP: "74 -> 75;

    // FP: "75 -> 76;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "76 -> 77;
      const int _np_laneid = cub::LaneId();
      // FP: "77 -> 78;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            work_items.reduce( 1);
            dst = graph.getAbsDestination(jj);
            new_dist = 1 + p_dist_current[src];
            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(dst);
            }
          }
        }
      }
      // FP: "101 -> 102;
      __syncthreads();
      // FP: "102 -> 103;
    }

    // FP: "103 -> 104;
    __syncthreads();
    // FP: "104 -> 105;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "105 -> 106;
    while (_np.work())
    {
      // FP: "106 -> 107;
      int _np_i =0;
      // FP: "107 -> 108;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "108 -> 109;
      __syncthreads();
      // FP: "109 -> 110;

      // FP: "110 -> 111;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          work_items.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = 1 + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "125 -> 126;
      _np.execute_round_done(ITSIZE);
      // FP: "126 -> 127;
      __syncthreads();
    }
    // FP: "128 -> 129;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "132 -> 133;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "133 -> 134;
  work_items.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_items_ts);
  // FP: "134 -> 135;
}
__global__ void BFSSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGReduceMax<uint32_t> DGMax)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_sum.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  DGMax.thread_entry();
  // FP: "5 -> 6;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_dist_current[src] < local_infinity)
      {
        DGAccumulator_sum.reduce( 1);
        DGMax.reduce(p_dist_current[src]);
      }
    }
  }
  // FP: "14 -> 15;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "15 -> 16;
  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);
  // FP: "16 -> 17;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void FirstItr_BFS_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  FirstItr_BFS <<<blocks, __tb_FirstItr_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      FirstItr_BFS_TB_LB <<<blocks, __tb_FirstItr_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void FirstItr_BFS_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_BFS_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void FirstItr_BFS_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void FirstItr_BFS_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_BFS_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void BFS_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  HGAccumulator<unsigned int> _work_items;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<unsigned int> work_itemsval  = Shared<unsigned int>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(work_itemsval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _work_items.rv = work_itemsval.gpu_wr_ptr();
  // FP: "12 -> 13;
  BFS <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_items, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      BFS_TB_LB <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_items, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "13 -> 14;
  check_cuda_kernel;
  // FP: "14 -> 15;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "15 -> 16;
  work_items = *(work_itemsval.cpu_rd_ptr());
  // FP: "16 -> 17;
}
void BFS_allNodes_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(0, ctx->gg.nnodes, active_vertices, work_items, local_priority, ctx);
  // FP: "2 -> 3;
}
void BFS_masterNodes_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_items, local_priority, ctx);
  // FP: "2 -> 3;
}
void BFS_nodesWithEdges_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, work_items, local_priority, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_sum;
  HGReduceMax<uint32_t> _DGMax;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(DGMaxval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _DGMax.rv = DGMaxval.gpu_wr_ptr();
  // FP: "12 -> 13;
  BFSSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGMax);
  cudaDeviceSynchronize();
  // FP: "13 -> 14;
  check_cuda_kernel;
  // FP: "14 -> 15;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "15 -> 16;
  DGMax = *(DGMaxval.cpu_rd_ptr());
  // FP: "16 -> 17;
}
void BFSSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void BFSSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "bfs_push_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> dist_current;
	struct CUDA_Context_Field<uint32_t> dist_old;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_old, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_old, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->dist_current.data.zero_gpu();
	ctx->dist_old.data.zero_gpu();
}

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_current, begin, end);
}

uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();
	return dist_current[LID];
}

void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] = v;
}

void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] += v;
}

bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	if (dist_current[LID] > v){
		dist_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);
}

void get_bitset_dist_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_old.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_old, begin, end);
}

uint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_rd_ptr();
	return dist_old[LID];
}

void set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	dist_old[LID] = v;
}

void add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	dist_old[LID] += v;
}

bool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	if (dist_old[LID] > v){
		dist_old[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v);
}

void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v);
}

void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, i);
}

void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_old, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void get_bitset_dist_old_cuda(struct CUDA_Context* ctx,
                              uint64_t* bitset_compute);
void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end);
uint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v);
void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, size_t* v_size,
                                  DataCommMode* data_mode);
void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode);
void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        uint32_t i);
void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size, DataCommMode* data_mode,
                                        uint32_t i);
void batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end, uint32_t v);

void BFS_cuda(unsigned int __begin, unsigned int __end,
              unsigned int& active_vertices, unsigned int& work_items,
              uint32_t local_priority, struct CUDA_Context* ctx);
void BFSSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                         uint64_t& DGAccumulator_sum, uint32_t& DGMax,
                         const uint32_t& local_infinity,
                         struct CUDA_Context* ctx);
void BFSSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum, uint32_t& DGMax,
                                  const uint32_t& local_infinity,
                                  struct CUDA_Context* ctx);
void BFSSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,
                                     uint32_t& DGMax,
                                     const uint32_t& local_infinity,
                                     struct CUDA_Context* ctx);
void BFSSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,
                                        uint32_t& DGMax,
                                        const uint32_t& local_infinity,
                                        struct CUDA_Context* ctx);
void BFS_allNodes_cuda(unsigned int& active_vertices, unsigned int& work_items,
                       uint32_t local_priority, struct CUDA_Context* ctx);
void BFS_masterNodes_cuda(unsigned int& active_vertices,
                          unsigned int& work_items, uint32_t local_priority,
                          struct CUDA_Context* ctx);
void BFS_nodesWithEdges_cuda(unsigned int& active_vertices,
                             unsigned int& work_items, uint32_t local_priority,
                             struct CUDA_Context* ctx);
void FirstItr_BFS_cuda(unsigned int __begin, unsigned int __end,
                       struct CUDA_Context* ctx);
void FirstItr_BFS_allNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_BFS_masterNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_BFS_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          const uint32_t& local_infinity,
                          uint64_t local_src_node, struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,
                                   uint64_t local_src_node,
                                   struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,
                                      uint64_t local_src_node,
                                      struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,
                                         uint64_t local_src_node,
                                         struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("bfs_push_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
CBlock(["p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
]),
]),
]),
Kernel("FirstItr_BFS", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_old[src]  = p_dist_current[src]"]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = 1 + p_dist_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(dst)"]),
]),
]),
),
]),
]),
Kernel("BFS", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_priority'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices'), ('HGAccumulator<unsigned int>', 'work_items')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "work_items_ts", "")]),
CBlock(["work_items.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_old[src] > p_dist_current[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
If("local_priority > p_dist_current[src]",
[
CBlock(["p_dist_old[src] = p_dist_current[src]"]),
], [ CBlock(["pop = false"]), ]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CBlock(["work_items.reduce( 1)"]),
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = 1 + p_dist_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(dst)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
CBlock(["work_items.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_items_ts)"], parse = False),
]),
Kernel("BFSSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGReduceMax<uint32_t>', 'DGMax')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage", "DGMax_ts", "")]),
CBlock(["DGMax.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_current[src] < local_infinity",
[
CBlock(["DGAccumulator_sum.reduce( 1)"]),
CBlock(["DGMax.reduce(p_dist_current[src])"]),
]),
]),
]),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "local_infinity", "local_src_node", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("FirstItr_BFS_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("FirstItr_BFS", ("ctx->gg", "__begin", "__end", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("FirstItr_BFS_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_BFS_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("FirstItr_BFS_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("FirstItr_BFS_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_BFS_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("BFS_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
CDecl([("Shared<unsigned int>", "work_itemsval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_work_items", "")]),
CBlock(["*(work_itemsval.cpu_wr_ptr()) = 0"]),
CBlock(["_work_items.rv = work_itemsval.gpu_wr_ptr()"]),
Invoke("BFS", ("ctx->gg", "__begin", "__end", "local_priority", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())", "_active_vertices", "_work_items")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
CBlock(["work_items = *(work_itemsval.cpu_rd_ptr())"]),
], host = True),
Kernel("BFS_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(0, ctx->gg.nnodes, active_vertices, work_items, local_priority, ctx)"]),
], host = True),
Kernel("BFS_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_items, local_priority, ctx)"]),
], host = True),
Kernel("BFS_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, work_items, local_priority, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_sumval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<uint32_t>", "DGMaxval", " = Shared<uint32_t>(1)")]),
CDecl([("HGReduceMax<uint32_t>", "_DGMax", "")]),
CBlock(["*(DGMaxval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGMax.rv = DGMaxval.gpu_wr_ptr()"]),
Invoke("BFSSanityCheck", ("ctx->gg", "__begin", "__end", "local_infinity", "ctx->dist_current.data.gpu_wr_ptr()", "_DGAccumulator_sum", "_DGMax")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["DGMax = *(DGMaxval.cpu_rd_ptr())"]),
], host = True),
Kernel("BFSSanityCheck_allNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("BFSSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/bfs/bfs_push_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(dist_current);


================================================
FILE: lonestar/analytics/distributed/connected-components/CMakeLists.txt
================================================
app_dist(cc_push connected-components-push)
add_test_dist(connected-components-push-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph)

app_dist(cc_pull connected-components-pull)
add_test_dist(connected-components-pull-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph)


================================================
FILE: lonestar/analytics/distributed/connected-components/README.md
================================================
Connected Components
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Find all connected components of an undirected (symmetric) graph. Set the same 
label to nodes which belong to the same component.

The algorithm supports both a bulk-synchronous and a bulk-asynchronous
parallel algorithms. This benchmark consists of two algorithms,
push- and pull-based.  In the push variant of the algorithm, nodes with a label
that has changed from the last round will push this label out to its neighbors
and update their labels with a min operation. In the pull variant of the
algorithm, all nodes check their neighbors to see if they have a lower label,
and they will adopt the lowest label among its neighbors/itself as its component.

INPUT
--------------------------------------------------------------------------------

Takes in symmetric Galois .gr graphs. You must specify the -symmetricGraph
flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/connected-components/; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 machine, use the following:
`./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`
`./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`

To run on 3 hosts h1, h2, and h3, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`
`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`

To run on 3 hosts h1, h2, and h3 with an incoming edge cut, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec`
`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec`

PERFORMANCE
--------------------------------------------------------------------------------

* The push variant generally performs better in our experience.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "cc_pull_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "ConnectedComp";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;
static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

struct NodeData {
  uint32_t comp_current;
};

galois::DynamicBitSet bitset_comp_current;

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "cc_pull_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();

      InitializeGraph_allNodes_cuda(cuda_ctx);

      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{&_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata    = graph->getData(src);
    sdata.comp_current = graph->getGID(src);
  }
};

template <bool async>
struct ConnectedComp {
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  ConnectedComp(Graph* _graph, DGTerminatorDetector& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    unsigned _num_iterations = 0;
    DGTerminatorDetector dga;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    do {
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("ConnectedComp_" +
                             (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        ConnectedComp_nodesWithEdges_cuda(__retval, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges), ConnectedComp(&_graph, dga),
            galois::steal(), galois::no_stats(),
            galois::loopname(
                syncSubstrate->get_run_identifier("ConnectedComp").c_str()));
      }

      syncSubstrate->sync<writeSource, readDestination, Reduce_min_comp_current,
                          Bitset_comp_current, async>("ConnectedComp");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          (unsigned long)dga.read_local());
      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          (unsigned long)_num_iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_comp = dnode.comp_current;
      uint32_t old_comp = galois::min(snode.comp_current, new_comp);
      if (old_comp > new_comp) {
        bitset_comp_current.set(src);
        active_vertices += 1;
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Get/print the number of components */
struct ConnectedCompSanityCheck {
  Graph* graph;

  galois::DGAccumulator<uint64_t>& active_vertices;

  ConnectedCompSanityCheck(Graph* _graph, galois::DGAccumulator<uint64_t>& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {
    dga.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      ConnectedCompSanityCheck_masterNodes_cuda(sum, cuda_ctx);
      dga += sum;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     ConnectedCompSanityCheck(&_graph, dga), galois::no_stats(),
                     galois::loopname("ConnectedCompSanityCheck"));
    }

    uint64_t num_components = dga.reduce();

    // Only node 0 will print the number visited
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of components is ", num_components, "\n");
    }
  }

  /* Check if a node's component is the same as its ID.
   * if yes, then increment an accumulator */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.comp_current == graph->getGID(src)) {
      active_vertices += 1;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).comp_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_comp_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "ConnectedComp Pull - Distributed "
                                          "Heterogeneous";
constexpr static const char* const desc = "ConnectedComp pull on Distributed "
                                          "Galois.";
constexpr static const char* const url = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>();
#endif

  bitset_comp_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");
  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<uint64_t> active_vertices64;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] ConnectedComp::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      ConnectedComp<true>::go(*hg);
    } else {
      ConnectedComp<false>::go(*hg);
    }
    StatTimer_main.stop();

    ConnectedCompSanityCheck::go(*hg, active_vertices64);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_comp_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_comp_current.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go((*hg));
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "component", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "cc_pull_cuda.cuh"
static const int __tb_ConnectedComp = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_comp_current[src] = graph.node_data[src];
    }
  }
  // FP: "7 -> 8;
}
__global__ void ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_comp;
        uint32_t old_comp;
        dst = graph.getAbsDestination(jj);
        new_comp = p_comp_current[dst];
        old_comp = atomicTestMin(&p_comp_current[src], new_comp);
        if (old_comp > new_comp)
        {
          bitset_comp_current.set(src);
          active_vertices.reduce( 1);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_ConnectedComp;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "8 -> 9;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "9 -> 10;
    if (pop)
    {
    }
    // FP: "11 -> 12;
    // FP: "14 -> 15;
    // FP: "15 -> 16;
    int threshold = TOTAL_THREADS_1D;
    // FP: "16 -> 17;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "19 -> 20;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "20 -> 21;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "21 -> 22;
    _np_closure[threadIdx.x].src = src;
    // FP: "22 -> 23;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "25 -> 26;
    // FP: "26 -> 27;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "27 -> 28;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "28 -> 29;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "31 -> 32;
    __syncthreads();
    // FP: "32 -> 33;
    while (true)
    {
      // FP: "33 -> 34;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "36 -> 37;
      __syncthreads();
      // FP: "37 -> 38;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "38 -> 39;
        __syncthreads();
        // FP: "39 -> 40;
        break;
      }
      // FP: "41 -> 42;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "44 -> 45;
      __syncthreads();
      // FP: "45 -> 46;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "46 -> 47;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "49 -> 50;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "50 -> 51;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_comp;
          uint32_t old_comp;
          dst = graph.getAbsDestination(jj);
          new_comp = p_comp_current[dst];
          old_comp = atomicTestMin(&p_comp_current[src], new_comp);
          if (old_comp > new_comp)
          {
            bitset_comp_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "64 -> 65;
      __syncthreads();
    }
    // FP: "66 -> 67;

    // FP: "67 -> 68;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "68 -> 69;
      const int _np_laneid = cub::LaneId();
      // FP: "69 -> 70;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_comp;
            uint32_t old_comp;
            dst = graph.getAbsDestination(jj);
            new_comp = p_comp_current[dst];
            old_comp = atomicTestMin(&p_comp_current[src], new_comp);
            if (old_comp > new_comp)
            {
              bitset_comp_current.set(src);
              active_vertices.reduce( 1);
            }
          }
        }
      }
      // FP: "93 -> 94;
      __syncthreads();
      // FP: "94 -> 95;
    }

    // FP: "95 -> 96;
    __syncthreads();
    // FP: "96 -> 97;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "97 -> 98;
    while (_np.work())
    {
      // FP: "98 -> 99;
      int _np_i =0;
      // FP: "99 -> 100;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "100 -> 101;
      __syncthreads();
      // FP: "101 -> 102;

      // FP: "102 -> 103;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_comp;
          uint32_t old_comp;
          dst = graph.getAbsDestination(jj);
          new_comp = p_comp_current[dst];
          old_comp = atomicTestMin(&p_comp_current[src], new_comp);
          if (old_comp > new_comp)
          {
            bitset_comp_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "117 -> 118;
      _np.execute_round_done(ITSIZE);
      // FP: "118 -> 119;
      __syncthreads();
    }
    // FP: "120 -> 121;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "122 -> 123;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "123 -> 124;
}
__global__ void ConnectedCompSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, HGAccumulator<uint64_t> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_comp_current[src] == graph.node_data[src])
      {
        active_vertices.reduce( 1);
      }
    }
  }
  // FP: "11 -> 12;
  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);
  // FP: "12 -> 13;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  ConnectedComp <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      ConnectedComp_TB_LB <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void ConnectedComp_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  ConnectedCompSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void ConnectedCompSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "cc_pull_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> comp_current;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->comp_current, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->comp_current, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->comp_current.data.zero_gpu();
}

void get_bitset_comp_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->comp_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->comp_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->comp_current, begin, end);
}

uint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_rd_ptr();
	return comp_current[LID];
}

void set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	comp_current[LID] = v;
}

void add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	comp_current[LID] += v;
}

bool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	if (comp_current[LID] > v){
		comp_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v);
}

void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v);
}

void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, i);
}

void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->comp_current, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_comp_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void ConnectedComp_cuda(unsigned int __begin, unsigned int __end,
                        unsigned int& active_vertices,
                        struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                                   uint64_t& active_vertices,
                                   struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_allNodes_cuda(uint64_t& active_vertices,
                                            struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_masterNodes_cuda(uint64_t& active_vertices,
                                               struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,
                                                  struct CUDA_Context* ctx);
void ConnectedComp_allNodes_cuda(unsigned int& active_vertices,
                                 struct CUDA_Context* ctx);
void ConnectedComp_masterNodes_cuda(unsigned int& active_vertices,
                                    struct CUDA_Context* ctx);
void ConnectedComp_nodesWithEdges_cuda(unsigned int& active_vertices,
                                       struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("cc_pull_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_comp_current[src] = graph.node_data[src]"]),
]),
]),
]),
Kernel("ConnectedComp", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('DynamicBitset&', 'bitset_comp_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_comp", "")]),
CBlock(["new_comp = p_comp_current[dst]"]),
CDecl([("uint32_t", "old_comp", "")]),
CBlock(["old_comp = atomicTestMin(&p_comp_current[src], new_comp)"]),
If("old_comp > new_comp",
[
CBlock(["bitset_comp_current.set(src)"]),
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("ConnectedCompSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('HGAccumulator<uint64_t>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_comp_current[src] == graph.node_data[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("ConnectedComp_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("ConnectedComp", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "*(ctx->comp_current.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("ConnectedComp_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedComp_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedComp_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "active_verticesval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("ConnectedCompSanityCheck", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("ConnectedCompSanityCheck_allNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_masterNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_pull_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(comp_current, uint32_t);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(comp_current, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(comp_current);


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "cc_push_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "ConnectedComp";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;
static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

struct NodeData {
  std::atomic<uint32_t> comp_current;
  uint32_t comp_old;
};

galois::DynamicBitSet bitset_comp_current;

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "cc_push_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{&_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata    = graph->getData(src);
    sdata.comp_current = graph->getGID(src);
    sdata.comp_old     = graph->getGID(src);
  }
};

template <bool async>
struct FirstItr_ConnectedComp {
  Graph* graph;
  FirstItr_ConnectedComp(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    syncSubstrate->set_num_round(0);
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("ConnectedComp_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      StatTimer_cuda.stop();
      FirstItr_ConnectedComp_nodesWithEdges_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(nodesWithEdges), FirstItr_ConnectedComp{&_graph},
          galois::steal(), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("ConnectedComp").c_str()));
    }

    syncSubstrate->sync<writeDestination, readSource, Reduce_min_comp_current,
                        Bitset_comp_current, async>("ConnectedComp");

    galois::runtime::reportStat_Tsum(
        REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
        _graph.allNodesRange().end() - _graph.allNodesRange().begin());
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);
    snode.comp_old  = snode.comp_current;

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_dist = snode.comp_current;
      uint32_t old_dist = galois::atomicMin(dnode.comp_current, new_dist);
      if (old_dist > new_dist)
        bitset_comp_current.set(dst);
    }
  }
};

template <bool async>
struct ConnectedComp {
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  ConnectedComp(Graph* _graph, DGTerminatorDetector& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    using namespace galois::worklists;

    FirstItr_ConnectedComp<async>::go(_graph);

    unsigned _num_iterations = 1;
    DGTerminatorDetector dga;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    do {
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("ConnectedComp_" +
                             (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        ConnectedComp_nodesWithEdges_cuda(__retval, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges), ConnectedComp(&_graph, dga),
            galois::no_stats(), galois::steal(),
            galois::loopname(
                syncSubstrate->get_run_identifier("ConnectedComp").c_str()));
      }

      syncSubstrate->sync<writeDestination, readSource, Reduce_min_comp_current,
                          Bitset_comp_current, async>("ConnectedComp");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          (unsigned long)dga.read_local());
      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    galois::runtime::reportStat_Tmax(
        REGION_NAME,
        "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
        (unsigned long)_num_iterations);
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    if (snode.comp_old > snode.comp_current) {
      snode.comp_old = snode.comp_current;

      for (auto jj : graph->edges(src)) {
        active_vertices += 1;

        GNode dst         = graph->getEdgeDst(jj);
        auto& dnode       = graph->getData(dst);
        uint32_t new_dist = snode.comp_current;
        uint32_t old_dist = galois::atomicMin(dnode.comp_current, new_dist);
        if (old_dist > new_dist)
          bitset_comp_current.set(dst);
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Get/print the number of components */
struct ConnectedCompSanityCheck {
  Graph* graph;

  galois::DGAccumulator<uint64_t>& active_vertices;

  ConnectedCompSanityCheck(Graph* _graph, galois::DGAccumulator<uint64_t>& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {
    dga.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      ConnectedCompSanityCheck_masterNodes_cuda(sum, cuda_ctx);
      dga += sum;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     ConnectedCompSanityCheck(&_graph, dga), galois::no_stats(),
                     galois::loopname("ConnectedCompSanityCheck"));
    }

    uint64_t num_components = dga.reduce();

    // Only node 0 will print the number visited
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of components is ", num_components, "\n");
    }
  }

  /* Check if a node's component is the same as its ID.
   * if yes, then increment an accumulator */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.comp_current == graph->getGID(src)) {
      active_vertices += 1;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).comp_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_comp_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "ConnectedComp - Distributed "
                                          "Heterogeneous with filter.";
constexpr static const char* const desc =
    "ConnectedComp on Distributed Galois.";
constexpr static const char* const url = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>();
#endif

  bitset_comp_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<uint64_t> active_vertices64;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] ConnectedComp::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      ConnectedComp<true>::go(*hg);
    } else {
      ConnectedComp<false>::go(*hg);
    }
    StatTimer_main.stop();

    ConnectedCompSanityCheck::go(*hg, active_vertices64);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_comp_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_comp_current.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go((*hg));
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "component", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "cc_push_cuda.cuh"
static const int __tb_FirstItr_ConnectedComp = TB_SIZE;
static const int __tb_ConnectedComp = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_comp_current[src] = graph.node_data[src];
      p_comp_old[src]     = graph.node_data[src];
    }
  }
  // FP: "8 -> 9;
}
__global__ void FirstItr_ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        dst = graph.getAbsDestination(jj);
        new_dist = p_comp_current[src];
        old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_comp_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "49 -> 50;
}
__global__ void FirstItr_ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_FirstItr_ConnectedComp;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
      p_comp_old[src]  = p_comp_current[src];
    }
    // FP: "10 -> 11;
    // FP: "13 -> 14;
    // FP: "14 -> 15;
    int threshold = TOTAL_THREADS_1D;
    // FP: "15 -> 16;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "18 -> 19;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "19 -> 20;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "20 -> 21;
    _np_closure[threadIdx.x].src = src;
    // FP: "21 -> 22;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "24 -> 25;
    // FP: "25 -> 26;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "26 -> 27;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "27 -> 28;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "30 -> 31;
    __syncthreads();
    // FP: "31 -> 32;
    while (true)
    {
      // FP: "32 -> 33;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "35 -> 36;
      __syncthreads();
      // FP: "36 -> 37;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "37 -> 38;
        __syncthreads();
        // FP: "38 -> 39;
        break;
      }
      // FP: "40 -> 41;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "43 -> 44;
      __syncthreads();
      // FP: "44 -> 45;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "45 -> 46;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "48 -> 49;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "49 -> 50;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_comp_current[src];
          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_comp_current.set(dst);
          }
        }
      }
      // FP: "62 -> 63;
      __syncthreads();
    }
    // FP: "64 -> 65;

    // FP: "65 -> 66;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "66 -> 67;
      const int _np_laneid = cub::LaneId();
      // FP: "67 -> 68;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            dst = graph.getAbsDestination(jj);
            new_dist = p_comp_current[src];
            old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_comp_current.set(dst);
            }
          }
        }
      }
      // FP: "90 -> 91;
      __syncthreads();
      // FP: "91 -> 92;
    }

    // FP: "92 -> 93;
    __syncthreads();
    // FP: "93 -> 94;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "94 -> 95;
    while (_np.work())
    {
      // FP: "95 -> 96;
      int _np_i =0;
      // FP: "96 -> 97;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "97 -> 98;
      __syncthreads();
      // FP: "98 -> 99;

      // FP: "99 -> 100;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_comp_current[src];
          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_comp_current.set(dst);
          }
        }
      }
      // FP: "113 -> 114;
      _np.execute_round_done(ITSIZE);
      // FP: "114 -> 115;
      __syncthreads();
    }
    // FP: "116 -> 117;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "118 -> 119;
}
__global__ void ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        active_vertices.reduce( 1);
        dst = graph.getAbsDestination(jj);
        new_dist = p_comp_current[src];
        old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_comp_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_ConnectedComp;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "8 -> 9;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "9 -> 10;
    if (pop)
    {
      if (p_comp_old[src] > p_comp_current[src])
      {
        p_comp_old[src] = p_comp_current[src];
      }
      else
      {
        pop = false;
      }
    }
    // FP: "14 -> 15;
    // FP: "17 -> 18;
    // FP: "18 -> 19;
    int threshold = TOTAL_THREADS_1D;
    // FP: "19 -> 20;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "22 -> 23;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "23 -> 24;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "24 -> 25;
    _np_closure[threadIdx.x].src = src;
    // FP: "25 -> 26;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "28 -> 29;
    // FP: "29 -> 30;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "30 -> 31;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "31 -> 32;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "34 -> 35;
    __syncthreads();
    // FP: "35 -> 36;
    while (true)
    {
      // FP: "36 -> 37;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "39 -> 40;
      __syncthreads();
      // FP: "40 -> 41;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "41 -> 42;
        __syncthreads();
        // FP: "42 -> 43;
        break;
      }
      // FP: "44 -> 45;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "47 -> 48;
      __syncthreads();
      // FP: "48 -> 49;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "49 -> 50;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "52 -> 53;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "53 -> 54;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          active_vertices.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = p_comp_current[src];
          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_comp_current.set(dst);
          }
        }
      }
      // FP: "67 -> 68;
      __syncthreads();
    }
    // FP: "69 -> 70;

    // FP: "70 -> 71;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "71 -> 72;
      const int _np_laneid = cub::LaneId();
      // FP: "72 -> 73;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            active_vertices.reduce( 1);
            dst = graph.getAbsDestination(jj);
            new_dist = p_comp_current[src];
            old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_comp_current.set(dst);
            }
          }
        }
      }
      // FP: "96 -> 97;
      __syncthreads();
      // FP: "97 -> 98;
    }

    // FP: "98 -> 99;
    __syncthreads();
    // FP: "99 -> 100;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "100 -> 101;
    while (_np.work())
    {
      // FP: "101 -> 102;
      int _np_i =0;
      // FP: "102 -> 103;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "103 -> 104;
      __syncthreads();
      // FP: "104 -> 105;

      // FP: "105 -> 106;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          active_vertices.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = p_comp_current[src];
          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_comp_current.set(dst);
          }
        }
      }
      // FP: "120 -> 121;
      _np.execute_round_done(ITSIZE);
      // FP: "121 -> 122;
      __syncthreads();
    }
    // FP: "123 -> 124;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "126 -> 127;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "127 -> 128;
}
__global__ void ConnectedCompSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, HGAccumulator<uint64_t> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_comp_current[src] == graph.node_data[src])
      {
        active_vertices.reduce( 1);
      }
    }
  }
  // FP: "11 -> 12;
  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);
  // FP: "12 -> 13;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void FirstItr_ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  FirstItr_ConnectedComp <<<blocks, __tb_FirstItr_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      FirstItr_ConnectedComp_TB_LB <<<blocks, __tb_FirstItr_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void FirstItr_ConnectedComp_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_ConnectedComp_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void FirstItr_ConnectedComp_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void FirstItr_ConnectedComp_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_ConnectedComp_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  ConnectedComp <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      ConnectedComp_TB_LB <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void ConnectedComp_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedComp_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  ConnectedCompSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void ConnectedCompSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "cc_push_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> comp_current;
	struct CUDA_Context_Field<uint32_t> comp_old;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->comp_current, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->comp_old, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->comp_current, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->comp_old, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->comp_current.data.zero_gpu();
	ctx->comp_old.data.zero_gpu();
}

void get_bitset_comp_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->comp_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->comp_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->comp_current, begin, end);
}

uint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_rd_ptr();
	return comp_current[LID];
}

void set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	comp_current[LID] = v;
}

void add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	comp_current[LID] += v;
}

bool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();
	if (comp_current[LID] > v){
		comp_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v);
}

void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v);
}

void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, i);
}

void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);
}

void batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->comp_current, begin, end, v);
}

void get_bitset_comp_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->comp_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_comp_old_reset_cuda(struct CUDA_Context* ctx) {
	ctx->comp_old.is_updated.cpu_rd_ptr()->reset();
}

void bitset_comp_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->comp_old, begin, end);
}

uint32_t get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *comp_old = ctx->comp_old.data.cpu_rd_ptr();
	return comp_old[LID];
}

void set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();
	comp_old[LID] = v;
}

void add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();
	comp_old[LID] += v;
}

bool min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();
	if (comp_old[LID] > v){
		comp_old[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_old, from_id, v);
}

void batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_old, from_id, v);
}

void batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_old, from_id, v, i);
}

void batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_add_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_min_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_old, from_id, v, data_mode);
}

void batch_reset_node_comp_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->comp_old, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_comp_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_comp_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void get_bitset_comp_old_cuda(struct CUDA_Context* ctx,
                              uint64_t* bitset_compute);
void bitset_comp_old_reset_cuda(struct CUDA_Context* ctx);
void bitset_comp_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end);
uint32_t get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v);
void batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, size_t* v_size,
                                  DataCommMode* data_mode);
void batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v);
void batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode);
void batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        uint32_t i);
void batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size, DataCommMode* data_mode,
                                        uint32_t i);
void batch_set_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_reset_node_comp_old_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end, uint32_t v);

void ConnectedComp_cuda(unsigned int __begin, unsigned int __end,
                        unsigned int& active_vertices,
                        struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                                   uint64_t& active_vertices,
                                   struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_allNodes_cuda(uint64_t& active_vertices,
                                            struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_masterNodes_cuda(uint64_t& active_vertices,
                                               struct CUDA_Context* ctx);
void ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,
                                                  struct CUDA_Context* ctx);
void ConnectedComp_allNodes_cuda(unsigned int& active_vertices,
                                 struct CUDA_Context* ctx);
void ConnectedComp_masterNodes_cuda(unsigned int& active_vertices,
                                    struct CUDA_Context* ctx);
void ConnectedComp_nodesWithEdges_cuda(unsigned int& active_vertices,
                                       struct CUDA_Context* ctx);
void FirstItr_ConnectedComp_cuda(unsigned int __begin, unsigned int __end,
                                 struct CUDA_Context* ctx);
void FirstItr_ConnectedComp_allNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_ConnectedComp_masterNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_ConnectedComp_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("cc_push_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_comp_current[src] = graph.node_data[src]"]),
CBlock(["p_comp_old[src]     = graph.node_data[src]"]),
]),
]),
]),
Kernel("FirstItr_ConnectedComp", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old'), ('DynamicBitset&', 'bitset_comp_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_comp_old[src]  = p_comp_current[src]"]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = p_comp_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_comp_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_comp_current.set(dst)"]),
]),
]),
),
]),
]),
Kernel("ConnectedComp", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old'), ('DynamicBitset&', 'bitset_comp_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_comp_old[src] > p_comp_current[src]",
[
CBlock(["p_comp_old[src] = p_comp_current[src]"]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CBlock(["active_vertices.reduce( 1)"]),
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = p_comp_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_comp_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_comp_current.set(dst)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("ConnectedCompSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('HGAccumulator<uint64_t>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_comp_current[src] == graph.node_data[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "ctx->comp_old.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("FirstItr_ConnectedComp_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("FirstItr_ConnectedComp", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "ctx->comp_old.data.gpu_wr_ptr()", "*(ctx->comp_current.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("FirstItr_ConnectedComp_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_ConnectedComp_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("FirstItr_ConnectedComp_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("FirstItr_ConnectedComp_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_ConnectedComp_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("ConnectedComp_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("ConnectedComp", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "ctx->comp_old.data.gpu_wr_ptr()", "*(ctx->comp_current.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("ConnectedComp_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedComp_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedComp_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "active_verticesval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("ConnectedCompSanityCheck", ("ctx->gg", "__begin", "__end", "ctx->comp_current.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("ConnectedCompSanityCheck_allNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_masterNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("ConnectedCompSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/connected-components/cc_push_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(comp_current, uint32_t);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(comp_current, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(comp_current);


================================================
FILE: lonestar/analytics/distributed/k-core/CMakeLists.txt
================================================
app_dist(kcore_push k-core-push)
add_test_dist(k-core-push-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph -kcore=100)

app_dist(kcore_pull k-core-pull)
add_test_dist(k-core-pull-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph -kcore=100)


================================================
FILE: lonestar/analytics/distributed/k-core/README.md
================================================
K-Core Decomposition
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Finds the <b>k-core</b> in a graph. A k-core is defined as a subgraph of a 
graph in which all vertices of degree less than k have been removed from the 
graph. The remaining vertices must all have a degree of at least k.

The algorithm supports both a bulk-synchronous and a bulk-asynchronous
parallel algorithms. This benchmark consists of two algorithms,
push- and pull-based. In the push-based algorithm, all non-removed nodes
check to see if their degree has fallen below k in each round. If so,
it removes itself and decrements the degree on its neighbors.
In the pull-based algorithm, a node will check which of its neighbors have
recently been removed from the graph and decrement its own degree in each round.
If the degree falls below k, then it removes itself from the graph.


INPUT
--------------------------------------------------------------------------------

Takes in symmetric Galois .gr graphs. You must specify the -symmetricGraph
flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/k-core; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 machine with a k value of 4, use the following:
`./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`
`./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`

To run on 3 hosts h1, h2, and h3 with a k value of 4, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`
`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`

To run on 3 hosts h1, h2, and h3 with a k value of 4 with an incoming edge cut, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec -kcore=4`
`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec -kcore=4`

PERFORMANCE
--------------------------------------------------------------------------------

* The push variant generally performs better in our experience.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/******************************************************************************/
/* Sync code/calls was manually written, not compiler generated */
/******************************************************************************/

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "kcore_pull_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "KCore";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;
static cll::opt<unsigned int>
    maxIterations("maxIterations",
                  cll::desc("Maximum iterations: Default 10000"),
                  cll::init(10000));
// required k specification for k-core
static cll::opt<unsigned int> k_core_num("kcore", cll::desc("KCore value"),
                                         cll::Required);

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other inits */
/******************************************************************************/

struct NodeData {
  uint32_t current_degree;
  uint32_t trim;
  uint8_t flag;
  uint8_t pull_flag;
};

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

// bitset for tracking updates
galois::DynamicBitSet bitset_current_degree;
galois::DynamicBitSet bitset_trim;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

// add all sync/bitset structs (needs above declarations)
#include "kcore_pull_sync.hh"

/******************************************************************************/
/* Functors for running the algorithm */
/******************************************************************************/

/* Degree counting
 * Called by InitializeGraph1 */
struct DegreeCounting {
  Graph* graph;

  DegreeCounting(Graph* _graph) : graph(_graph) {}

  /* Initialize the entire graph node-by-node */
  void static go(Graph& _graph) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("DegreeCounting_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      DegreeCounting_nodesWithEdges_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(nodesWithEdges), DegreeCounting{&_graph},
          galois::steal(), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("DegreeCounting").c_str()));
    }

    syncSubstrate->sync<writeSource, readAny, Reduce_add_current_degree,
                        Bitset_current_degree>("DegreeCounting");
  }

  /* Calculate degree of nodes by checking how many nodes have it as a dest and
   * adding for every dest (works same way in pull version since it's a
   * symmetric graph) */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    src_data.current_degree =
        std::distance(graph->edge_begin(src), graph->edge_end(src));
    bitset_current_degree.set(src);

    //// technically can use std::dist above, but this is more easily
    //// recognizable by dist compiler + this is init so it doesn't matter much
    // for (auto current_edge : graph->edges(src)) {
    //  src_data.current_degree++;
    //  bitset_current_degree.set(src);
    //}
  }
};

/* Initialize: initial field setup */
struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  /* Initialize the entire graph node-by-node */
  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{&_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }

    // degree calculation
    DegreeCounting::go(_graph);
  }

  /* Setup intial fields */
  void operator()(GNode src) const {
    NodeData& src_data      = graph->getData(src);
    src_data.flag           = true;
    src_data.trim           = 0;
    src_data.current_degree = 0;
    src_data.pull_flag      = false;
  }
};

/* Updates liveness of a node + updates flag that says if node has been pulled
 * from */
template <bool async>
struct LiveUpdate {
  cll::opt<uint32_t>& local_k_core_num;
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  LiveUpdate(cll::opt<uint32_t>& _kcore, Graph* _graph,
             DGTerminatorDetector& _dga)
      : local_k_core_num(_kcore), graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph, DGTerminatorDetector& dga) {
    const auto& allNodes = _graph.allNodesRange();
    dga.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("LiveUpdate_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      unsigned int __retval = 0;
      LiveUpdate_allNodes_cuda(__retval, k_core_num, cuda_ctx);
      dga += __retval;
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          LiveUpdate{k_core_num, &_graph, dga}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("LiveUpdate").c_str()));
    }

    // no sync necessary as all nodes should have updated
  }

  /**
   * Mark a node dead if degree is under kcore number and mark it
   * available for pulling from.
   *
   * If dead, and pull flag is on, then turn off flag as you don't want to
   * be pulled from more than once.
   */
  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);

    if (sdata.flag) {
      if (sdata.trim > 0) {
        sdata.current_degree = sdata.current_degree - sdata.trim;
      }

      if (sdata.current_degree < local_k_core_num) {
        sdata.flag = false;
        active_vertices += 1;

        // let neighbors pull from me next round
        // assert(sdata.pull_flag == false);
        sdata.pull_flag = true;
      }
    } else {
      // dead
      if (sdata.pull_flag) {
        // do not allow neighbors to pull value from this node anymore
        sdata.pull_flag = false;
      }
    }

    // always reset trim
    sdata.trim = 0;
  }
};

/* Step that determines if a node is dead and updates its neighbors' trim
 * if it is */
template <bool async>
struct KCore {
  Graph* graph;

  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  KCore(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    unsigned iterations = 0;
    DGTerminatorDetector dga;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    do {
      syncSubstrate->set_num_round(iterations);

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("KCore_" + (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        KCore_nodesWithEdges_cuda(cuda_ctx);
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(galois::iterate(nodesWithEdges), KCore{&_graph},
                       galois::no_stats(), galois::steal(),
                       galois::loopname(
                           syncSubstrate->get_run_identifier("KCore").c_str()));
      }

      syncSubstrate
          ->sync<writeSource, readAny, Reduce_add_trim, Bitset_trim, async>(
              "KCore");

      // update live/deadness
      LiveUpdate<async>::go(_graph, dga);

      iterations++;
    } while ((async || (iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          (unsigned long)iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    // only if node is alive we do things
    if (src_data.flag) {
      // if dst node is dead, increment trim by one so we can decrement
      // our degree later
      for (auto current_edge : graph->edges(src)) {
        GNode dst          = graph->getEdgeDst(current_edge);
        NodeData& dst_data = graph->getData(dst);

        if (dst_data.pull_flag) {
          galois::add(src_data.trim, (uint32_t)1);
          bitset_trim.set(src);
        }
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Gets the total number of nodes that are still alive */
struct KCoreSanityCheck {
  Graph* graph;
  galois::DGAccumulator<uint64_t>& active_vertices;

  KCoreSanityCheck(Graph* _graph,
                   galois::DGAccumulator<uint64_t>& _active_vertices)
      : graph(_graph), active_vertices(_active_vertices) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {
    dga.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum = 0;
      KCoreSanityCheck_masterNodes_cuda(sum, cuda_ctx);
      dga += sum;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     KCoreSanityCheck(&_graph, dga), galois::no_stats(),
                     galois::loopname("KCoreSanityCheck"));
    }

    uint64_t num_nodes = dga.reduce();

    // Only node 0 will print data
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes in the ", k_core_num, "-core is ",
                     num_nodes, "\n");
    }
  }

  /* Check if an owned node is alive/dead: increment appropriate accumulator */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.flag) {
      active_vertices += 1;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<unsigned> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<unsigned> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).flag);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<unsigned> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_flag_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<unsigned> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main method for running */
/******************************************************************************/

constexpr static const char* const name = "KCore - Distributed Heterogeneous "
                                          "Pull Topological.";
constexpr static const char* const desc = "KCore on Distributed Galois.";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> h_graph;
#ifdef GALOIS_ENABLE_GPU
  std::tie(h_graph, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(h_graph, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>();
#endif

  bitset_current_degree.resize(h_graph->size());
  bitset_trim.resize(h_graph->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go functions called\n");

  InitializeGraph::go((*h_graph));
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<uint64_t> dga;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] KCore::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      KCore<true>::go(*h_graph);
    } else {
      KCore<false>::go(*h_graph);
    }
    StatTimer_main.stop();

    // sanity check
    KCoreSanityCheck::go(*h_graph, dga);

    // re-init graph for next run
    if ((run + 1) != numRuns) {
      (*syncSubstrate).set_num_run(run + 1);

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_current_degree_reset_cuda(cuda_ctx);
        bitset_trim_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_current_degree.reset();
        bitset_trim.reset();
      }

      InitializeGraph::go(*h_graph);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<unsigned> results = makeResults(h_graph);
    auto globalIDs                = h_graph->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "in_kcore", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "kcore_pull_cuda.cuh"
static const int __tb_KCore = TB_SIZE;
__global__ void DegreeCounting(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_current_degree[src] = graph.getOutDegree(src);
      bitset_current_degree.set(src);
    }
  }
  // FP: "8 -> 9;
}
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_flag[src]           = true;
      p_trim[src]           = 0;
      p_current_degree[src] = 0;
      p_pull_flag[src]      = false;
    }
  }
  // FP: "10 -> 11;
}
__global__ void LiveUpdate(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, HGAccumulator<unsigned int> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_flag[src])
      {
        if (p_trim[src] > 0)
        {
          p_current_degree[src] = p_current_degree[src] - p_trim[src];
        }
        if (p_current_degree[src] < local_k_core_num)
        {
          p_flag[src] = false;
          active_vertices.reduce( 1);
          p_pull_flag[src] = true;
        }
      }
      else
      {
        if (p_pull_flag[src])
        {
          p_pull_flag[src] = false;
        }
      }
      p_trim[src] = 0;
    }
  }
  // FP: "22 -> 23;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "23 -> 24;
}
__global__ void KCore_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type current_edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      current_edge = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        dst = graph.getAbsDestination(current_edge);
        if (p_pull_flag[dst])
        {
          atomicTestAdd(&p_trim[src], (uint32_t)1);
          bitset_trim.set(src);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "46 -> 47;
}
__global__ void KCore(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_KCore;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
      if (p_flag[src])
      {
      }
      else
      {
        pop = false;
      }
    }
    // FP: "12 -> 13;
    // FP: "15 -> 16;
    // FP: "16 -> 17;
    int threshold = TOTAL_THREADS_1D;
    // FP: "17 -> 18;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "20 -> 21;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "21 -> 22;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "22 -> 23;
    _np_closure[threadIdx.x].src = src;
    // FP: "23 -> 24;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "26 -> 27;
    // FP: "27 -> 28;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "28 -> 29;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "29 -> 30;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "32 -> 33;
    __syncthreads();
    // FP: "33 -> 34;
    while (true)
    {
      // FP: "34 -> 35;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "37 -> 38;
      __syncthreads();
      // FP: "38 -> 39;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "39 -> 40;
        __syncthreads();
        // FP: "40 -> 41;
        break;
      }
      // FP: "42 -> 43;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "45 -> 46;
      __syncthreads();
      // FP: "46 -> 47;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "47 -> 48;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "50 -> 51;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "51 -> 52;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type current_edge;
        current_edge = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(current_edge);
          if (p_pull_flag[dst])
          {
            atomicTestAdd(&p_trim[src], (uint32_t)1);
            bitset_trim.set(src);
          }
        }
      }
      // FP: "61 -> 62;
      __syncthreads();
    }
    // FP: "63 -> 64;

    // FP: "64 -> 65;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "65 -> 66;
      const int _np_laneid = cub::LaneId();
      // FP: "66 -> 67;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type current_edge;
          current_edge = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(current_edge);
            if (p_pull_flag[dst])
            {
              atomicTestAdd(&p_trim[src], (uint32_t)1);
              bitset_trim.set(src);
            }
          }
        }
      }
      // FP: "86 -> 87;
      __syncthreads();
      // FP: "87 -> 88;
    }

    // FP: "88 -> 89;
    __syncthreads();
    // FP: "89 -> 90;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "90 -> 91;
    while (_np.work())
    {
      // FP: "91 -> 92;
      int _np_i =0;
      // FP: "92 -> 93;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "93 -> 94;
      __syncthreads();
      // FP: "94 -> 95;

      // FP: "95 -> 96;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type current_edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        current_edge= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(current_edge);
          if (p_pull_flag[dst])
          {
            atomicTestAdd(&p_trim[src], (uint32_t)1);
            bitset_trim.set(src);
          }
        }
      }
      // FP: "106 -> 107;
      _np.execute_round_done(ITSIZE);
      // FP: "107 -> 108;
      __syncthreads();
    }
    // FP: "109 -> 110;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "111 -> 112;
}
__global__ void KCoreSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, HGAccumulator<uint64_t> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_flag[src])
      {
        active_vertices.reduce( 1);
      }
    }
  }
  // FP: "11 -> 12;
  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);
  // FP: "12 -> 13;
}
void DegreeCounting_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  DegreeCounting <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()));
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void DegreeCounting_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  DegreeCounting_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void DegreeCounting_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  DegreeCounting_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void DegreeCounting_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  DegreeCounting_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void LiveUpdate_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  LiveUpdate <<<blocks, threads>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void LiveUpdate_allNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  LiveUpdate_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void LiveUpdate_masterNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  LiveUpdate_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void LiveUpdate_nodesWithEdges_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  LiveUpdate_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void KCore_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  KCore <<<blocks, __tb_KCore>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      KCore_TB_LB <<<blocks, __tb_KCore>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void KCore_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCore_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void KCore_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCore_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void KCore_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCore_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  KCoreSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void KCoreSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "kcore_pull_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> current_degree;
	struct CUDA_Context_Field<uint8_t> flag;
	struct CUDA_Context_Field<uint8_t> pull_flag;
	struct CUDA_Context_Field<uint32_t> trim;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->current_degree, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->flag, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->pull_flag, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->trim, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->current_degree, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->flag, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->pull_flag, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->trim, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->current_degree.data.zero_gpu();
	ctx->flag.data.zero_gpu();
	ctx->pull_flag.data.zero_gpu();
	ctx->trim.data.zero_gpu();
}

void get_bitset_current_degree_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->current_degree.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx) {
	ctx->current_degree.is_updated.cpu_rd_ptr()->reset();
}

void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->current_degree, begin, end);
}

uint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_rd_ptr();
	return current_degree[LID];
}

void set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	current_degree[LID] = v;
}

void add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	current_degree[LID] += v;
}

bool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	if (current_degree[LID] > v){
		current_degree[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v);
}

void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v);
}

void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, i);
}

void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->current_degree, begin, end, v);
}

void get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_flag_reset_cuda(struct CUDA_Context* ctx) {
	ctx->flag.is_updated.cpu_rd_ptr()->reset();
}

void bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->flag, begin, end);
}

uint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint8_t *flag = ctx->flag.data.cpu_rd_ptr();
	return flag[LID];
}

void set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	flag[LID] = v;
}

void add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	flag[LID] += v;
}

bool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	if (flag[LID] > v){
		flag[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v);
}

void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v);
}

void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, i);
}

void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {
	reset_data_field<uint8_t>(&ctx->flag, begin, end, v);
}

void get_bitset_pull_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->pull_flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx) {
	ctx->pull_flag.is_updated.cpu_rd_ptr()->reset();
}

void bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->pull_flag, begin, end);
}

uint8_t get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint8_t *pull_flag = ctx->pull_flag.data.cpu_rd_ptr();
	return pull_flag[LID];
}

void set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();
	pull_flag[LID] = v;
}

void add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();
	pull_flag[LID] += v;
}

bool min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();
	if (pull_flag[LID] > v){
		pull_flag[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->pull_flag, from_id, v);
}

void batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->pull_flag, from_id, v);
}

void batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->pull_flag, from_id, v, i);
}

void batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_add_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_min_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);
}

void batch_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {
	reset_data_field<uint8_t>(&ctx->pull_flag, begin, end, v);
}

void get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->trim.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_trim_reset_cuda(struct CUDA_Context* ctx) {
	ctx->trim.is_updated.cpu_rd_ptr()->reset();
}

void bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->trim, begin, end);
}

uint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *trim = ctx->trim.data.cpu_rd_ptr();
	return trim[LID];
}

void set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	trim[LID] = v;
}

void add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	trim[LID] += v;
}

bool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	if (trim[LID] > v){
		trim[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v);
}

void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v);
}

void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, i);
}

void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->trim, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_current_degree_cuda(struct CUDA_Context* ctx,
                                    uint64_t* bitset_compute);
void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx);
void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                      size_t end);
uint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
bool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v);
void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size,
                                        DataCommMode* data_mode);
void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v);
void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               size_t* v_size,
                                               DataCommMode* data_mode);
void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              uint32_t i);
void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              size_t* v_size,
                                              DataCommMode* data_mode,
                                              uint32_t i);
void batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_set_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_add_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_min_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                          size_t begin, size_t end, uint32_t v);

void get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_flag_reset_cuda(struct CUDA_Context* ctx);
void bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
bool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint8_t i);
void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint8_t i);
void batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint8_t v);

void get_bitset_pull_flag_cuda(struct CUDA_Context* ctx,
                               uint64_t* bitset_compute);
void bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx);
void bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                 size_t end);
uint8_t get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
bool min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                   uint8_t* v);
void batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                   uint8_t* v, size_t* v_size,
                                   DataCommMode* data_mode);
void batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v);
void batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          size_t* v_size,
                                          DataCommMode* data_mode);
void batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         uint8_t i);
void batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode, uint8_t i);
void batch_set_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          DataCommMode data_mode);
void batch_set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                   uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          DataCommMode data_mode);
void batch_add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                   uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,
                                          unsigned from_id, uint8_t* v,
                                          DataCommMode data_mode);
void batch_min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                   uint8_t* v, DataCommMode data_mode);
void batch_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, size_t begin,
                                     size_t end, uint8_t v);

void get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_trim_reset_cuda(struct CUDA_Context* ctx);
void bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint32_t i);
void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint32_t i);
void batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint32_t v);

void DegreeCounting_cuda(unsigned int __begin, unsigned int __end,
                         struct CUDA_Context* ctx);
void DegreeCounting_allNodes_cuda(struct CUDA_Context* ctx);
void DegreeCounting_masterNodes_cuda(struct CUDA_Context* ctx);
void DegreeCounting_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void KCore_cuda(unsigned int __begin, unsigned int __end,
                struct CUDA_Context* ctx);
void KCoreSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                           uint64_t& active_vertices, struct CUDA_Context* ctx);
void KCoreSanityCheck_allNodes_cuda(uint64_t& active_vertices,
                                    struct CUDA_Context* ctx);
void KCoreSanityCheck_masterNodes_cuda(uint64_t& active_vertices,
                                       struct CUDA_Context* ctx);
void KCoreSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,
                                          struct CUDA_Context* ctx);
void KCore_allNodes_cuda(struct CUDA_Context* ctx);
void KCore_masterNodes_cuda(struct CUDA_Context* ctx);
void KCore_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void LiveUpdate_cuda(unsigned int __begin, unsigned int __end,
                     unsigned int& active_vertices, uint32_t local_k_core_num,
                     struct CUDA_Context* ctx);
void LiveUpdate_allNodes_cuda(unsigned int& active_vertices,
                              uint32_t local_k_core_num,
                              struct CUDA_Context* ctx);
void LiveUpdate_masterNodes_cuda(unsigned int& active_vertices,
                                 uint32_t local_k_core_num,
                                 struct CUDA_Context* ctx);
void LiveUpdate_nodesWithEdges_cuda(unsigned int& active_vertices,
                                    uint32_t local_k_core_num,
                                    struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("kcore_pull_cuda.cuh", system = False)], parse = False),
Kernel("DegreeCounting", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('DynamicBitset&', 'bitset_current_degree')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_current_degree[src] = graph.getOutDegree(src)"]),
CBlock(["bitset_current_degree.set(src)"]),
]),
]),
]),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_flag[src]           = true"]),
CBlock(["p_trim[src]           = 0"]),
CBlock(["p_current_degree[src] = 0"]),
CBlock(["p_pull_flag[src]      = false"]),
]),
]),
]),
Kernel("LiveUpdate", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_k_core_num'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
If("p_trim[src] > 0",
[
CBlock(["p_current_degree[src] = p_current_degree[src] - p_trim[src]"]),
]),
If("p_current_degree[src] < local_k_core_num",
[
CBlock(["p_flag[src] = false"]),
CBlock(["active_vertices.reduce( 1)"]),
CBlock(["p_pull_flag[src] = true"]),
]),
],
[
If("p_pull_flag[src]",
[
CBlock(["p_pull_flag[src] = false"]),
]),
]),
CBlock(["p_trim[src] = 0"]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("KCore", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim'), ('DynamicBitset&', 'bitset_trim')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("current_edge", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(current_edge)"]),
If("p_pull_flag[dst]",
[
CBlock(["atomicTestAdd(&p_trim[src], (uint32_t)1)"]),
CBlock(["bitset_trim.set(src)"]),
]),
]),
),
]),
]),
Kernel("KCoreSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('HGAccumulator<uint64_t>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("DegreeCounting_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("DegreeCounting", ("ctx->gg", "__begin", "__end", "ctx->current_degree.data.gpu_wr_ptr()", "*(ctx->current_degree.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("DegreeCounting_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["DegreeCounting_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("DegreeCounting_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["DegreeCounting_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("DegreeCounting_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["DegreeCounting_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "ctx->current_degree.data.gpu_wr_ptr()", "ctx->flag.data.gpu_wr_ptr()", "ctx->pull_flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("LiveUpdate_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("LiveUpdate", ("ctx->gg", "__begin", "__end", "local_k_core_num", "ctx->current_degree.data.gpu_wr_ptr()", "ctx->flag.data.gpu_wr_ptr()", "ctx->pull_flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("LiveUpdate_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["LiveUpdate_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("LiveUpdate_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["LiveUpdate_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("LiveUpdate_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["LiveUpdate_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("KCore_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("KCore", ("ctx->gg", "__begin", "__end", "ctx->flag.data.gpu_wr_ptr()", "ctx->pull_flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()", "*(ctx->trim.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("KCore_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCore_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("KCore_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCore_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("KCore_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCore_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "active_verticesval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("KCoreSanityCheck", ("ctx->gg", "__begin", "__end", "ctx->flag.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("KCoreSanityCheck_allNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_masterNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_pull_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

////////////////////////////////////////////////////////////////////////////////
// current_degree
////////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(current_degree, uint32_t);
GALOIS_SYNC_STRUCTURE_REDUCE_SET(current_degree, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(current_degree);

////////////////////////////////////////////////////////////////////////////////
// trim
////////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(trim, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(trim);


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/******************************************************************************/
/* Sync code/calls was manually written, not compiler generated */
/******************************************************************************/

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "kcore_push_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "KCore";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;
static cll::opt<unsigned int>
    maxIterations("maxIterations",
                  cll::desc("Maximum iterations: Default 10000"),
                  cll::init(10000));
// required k specification for k-core
static cll::opt<unsigned int> k_core_num("kcore", cll::desc("KCore value"),
                                         cll::Required);

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other inits */
/******************************************************************************/

struct NodeData {
  std::atomic<uint32_t> current_degree;
  std::atomic<uint32_t> trim;
  uint8_t flag;
};

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

// bitset for tracking updates
galois::DynamicBitSet bitset_current_degree;
galois::DynamicBitSet bitset_trim;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

// add all sync/bitset structs (needs above declarations)
#include "kcore_push_sync.hh"

/******************************************************************************/
/* Functors for running the algorithm */
/******************************************************************************/

/* Degree counting
 * Called by InitializeGraph1 */
struct InitializeGraph2 {
  Graph* graph;

  InitializeGraph2(Graph* _graph) : graph(_graph) {}

  /* Initialize the entire graph node-by-node */
  void static go(Graph& _graph) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph2_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph2_nodesWithEdges_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(nodesWithEdges), InitializeGraph2{&_graph},
          galois::steal(), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph2").c_str()));
    }

    syncSubstrate->sync<writeDestination, readSource, Reduce_add_current_degree,
                        Bitset_current_degree>("InitializeGraph2");
  }

  /* Calculate degree of nodes by checking how many nodes have it as a dest and
   * adding for every dest */
  void operator()(GNode src) const {
    for (auto current_edge : graph->edges(src)) {
      GNode dest_node = graph->getEdgeDst(current_edge);

      NodeData& dest_data = graph->getData(dest_node);
      galois::atomicAdd(dest_data.current_degree, (uint32_t)1);

      bitset_current_degree.set(dest_node);
    }
  }
};

/* Initialize: initial field setup */
struct InitializeGraph1 {
  Graph* graph;

  InitializeGraph1(Graph* _graph) : graph(_graph) {}

  /* Initialize the entire graph node-by-node */
  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph1_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph1_allNodes_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph1{&_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph1").c_str()));
    }

    // degree calculation
    InitializeGraph2::go(_graph);
  }

  /* Setup intial fields */
  void operator()(GNode src) const {
    NodeData& src_data      = graph->getData(src);
    src_data.flag           = true;
    src_data.trim           = 0;
    src_data.current_degree = 0;
  }
};

/* Use the trim value (i.e. number of incident nodes that have been removed)
 * to update degrees.
 * Called by KCoreStep1 */
struct KCoreStep2 {
  Graph* graph;

  KCoreStep2(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("KCore_" + (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      KCoreStep2_nodesWithEdges_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),
          KCoreStep2{&_graph}, galois::no_stats(),
          galois::loopname(syncSubstrate->get_run_identifier("KCore").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    // we currently do not care about degree for dead nodes,
    // so we ignore those (i.e. if flag isn't set, do nothing)
    if (src_data.flag) {
      if (src_data.trim > 0) {
        src_data.current_degree = src_data.current_degree - src_data.trim;
      }
    }

    src_data.trim = 0;
  }
};

/* Step that determines if a node is dead and updates its neighbors' trim
 * if it is */
template <bool async>
struct KCoreStep1 {
  cll::opt<uint32_t>& local_k_core_num;
  Graph* graph;

  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  KCoreStep1(cll::opt<uint32_t>& _kcore, Graph* _graph,
             DGTerminatorDetector& _dga)
      : local_k_core_num(_kcore), graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    unsigned iterations = 0;
    DGTerminatorDetector dga;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    do {
      syncSubstrate->set_num_round(iterations);
      dga.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("KCore_" + (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        KCoreStep1_nodesWithEdges_cuda(__retval, k_core_num, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(galois::iterate(nodesWithEdges),
                       KCoreStep1{k_core_num, &_graph, dga}, galois::steal(),
                       galois::no_stats(),
                       galois::loopname(
                           syncSubstrate->get_run_identifier("KCore").c_str()));
      }

      // do the trim sync; readSource because in symmetric graph
      // source=destination; not a readAny because any will grab non
      // source/dest nodes (which have degree 0, so they won't have a trim
      // anyways)
      syncSubstrate->sync<writeDestination, readSource, Reduce_add_trim,
                          Bitset_trim, async>("KCore");

      // handle trimming (locally)
      KCoreStep2::go(_graph);

      iterations++;
    } while ((async || (iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          (unsigned long)iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    // only if node is alive we do things
    if (src_data.flag) {
      if (src_data.current_degree < local_k_core_num) {
        // set flag to 0 (false) and increment trim on outgoing neighbors
        // (if they exist)
        src_data.flag = false;
        active_vertices += 1; // can be optimized: node may not have edges

        for (auto current_edge : graph->edges(src)) {
          GNode dst = graph->getEdgeDst(current_edge);

          auto& dst_data = graph->getData(dst);

          galois::atomicAdd(dst_data.trim, (uint32_t)1);
          bitset_trim.set(dst);
        }
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Gets the total number of nodes that are still alive */
struct KCoreSanityCheck {
  Graph* graph;
  galois::DGAccumulator<uint64_t>& active_vertices;

  KCoreSanityCheck(Graph* _graph,
                   galois::DGAccumulator<uint64_t>& _active_vertices)
      : graph(_graph), active_vertices(_active_vertices) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {
    dga.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      KCoreSanityCheck_masterNodes_cuda(sum, cuda_ctx);
      dga += sum;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     KCoreSanityCheck(&_graph, dga), galois::no_stats(),
                     galois::loopname("KCoreSanityCheck"));
    }

    uint64_t num_nodes = dga.reduce();

    // Only node 0 will print data
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes in the ", k_core_num, "-core is ",
                     num_nodes, "\n");
    }
  }

  /* Check if an owned node is alive/dead: increment appropriate accumulator */
  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.flag) {
      active_vertices += 1;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<unsigned> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<unsigned> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).flag);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<unsigned> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_flag_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<unsigned> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main method for running */
/******************************************************************************/

constexpr static const char* const name = "KCore - Distributed Heterogeneous "
                                          "Push Filter.";
constexpr static const char* const desc = "KCore on Distributed Galois.";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> h_graph;
#ifdef GALOIS_ENABLE_GPU
  std::tie(h_graph, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(h_graph, syncSubstrate) =
      symmetricDistGraphInitialization<NodeData, void>();
#endif

  bitset_current_degree.resize(h_graph->size());
  bitset_trim.resize(h_graph->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go functions called\n");
  InitializeGraph1::go((*h_graph));
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<uint64_t> dga;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] KCoreStep1::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      KCoreStep1<true>::go(*h_graph);
    } else {
      KCoreStep1<false>::go(*h_graph);
    }
    StatTimer_main.stop();

    // sanity check
    KCoreSanityCheck::go(*h_graph, dga);

    // re-init graph for next run
    if ((run + 1) != numRuns) {
      (*syncSubstrate).set_num_run(run + 1);

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_current_degree_reset_cuda(cuda_ctx);
        bitset_trim_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_current_degree.reset();
        bitset_trim.reset();
      }

      InitializeGraph1::go(*h_graph);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<unsigned> results = makeResults(h_graph);
    auto globalIDs                = h_graph->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "in_kcore", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "kcore_push_cuda.cuh"
static const int __tb_InitializeGraph2 = TB_SIZE;
static const int __tb_KCoreStep1 = TB_SIZE;
__global__ void InitializeGraph2_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type current_edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      current_edge = (graph).getFirstEdge(src)+ offset;
      {
        index_type dest_node;
        dest_node = graph.getAbsDestination(current_edge);
        atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);
        bitset_current_degree.set(dest_node);
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "44 -> 45;
}
__global__ void InitializeGraph2(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_InitializeGraph2;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
    }
    // FP: "9 -> 10;
    // FP: "12 -> 13;
    // FP: "13 -> 14;
    int threshold = TOTAL_THREADS_1D;
    // FP: "14 -> 15;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "17 -> 18;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "18 -> 19;
    __shared__ struct { ; } _np_closure [TB_SIZE];
    // FP: "19 -> 20;
    // FP: "20 -> 21;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "23 -> 24;
    // FP: "24 -> 25;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "25 -> 26;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "26 -> 27;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "29 -> 30;
    __syncthreads();
    // FP: "30 -> 31;
    while (true)
    {
      // FP: "31 -> 32;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "34 -> 35;
      __syncthreads();
      // FP: "35 -> 36;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "36 -> 37;
        __syncthreads();
        // FP: "37 -> 38;
        break;
      }
      // FP: "39 -> 40;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "42 -> 43;
      __syncthreads();
      // FP: "43 -> 44;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "44 -> 45;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "47 -> 48;
      assert(nps.tb.src < __kernel_tb_size);
      // FP: "48 -> 49;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type current_edge;
        current_edge = ns +_np_j;
        {
          index_type dest_node;
          dest_node = graph.getAbsDestination(current_edge);
          atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);
          bitset_current_degree.set(dest_node);
        }
      }
      // FP: "56 -> 57;
      __syncthreads();
    }
    // FP: "58 -> 59;

    // FP: "59 -> 60;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "60 -> 61;
      const int _np_laneid = cub::LaneId();
      // FP: "61 -> 62;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type current_edge;
          current_edge = _np_w_start +_np_ii;
          {
            index_type dest_node;
            dest_node = graph.getAbsDestination(current_edge);
            atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);
            bitset_current_degree.set(dest_node);
          }
        }
      }
      // FP: "79 -> 80;
      __syncthreads();
      // FP: "80 -> 81;
    }

    // FP: "81 -> 82;
    __syncthreads();
    // FP: "82 -> 83;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "83 -> 84;
    while (_np.work())
    {
      // FP: "84 -> 85;
      int _np_i =0;
      // FP: "85 -> 86;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "86 -> 87;
      __syncthreads();
      // FP: "87 -> 88;

      // FP: "88 -> 89;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type current_edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        current_edge= nps.fg.itvalue[_np_i];
        {
          index_type dest_node;
          dest_node = graph.getAbsDestination(current_edge);
          atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);
          bitset_current_degree.set(dest_node);
        }
      }
      // FP: "97 -> 98;
      _np.execute_round_done(ITSIZE);
      // FP: "98 -> 99;
      __syncthreads();
    }
    // FP: "100 -> 101;
    assert(threadIdx.x < __kernel_tb_size);
  }
  // FP: "102 -> 103;
}
__global__ void InitializeGraph1(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_flag[src]           = true;
      p_trim[src]           = 0;
      p_current_degree[src] = 0;
    }
  }
  // FP: "9 -> 10;
}
__global__ void KCoreStep2(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_flag[src])
      {
        if (p_trim[src] > 0)
        {
          p_current_degree[src] = p_current_degree[src] - p_trim[src];
        }
      }
      p_trim[src] = 0;
    }
  }
  // FP: "12 -> 13;
}
__global__ void KCoreStep1_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type current_edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      current_edge = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        dst = graph.getAbsDestination(current_edge);
        atomicTestAdd(&p_trim[dst], (uint32_t)1);
        bitset_trim.set(dst);
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "44 -> 45;
}
__global__ void KCoreStep1(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_KCoreStep1;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "8 -> 9;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "9 -> 10;
    if (pop)
    {
      if (p_flag[src])
      {
        if (p_current_degree[src] < local_k_core_num)
        {
          p_flag[src] = false;
          active_vertices.reduce( 1);
        }
        else
        {
          pop = false;
        }
      }
      else
      {
        pop = false;
      }
    }
    // FP: "17 -> 18;
    // FP: "20 -> 21;
    // FP: "21 -> 22;
    int threshold = TOTAL_THREADS_1D;
    // FP: "22 -> 23;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "25 -> 26;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "26 -> 27;
    __shared__ struct { ; } _np_closure [TB_SIZE];
    // FP: "27 -> 28;
    // FP: "28 -> 29;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "31 -> 32;
    // FP: "32 -> 33;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "33 -> 34;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "34 -> 35;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "37 -> 38;
    __syncthreads();
    // FP: "38 -> 39;
    while (true)
    {
      // FP: "39 -> 40;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "42 -> 43;
      __syncthreads();
      // FP: "43 -> 44;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "44 -> 45;
        __syncthreads();
        // FP: "45 -> 46;
        break;
      }
      // FP: "47 -> 48;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "50 -> 51;
      __syncthreads();
      // FP: "51 -> 52;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "52 -> 53;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "55 -> 56;
      assert(nps.tb.src < __kernel_tb_size);
      // FP: "56 -> 57;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type current_edge;
        current_edge = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(current_edge);
          atomicTestAdd(&p_trim[dst], (uint32_t)1);
          bitset_trim.set(dst);
        }
      }
      // FP: "64 -> 65;
      __syncthreads();
    }
    // FP: "66 -> 67;

    // FP: "67 -> 68;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "68 -> 69;
      const int _np_laneid = cub::LaneId();
      // FP: "69 -> 70;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type current_edge;
          current_edge = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(current_edge);
            atomicTestAdd(&p_trim[dst], (uint32_t)1);
            bitset_trim.set(dst);
          }
        }
      }
      // FP: "87 -> 88;
      __syncthreads();
      // FP: "88 -> 89;
    }

    // FP: "89 -> 90;
    __syncthreads();
    // FP: "90 -> 91;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "91 -> 92;
    while (_np.work())
    {
      // FP: "92 -> 93;
      int _np_i =0;
      // FP: "93 -> 94;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "94 -> 95;
      __syncthreads();
      // FP: "95 -> 96;

      // FP: "96 -> 97;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type current_edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        current_edge= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(current_edge);
          atomicTestAdd(&p_trim[dst], (uint32_t)1);
          bitset_trim.set(dst);
        }
      }
      // FP: "105 -> 106;
      _np.execute_round_done(ITSIZE);
      // FP: "106 -> 107;
      __syncthreads();
    }
    // FP: "108 -> 109;
    assert(threadIdx.x < __kernel_tb_size);
  }
  // FP: "112 -> 113;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "113 -> 114;
}
__global__ void KCoreSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, HGAccumulator<uint64_t> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_flag[src])
      {
        active_vertices.reduce( 1);
      }
    }
  }
  // FP: "11 -> 12;
  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);
  // FP: "12 -> 13;
}
void InitializeGraph2_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph2 <<<blocks, __tb_InitializeGraph2>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      InitializeGraph2_TB_LB <<<blocks, __tb_InitializeGraph2>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph2_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph2_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph2_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph2_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph2_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph1_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph1 <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph1_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph1_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph1_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph1_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph1_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void KCoreStep2_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  KCoreStep2 <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void KCoreStep2_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep2_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void KCoreStep2_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void KCoreStep2_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep2_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void KCoreStep1_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  KCoreStep1 <<<blocks, __tb_KCoreStep1>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      KCoreStep1_TB_LB <<<blocks, __tb_KCoreStep1>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void KCoreStep1_allNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep1_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void KCoreStep1_masterNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void KCoreStep1_nodesWithEdges_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreStep1_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  KCoreSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void KCoreSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void KCoreSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "kcore_push_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> current_degree;
	struct CUDA_Context_Field<uint8_t> flag;
	struct CUDA_Context_Field<uint32_t> trim;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->current_degree, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->flag, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->trim, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->current_degree, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->flag, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->trim, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->current_degree.data.zero_gpu();
	ctx->flag.data.zero_gpu();
	ctx->trim.data.zero_gpu();
}

void get_bitset_current_degree_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->current_degree.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx) {
	ctx->current_degree.is_updated.cpu_rd_ptr()->reset();
}

void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->current_degree, begin, end);
}

uint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_rd_ptr();
	return current_degree[LID];
}

void set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	current_degree[LID] = v;
}

void add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	current_degree[LID] += v;
}

bool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();
	if (current_degree[LID] > v){
		current_degree[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v);
}

void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v);
}

void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, i);
}

void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);
}

void batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->current_degree, begin, end, v);
}

void get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_flag_reset_cuda(struct CUDA_Context* ctx) {
	ctx->flag.is_updated.cpu_rd_ptr()->reset();
}

void bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->flag, begin, end);
}

uint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint8_t *flag = ctx->flag.data.cpu_rd_ptr();
	return flag[LID];
}

void set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	flag[LID] = v;
}

void add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	flag[LID] += v;
}

bool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {
	uint8_t *flag = ctx->flag.data.cpu_wr_ptr();
	if (flag[LID] > v){
		flag[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v);
}

void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v);
}

void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, i);
}

void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {
	batch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->flag, from_id, v, data_mode);
}

void batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {
	reset_data_field<uint8_t>(&ctx->flag, begin, end, v);
}

void get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->trim.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_trim_reset_cuda(struct CUDA_Context* ctx) {
	ctx->trim.is_updated.cpu_rd_ptr()->reset();
}

void bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->trim, begin, end);
}

uint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *trim = ctx->trim.data.cpu_rd_ptr();
	return trim[LID];
}

void set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	trim[LID] = v;
}

void add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	trim[LID] += v;
}

bool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *trim = ctx->trim.data.cpu_wr_ptr();
	if (trim[LID] > v){
		trim[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v);
}

void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v);
}

void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, i);
}

void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->trim, from_id, v, data_mode);
}

void batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->trim, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_current_degree_cuda(struct CUDA_Context* ctx,
                                    uint64_t* bitset_compute);
void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx);
void bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                      size_t end);
uint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
bool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,
                                  uint32_t v);
void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v);
void batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size,
                                        DataCommMode* data_mode);
void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v);
void batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               size_t* v_size,
                                               DataCommMode* data_mode);
void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              uint32_t i);
void batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                              unsigned from_id, uint8_t* v,
                                              size_t* v_size,
                                              DataCommMode* data_mode,
                                              uint32_t i);
void batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_set_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_add_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,
                                               unsigned from_id, uint8_t* v,
                                               DataCommMode data_mode);
void batch_min_node_current_degree_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        DataCommMode data_mode);
void batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx,
                                          size_t begin, size_t end, uint32_t v);

void get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_flag_reset_cuda(struct CUDA_Context* ctx);
void bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
bool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);
void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint8_t i);
void batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint8_t i);
void batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint8_t v);

void get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_trim_reset_cuda(struct CUDA_Context* ctx);
void bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint32_t i);
void batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint32_t i);
void batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint32_t v);

void InitializeGraph1_cuda(unsigned int __begin, unsigned int __end,
                           struct CUDA_Context* ctx);
void InitializeGraph1_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph1_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph1_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeGraph2_cuda(unsigned int __begin, unsigned int __end,
                           struct CUDA_Context* ctx);
void InitializeGraph2_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph2_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph2_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void KCoreSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                           uint64_t& active_vertices, struct CUDA_Context* ctx);
void KCoreSanityCheck_allNodes_cuda(uint64_t& active_vertices,
                                    struct CUDA_Context* ctx);
void KCoreSanityCheck_masterNodes_cuda(uint64_t& active_vertices,
                                       struct CUDA_Context* ctx);
void KCoreSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,
                                          struct CUDA_Context* ctx);
void KCoreStep1_cuda(unsigned int __begin, unsigned int __end,
                     unsigned int& active_vertices, uint32_t local_k_core_num,
                     struct CUDA_Context* ctx);
void KCoreStep1_allNodes_cuda(unsigned int& active_vertices,
                              uint32_t local_k_core_num,
                              struct CUDA_Context* ctx);
void KCoreStep1_masterNodes_cuda(unsigned int& active_vertices,
                                 uint32_t local_k_core_num,
                                 struct CUDA_Context* ctx);
void KCoreStep1_nodesWithEdges_cuda(unsigned int& active_vertices,
                                    uint32_t local_k_core_num,
                                    struct CUDA_Context* ctx);
void KCoreStep2_cuda(unsigned int __begin, unsigned int __end,
                     struct CUDA_Context* ctx);
void KCoreStep2_allNodes_cuda(struct CUDA_Context* ctx);
void KCoreStep2_masterNodes_cuda(struct CUDA_Context* ctx);
void KCoreStep2_nodesWithEdges_cuda(struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("kcore_push_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph2", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('DynamicBitset&', 'bitset_current_degree')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("current_edge", G.edges("src"),
[
CDecl([("index_type", "dest_node", "")]),
CBlock(["dest_node = graph.getAbsDestination(current_edge)"]),
CBlock(["atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1)"]),
CBlock(["bitset_current_degree.set(dest_node)"]),
]),
),
]),
]),
Kernel("InitializeGraph1", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_flag[src]           = true"]),
CBlock(["p_trim[src]           = 0"]),
CBlock(["p_current_degree[src] = 0"]),
]),
]),
]),
Kernel("KCoreStep2", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
If("p_trim[src] > 0",
[
CBlock(["p_current_degree[src] = p_current_degree[src] - p_trim[src]"]),
]),
]),
CBlock(["p_trim[src] = 0"]),
]),
]),
]),
Kernel("KCoreStep1", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_k_core_num'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim'), ('DynamicBitset&', 'bitset_trim'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
If("p_current_degree[src] < local_k_core_num",
[
CBlock(["p_flag[src] = false"]),
CBlock(["active_vertices.reduce( 1)"]),
], [ CBlock(["pop = false"]), ]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("current_edge", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(current_edge)"]),
CBlock(["atomicTestAdd(&p_trim[dst], (uint32_t)1)"]),
CBlock(["bitset_trim.set(dst)"]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("KCoreSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('HGAccumulator<uint64_t>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_flag[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("InitializeGraph2_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph2", ("ctx->gg", "__begin", "__end", "ctx->current_degree.data.gpu_wr_ptr()", "*(ctx->current_degree.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph2_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph2_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph2_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph2_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph2_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("InitializeGraph1_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph1", ("ctx->gg", "__begin", "__end", "ctx->current_degree.data.gpu_wr_ptr()", "ctx->flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph1_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph1_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph1_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph1_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph1_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("KCoreStep2_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("KCoreStep2", ("ctx->gg", "__begin", "__end", "ctx->current_degree.data.gpu_wr_ptr()", "ctx->flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("KCoreStep2_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep2_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("KCoreStep2_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("KCoreStep2_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep2_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("KCoreStep1_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("KCoreStep1", ("ctx->gg", "__begin", "__end", "local_k_core_num", "ctx->current_degree.data.gpu_wr_ptr()", "ctx->flag.data.gpu_wr_ptr()", "ctx->trim.data.gpu_wr_ptr()", "*(ctx->trim.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("KCoreStep1_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep1_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("KCoreStep1_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("KCoreStep1_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreStep1_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "active_verticesval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("KCoreSanityCheck", ("ctx->gg", "__begin", "__end", "ctx->flag.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("KCoreSanityCheck_allNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_masterNodes_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("KCoreSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/k-core/kcore_push_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

////////////////////////////////////////////////////////////////////////////////
// current_degree
////////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(current_degree, uint32_t);
GALOIS_SYNC_STRUCTURE_REDUCE_SET(current_degree, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(current_degree);

////////////////////////////////////////////////////////////////////////////////
// trim
////////////////////////////////////////////////////////////////////////////////

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(trim, uint32_t);
GALOIS_SYNC_STRUCTURE_BITSET(trim);


================================================
FILE: lonestar/analytics/distributed/matrixcompletion/CMakeLists.txt
================================================
app_dist(matrixCompletion matrixcompletion NO_GPU)
add_test_dist(matrixcompletion-dist Epinions_dataset NO_ASYNC NO_GPU ${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr -graphTranspose=${BASEINPUT}/weighted/bipartite/Epinions_dataset.tgr -maxIterations=2)


================================================
FILE: lonestar/analytics/distributed/matrixcompletion/README.md
================================================
Matrix Completion
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Runs matrix completion using stochastic gradient descent.

The algorithm is a bulk synchronous parallel residual based algorithm. In
each round, updates to the latent vectors are calcuated based on the current
error between 2 nodes and then applied at the end of the round.

INPUT
--------------------------------------------------------------------------------

Takes in bipartite Galois .gr graphs: all nodes with edges should be located
in the prefix of the graph.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/matrixcompletion; make -j

RUN
--------------------------------------------------------------------------------

To run for a max of 10 iterations, do the following
`./matrixcompletion-dist <bipartite-input-graph> -t=<num-threads> -maxIterations=10`

To run on 3 hosts h1, h2, and h3 with changes to the learning parameters, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./matrixcompletion-dist <bipartite-input-graph> -t=<num-threads> -DECAY_RATE=0.5 -LAMBDA=0.001 -LEARNING_RATE=0.001`

PERFORMANCE  
--------------------------------------------------------------------------------

* Convergence/time to convergence may be affected by the different learning 
  parameters (e.g. decay rate, lambda, learning rate). They may need tuning for
  best performance. The best parameters are input dependent.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC).

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC).


================================================
FILE: lonestar/analytics/distributed/matrixcompletion/matrixCompletion.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <iostream>
#include <limits>
#include <cmath>
#include "DistBench/Start.h"
#include "galois/ArrayWrapper.h"
#include "galois/AtomicWrapper.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#ifdef GALOIS_ENABLE_GPU
#include "galois/cuda/cuda_device.h"
#include "sgd_cuda.h"
struct CUDA_Context* cuda_ctx;
#endif

constexpr static const char* const REGION_NAME = "SGD";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;

static cll::opt<unsigned int>
    maxIterations("maxIterations",
                  cll::desc("Maximum iterations: Default 10000"),
                  cll::init(10000));
static cll::opt<double>
    LEARNING_RATE("LEARNING_RATE",
                  cll::desc("Learning rate (GAMMA): Default 0.00001"),
                  cll::init(0.00001));
static cll::opt<double> LAMBDA("LAMBDA", cll::desc("LAMBDA: Default 0.0001"),
                               cll::init(0.0001));
static cll::opt<double>
    DECAY_RATE("DECAY_RATE",
               cll::desc("Decay rate to be used in step size function "
                         "(DECAY_RATE): Default 0.9"),
               cll::init(0.9));

/******************************************************************************/
/* Graph structure declarations + helper functions + other initialization */
/******************************************************************************/

#define LATENT_VECTOR_SIZE 20
// static const double LEARNING_RATE = 0.00001; // GAMMA, Purdue: 0.01 Intel:
// 0.001 static const double DECAY_RATE = 0.9; // STEP_DEC, Purdue: 0.1 Intel:
// 0.9 static const double LAMBDA = 0.0001; // Purdue: 1.0 Intel: 0.001
static const double MINVAL = -1e+100;
static const double MAXVAL = 1e+100;

struct NodeData {

  std::vector<galois::CopyableAtomic<double>> residual_latent_vector;
  std::vector<double> latent_vector;
};

typedef galois::graphs::DistGraph<NodeData, double> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "matrixCompletion_sync.hh"
// TODO: Set seed
static double genRand() {
  // generate a random double in (-1,1)
  return 2.0 * ((double)std::rand() / (double)RAND_MAX) - 1.0;
}

// Purdue learning function
double getstep_size(unsigned int round) {
  return LEARNING_RATE * 1.5 / (1.0 + DECAY_RATE * pow(round + 1, 1.5));
}

/**
 * Prediction of edge weight based on 2 latent vectors
 */
double calcPrediction(const NodeData& movie_data, const NodeData& user_data) {
  double pred = galois::innerProduct(movie_data.latent_vector,
                                     user_data.latent_vector, 0.0);

  pred = std::min(MAXVAL, pred);
  pred = std::max(MINVAL, pred);

  return pred;
}

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    auto& allNodes = _graph.allNodesRange();

#ifdef GALOIS_ENABLE_GPU
    if (personality == GPU_CUDA) {
      std::string impl_str(
          syncSubstrate->get_run_identifier("InitializeGraph"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str());
      StatTimer_cuda.start();
      InitializeGraph_cuda(*allNodes.begin(), *allNodes.end(), cuda_ctx);
      StatTimer_cuda.stop();
    } else if (personality == CPU)
#endif
      galois::do_all(galois::iterate(allNodes.begin(), allNodes.end()),
                     InitializeGraph{&_graph}, galois::loopname("Init"));

    // due to latent_vector being generated randomly, it should be sync'd
    // to 1 consistent version across all hosts
    syncSubstrate->sync<writeSource, readAny, Reduce_set_latent_vector>(
        "InitializeGraph");
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);

    // resize vectors
    sdata.latent_vector.resize(LATENT_VECTOR_SIZE);
    sdata.residual_latent_vector.resize(LATENT_VECTOR_SIZE);

    for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
      sdata.latent_vector[i] = genRand();  // randomly create latent vector
      sdata.residual_latent_vector[i] = 0; // randomly create latent vector

#ifndef NDEBUG
      if (!std::isnormal(sdata.latent_vector[i]))
        galois::gDebug("GEN for ", i, " ", sdata.latent_vector[i]);
#endif
    }
  }
};

struct SGD_mergeResidual {
  Graph* graph;

  SGD_mergeResidual(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {

    auto& allNodes = _graph.allNodesRange();

#ifdef GALOIS_ENABLE_GPU
    if (personality == GPU_CUDA) {
      std::string impl_str("SGD_" + (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str());
      StatTimer_cuda.start();
      int __retval = 0;
      SGD_all_cuda(__retval, cuda_ctx);
      // DGAccumulator_accum += __retval;
      StatTimer_cuda.stop();
    } else if (personality == CPU)
#endif

      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          SGD_mergeResidual{&_graph},
          galois::loopname(
              syncSubstrate->get_run_identifier("SGD_merge").c_str()),
          galois::steal(), galois::no_stats());
  }

  void operator()(GNode src) const {
    NodeData& sdata              = graph->getData(src);
    auto& latent_vector          = sdata.latent_vector;
    auto& residual_latent_vector = sdata.residual_latent_vector;

    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {
      latent_vector[i] += residual_latent_vector[i];
      residual_latent_vector[i] = 0;

#ifndef NDEBUG
      if (!std::isnormal(sdata.latent_vector[i]))
        galois::gDebug("GEN for ", i, " ", sdata.latent_vector[i]);
#endif
    }
  }
};

struct SGD {
  Graph* graph;
  double step_size;
  galois::DGAccumulator<double>& DGAccumulator_accum;

  SGD(Graph* _graph, double _step_size, galois::DGAccumulator<double>& _dga)
      : graph(_graph), step_size(_step_size), DGAccumulator_accum(_dga) {}

  void static go(Graph& _graph, galois::DGAccumulator<double>& dga) {
    unsigned _num_iterations = 0;
    double rms_normalized    = 0.0;
    auto& nodesWithEdges     = _graph.allNodesWithEdgesRange();
    const auto& net          = galois::runtime::getSystemNetworkInterface();
    galois::gPrint("Nodes with edges on : ", net.ID, " : ",
                   std::distance(nodesWithEdges.begin(), nodesWithEdges.end()),
                   "\n");
    do {
      galois::gPrint("ITERATION : ", _num_iterations, "\n");

      auto step_size = getstep_size(_num_iterations);
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      galois::do_all(
          galois::iterate(nodesWithEdges), SGD(&_graph, step_size, dga),
          galois::loopname(syncSubstrate->get_run_identifier("SGD").c_str()),
          galois::steal(), galois::no_stats());

      // sync all residual latent vectors
      syncSubstrate->sync<writeAny, readAny,
                          Reduce_pair_wise_add_array_residual_latent_vector>(
          "SGD");

      SGD_mergeResidual::go(_graph);

      ++_num_iterations;

      // calculate root mean squared error
      rms_normalized = std::sqrt(dga.reduce() / _graph.globalSizeEdges());
      galois::gDebug("RMS Normalized : ", rms_normalized);
      galois::gPrint("RMS Normalized: ", rms_normalized, "\n");
    } while ((_num_iterations < maxIterations) && (rms_normalized > 1));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          _num_iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata           = graph->getData(src);
    auto& movie_node          = sdata.latent_vector;
    auto& residual_movie_node = sdata.residual_latent_vector;

    for (auto jj = graph->edge_begin(src), ej = graph->edge_end(src); jj != ej;
         ++jj) {
      GNode dst   = graph->getEdgeDst(jj);
      auto& ddata = graph->getData(dst);

      auto& user_node          = ddata.latent_vector;
      auto& residual_user_node = ddata.residual_latent_vector;
      // auto& sdata_up = sdata.updates;

      double edge_rating = graph->getEdgeData(dst);

      // doGradientUpdate
      double old_dp = galois::innerProduct(user_node, movie_node, double(0));

      double cur_error = edge_rating - old_dp;
      DGAccumulator_accum += (cur_error * cur_error);

      assert(cur_error < 10000 && cur_error > -10000);

      // update both vectors based on error derived from 2 previous vectors
      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {

        double prevUser  = user_node[i];
        double prevMovie = movie_node[i];

        galois::atomicAdd(
            residual_user_node[i],
            double(step_size * (cur_error * prevMovie - LAMBDA * prevUser)));
        assert(std::isnormal(residual_user_node[i].load()));

        galois::atomicAdd(
            residual_movie_node[i],
            double(step_size * (cur_error * prevUser - LAMBDA * prevMovie)));
        assert(std::isnormal(residual_movie_node[i].load()));
      }
    }
  }
};

/******************************************************************************/
/* Main */
/******************************************************************************/
constexpr static const char* const name = "SGD - Distributed Heterogeneous";
constexpr static const char* const desc = "SGD on Distributed Galois.";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  const auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();
  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, double>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, double>();
#endif

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));

  galois::runtime::getHostBarrier().wait();

  // accumulators for use in operators
  galois::DGAccumulator<double> DGAccumulator_accum;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] SGD::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    SGD::go((*hg), DGAccumulator_accum);
    StatTimer_main.stop();

    if ((run + 1) != numRuns) {
      syncSubstrate->set_num_run(run + 1);
      InitializeGraph::go(*hg);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    galois::gError("output requested but this application doesn't support it");
    return 1;
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/matrixcompletion/matrixCompletion_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"
#include "galois/AtomicWrapper.h"
#include "galois/ArrayWrapper.h"

#define LATENT_VECTOR_SIZE 20

typedef galois::CopyableArray<double, LATENT_VECTOR_SIZE> ArrTy;
typedef galois::CopyableArray<galois::CopyableAtomic<double>, LATENT_VECTOR_SIZE> ArrAtomicTy;
typedef std::vector<galois::CopyableAtomic<double>> VecAtomicTy;
typedef std::vector<double> VecTy;

//GALOIS_SYNC_STRUCTURE_REDUCE_SET(residual_latent_vector, ArrAtomicTy);
//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(residual_latent_vector, ArrAtomicTy);

//New vector type
GALOIS_SYNC_STRUCTURE_REDUCE_SET(residual_latent_vector, VecAtomicTy);
GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(residual_latent_vector, VecAtomicTy);


//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(residual_latent_vector, ArrAtomicTy);
//GALOIS_SYNC_STRUCTURE_REDUCE_SET(latent_vector, ArrTy);
//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(latent_vector, ArrTy);

//New vector type
GALOIS_SYNC_STRUCTURE_REDUCE_SET(latent_vector, VecTy);
GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(latent_vector, VecTy);


================================================
FILE: lonestar/analytics/distributed/pagerank/CMakeLists.txt
================================================
app_dist(pagerank_pull pagerank-pull)
add_test_dist(pagerank-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -maxIterations=100)

app_dist(pagerank_push pagerank-push)
add_test_dist(pagerank-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -maxIterations=100)


================================================
FILE: lonestar/analytics/distributed/pagerank/README.md
================================================
PageRank
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Given a graph, ranks nodes in order of their importance using the PageRank
algorithm.

The algorithm supports both a bulk-synchronous and a bulk-asynchronous
parallel algorithms. This benchmark consists of two algorithms,
push- and pull-based. In the push-based algorithm, if a node has new
contributions to its neighbors' page rank values, it will push them out
to them, in each round. In the pull-based algorithm, every node will
contribute to its own pagerank from its neighbors if they have any new
contributions to give, in each round.

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/pagerank/; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 host for a max of 100 iterations, use the following:
`./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100` 
`./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100` 

To run on 3 hosts h1, h2, and h3 for a max of 100 iterations with tolerance 0.001, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100 -tolerance=0.001`
`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100 -tolerance=0.001`

To run on 3 hosts h1, h2, and h3 with an incoming edge cut, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -partition=iec`
`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -partition=iec`

PERFORMANCE
--------------------------------------------------------------------------------

* The pull variant generally performs better in our experience.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/gstl.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/runtime/Tracer.h"

#include <algorithm>
#include <iostream>
#include <limits>
#include <vector>

#ifdef GALOIS_ENABLE_GPU
#include "pagerank_pull_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "PageRank";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;

static cll::opt<float> tolerance("tolerance",
                                 cll::desc("tolerance for residual"),
                                 cll::init(0.000001));
static cll::opt<unsigned int>
    maxIterations("maxIterations",
                  cll::desc("Maximum iterations: Default 1000"),
                  cll::init(1000));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

static const float alpha = (1.0 - 0.85);
struct NodeData {
  float value;
  std::atomic<uint32_t> nout;
  float residual;
  float delta;
};

galois::DynamicBitSet bitset_residual;
galois::DynamicBitSet bitset_nout;

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "pagerank_pull_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

/* (Re)initialize all fields to 0 except for residual which needs to be 0.15
 * everywhere */
struct ResetGraph {
  const float& local_alpha;
  Graph* graph;

  ResetGraph(const float& _local_alpha, Graph* _graph)
      : local_alpha(_local_alpha), graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("ResetGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      ResetGraph_allNodes_cuda(alpha, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          ResetGraph{alpha, &_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("ResetGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    auto& sdata    = graph->getData(src);
    sdata.value    = 0;
    sdata.nout     = 0;
    sdata.delta    = 0;
    sdata.residual = local_alpha;
  }
};

struct InitializeGraph {
  Graph* graph;

  InitializeGraph(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    // init graph
    ResetGraph::go(_graph);

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_nodesWithEdges_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      // doing a local do all because we are looping over edges
      galois::do_all(
          galois::iterate(nodesWithEdges), InitializeGraph{&_graph},
          galois::steal(), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }

    syncSubstrate
        ->sync<writeDestination, readAny, Reduce_add_nout, Bitset_nout>(
            "InitializeGraph");
  }

  // Calculate "outgoing" edges for destination nodes (note we are using
  // the tranpose graph for pull algorithms)
  void operator()(GNode src) const {
    for (auto nbr : graph->edges(src)) {
      GNode dst   = graph->getEdgeDst(nbr);
      auto& ddata = graph->getData(dst);
      galois::atomicAdd(ddata.nout, (uint32_t)1);
      bitset_nout.set(dst);
    }
  }
};

template <bool async>
struct PageRank_delta {
  const float& local_alpha;
  cll::opt<float>& local_tolerance;
  Graph* graph;

  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  PageRank_delta(const float& _local_alpha, cll::opt<float>& _local_tolerance,
                 Graph* _graph, DGTerminatorDetector& _dga)
      : local_alpha(_local_alpha), local_tolerance(_local_tolerance),
        graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph, DGTerminatorDetector& dga) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("PageRank_" + (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      unsigned int __retval = 0;
      PageRank_delta_allNodes_cuda(__retval, alpha, tolerance, cuda_ctx);
      dga += __retval;
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          PageRank_delta{alpha, tolerance, &_graph, dga}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("PageRank_delta").c_str()));
    }
  }

  void operator()(GNode src) const {
    auto& sdata = graph->getData(src);
    sdata.delta = 0;

    if (sdata.residual > 0) {
      sdata.value += sdata.residual;
      if (sdata.residual > this->local_tolerance) {
        if (sdata.nout > 0) {
          sdata.delta = sdata.residual * (1 - local_alpha) / sdata.nout;
          active_vertices += 1;
        }
      }
      sdata.residual = 0;
    }
  }
};

// TODO: GPU code operator does not match CPU's operator (cpu accumulates sum
// and adds all at once, GPU adds each pulled value individually/atomically)
template <bool async>
struct PageRank {
  Graph* graph;

  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  PageRank(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    unsigned _num_iterations   = 0;
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    DGTerminatorDetector dga;

    // unsigned int reduced = 0;

    do {
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      PageRank_delta<async>::go(_graph, dga);
      // reset residual on mirrors
      syncSubstrate->reset_mirrorField<Reduce_add_residual>();

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("PageRank_" +
                             (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        PageRank_nodesWithEdges_cuda(cuda_ctx);
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges), PageRank{&_graph}, galois::steal(),
            galois::no_stats(),
            galois::loopname(
                syncSubstrate->get_run_identifier("PageRank").c_str()));
      }

      syncSubstrate->sync<writeSource, readDestination, Reduce_add_residual,
                          Bitset_residual, async>("PageRank");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          (unsigned long)_graph.sizeEdges());

      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    galois::runtime::reportStat_Tmax(
        REGION_NAME,
        "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
        (unsigned long)_num_iterations);
  }

  // Pull deltas from neighbor nodes, then add to self-residual
  void operator()(GNode src) const {
    auto& sdata = graph->getData(src);

    for (auto nbr : graph->edges(src)) {
      GNode dst   = graph->getEdgeDst(nbr);
      auto& ddata = graph->getData(dst);

      if (ddata.delta > 0) {
        galois::add(sdata.residual, ddata.delta);

        bitset_residual.set(src);
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

// Gets various values from the pageranks values/residuals of the graph
struct PageRankSanity {
  cll::opt<float>& local_tolerance;
  Graph* graph;

  galois::DGAccumulator<float>& DGAccumulator_sum;
  galois::DGAccumulator<float>& DGAccumulator_sum_residual;
  galois::DGAccumulator<uint64_t>& DGAccumulator_residual_over_tolerance;

  galois::DGReduceMax<float>& max_value;
  galois::DGReduceMin<float>& min_value;
  galois::DGReduceMax<float>& max_residual;
  galois::DGReduceMin<float>& min_residual;

  PageRankSanity(
      cll::opt<float>& _local_tolerance, Graph* _graph,
      galois::DGAccumulator<float>& _DGAccumulator_sum,
      galois::DGAccumulator<float>& _DGAccumulator_sum_residual,
      galois::DGAccumulator<uint64_t>& _DGAccumulator_residual_over_tolerance,
      galois::DGReduceMax<float>& _max_value,
      galois::DGReduceMin<float>& _min_value,
      galois::DGReduceMax<float>& _max_residual,
      galois::DGReduceMin<float>& _min_residual)
      : local_tolerance(_local_tolerance), graph(_graph),
        DGAccumulator_sum(_DGAccumulator_sum),
        DGAccumulator_sum_residual(_DGAccumulator_sum_residual),
        DGAccumulator_residual_over_tolerance(
            _DGAccumulator_residual_over_tolerance),
        max_value(_max_value), min_value(_min_value),
        max_residual(_max_residual), min_residual(_min_residual) {}

  void static go(Graph& _graph, galois::DGAccumulator<float>& DGA_sum,
                 galois::DGAccumulator<float>& DGA_sum_residual,
                 galois::DGAccumulator<uint64_t>& DGA_residual_over_tolerance,
                 galois::DGReduceMax<float>& max_value,
                 galois::DGReduceMin<float>& min_value,
                 galois::DGReduceMax<float>& max_residual,
                 galois::DGReduceMin<float>& min_residual) {
    DGA_sum.reset();
    DGA_sum_residual.reset();
    max_value.reset();
    max_residual.reset();
    min_value.reset();
    min_residual.reset();
    DGA_residual_over_tolerance.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      float _max_value;
      float _min_value;
      float _sum_value;
      float _sum_residual;
      uint64_t num_residual_over_tolerance;
      float _max_residual;
      float _min_residual;
      PageRankSanity_masterNodes_cuda(
          num_residual_over_tolerance, _sum_value, _sum_residual, _max_residual,
          _max_value, _min_residual, _min_value, tolerance, cuda_ctx);
      DGA_sum += _sum_value;
      DGA_sum_residual += _sum_residual;
      DGA_residual_over_tolerance += num_residual_over_tolerance;
      max_value.update(_max_value);
      max_residual.update(_max_residual);
      min_value.update(_min_value);
      min_residual.update(_min_residual);
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     PageRankSanity(tolerance, &_graph, DGA_sum,
                                    DGA_sum_residual,
                                    DGA_residual_over_tolerance, max_value,
                                    min_value, max_residual, min_residual),
                     galois::no_stats(), galois::loopname("PageRankSanity"));
    }

    float max_rank          = max_value.reduce();
    float min_rank          = min_value.reduce();
    float rank_sum          = DGA_sum.reduce();
    float residual_sum      = DGA_sum_residual.reduce();
    uint64_t over_tolerance = DGA_residual_over_tolerance.reduce();
    float max_res           = max_residual.reduce();
    float min_res           = min_residual.reduce();

    // Only node 0 will print data
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Max rank is ", max_rank, "\n");
      galois::gPrint("Min rank is ", min_rank, "\n");
      galois::gPrint("Rank sum is ", rank_sum, "\n");
      galois::gPrint("Residual sum is ", residual_sum, "\n");
      galois::gPrint("# nodes with residual over ", tolerance,
                     " (tolerance) is ", over_tolerance, "\n");
      galois::gPrint("Max residual is ", max_res, "\n");
      galois::gPrint("Min residual is ", min_res, "\n");
    }
  }

  /* Gets the max, min rank from all owned nodes and
   * also the sum of ranks */
  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);

    max_value.update(sdata.value);
    min_value.update(sdata.value);
    max_residual.update(sdata.residual);
    min_residual.update(sdata.residual);

    DGAccumulator_sum += sdata.value;
    DGAccumulator_sum_residual += sdata.residual;

    if (sdata.residual > local_tolerance) {
      DGAccumulator_residual_over_tolerance += 1;
    }
  }
};

std::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).value);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_value_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "PageRank - Compiler Generated "
                                          "Distributed Heterogeneous";
constexpr static const char* const desc = "PageRank Residual Pull version on "
                                          "Distributed Galois.";
constexpr static const char* const url = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
    std::ostringstream ss;
    ss << tolerance;
    galois::runtime::reportParam(REGION_NAME, "Tolerance", ss.str());
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void, false>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void, false>();
#endif

  bitset_residual.resize(hg->size());
  bitset_nout.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go(*hg);
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<float> DGA_sum;
  galois::DGAccumulator<float> DGA_sum_residual;
  galois::DGAccumulator<uint64_t> DGA_residual_over_tolerance;
  galois::DGReduceMax<float> max_value;
  galois::DGReduceMin<float> min_value;
  galois::DGReduceMax<float> max_residual;
  galois::DGReduceMin<float> min_residual;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] PageRank::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      PageRank<true>::go(*hg);
    } else {
      PageRank<false>::go(*hg);
    }
    StatTimer_main.stop();

    // sanity check
    PageRankSanity::go(*hg, DGA_sum, DGA_sum_residual,
                       DGA_residual_over_tolerance, max_value, min_value,
                       max_residual, min_residual);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_residual_reset_cuda(cuda_ctx);
        bitset_nout_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_residual.reset();
        bitset_nout.reset();
      }

      syncSubstrate->set_num_run(run + 1);
      InitializeGraph::go(*hg);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<float> results = makeResults(hg);
    auto globalIDs             = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "pagerank", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "pagerank_pull_cuda.cuh"
static const int __tb_PageRank = TB_SIZE;
static const int __tb_InitializeGraph = TB_SIZE;
__global__ void ResetGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_value[src]    = 0;
      p_nout[src]     = 0;
      p_delta[src]    = 0;
      p_residual[src] = local_alpha;
    }
  }
  // FP: "10 -> 11;
}
__global__ void InitializeGraph_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_nout, DynamicBitset& bitset_nout, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type nbr;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      nbr = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        dst = graph.getAbsDestination(nbr);
        atomicTestAdd(&p_nout[dst], (uint32_t)1);
        bitset_nout.set(dst);
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "44 -> 45;
}
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_nout, DynamicBitset& bitset_nout, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_InitializeGraph;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
    }
    // FP: "9 -> 10;
    // FP: "12 -> 13;
    // FP: "13 -> 14;
    int threshold = TOTAL_THREADS_1D;
    // FP: "14 -> 15;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "17 -> 18;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "18 -> 19;
    __shared__ struct { ; } _np_closure [TB_SIZE];
    // FP: "19 -> 20;
    // FP: "20 -> 21;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "23 -> 24;
    // FP: "24 -> 25;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "25 -> 26;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "26 -> 27;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "29 -> 30;
    __syncthreads();
    // FP: "30 -> 31;
    while (true)
    {
      // FP: "31 -> 32;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "34 -> 35;
      __syncthreads();
      // FP: "35 -> 36;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "36 -> 37;
        __syncthreads();
        // FP: "37 -> 38;
        break;
      }
      // FP: "39 -> 40;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "42 -> 43;
      __syncthreads();
      // FP: "43 -> 44;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "44 -> 45;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "47 -> 48;
      assert(nps.tb.src < __kernel_tb_size);
      // FP: "48 -> 49;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type nbr;
        nbr = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          atomicTestAdd(&p_nout[dst], (uint32_t)1);
          bitset_nout.set(dst);
        }
      }
      // FP: "56 -> 57;
      __syncthreads();
    }
    // FP: "58 -> 59;

    // FP: "59 -> 60;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "60 -> 61;
      const int _np_laneid = cub::LaneId();
      // FP: "61 -> 62;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type nbr;
          nbr = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(nbr);
            atomicTestAdd(&p_nout[dst], (uint32_t)1);
            bitset_nout.set(dst);
          }
        }
      }
      // FP: "79 -> 80;
      __syncthreads();
      // FP: "80 -> 81;
    }

    // FP: "81 -> 82;
    __syncthreads();
    // FP: "82 -> 83;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "83 -> 84;
    while (_np.work())
    {
      // FP: "84 -> 85;
      int _np_i =0;
      // FP: "85 -> 86;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "86 -> 87;
      __syncthreads();
      // FP: "87 -> 88;

      // FP: "88 -> 89;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type nbr;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        nbr= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          atomicTestAdd(&p_nout[dst], (uint32_t)1);
          bitset_nout.set(dst);
        }
      }
      // FP: "97 -> 98;
      _np.execute_round_done(ITSIZE);
      // FP: "98 -> 99;
      __syncthreads();
    }
    // FP: "100 -> 101;
    assert(threadIdx.x < __kernel_tb_size);
  }
  // FP: "102 -> 103;
}
__global__ void PageRank_delta(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float local_tolerance, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value, HGAccumulator<unsigned int> active_vertices)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  active_vertices.thread_entry();
  // FP: "3 -> 4;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_delta[src] = 0;
      if (p_residual[src] > 0)
      {
        p_value[src] += p_residual[src];
        if (p_residual[src] > local_tolerance)
        {
          if (p_nout[src] > 0)
          {
            p_delta[src] = p_residual[src] * (1 - local_alpha) / p_nout[src];
            active_vertices.reduce( 1);
          }
        }
        p_residual[src] = 0;
      }
    }
  }
  // FP: "19 -> 20;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "20 -> 21;
}
__global__ void PageRank_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type nbr;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      nbr = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        dst = graph.getAbsDestination(nbr);
        if (p_delta[dst] > 0)
        {
          atomicTestAdd(&p_residual[src], p_delta[dst]);
          bitset_residual.set(src);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "46 -> 47;
}
__global__ void PageRank(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_PageRank;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
    }
    // FP: "9 -> 10;
    // FP: "12 -> 13;
    // FP: "13 -> 14;
    int threshold = TOTAL_THREADS_1D;
    // FP: "14 -> 15;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "17 -> 18;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "18 -> 19;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "19 -> 20;
    _np_closure[threadIdx.x].src = src;
    // FP: "20 -> 21;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "23 -> 24;
    // FP: "24 -> 25;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "25 -> 26;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "26 -> 27;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "29 -> 30;
    __syncthreads();
    // FP: "30 -> 31;
    while (true)
    {
      // FP: "31 -> 32;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "34 -> 35;
      __syncthreads();
      // FP: "35 -> 36;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "36 -> 37;
        __syncthreads();
        // FP: "37 -> 38;
        break;
      }
      // FP: "39 -> 40;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "42 -> 43;
      __syncthreads();
      // FP: "43 -> 44;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "44 -> 45;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "47 -> 48;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "48 -> 49;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type nbr;
        nbr = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          if (p_delta[dst] > 0)
          {
            atomicTestAdd(&p_residual[src], p_delta[dst]);
            bitset_residual.set(src);
          }
        }
      }
      // FP: "58 -> 59;
      __syncthreads();
    }
    // FP: "60 -> 61;

    // FP: "61 -> 62;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "62 -> 63;
      const int _np_laneid = cub::LaneId();
      // FP: "63 -> 64;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type nbr;
          nbr = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(nbr);
            if (p_delta[dst] > 0)
            {
              atomicTestAdd(&p_residual[src], p_delta[dst]);
              bitset_residual.set(src);
            }
          }
        }
      }
      // FP: "83 -> 84;
      __syncthreads();
      // FP: "84 -> 85;
    }

    // FP: "85 -> 86;
    __syncthreads();
    // FP: "86 -> 87;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "87 -> 88;
    while (_np.work())
    {
      // FP: "88 -> 89;
      int _np_i =0;
      // FP: "89 -> 90;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "90 -> 91;
      __syncthreads();
      // FP: "91 -> 92;

      // FP: "92 -> 93;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type nbr;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        nbr= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          if (p_delta[dst] > 0)
          {
            atomicTestAdd(&p_residual[src], p_delta[dst]);
            bitset_residual.set(src);
          }
        }
      }
      // FP: "103 -> 104;
      _np.execute_round_done(ITSIZE);
      // FP: "104 -> 105;
      __syncthreads();
    }
    // FP: "106 -> 107;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "108 -> 109;
}
__global__ void PageRankSanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float local_tolerance, float * p_residual, float * p_value, HGAccumulator<uint64_t> DGAccumulator_residual_over_tolerance, HGAccumulator<float> DGAccumulator_sum, HGAccumulator<float> DGAccumulator_sum_residual, HGReduceMax<float> max_residual, HGReduceMax<float> max_value, HGReduceMin<float> min_residual, HGReduceMin<float> min_value)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_residual_over_tolerance_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_value_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_value_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_residual_over_tolerance.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  DGAccumulator_sum.thread_entry();
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  DGAccumulator_sum_residual.thread_entry();
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  max_residual.thread_entry();
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  max_value.thread_entry();
  // FP: "11 -> 12;
  // FP: "12 -> 13;
  min_residual.thread_entry();
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  min_value.thread_entry();
  // FP: "15 -> 16;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      max_value.reduce(p_value[src]);
      min_value.reduce(p_value[src]);
      max_residual.reduce(p_residual[src]);
      min_residual.reduce(p_residual[src]);
      DGAccumulator_sum.reduce( p_value[src]);
      DGAccumulator_sum.reduce( p_residual[src]);
      if (p_residual[src] > local_tolerance)
      {
        DGAccumulator_residual_over_tolerance.reduce( 1);
      }
    }
  }
  // FP: "29 -> 30;
  DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts);
  // FP: "30 -> 31;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "31 -> 32;
  DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts);
  // FP: "32 -> 33;
  max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts);
  // FP: "33 -> 34;
  max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts);
  // FP: "34 -> 35;
  min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts);
  // FP: "35 -> 36;
  min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts);
  // FP: "36 -> 37;
}
void ResetGraph_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  ResetGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void ResetGraph_allNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx);
  // FP: "2 -> 3;
}
void ResetGraph_masterNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx);
  // FP: "2 -> 3;
}
void ResetGraph_nodesWithEdges_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, __tb_InitializeGraph>>>(ctx->gg, __begin, __end, ctx->nout.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      InitializeGraph_TB_LB <<<blocks, __tb_InitializeGraph>>>(ctx->gg, __begin, __end, ctx->nout.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  PageRank_delta <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, local_tolerance, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _active_vertices);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void PageRank_delta_allNodes_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(0, ctx->gg.nnodes, active_vertices, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_masterNodes_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_nodesWithEdges_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(0, ctx->numNodesWithEdges, active_vertices, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  PageRank <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      PageRank_TB_LB <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void PageRank_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void PageRank_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void PageRank_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_residual_over_tolerance;
  HGAccumulator<float> _DGAccumulator_sum;
  HGAccumulator<float> _DGAccumulator_sum_residual;
  HGReduceMax<float> _max_residual;
  HGReduceMax<float> _max_value;
  HGReduceMin<float> _min_residual;
  HGReduceMin<float> _min_value;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_residual_over_toleranceval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<float> DGAccumulator_sumval  = Shared<float>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "12 -> 13;
  Shared<float> DGAccumulator_sum_residualval  = Shared<float>(1);
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  *(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0;
  // FP: "15 -> 16;
  _DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr();
  // FP: "16 -> 17;
  Shared<float> max_residualval  = Shared<float>(1);
  // FP: "17 -> 18;
  // FP: "18 -> 19;
  *(max_residualval.cpu_wr_ptr()) = 0;
  // FP: "19 -> 20;
  _max_residual.rv = max_residualval.gpu_wr_ptr();
  // FP: "20 -> 21;
  Shared<float> max_valueval  = Shared<float>(1);
  // FP: "21 -> 22;
  // FP: "22 -> 23;
  *(max_valueval.cpu_wr_ptr()) = 0;
  // FP: "23 -> 24;
  _max_value.rv = max_valueval.gpu_wr_ptr();
  // FP: "24 -> 25;
  Shared<float> min_residualval  = Shared<float>(1);
  // FP: "25 -> 26;
  // FP: "26 -> 27;
  *(min_residualval.cpu_wr_ptr()) = 1073741823;
  // FP: "27 -> 28;
  _min_residual.rv = min_residualval.gpu_wr_ptr();
  // FP: "28 -> 29;
  Shared<float> min_valueval  = Shared<float>(1);
  // FP: "29 -> 30;
  // FP: "30 -> 31;
  *(min_valueval.cpu_wr_ptr()) = 1073741823;
  // FP: "31 -> 32;
  _min_value.rv = min_valueval.gpu_wr_ptr();
  // FP: "32 -> 33;
  PageRankSanity <<<blocks, threads>>>(ctx->gg, __begin, __end, local_tolerance, ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _DGAccumulator_residual_over_tolerance, _DGAccumulator_sum, _DGAccumulator_sum_residual, _max_residual, _max_value, _min_residual, _min_value);
  cudaDeviceSynchronize();
  // FP: "33 -> 34;
  check_cuda_kernel;
  // FP: "34 -> 35;
  DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr());
  // FP: "35 -> 36;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "36 -> 37;
  DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr());
  // FP: "37 -> 38;
  max_residual = *(max_residualval.cpu_rd_ptr());
  // FP: "38 -> 39;
  max_value = *(max_valueval.cpu_rd_ptr());
  // FP: "39 -> 40;
  min_residual = *(min_residualval.cpu_rd_ptr());
  // FP: "40 -> 41;
  min_value = *(min_valueval.cpu_rd_ptr());
  // FP: "41 -> 42;
}
void PageRankSanity_allNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_masterNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_nodesWithEdges_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "pagerank_pull_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<float> delta;
	struct CUDA_Context_Field<uint32_t> nout;
	struct CUDA_Context_Field<float> residual;
	struct CUDA_Context_Field<float> value;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->delta, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->nout, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->residual, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->value, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->delta, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->nout, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->residual, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->value, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->delta.data.zero_gpu();
	ctx->nout.data.zero_gpu();
	ctx->residual.data.zero_gpu();
	ctx->value.data.zero_gpu();
}

void get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->delta.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_delta_reset_cuda(struct CUDA_Context* ctx) {
	ctx->delta.is_updated.cpu_rd_ptr()->reset();
}

void bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->delta, begin, end);
}

float get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *delta = ctx->delta.data.cpu_rd_ptr();
	return delta[LID];
}

void set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	delta[LID] = v;
}

void add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	delta[LID] += v;
}

bool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	if (delta[LID] > v){
		delta[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v);
}

void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v);
}

void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, i);
}

void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->delta, begin, end, v);
}

void get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->nout.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_nout_reset_cuda(struct CUDA_Context* ctx) {
	ctx->nout.is_updated.cpu_rd_ptr()->reset();
}

void bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->nout, begin, end);
}

uint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *nout = ctx->nout.data.cpu_rd_ptr();
	return nout[LID];
}

void set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	nout[LID] = v;
}

void add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	nout[LID] += v;
}

bool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	if (nout[LID] > v){
		nout[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v);
}

void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v);
}

void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, i);
}

void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->nout, begin, end, v);
}

void get_bitset_residual_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->residual.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_residual_reset_cuda(struct CUDA_Context* ctx) {
	ctx->residual.is_updated.cpu_rd_ptr()->reset();
}

void bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->residual, begin, end);
}

float get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *residual = ctx->residual.data.cpu_rd_ptr();
	return residual[LID];
}

void set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	residual[LID] = v;
}

void add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	residual[LID] += v;
}

bool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	if (residual[LID] > v){
		residual[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v);
}

void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v);
}

void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, i);
}

void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->residual, begin, end, v);
}

void get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->value.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_value_reset_cuda(struct CUDA_Context* ctx) {
	ctx->value.is_updated.cpu_rd_ptr()->reset();
}

void bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->value, begin, end);
}

float get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *value = ctx->value.data.cpu_rd_ptr();
	return value[LID];
}

void set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	value[LID] = v;
}

void add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	value[LID] += v;
}

bool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	if (value[LID] > v){
		value[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v);
}

void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v);
}

void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, i);
}

void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->value, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_delta_reset_cuda(struct CUDA_Context* ctx);
void bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                             size_t end);
float get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v);
void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, size_t* v_size,
                               DataCommMode* data_mode);
void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, float i);
void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode, float i);
void batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin,
                                 size_t end, float v);

void get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_nout_reset_cuda(struct CUDA_Context* ctx);
void bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint32_t i);
void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint32_t i);
void batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint32_t v);

void get_bitset_residual_cuda(struct CUDA_Context* ctx,
                              uint64_t* bitset_compute);
void bitset_residual_reset_cuda(struct CUDA_Context* ctx);
void bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end);
float get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v);
void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, size_t* v_size,
                                  DataCommMode* data_mode);
void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v);
void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode);
void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v, float i);
void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size, DataCommMode* data_mode,
                                        float i);
void batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end, float v);

void get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_value_reset_cuda(struct CUDA_Context* ctx);
void bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                             size_t end);
float get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v);
void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, size_t* v_size,
                               DataCommMode* data_mode);
void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, float i);
void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode, float i);
void batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin,
                                 size_t end, float v);

void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void PageRank_cuda(unsigned int __begin, unsigned int __end,
                   struct CUDA_Context* ctx);
void PageRankSanity_cuda(unsigned int __begin, unsigned int __end,
                         uint64_t& DGAccumulator_residual_over_tolerance,
                         float& DGAccumulator_sum,
                         float& DGAccumulator_sum_residual, float& max_residual,
                         float& max_value, float& min_residual,
                         float& min_value, float local_tolerance,
                         struct CUDA_Context* ctx);
void PageRankSanity_allNodes_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRankSanity_masterNodes_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRankSanity_nodesWithEdges_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRank_allNodes_cuda(struct CUDA_Context* ctx);
void PageRank_delta_cuda(unsigned int __begin, unsigned int __end,
                         unsigned int& active_vertices,
                         const float& local_alpha, float local_tolerance,
                         struct CUDA_Context* ctx);
void PageRank_delta_allNodes_cuda(unsigned int& active_vertices,
                                  const float& local_alpha,
                                  float local_tolerance,
                                  struct CUDA_Context* ctx);
void PageRank_delta_masterNodes_cuda(unsigned int& active_vertices,
                                     const float& local_alpha,
                                     float local_tolerance,
                                     struct CUDA_Context* ctx);
void PageRank_delta_nodesWithEdges_cuda(unsigned int& active_vertices,
                                        const float& local_alpha,
                                        float local_tolerance,
                                        struct CUDA_Context* ctx);
void PageRank_masterNodes_cuda(struct CUDA_Context* ctx);
void PageRank_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void ResetGraph_cuda(unsigned int __begin, unsigned int __end,
                     const float& local_alpha, struct CUDA_Context* ctx);
void ResetGraph_allNodes_cuda(const float& local_alpha,
                              struct CUDA_Context* ctx);
void ResetGraph_masterNodes_cuda(const float& local_alpha,
                                 struct CUDA_Context* ctx);
void ResetGraph_nodesWithEdges_cuda(const float& local_alpha,
                                    struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("pagerank_pull_cuda.cuh", system = False)], parse = False),
Kernel("ResetGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_value[src]    = 0"]),
CBlock(["p_nout[src]     = 0"]),
CBlock(["p_delta[src]    = 0"]),
CBlock(["p_residual[src] = local_alpha"]),
]),
]),
]),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_nout'), ('DynamicBitset&', 'bitset_nout')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("nbr", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(nbr)"]),
CBlock(["atomicTestAdd(&p_nout[dst], (uint32_t)1)"]),
CBlock(["bitset_nout.set(dst)"]),
]),
),
]),
]),
Kernel("PageRank_delta", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float', 'local_tolerance'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_delta[src] = 0"]),
If("p_residual[src] > 0",
[
CBlock(["p_value[src] += p_residual[src]"]),
If("p_residual[src] > local_tolerance",
[
If("p_nout[src] > 0",
[
CBlock(["p_delta[src] = p_residual[src] * (1 - local_alpha) / p_nout[src]"]),
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
CBlock(["p_residual[src] = 0"]),
]),
]),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("PageRank", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_residual')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("nbr", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(nbr)"]),
If("p_delta[dst] > 0",
[
CBlock(["atomicTestAdd(&p_residual[src], p_delta[dst])"]),
CBlock(["bitset_residual.set(src)"]),
]),
]),
),
]),
]),
Kernel("PageRankSanity", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float', 'local_tolerance'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<uint64_t>', 'DGAccumulator_residual_over_tolerance'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGAccumulator<float>', 'DGAccumulator_sum_residual'), ('HGReduceMax<float>', 'max_residual'), ('HGReduceMax<float>', 'max_value'), ('HGReduceMin<float>', 'min_residual'), ('HGReduceMin<float>', 'min_value')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_residual_over_tolerance_ts", "")]),
CBlock(["DGAccumulator_residual_over_tolerance.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_sum_residual_ts", "")]),
CBlock(["DGAccumulator_sum_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "max_residual_ts", "")]),
CBlock(["max_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "max_value_ts", "")]),
CBlock(["max_value.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "min_residual_ts", "")]),
CBlock(["min_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "min_value_ts", "")]),
CBlock(["min_value.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["max_value.reduce(p_value[src])"]),
CBlock(["min_value.reduce(p_value[src])"]),
CBlock(["max_residual.reduce(p_residual[src])"]),
CBlock(["min_residual.reduce(p_residual[src])"]),
CBlock(["DGAccumulator_sum.reduce( p_value[src])"]),
CBlock(["DGAccumulator_sum.reduce( p_residual[src])"]),
If("p_residual[src] > local_tolerance",
[
CBlock(["DGAccumulator_residual_over_tolerance.reduce( 1)"]),
]),
]),
]),
CBlock(["DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts)"], parse = False),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts)"], parse = False),
CBlock(["max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts)"], parse = False),
CBlock(["max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts)"], parse = False),
CBlock(["min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts)"], parse = False),
CBlock(["min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts)"], parse = False),
]),
Kernel("ResetGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("ResetGraph", ("ctx->gg", "__begin", "__end", "local_alpha", "ctx->delta.data.gpu_wr_ptr()", "ctx->nout.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("ResetGraph_allNodes_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx)"]),
], host = True),
Kernel("ResetGraph_masterNodes_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx)"]),
], host = True),
Kernel("ResetGraph_nodesWithEdges_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx)"]),
], host = True),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "ctx->nout.data.gpu_wr_ptr()", "*(ctx->nout.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("PageRank_delta_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("PageRank_delta", ("ctx->gg", "__begin", "__end", "local_alpha", "local_tolerance", "ctx->delta.data.gpu_wr_ptr()", "ctx->nout.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("PageRank_delta_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(0, ctx->gg.nnodes, active_vertices, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_delta_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_delta_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(0, ctx->numNodesWithEdges, active_vertices, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("PageRank", ("ctx->gg", "__begin", "__end", "ctx->delta.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "*(ctx->residual.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("PageRank_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("PageRank_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("PageRank_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("PageRankSanity_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_residual_over_toleranceval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_residual_over_tolerance", "")]),
CBlock(["*(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_sumval", " = Shared<float>(1)")]),
CDecl([("HGAccumulator<float>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_sum_residualval", " = Shared<float>(1)")]),
CDecl([("HGAccumulator<float>", "_DGAccumulator_sum_residual", "")]),
CBlock(["*(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "max_residualval", " = Shared<float>(1)")]),
CDecl([("HGReduceMax<float>", "_max_residual", "")]),
CBlock(["*(max_residualval.cpu_wr_ptr()) = 0"]),
CBlock(["_max_residual.rv = max_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "max_valueval", " = Shared<float>(1)")]),
CDecl([("HGReduceMax<float>", "_max_value", "")]),
CBlock(["*(max_valueval.cpu_wr_ptr()) = 0"]),
CBlock(["_max_value.rv = max_valueval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "min_residualval", " = Shared<float>(1)")]),
CDecl([("HGReduceMin<float>", "_min_residual", "")]),
CBlock(["*(min_residualval.cpu_wr_ptr()) = 1073741823"]),
CBlock(["_min_residual.rv = min_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "min_valueval", " = Shared<float>(1)")]),
CDecl([("HGReduceMin<float>", "_min_value", "")]),
CBlock(["*(min_valueval.cpu_wr_ptr()) = 1073741823"]),
CBlock(["_min_value.rv = min_valueval.gpu_wr_ptr()"]),
Invoke("PageRankSanity", ("ctx->gg", "__begin", "__end", "local_tolerance", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()", "_DGAccumulator_residual_over_tolerance", "_DGAccumulator_sum", "_DGAccumulator_sum_residual", "_max_residual", "_max_value", "_min_residual", "_min_value")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr())"]),
CBlock(["max_residual = *(max_residualval.cpu_rd_ptr())"]),
CBlock(["max_value = *(max_valueval.cpu_rd_ptr())"]),
CBlock(["min_residual = *(min_residualval.cpu_rd_ptr())"]),
CBlock(["min_value = *(min_valueval.cpu_rd_ptr())"]),
], host = True),
Kernel("PageRankSanity_allNodes_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRankSanity_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRankSanity_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_pull_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(nout, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(nout);

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(residual, float);
GALOIS_SYNC_STRUCTURE_REDUCE_SET(residual, float);
GALOIS_SYNC_STRUCTURE_BITSET(residual);


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/gstl.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/runtime/Tracer.h"

#include <algorithm>
#include <iostream>
#include <limits>
#include <vector>

#ifdef GALOIS_ENABLE_GPU
#include "pagerank_push_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "PageRank";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/
namespace cll = llvm::cl;

static cll::opt<float> tolerance("tolerance",
                                 cll::desc("tolerance for residual"),
                                 cll::init(0.000001));
static cll::opt<unsigned int>
    maxIterations("maxIterations",
                  cll::desc("Maximum iterations: Default 1000"),
                  cll::init(1000));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

static const float alpha = (1.0 - 0.85);
struct NodeData {
  float value;
  std::atomic<uint32_t> nout;
  float delta;
  std::atomic<float> residual;
};

galois::DynamicBitSet bitset_residual;
galois::DynamicBitSet bitset_nout;

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;
typedef GNode WorkItem;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "pagerank_push_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

// Reset all fields of all nodes to 0
struct ResetGraph {
  Graph* graph;

  ResetGraph(Graph* _graph) : graph(_graph) {}
  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("ResetGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      ResetGraph_allNodes_cuda(cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          ResetGraph{&_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("ResetGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.value     = 0;
    sdata.nout      = 0;
    sdata.residual  = 0;
    sdata.delta     = 0;
  }
};

// Initialize residual at nodes with outgoing edges + find nout for
// nodes with outgoing edges
struct InitializeGraph {
  const float& local_alpha;
  Graph* graph;

  InitializeGraph(const float& _alpha, Graph* _graph)
      : local_alpha(_alpha), graph(_graph) {}

  void static go(Graph& _graph) {
    // first initialize all fields to 0 via ResetGraph (can't assume all zero
    // at start)
    ResetGraph::go(_graph);

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_nodesWithEdges_cuda(alpha, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      // regular do all without stealing; just initialization of nodes with
      // outgoing edges
      galois::do_all(
          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),
          InitializeGraph{alpha, &_graph}, galois::steal(), galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }

    syncSubstrate->sync<writeSource, readSource, Reduce_add_nout, Bitset_nout>(
        "InitializeGraphNout");
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.residual  = local_alpha;
    uint32_t num_edges =
        std::distance(graph->edge_begin(src), graph->edge_end(src));
    galois::atomicAdd(sdata.nout, num_edges);
    bitset_nout.set(src);
  }
};

struct PageRank_delta {
  const float& local_alpha;
  cll::opt<float>& local_tolerance;
  Graph* graph;

  PageRank_delta(const float& _local_alpha, cll::opt<float>& _local_tolerance,
                 Graph* _graph)
      : local_alpha(_local_alpha), local_tolerance(_local_tolerance),
        graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("PageRank_" + (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      PageRank_delta_nodesWithEdges_cuda(alpha, tolerance, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),
          PageRank_delta{alpha, tolerance, &_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("PageRank_delta").c_str()));
    }
  }

  void operator()(WorkItem src) const {
    NodeData& sdata = graph->getData(src);

    if (sdata.residual > 0) {
      float residual_old = sdata.residual;
      sdata.residual     = 0;
      sdata.value += residual_old;
      if (residual_old > this->local_tolerance) {
        if (sdata.nout > 0) {
          sdata.delta = residual_old * (1 - local_alpha) / sdata.nout;
        }
      }
    }
  }
};

template <bool async>
struct PageRank {
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  PageRank(Graph* _g, DGTerminatorDetector& _dga)
      : graph(_g), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    unsigned _num_iterations   = 0;
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    DGTerminatorDetector dga;

    do {
      syncSubstrate->set_num_round(_num_iterations);
      PageRank_delta::go(_graph);
      dga.reset();
      // reset residual on mirrors
      syncSubstrate->reset_mirrorField<Reduce_add_residual>();

      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("PageRank_" +
                             (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        PageRank_nodesWithEdges_cuda(__retval, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges), PageRank{&_graph, dga},
            galois::no_stats(), galois::steal(),
            galois::loopname(
                syncSubstrate->get_run_identifier("PageRank").c_str()));
      }

      syncSubstrate->sync<writeDestination, readSource, Reduce_add_residual,
                          Bitset_residual, async>("PageRank");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          (unsigned long)dga.read_local());

      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          (unsigned long)_num_iterations);
    }
  }

  void operator()(WorkItem src) const {
    NodeData& sdata = graph->getData(src);
    if (sdata.delta > 0) {
      float _delta = sdata.delta;
      sdata.delta  = 0;

      active_vertices += 1; // this should be moved to Pagerank_delta operator

      for (auto nbr : graph->edges(src)) {
        GNode dst       = graph->getEdgeDst(nbr);
        NodeData& ddata = graph->getData(dst);

        galois::atomicAdd(ddata.residual, _delta);

        bitset_residual.set(dst);
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

// Gets various values from the pageranks values/residuals of the graph
struct PageRankSanity {
  cll::opt<float>& local_tolerance;
  Graph* graph;

  galois::DGAccumulator<float>& DGAccumulator_sum;
  galois::DGAccumulator<float>& DGAccumulator_sum_residual;
  galois::DGAccumulator<uint64_t>& DGAccumulator_residual_over_tolerance;

  galois::DGReduceMax<float>& max_value;
  galois::DGReduceMin<float>& min_value;
  galois::DGReduceMax<float>& max_residual;
  galois::DGReduceMin<float>& min_residual;

  PageRankSanity(
      cll::opt<float>& _local_tolerance, Graph* _graph,
      galois::DGAccumulator<float>& _DGAccumulator_sum,
      galois::DGAccumulator<float>& _DGAccumulator_sum_residual,
      galois::DGAccumulator<uint64_t>& _DGAccumulator_residual_over_tolerance,
      galois::DGReduceMax<float>& _max_value,
      galois::DGReduceMin<float>& _min_value,
      galois::DGReduceMax<float>& _max_residual,
      galois::DGReduceMin<float>& _min_residual)
      : local_tolerance(_local_tolerance), graph(_graph),
        DGAccumulator_sum(_DGAccumulator_sum),
        DGAccumulator_sum_residual(_DGAccumulator_sum_residual),
        DGAccumulator_residual_over_tolerance(
            _DGAccumulator_residual_over_tolerance),
        max_value(_max_value), min_value(_min_value),
        max_residual(_max_residual), min_residual(_min_residual) {}

  void static go(Graph& _graph, galois::DGAccumulator<float>& DGA_sum,
                 galois::DGAccumulator<float>& DGA_sum_residual,
                 galois::DGAccumulator<uint64_t>& DGA_residual_over_tolerance,
                 galois::DGReduceMax<float>& max_value,
                 galois::DGReduceMin<float>& min_value,
                 galois::DGReduceMax<float>& max_residual,
                 galois::DGReduceMin<float>& min_residual) {
    DGA_sum.reset();
    DGA_sum_residual.reset();
    max_value.reset();
    max_residual.reset();
    min_value.reset();
    min_residual.reset();
    DGA_residual_over_tolerance.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      float _max_value;
      float _min_value;
      float _sum_value;
      float _sum_residual;
      uint64_t num_residual_over_tolerance;
      float _max_residual;
      float _min_residual;
      PageRankSanity_masterNodes_cuda(
          num_residual_over_tolerance, _sum_value, _sum_residual, _max_residual,
          _max_value, _min_residual, _min_value, tolerance, cuda_ctx);
      DGA_sum += _sum_value;
      DGA_sum_residual += _sum_residual;
      DGA_residual_over_tolerance += num_residual_over_tolerance;
      max_value.update(_max_value);
      max_residual.update(_max_residual);
      min_value.update(_min_value);
      min_residual.update(_min_residual);
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     PageRankSanity(tolerance, &_graph, DGA_sum,
                                    DGA_sum_residual,
                                    DGA_residual_over_tolerance, max_value,
                                    min_value, max_residual, min_residual),
                     galois::no_stats(), galois::loopname("PageRankSanity"));
    }

    float max_rank          = max_value.reduce();
    float min_rank          = min_value.reduce();
    float rank_sum          = DGA_sum.reduce();
    float residual_sum      = DGA_sum_residual.reduce();
    uint64_t over_tolerance = DGA_residual_over_tolerance.reduce();
    float max_res           = max_residual.reduce();
    float min_res           = min_residual.reduce();

    // Only node 0 will print data
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Max rank is ", max_rank, "\n");
      galois::gPrint("Min rank is ", min_rank, "\n");
      galois::gPrint("Rank sum is ", rank_sum, "\n");
      galois::gPrint("Residual sum is ", residual_sum, "\n");
      galois::gPrint("# nodes with residual over ", tolerance,
                     " (tolerance) is ", over_tolerance, "\n");
      galois::gPrint("Max residual is ", max_res, "\n");
      galois::gPrint("Min residual is ", min_res, "\n");
    }
  }

  /* Gets the max, min rank from all owned nodes and
   * also the sum of ranks */
  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);

    max_value.update(sdata.value);
    min_value.update(sdata.value);
    max_residual.update(sdata.residual);
    min_residual.update(sdata.residual);

    DGAccumulator_sum += sdata.value;
    DGAccumulator_sum_residual += sdata.residual;

    if (sdata.residual > local_tolerance) {
      DGAccumulator_residual_over_tolerance += 1;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).value);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<float> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_value_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "PageRank - Compiler Generated "
                                          "Distributed Heterogeneous";
constexpr static const char* const desc = "Residual PageRank on Distributed "
                                          "Galois.";
constexpr static const char* const url = 0;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
    std::ostringstream ss;
    ss << tolerance;
    galois::runtime::reportParam(REGION_NAME, "Tolerance", ss.str());
  }
  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, void>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, void>();
#endif

  bitset_residual.resize(hg->size());
  bitset_nout.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  galois::DGAccumulator<float> DGA_sum;
  galois::DGAccumulator<float> DGA_sum_residual;
  galois::DGAccumulator<uint64_t> DGA_residual_over_tolerance;
  galois::DGReduceMax<float> max_value;
  galois::DGReduceMin<float> min_value;
  galois::DGReduceMax<float> max_residual;
  galois::DGReduceMin<float> min_residual;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] PageRank::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      PageRank<true>::go(*hg);
    } else {
      PageRank<false>::go(*hg);
    }
    StatTimer_main.stop();

    // sanity check
    PageRankSanity::go(*hg, DGA_sum, DGA_sum_residual,
                       DGA_residual_over_tolerance, max_value, min_value,
                       max_residual, min_residual);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_residual_reset_cuda(cuda_ctx);
        bitset_nout_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_residual.reset();
        bitset_nout.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go(*hg);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<float> results = makeResults(hg);
    auto globalIDs             = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "pagerank", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=False $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
bool enable_lb = false;
#include "pagerank_push_cuda.cuh"
static const int __tb_PageRank = TB_SIZE;
__global__ void ResetGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_value[src]     = 0;
      p_nout[src]      = 0;
      p_residual[src]  = 0;
      p_delta[src]     = 0;
    }
  }
  // FP: "10 -> 11;
}
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, uint32_t * p_nout, float * p_residual, DynamicBitset& bitset_nout)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  uint32_t num_edges;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_residual[src]  = local_alpha;
      num_edges = graph.getOutDegree(src);
      atomicTestAdd(&p_nout[src], num_edges);
      bitset_nout.set(src);
    }
  }
  // FP: "11 -> 12;
}
__global__ void PageRank_delta(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float local_tolerance, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  float residual_old;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_residual[src] > 0)
      {
        residual_old = p_residual[src];
        p_residual[src]     = 0;
        p_value[src] += residual_old;
        if (residual_old > local_tolerance)
        {
          if (p_nout[src] > 0)
          {
            p_delta[src] = residual_old * (1 - local_alpha) / p_nout[src];
          }
        }
      }
    }
  }
  // FP: "17 -> 18;
}
__global__ void PageRank(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, HGAccumulator<unsigned int> active_vertices, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_PageRank;
  float _delta;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  active_vertices.thread_entry();
  // FP: "8 -> 9;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "9 -> 10;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "10 -> 11;
    if (pop)
    {
      if (p_delta[src] > 0)
      {
        _delta = p_delta[src];
        p_delta[src]  = 0;
        active_vertices.reduce( 1);
      }
      else
      {
        pop = false;
      }
    }
    // FP: "17 -> 18;
    // FP: "20 -> 21;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "21 -> 22;
    __shared__ struct { float _delta; } _np_closure [TB_SIZE];
    // FP: "22 -> 23;
    _np_closure[threadIdx.x]._delta = _delta;
    // FP: "23 -> 24;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "26 -> 27;
    // FP: "27 -> 28;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "28 -> 29;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "29 -> 30;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "32 -> 33;
    __syncthreads();
    // FP: "33 -> 34;
    while (true)
    {
      // FP: "34 -> 35;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "37 -> 38;
      __syncthreads();
      // FP: "38 -> 39;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "39 -> 40;
        __syncthreads();
        // FP: "40 -> 41;
        break;
      }
      // FP: "42 -> 43;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "45 -> 46;
      __syncthreads();
      // FP: "46 -> 47;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "47 -> 48;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "50 -> 51;
      assert(nps.tb.src < __kernel_tb_size);
      _delta = _np_closure[nps.tb.src]._delta;
      // FP: "51 -> 52;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type nbr;
        nbr = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          atomicTestAdd(&p_residual[dst], _delta);
          bitset_residual.set(dst);
        }
      }
      // FP: "59 -> 60;
      __syncthreads();
    }
    // FP: "61 -> 62;

    // FP: "62 -> 63;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "63 -> 64;
      const int _np_laneid = cub::LaneId();
      // FP: "64 -> 65;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        _delta = _np_closure[nps.warp.src[warpid]]._delta;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type nbr;
          nbr = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(nbr);
            atomicTestAdd(&p_residual[dst], _delta);
            bitset_residual.set(dst);
          }
        }
      }
      // FP: "82 -> 83;
      __syncthreads();
      // FP: "83 -> 84;
    }

    // FP: "84 -> 85;
    __syncthreads();
    // FP: "85 -> 86;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "86 -> 87;
    while (_np.work())
    {
      // FP: "87 -> 88;
      int _np_i =0;
      // FP: "88 -> 89;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "89 -> 90;
      __syncthreads();
      // FP: "90 -> 91;

      // FP: "91 -> 92;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type nbr;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        _delta = _np_closure[nps.fg.src[_np_i]]._delta;
        nbr= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(nbr);
          atomicTestAdd(&p_residual[dst], _delta);
          bitset_residual.set(dst);
        }
      }
      // FP: "100 -> 101;
      _np.execute_round_done(ITSIZE);
      // FP: "101 -> 102;
      __syncthreads();
    }
    // FP: "103 -> 104;
    assert(threadIdx.x < __kernel_tb_size);
    _delta = _np_closure[threadIdx.x]._delta;
  }
  // FP: "106 -> 107;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "107 -> 108;
}
__global__ void PageRankSanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float local_tolerance, float * p_residual, float * p_value, HGAccumulator<uint64_t> DGAccumulator_residual_over_tolerance, HGAccumulator<float> DGAccumulator_sum, HGAccumulator<float> DGAccumulator_sum_residual, HGReduceMax<float> max_residual, HGReduceMax<float> max_value, HGReduceMin<float> min_residual, HGReduceMin<float> min_value)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_residual_over_tolerance_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_value_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_residual_ts;
  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_value_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_residual_over_tolerance.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  DGAccumulator_sum.thread_entry();
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  DGAccumulator_sum_residual.thread_entry();
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  max_residual.thread_entry();
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  max_value.thread_entry();
  // FP: "11 -> 12;
  // FP: "12 -> 13;
  min_residual.thread_entry();
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  min_value.thread_entry();
  // FP: "15 -> 16;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      max_value.reduce(p_value[src]);
      min_value.reduce(p_value[src]);
      max_residual.reduce(p_residual[src]);
      min_residual.reduce(p_residual[src]);
      DGAccumulator_sum.reduce( p_value[src]);
      DGAccumulator_sum.reduce( p_residual[src]);
      if (p_residual[src] > local_tolerance)
      {
        DGAccumulator_residual_over_tolerance.reduce( 1);
      }
    }
  }
  // FP: "29 -> 30;
  DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts);
  // FP: "30 -> 31;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "31 -> 32;
  DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts);
  // FP: "32 -> 33;
  max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts);
  // FP: "33 -> 34;
  max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts);
  // FP: "34 -> 35;
  min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts);
  // FP: "35 -> 36;
  min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts);
  // FP: "36 -> 37;
}
void ResetGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  ResetGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void ResetGraph_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void ResetGraph_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void ResetGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  ResetGraph_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()));
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(const float & local_alpha, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  PageRank_delta <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, local_tolerance, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void PageRank_delta_allNodes_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(0, ctx->gg.nnodes, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_masterNodes_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_delta_nodesWithEdges_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_delta_cuda(0, ctx->numNodesWithEdges, local_alpha, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRank_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  PageRank <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), _active_vertices, enable_lb);
  cudaDeviceSynchronize();
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void PageRank_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void PageRank_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void PageRank_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRank_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_residual_over_tolerance;
  HGAccumulator<float> _DGAccumulator_sum;
  HGAccumulator<float> _DGAccumulator_sum_residual;
  HGReduceMax<float> _max_residual;
  HGReduceMax<float> _max_value;
  HGReduceMin<float> _min_residual;
  HGReduceMin<float> _min_value;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_residual_over_toleranceval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<float> DGAccumulator_sumval  = Shared<float>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "12 -> 13;
  Shared<float> DGAccumulator_sum_residualval  = Shared<float>(1);
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  *(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0;
  // FP: "15 -> 16;
  _DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr();
  // FP: "16 -> 17;
  Shared<float> max_residualval  = Shared<float>(1);
  // FP: "17 -> 18;
  // FP: "18 -> 19;
  *(max_residualval.cpu_wr_ptr()) = 0;
  // FP: "19 -> 20;
  _max_residual.rv = max_residualval.gpu_wr_ptr();
  // FP: "20 -> 21;
  Shared<float> max_valueval  = Shared<float>(1);
  // FP: "21 -> 22;
  // FP: "22 -> 23;
  *(max_valueval.cpu_wr_ptr()) = 0;
  // FP: "23 -> 24;
  _max_value.rv = max_valueval.gpu_wr_ptr();
  // FP: "24 -> 25;
  Shared<float> min_residualval  = Shared<float>(1);
  // FP: "25 -> 26;
  // FP: "26 -> 27;
  *(min_residualval.cpu_wr_ptr()) = 1073741823;
  // FP: "27 -> 28;
  _min_residual.rv = min_residualval.gpu_wr_ptr();
  // FP: "28 -> 29;
  Shared<float> min_valueval  = Shared<float>(1);
  // FP: "29 -> 30;
  // FP: "30 -> 31;
  *(min_valueval.cpu_wr_ptr()) = 1073741823;
  // FP: "31 -> 32;
  _min_value.rv = min_valueval.gpu_wr_ptr();
  // FP: "32 -> 33;
  PageRankSanity <<<blocks, threads>>>(ctx->gg, __begin, __end, local_tolerance, ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _DGAccumulator_residual_over_tolerance, _DGAccumulator_sum, _DGAccumulator_sum_residual, _max_residual, _max_value, _min_residual, _min_value);
  cudaDeviceSynchronize();
  // FP: "33 -> 34;
  check_cuda_kernel;
  // FP: "34 -> 35;
  DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr());
  // FP: "35 -> 36;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "36 -> 37;
  DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr());
  // FP: "37 -> 38;
  max_residual = *(max_residualval.cpu_rd_ptr());
  // FP: "38 -> 39;
  max_value = *(max_valueval.cpu_rd_ptr());
  // FP: "39 -> 40;
  min_residual = *(min_residualval.cpu_rd_ptr());
  // FP: "40 -> 41;
  min_value = *(min_valueval.cpu_rd_ptr());
  // FP: "41 -> 42;
}
void PageRankSanity_allNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_masterNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}
void PageRankSanity_nodesWithEdges_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "pagerank_push_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<float> delta;
	struct CUDA_Context_Field<uint32_t> nout;
	struct CUDA_Context_Field<float> residual;
	struct CUDA_Context_Field<float> value;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->delta, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->nout, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->residual, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->value, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->delta, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->nout, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->residual, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->value, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->delta.data.zero_gpu();
	ctx->nout.data.zero_gpu();
	ctx->residual.data.zero_gpu();
	ctx->value.data.zero_gpu();
}

void get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->delta.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_delta_reset_cuda(struct CUDA_Context* ctx) {
	ctx->delta.is_updated.cpu_rd_ptr()->reset();
}

void bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->delta, begin, end);
}

float get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *delta = ctx->delta.data.cpu_rd_ptr();
	return delta[LID];
}

void set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	delta[LID] = v;
}

void add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	delta[LID] += v;
}

bool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *delta = ctx->delta.data.cpu_wr_ptr();
	if (delta[LID] > v){
		delta[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v);
}

void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v);
}

void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, i);
}

void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->delta, from_id, v, data_mode);
}

void batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->delta, begin, end, v);
}

void get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->nout.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_nout_reset_cuda(struct CUDA_Context* ctx) {
	ctx->nout.is_updated.cpu_rd_ptr()->reset();
}

void bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->nout, begin, end);
}

uint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *nout = ctx->nout.data.cpu_rd_ptr();
	return nout[LID];
}

void set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	nout[LID] = v;
}

void add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	nout[LID] += v;
}

bool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *nout = ctx->nout.data.cpu_wr_ptr();
	if (nout[LID] > v){
		nout[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v);
}

void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v);
}

void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, i);
}

void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->nout, from_id, v, data_mode);
}

void batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->nout, begin, end, v);
}

void get_bitset_residual_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->residual.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_residual_reset_cuda(struct CUDA_Context* ctx) {
	ctx->residual.is_updated.cpu_rd_ptr()->reset();
}

void bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->residual, begin, end);
}

float get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *residual = ctx->residual.data.cpu_rd_ptr();
	return residual[LID];
}

void set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	residual[LID] = v;
}

void add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	residual[LID] += v;
}

bool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *residual = ctx->residual.data.cpu_wr_ptr();
	if (residual[LID] > v){
		residual[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v);
}

void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v);
}

void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, i);
}

void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->residual, from_id, v, data_mode);
}

void batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->residual, begin, end, v);
}

void get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->value.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_value_reset_cuda(struct CUDA_Context* ctx) {
	ctx->value.is_updated.cpu_rd_ptr()->reset();
}

void bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->value, begin, end);
}

float get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID) {
	float *value = ctx->value.data.cpu_rd_ptr();
	return value[LID];
}

void set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	value[LID] = v;
}

void add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	value[LID] += v;
}

bool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {
	float *value = ctx->value.data.cpu_wr_ptr();
	if (value[LID] > v){
		value[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v);
}

void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v);
}

void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, i);
}

void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {
	batch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->value, from_id, v, data_mode);
}

void batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {
	reset_data_field<float>(&ctx->value, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_delta_reset_cuda(struct CUDA_Context* ctx);
void bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                             size_t end);
float get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v);
void batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, size_t* v_size,
                               DataCommMode* data_mode);
void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, float i);
void batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode, float i);
void batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin,
                                 size_t end, float v);

void get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_nout_reset_cuda(struct CUDA_Context* ctx);
void bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);
uint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v);
void batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, size_t* v_size,
                              DataCommMode* data_mode);
void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v);
void batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode);
void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, uint32_t i);
void batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                    uint8_t* v, size_t* v_size,
                                    DataCommMode* data_mode, uint32_t i);
void batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, DataCommMode data_mode);
void batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,
                              uint8_t* v, DataCommMode data_mode);
void batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end, uint32_t v);

void get_bitset_residual_cuda(struct CUDA_Context* ctx,
                              uint64_t* bitset_compute);
void bitset_residual_reset_cuda(struct CUDA_Context* ctx);
void bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end);
float get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v);
void batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, size_t* v_size,
                                  DataCommMode* data_mode);
void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v);
void batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode);
void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v, float i);
void batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size, DataCommMode* data_mode,
                                        float i);
void batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end, float v);

void get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);
void bitset_value_reset_cuda(struct CUDA_Context* ctx);
void bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                             size_t end);
float get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
bool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);
void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v);
void batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, size_t* v_size,
                               DataCommMode* data_mode);
void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, float i);
void batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                     uint8_t* v, size_t* v_size,
                                     DataCommMode* data_mode, float i);
void batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,
                               uint8_t* v, DataCommMode data_mode);
void batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin,
                                 size_t end, float v);

void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          const float& local_alpha, struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(const float& local_alpha,
                                   struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(const float& local_alpha,
                                      struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(const float& local_alpha,
                                         struct CUDA_Context* ctx);
void PageRank_cuda(unsigned int __begin, unsigned int __end,
                   unsigned int& active_vertices, struct CUDA_Context* ctx);
void PageRankSanity_cuda(unsigned int __begin, unsigned int __end,
                         uint64_t& DGAccumulator_residual_over_tolerance,
                         float& DGAccumulator_sum,
                         float& DGAccumulator_sum_residual, float& max_residual,
                         float& max_value, float& min_residual,
                         float& min_value, float local_tolerance,
                         struct CUDA_Context* ctx);
void PageRankSanity_allNodes_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRankSanity_masterNodes_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRankSanity_nodesWithEdges_cuda(
    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,
    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,
    float& min_residual, float& min_value, float local_tolerance,
    struct CUDA_Context* ctx);
void PageRank_allNodes_cuda(unsigned int& active_vertices,
                            struct CUDA_Context* ctx);
void PageRank_delta_cuda(unsigned int __begin, unsigned int __end,
                         const float& local_alpha, float local_tolerance,
                         struct CUDA_Context* ctx);
void PageRank_delta_allNodes_cuda(const float& local_alpha,
                                  float local_tolerance,
                                  struct CUDA_Context* ctx);
void PageRank_delta_masterNodes_cuda(const float& local_alpha,
                                     float local_tolerance,
                                     struct CUDA_Context* ctx);
void PageRank_delta_nodesWithEdges_cuda(const float& local_alpha,
                                        float local_tolerance,
                                        struct CUDA_Context* ctx);
void PageRank_masterNodes_cuda(unsigned int& active_vertices,
                               struct CUDA_Context* ctx);
void PageRank_nodesWithEdges_cuda(unsigned int& active_vertices,
                                  struct CUDA_Context* ctx);
void ResetGraph_cuda(unsigned int __begin, unsigned int __end,
                     struct CUDA_Context* ctx);
void ResetGraph_allNodes_cuda(struct CUDA_Context* ctx);
void ResetGraph_masterNodes_cuda(struct CUDA_Context* ctx);
void ResetGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("pagerank_push_cuda.cuh", system = False)], parse = False),
Kernel("ResetGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_value[src]     = 0"]),
CBlock(["p_nout[src]      = 0"]),
CBlock(["p_residual[src]  = 0"]),
CBlock(["p_delta[src]     = 0"]),
]),
]),
]),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_nout')],
[
CDecl([("uint32_t", "num_edges", "")]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_residual[src]  = local_alpha"]),
CBlock(["num_edges = graph.getOutDegree(src)"]),
CBlock(["atomicTestAdd(&p_nout[src], num_edges)"]),
CBlock(["bitset_nout.set(src)"]),
]),
]),
]),
Kernel("PageRank_delta", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float', 'local_tolerance'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],
[
CDecl([("float", "residual_old", "")]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_residual[src] > 0",
[
CBlock(["residual_old = p_residual[src]"]),
CBlock(["p_residual[src]     = 0"]),
CBlock(["p_value[src] += residual_old"]),
If("residual_old > local_tolerance",
[
If("p_nout[src] > 0",
[
CBlock(["p_delta[src] = residual_old * (1 - local_alpha) / p_nout[src]"]),
]),
]),
]),
]),
]),
]),
Kernel("PageRank", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_residual'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("float", "_delta", "")]),
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_delta[src] > 0",
[
CBlock(["_delta = p_delta[src]"]),
CBlock(["p_delta[src]  = 0"]),
CBlock(["active_vertices.reduce( 1)"]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("nbr", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(nbr)"]),
CBlock(["atomicTestAdd(&p_residual[dst], _delta)"]),
CBlock(["bitset_residual.set(dst)"]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("PageRankSanity", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float', 'local_tolerance'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<uint64_t>', 'DGAccumulator_residual_over_tolerance'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGAccumulator<float>', 'DGAccumulator_sum_residual'), ('HGReduceMax<float>', 'max_residual'), ('HGReduceMax<float>', 'max_value'), ('HGReduceMin<float>', 'min_residual'), ('HGReduceMin<float>', 'min_value')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_residual_over_tolerance_ts", "")]),
CBlock(["DGAccumulator_residual_over_tolerance.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "DGAccumulator_sum_residual_ts", "")]),
CBlock(["DGAccumulator_sum_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "max_residual_ts", "")]),
CBlock(["max_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "max_value_ts", "")]),
CBlock(["max_value.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "min_residual_ts", "")]),
CBlock(["min_residual.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage", "min_value_ts", "")]),
CBlock(["min_value.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["max_value.reduce(p_value[src])"]),
CBlock(["min_value.reduce(p_value[src])"]),
CBlock(["max_residual.reduce(p_residual[src])"]),
CBlock(["min_residual.reduce(p_residual[src])"]),
CBlock(["DGAccumulator_sum.reduce( p_value[src])"]),
CBlock(["DGAccumulator_sum.reduce( p_residual[src])"]),
If("p_residual[src] > local_tolerance",
[
CBlock(["DGAccumulator_residual_over_tolerance.reduce( 1)"]),
]),
]),
]),
CBlock(["DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts)"], parse = False),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts)"], parse = False),
CBlock(["max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts)"], parse = False),
CBlock(["max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts)"], parse = False),
CBlock(["min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts)"], parse = False),
CBlock(["min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts)"], parse = False),
]),
Kernel("ResetGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("ResetGraph", ("ctx->gg", "__begin", "__end", "ctx->delta.data.gpu_wr_ptr()", "ctx->nout.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("ResetGraph_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("ResetGraph_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("ResetGraph_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["ResetGraph_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "local_alpha", "ctx->nout.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "*(ctx->nout.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx)"]),
], host = True),
Kernel("PageRank_delta_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("PageRank_delta", ("ctx->gg", "__begin", "__end", "local_alpha", "local_tolerance", "ctx->delta.data.gpu_wr_ptr()", "ctx->nout.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("PageRank_delta_allNodes_cuda", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(0, ctx->gg.nnodes, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_delta_masterNodes_cuda", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_delta_nodesWithEdges_cuda", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_delta_cuda(0, ctx->numNodesWithEdges, local_alpha, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRank_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("PageRank", ("ctx->gg", "__begin", "__end", "ctx->delta.data.gpu_wr_ptr()", "ctx->residual.data.gpu_wr_ptr()", "*(ctx->residual.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("PageRank_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("PageRank_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("PageRank_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRank_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
Kernel("PageRankSanity_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_residual_over_toleranceval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_residual_over_tolerance", "")]),
CBlock(["*(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_sumval", " = Shared<float>(1)")]),
CDecl([("HGAccumulator<float>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "DGAccumulator_sum_residualval", " = Shared<float>(1)")]),
CDecl([("HGAccumulator<float>", "_DGAccumulator_sum_residual", "")]),
CBlock(["*(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "max_residualval", " = Shared<float>(1)")]),
CDecl([("HGReduceMax<float>", "_max_residual", "")]),
CBlock(["*(max_residualval.cpu_wr_ptr()) = 0"]),
CBlock(["_max_residual.rv = max_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "max_valueval", " = Shared<float>(1)")]),
CDecl([("HGReduceMax<float>", "_max_value", "")]),
CBlock(["*(max_valueval.cpu_wr_ptr()) = 0"]),
CBlock(["_max_value.rv = max_valueval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "min_residualval", " = Shared<float>(1)")]),
CDecl([("HGReduceMin<float>", "_min_residual", "")]),
CBlock(["*(min_residualval.cpu_wr_ptr()) = 1073741823"]),
CBlock(["_min_residual.rv = min_residualval.gpu_wr_ptr()"]),
CDecl([("Shared<float>", "min_valueval", " = Shared<float>(1)")]),
CDecl([("HGReduceMin<float>", "_min_value", "")]),
CBlock(["*(min_valueval.cpu_wr_ptr()) = 1073741823"]),
CBlock(["_min_value.rv = min_valueval.gpu_wr_ptr()"]),
Invoke("PageRankSanity", ("ctx->gg", "__begin", "__end", "local_tolerance", "ctx->residual.data.gpu_wr_ptr()", "ctx->value.data.gpu_wr_ptr()", "_DGAccumulator_residual_over_tolerance", "_DGAccumulator_sum", "_DGAccumulator_sum_residual", "_max_residual", "_max_value", "_min_residual", "_min_value")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr())"]),
CBlock(["max_residual = *(max_residualval.cpu_rd_ptr())"]),
CBlock(["max_value = *(max_valueval.cpu_rd_ptr())"]),
CBlock(["min_residual = *(min_residualval.cpu_rd_ptr())"]),
CBlock(["min_value = *(min_valueval.cpu_rd_ptr())"]),
], host = True),
Kernel("PageRankSanity_allNodes_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRankSanity_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
Kernel("PageRankSanity_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/pagerank/pagerank_push_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_ADD(nout, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(nout);

GALOIS_SYNC_STRUCTURE_REDUCE_SET(residual, float);
GALOIS_SYNC_STRUCTURE_REDUCE_ADD(residual, float);
GALOIS_SYNC_STRUCTURE_BITSET(residual);


================================================
FILE: lonestar/analytics/distributed/partition/CMakeLists.txt
================================================
app_dist(partition partition NO_GPU)


================================================
FILE: lonestar/analytics/distributed/partition/partition.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <iostream>
#include <limits>
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/gstl.h"

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

struct NodeData {
  uint32_t dummy;
};

typedef galois::graphs::DistGraph<NodeData, void> Graph;
typedef typename Graph::GraphNode GNode;

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "Partition";
constexpr static const char* const desc = "Partitions a normal graph.";
constexpr static const char* const url  = 0;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);
  distGraphInitialization<NodeData, void>();
  return 0;
}


================================================
FILE: lonestar/analytics/distributed/sssp/CMakeLists.txt
================================================
app_dist(sssp_push sssp-push)
add_test_dist(sssp-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)

app_dist(sssp_pull sssp-pull)
add_test_dist(sssp-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)


================================================
FILE: lonestar/analytics/distributed/sssp/README.md
================================================
Single Source Shortest Path
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program performs single source shortest path on a weighted input graph, 
starting from a source node (specified by -startNode option). 

The algorithm supports both a bulk-synchronous and a bulk-asynchronous
parallel algorithms. This benchmark consists of two algorithms,
push- and pull-based. In the push based algorithm, a node that has been updated
from the last round will push out its distance value to its neighbors and
update them if necessary after considering the edge weight between itself
and its neighbor, in each round.

In the pull based algorithm, every node will check its neighbors' distance 
values and update their own values based on the edge weight between the node
and its neighbor, in each round.


INPUT
--------------------------------------------------------------------------------

Takes in weighted Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/sssp; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 host with start node 0, use the following:
`./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` 
`./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` 

To run on 3 hosts h1, h2, and h3 for start node 0, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` 
`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` 

To run on 3 hosts h1, h2, and h3 for start node 10 with an incoming edge cut, use the following:
`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`
`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` 

PERFORMANCE  
--------------------------------------------------------------------------------

* The push variant generally performs better in our experience.

* For 16 or less hosts/GPUs, for performance, we recommend using an
  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**
  communication for performance.

* For 32 or more hosts/GPUs, for performance, we recommend using the
  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**
  communication for performance.


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "sssp_pull_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "SSSP";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;
static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));
static cll::opt<uint64_t>
    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;

struct NodeData {
  uint32_t dist_current;
};

galois::DynamicBitSet bitset_dist_current;

typedef galois::graphs::DistGraph<NodeData, unsigned int> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "sssp_pull_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  const uint32_t& local_infinity;
  cll::opt<uint64_t>& local_src_node;
  Graph* graph;

  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
                  Graph* _graph)
      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.dist_current =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
  }
};

template <bool async>
struct SSSP {
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;

  DGTerminatorDetector& active_vertices;

  SSSP(Graph* _graph, DGTerminatorDetector& _dga)
      : graph(_graph), active_vertices(_dga) {}

  void static go(Graph& _graph) {
    unsigned _num_iterations   = 0;
    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();
    DGTerminatorDetector dga;

    do {
      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("SSSP_" + (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval = 0;
        SSSP_nodesWithEdges_cuda(__retval, cuda_ctx);
        dga += __retval;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(galois::iterate(nodesWithEdges), SSSP{&_graph, dga},
                       galois::no_stats(), galois::steal(),
                       galois::loopname(
                           syncSubstrate->get_run_identifier("SSSP").c_str()));
      }

      syncSubstrate->sync<writeSource, readDestination, Reduce_min_dist_current,
                          Bitset_dist_current, async>("SSSP");

      galois::runtime::reportStat_Tsum(
          REGION_NAME, "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          dga.read_local());

      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::runtime::reportStat_Single(
          REGION_NAME,
          "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
          _num_iterations);
    }
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_dist = dnode.dist_current + graph->getEdgeData(jj);
      uint32_t old_dist = galois::min(snode.dist_current, new_dist);
      if (old_dist > new_dist) {
        bitset_dist_current.set(src);
        active_vertices += 1;
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Prints total number of nodes visited + max distance */
struct SSSPSanityCheck {
  const uint32_t& local_infinity;
  Graph* graph;

  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;
  galois::DGReduceMax<uint32_t>& DGMax;
  galois::DGAccumulator<uint64_t>& dg_avg;

  SSSPSanityCheck(const uint32_t& _infinity, Graph* _graph,
                  galois::DGAccumulator<uint64_t>& dgas,
                  galois::DGReduceMax<uint32_t>& dgm,
                  galois::DGAccumulator<uint64_t>& _dg_avg)
      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),
        DGMax(dgm), dg_avg(_dg_avg) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm,
                 galois::DGAccumulator<uint64_t>& dgag) {
    dgas.reset();
    dgm.reset();
    dgag.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      uint64_t avg;
      uint32_t max;
      SSSPSanityCheck_masterNodes_cuda(sum, avg, max, infinity, cuda_ctx);
      dgas += sum;
      dgm.update(max);
      dgag += avg;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     SSSPSanityCheck(infinity, &_graph, dgas, dgm, dgag),
                     galois::no_stats(), galois::loopname("SSSPSanityCheck"));
    }

    uint64_t num_visited  = dgas.reduce();
    uint32_t max_distance = dgm.reduce();

    float visit_average = ((float)dgag.reduce()) / num_visited;

    // Only host 0 will print the info
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes visited from source ", src_node, " is ",
                     num_visited, "\n");
      galois::gPrint("Max distance from source ", src_node, " is ",
                     max_distance, "\n");
      galois::gPrint("Average distances on visited nodes is ", visit_average,
                     "\n");
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.dist_current < local_infinity) {
      DGAccumulator_sum += 1;
      DGMax.update(src_data.dist_current);
      dg_avg += src_data.dist_current;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).dist_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "SSSP pull - Distributed "
                                          "Heterogeneous";
constexpr static const char* const desc = "SSSP pull on Distributed Galois.";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.ID == 0) {
    galois::runtime::reportParam(REGION_NAME, "Max Iterations", maxIterations);
    galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, unsigned int, false>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, unsigned int, false>();
#endif

  bitset_dist_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  // accumulators for use in operators
  galois::DGAccumulator<uint64_t> DGAccumulator_sum;
  galois::DGAccumulator<uint64_t> dg_avge;
  galois::DGReduceMax<uint32_t> m;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] SSSP::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      SSSP<true>::go(*hg);
    } else {
      SSSP<false>::go(*hg);
    }
    StatTimer_main.stop();

    // sanity check
    SSSPSanityCheck::go(*hg, DGAccumulator_sum, m, dg_avge);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_dist_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_dist_current.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go((*hg));
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "distance", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "sssp_pull_cuda.cuh"
static const int __tb_SSSP = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, unsigned long long local_src_node, uint32_t * p_dist_current)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
    }
  }
  // FP: "7 -> 8;
}
__global__ void SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        dst = graph.getAbsDestination(jj);
        new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);
        old_dist = atomicTestMin(&p_dist_current[src], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(src);
          active_vertices.reduce( 1);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_SSSP;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "8 -> 9;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "9 -> 10;
    if (pop)
    {
    }
    // FP: "11 -> 12;
    // FP: "14 -> 15;
    // FP: "15 -> 16;
    int threshold = TOTAL_THREADS_1D;
    // FP: "16 -> 17;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "19 -> 20;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "20 -> 21;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "21 -> 22;
    _np_closure[threadIdx.x].src = src;
    // FP: "22 -> 23;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "25 -> 26;
    // FP: "26 -> 27;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "27 -> 28;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "28 -> 29;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "31 -> 32;
    __syncthreads();
    // FP: "32 -> 33;
    while (true)
    {
      // FP: "33 -> 34;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "36 -> 37;
      __syncthreads();
      // FP: "37 -> 38;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "38 -> 39;
        __syncthreads();
        // FP: "39 -> 40;
        break;
      }
      // FP: "41 -> 42;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "44 -> 45;
      __syncthreads();
      // FP: "45 -> 46;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "46 -> 47;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "49 -> 50;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "50 -> 51;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);
          old_dist = atomicTestMin(&p_dist_current[src], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "64 -> 65;
      __syncthreads();
    }
    // FP: "66 -> 67;

    // FP: "67 -> 68;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "68 -> 69;
      const int _np_laneid = cub::LaneId();
      // FP: "69 -> 70;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            dst = graph.getAbsDestination(jj);
            new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);
            old_dist = atomicTestMin(&p_dist_current[src], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(src);
              active_vertices.reduce( 1);
            }
          }
        }
      }
      // FP: "93 -> 94;
      __syncthreads();
      // FP: "94 -> 95;
    }

    // FP: "95 -> 96;
    __syncthreads();
    // FP: "96 -> 97;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "97 -> 98;
    while (_np.work())
    {
      // FP: "98 -> 99;
      int _np_i =0;
      // FP: "99 -> 100;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "100 -> 101;
      __syncthreads();
      // FP: "101 -> 102;

      // FP: "102 -> 103;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);
          old_dist = atomicTestMin(&p_dist_current[src], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(src);
            active_vertices.reduce( 1);
          }
        }
      }
      // FP: "117 -> 118;
      _np.execute_round_done(ITSIZE);
      // FP: "118 -> 119;
      __syncthreads();
    }
    // FP: "120 -> 121;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "122 -> 123;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "123 -> 124;
}
__global__ void SSSPSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGAccumulator<uint64_t> dg_avg, HGReduceMax<uint32_t> DGMax)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage dg_avg_ts;
  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_sum.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  dg_avg.thread_entry();
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  DGMax.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_dist_current[src] < local_infinity)
      {
        DGAccumulator_sum.reduce( 1);
        DGMax.reduce(p_dist_current[src]);
        dg_avg.reduce( p_dist_current[src]);
      }
    }
  }
  // FP: "17 -> 18;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "18 -> 19;
  dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts);
  // FP: "19 -> 20;
  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);
  // FP: "20 -> 21;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void SSSP_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  SSSP <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      SSSP_TB_LB <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "9 -> 10;
  check_cuda_kernel;
  // FP: "10 -> 11;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "11 -> 12;
}
void SSSP_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(0, ctx->gg.nnodes, active_vertices, ctx);
  // FP: "2 -> 3;
}
void SSSP_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);
  // FP: "2 -> 3;
}
void SSSP_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_sum;
  HGAccumulator<uint64_t> _dg_avg;
  HGReduceMax<uint32_t> _DGMax;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<uint64_t> dg_avgval  = Shared<uint64_t>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(dg_avgval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _dg_avg.rv = dg_avgval.gpu_wr_ptr();
  // FP: "12 -> 13;
  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  *(DGMaxval.cpu_wr_ptr()) = 0;
  // FP: "15 -> 16;
  _DGMax.rv = DGMaxval.gpu_wr_ptr();
  // FP: "16 -> 17;
  SSSPSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _dg_avg, _DGMax);
  cudaDeviceSynchronize();
  // FP: "17 -> 18;
  check_cuda_kernel;
  // FP: "18 -> 19;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "19 -> 20;
  dg_avg = *(dg_avgval.cpu_rd_ptr());
  // FP: "20 -> 21;
  DGMax = *(DGMaxval.cpu_rd_ptr());
  // FP: "21 -> 22;
}
void SSSPSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "sssp_pull_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> dist_current;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->dist_current.data.zero_gpu();
}

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_current, begin, end);
}

uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();
	return dist_current[LID];
}

void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] = v;
}

void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] += v;
}

bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	if (dist_current[LID] > v){
		dist_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          const uint32_t& local_infinity,
                          unsigned long long local_src_node,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,
                                   unsigned long long local_src_node,
                                   struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,
                                      unsigned long long local_src_node,
                                      struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,
                                         unsigned long long local_src_node,
                                         struct CUDA_Context* ctx);
void SSSP_cuda(unsigned int __begin, unsigned int __end,
               unsigned int& active_vertices, struct CUDA_Context* ctx);
void SSSPSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                          uint64_t& DGAccumulator_sum, uint64_t& dg_avg,
                          uint32_t& DGMax, const uint32_t& local_infinity,
                          struct CUDA_Context* ctx);
void SSSPSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum,
                                   uint64_t& dg_avg, uint32_t& DGMax,
                                   const uint32_t& local_infinity,
                                   struct CUDA_Context* ctx);
void SSSPSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,
                                      uint64_t& dg_avg, uint32_t& DGMax,
                                      const uint32_t& local_infinity,
                                      struct CUDA_Context* ctx);
void SSSPSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,
                                         uint64_t& dg_avg, uint32_t& DGMax,
                                         const uint32_t& local_infinity,
                                         struct CUDA_Context* ctx);
void SSSP_allNodes_cuda(unsigned int& active_vertices,
                        struct CUDA_Context* ctx);
void SSSP_masterNodes_cuda(unsigned int& active_vertices,
                           struct CUDA_Context* ctx);
void SSSP_nodesWithEdges_cuda(unsigned int& active_vertices,
                              struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("sssp_pull_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
]),
]),
]),
Kernel("SSSP", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = p_dist_current[dst] + graph.getAbsWeight(jj)"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[src], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(src)"]),
CBlock(["active_vertices.reduce( 1)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
]),
Kernel("SSSPSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGAccumulator<uint64_t>', 'dg_avg'), ('HGReduceMax<uint32_t>', 'DGMax')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "dg_avg_ts", "")]),
CBlock(["dg_avg.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage", "DGMax_ts", "")]),
CBlock(["DGMax.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_current[src] < local_infinity",
[
CBlock(["DGAccumulator_sum.reduce( 1)"]),
CBlock(["DGMax.reduce(p_dist_current[src])"]),
CBlock(["dg_avg.reduce( p_dist_current[src])"]),
]),
]),
]),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts)"], parse = False),
CBlock(["DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "local_infinity", "local_src_node", "ctx->dist_current.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("SSSP_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
Invoke("SSSP", ("ctx->gg", "__begin", "__end", "ctx->dist_current.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())", "_active_vertices")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
], host = True),
Kernel("SSSP_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(0, ctx->gg.nnodes, active_vertices, ctx)"]),
], host = True),
Kernel("SSSP_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)"]),
], host = True),
Kernel("SSSP_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_sumval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<uint64_t>", "dg_avgval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_dg_avg", "")]),
CBlock(["*(dg_avgval.cpu_wr_ptr()) = 0"]),
CBlock(["_dg_avg.rv = dg_avgval.gpu_wr_ptr()"]),
CDecl([("Shared<uint32_t>", "DGMaxval", " = Shared<uint32_t>(1)")]),
CDecl([("HGReduceMax<uint32_t>", "_DGMax", "")]),
CBlock(["*(DGMaxval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGMax.rv = DGMaxval.gpu_wr_ptr()"]),
Invoke("SSSPSanityCheck", ("ctx->gg", "__begin", "__end", "local_infinity", "ctx->dist_current.data.gpu_wr_ptr()", "_DGAccumulator_sum", "_dg_avg", "_DGMax")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["dg_avg = *(dg_avgval.cpu_rd_ptr())"]),
CBlock(["DGMax = *(DGMaxval.cpu_rd_ptr())"]),
], host = True),
Kernel("SSSPSanityCheck_allNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_pull_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(dist_current);


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Output.h"
#include "DistBench/Start.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "sssp_push_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

constexpr static const char* const REGION_NAME = "SSSP";

/******************************************************************************/
/* Declaration of command line arguments */
/******************************************************************************/

namespace cll = llvm::cl;

static cll::opt<unsigned int> maxIterations("maxIterations",
                                            cll::desc("Maximum iterations: "
                                                      "Default 1000"),
                                            cll::init(1000));
static cll::opt<uint64_t>
    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));

static cll::opt<uint32_t>
    delta("delta",
          cll::desc("Shift value for the delta step (default value 0)"),
          cll::init(0));

enum Exec { Sync, Async };

static cll::opt<Exec> execution(
    "exec", cll::desc("Distributed Execution Model (default value Async):"),
    cll::values(clEnumVal(Sync, "Bulk-synchronous Parallel (BSP)"),
                clEnumVal(Async, "Bulk-asynchronous Parallel (BASP)")),
    cll::init(Async));

/******************************************************************************/
/* Graph structure declarations + other initialization */
/******************************************************************************/

const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;

struct NodeData {
  std::atomic<uint32_t> dist_current;
  uint32_t dist_old;
};

galois::DynamicBitSet bitset_dist_current;

typedef galois::graphs::DistGraph<NodeData, unsigned int> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;

#include "sssp_push_sync.hh"

/******************************************************************************/
/* Algorithm structures */
/******************************************************************************/

struct InitializeGraph {
  const uint32_t& local_infinity;
  cll::opt<uint64_t>& local_src_node;
  Graph* graph;

  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
                  Graph* _graph)
      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}

  void static go(Graph& _graph) {
    const auto& allNodes = _graph.allNodesRange();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("InitializeGraph_" +
                           (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      galois::do_all(
          galois::iterate(allNodes.begin(), allNodes.end()),
          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),
          galois::loopname(
              syncSubstrate->get_run_identifier("InitializeGraph").c_str()));
    }
  }

  void operator()(GNode src) const {
    NodeData& sdata = graph->getData(src);
    sdata.dist_current =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
    sdata.dist_old =
        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;
  }
};

template <bool async>
struct FirstItr_SSSP {
  Graph* graph;
  FirstItr_SSSP(Graph* _graph) : graph(_graph) {}

  void static go(Graph& _graph) {
    uint32_t __begin, __end;
    if (_graph.isLocal(src_node)) {
      __begin = _graph.getLID(src_node);
      __end   = __begin + 1;
    } else {
      __begin = 0;
      __end   = 0;
    }
    syncSubstrate->set_num_round(0);
    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      std::string impl_str("SSSP_" + (syncSubstrate->get_run_identifier()));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      FirstItr_SSSP_cuda(__begin, __end, cuda_ctx);
      StatTimer_cuda.stop();
#else
      abort();
#endif
    } else if (personality == CPU) {
      // one node
      galois::do_all(
          galois::iterate(__begin, __end), FirstItr_SSSP{&_graph},
          galois::no_stats(),
          galois::loopname(syncSubstrate->get_run_identifier("SSSP").c_str()));
    }

    syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,
                        Bitset_dist_current, async>("SSSP");

    galois::runtime::reportStat_Tsum(
        "SSSP", "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
        __end - __begin);
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);
    snode.dist_old  = snode.dist_current;

    for (auto jj : graph->edges(src)) {
      GNode dst         = graph->getEdgeDst(jj);
      auto& dnode       = graph->getData(dst);
      uint32_t new_dist = graph->getEdgeData(jj) + snode.dist_current;
      uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
      if (old_dist > new_dist)
        bitset_dist_current.set(dst);
    }
  }
};

template <bool async>
struct SSSP {
  uint32_t local_priority;
  Graph* graph;
  using DGTerminatorDetector =
      typename std::conditional<async, galois::DGTerminator<unsigned int>,
                                galois::DGAccumulator<unsigned int>>::type;
  using DGAccumulatorTy = galois::DGAccumulator<unsigned int>;

  DGTerminatorDetector& active_vertices;
  DGAccumulatorTy& work_edges;

  SSSP(uint32_t _local_priority, Graph* _graph, DGTerminatorDetector& _dga,
       DGAccumulatorTy& _work_edges)
      : local_priority(_local_priority), graph(_graph), active_vertices(_dga),
        work_edges(_work_edges) {}

  void static go(Graph& _graph) {
    FirstItr_SSSP<async>::go(_graph);

    unsigned _num_iterations = 1;

    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();

    uint32_t priority;
    if (delta == 0)
      priority = std::numeric_limits<uint32_t>::max();
    else
      priority = 0;
    DGTerminatorDetector dga;
    DGAccumulatorTy work_edges;

    do {

      // if (work_edges.reduce() == 0)
      priority += delta;

      syncSubstrate->set_num_round(_num_iterations);
      dga.reset();
      work_edges.reset();
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        std::string impl_str("SSSP_" + (syncSubstrate->get_run_identifier()));
        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
        StatTimer_cuda.start();
        unsigned int __retval  = 0;
        unsigned int __retval2 = 0;
        SSSP_nodesWithEdges_cuda(__retval, __retval2, priority, cuda_ctx);
        dga += __retval;
        work_edges += __retval2;
        StatTimer_cuda.stop();
#else
        abort();
#endif
      } else if (personality == CPU) {
        galois::do_all(
            galois::iterate(nodesWithEdges),
            SSSP{priority, &_graph, dga, work_edges}, galois::no_stats(),
            galois::loopname(syncSubstrate->get_run_identifier("SSSP").c_str()),
            galois::steal());
      }

      syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,
                          Bitset_dist_current, async>("SSSP");

      galois::runtime::reportStat_Tsum(
          "SSSP", "NumWorkItems_" + (syncSubstrate->get_run_identifier()),
          work_edges.read_local());
      ++_num_iterations;
    } while ((async || (_num_iterations < maxIterations)) &&
             dga.reduce(syncSubstrate->get_run_identifier()));

    galois::runtime::reportStat_Tmax(
        "SSSP", "NumIterations_" + std::to_string(syncSubstrate->get_run_num()),
        _num_iterations);
  }

  void operator()(GNode src) const {
    NodeData& snode = graph->getData(src);

    if (snode.dist_old > snode.dist_current) {
      active_vertices += 1;

      if (local_priority > snode.dist_current) {
        snode.dist_old = snode.dist_current;

        for (auto jj : graph->edges(src)) {
          work_edges += 1;

          GNode dst         = graph->getEdgeDst(jj);
          auto& dnode       = graph->getData(dst);
          uint32_t new_dist = graph->getEdgeData(jj) + snode.dist_current;
          uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
          if (old_dist > new_dist)
            bitset_dist_current.set(dst);
        }
      }
    }
  }
};

/******************************************************************************/
/* Sanity check operators */
/******************************************************************************/

/* Prints total number of nodes visited + max distance */
struct SSSPSanityCheck {
  const uint32_t& local_infinity;
  Graph* graph;

  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;
  galois::DGReduceMax<uint32_t>& DGMax;
  galois::DGAccumulator<uint64_t>& dg_avg;

  SSSPSanityCheck(const uint32_t& _infinity, Graph* _graph,
                  galois::DGAccumulator<uint64_t>& dgas,
                  galois::DGReduceMax<uint32_t>& dgm,
                  galois::DGAccumulator<uint64_t>& _dg_avg)
      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),
        DGMax(dgm), dg_avg(_dg_avg) {}

  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,
                 galois::DGReduceMax<uint32_t>& dgm,
                 galois::DGAccumulator<uint64_t>& dgag) {
    dgas.reset();
    dgm.reset();
    dgag.reset();

    if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
      uint64_t sum;
      uint64_t avg;
      uint32_t max;
      SSSPSanityCheck_masterNodes_cuda(sum, avg, max, infinity, cuda_ctx);
      dgas += sum;
      dgm.update(max);
      dgag += avg;
#else
      abort();
#endif
    } else {
      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),
                                     _graph.masterNodesRange().end()),
                     SSSPSanityCheck(infinity, &_graph, dgas, dgm, dgag),
                     galois::no_stats(), galois::loopname("SSSPSanityCheck"));
    }

    uint64_t num_visited  = dgas.reduce();
    uint32_t max_distance = dgm.reduce();

    float visit_average = ((float)dgag.reduce()) / num_visited;

    // Only host 0 will print the info
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Number of nodes visited from source ", src_node, " is ",
                     num_visited, "\n");
      galois::gPrint("Max distance from source ", src_node, " is ",
                     max_distance, "\n");
      galois::gPrint("Average distances on visited nodes is ", visit_average,
                     "\n");
    }
  }

  void operator()(GNode src) const {
    NodeData& src_data = graph->getData(src);

    if (src_data.dist_current < local_infinity) {
      DGAccumulator_sum += 1;
      DGMax.update(src_data.dist_current);
      dg_avg += src_data.dist_current;
    }
  }
};

/******************************************************************************/
/* Make results */
/******************************************************************************/

std::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(hg->getData(node).dist_current);
  }

  return values;
}

#ifdef GALOIS_ENABLE_GPU
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {
  std::vector<uint32_t> values;

  values.reserve(hg->numMasters());
  for (auto node : hg->masterNodesRange()) {
    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));
  }

  return values;
}
#else
std::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {
  abort();
}
#endif

std::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {
  switch (personality) {
  case CPU:
    return makeResultsCPU(hg);
  case GPU_CUDA:
    return makeResultsGPU(hg);
  default:
    abort();
  }
}

/******************************************************************************/
/* Main */
/******************************************************************************/

constexpr static const char* const name = "SSSP - Distributed Heterogeneous "
                                          "with worklist.";
constexpr static const char* const desc = "Variant of Chaotic relaxation SSSP "
                                          "on Distributed Galois.";
constexpr static const char* const url = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    galois::runtime::reportParam("SSSP", "Max Iterations", maxIterations);
    galois::runtime::reportParam("SSSP", "Source Node ID", src_node);
  }

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();

  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, unsigned int>(&cuda_ctx);
#else
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<NodeData, unsigned int>();
#endif

  bitset_dist_current.resize(hg->size());

  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");

  InitializeGraph::go((*hg));
  galois::runtime::getHostBarrier().wait();

  // accumulators for use in operators
  galois::DGAccumulator<uint64_t> DGAccumulator_sum;
  galois::DGAccumulator<uint64_t> dg_avge;
  galois::DGReduceMax<uint32_t> m;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] SSSP::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    if (execution == Async) {
      SSSP<true>::go(*hg);
    } else {
      SSSP<false>::go(*hg);
    }
    StatTimer_main.stop();

    SSSPSanityCheck::go(*hg, DGAccumulator_sum, m, dg_avge);

    if ((run + 1) != numRuns) {
      if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
        bitset_dist_current_reset_cuda(cuda_ctx);
#else
        abort();
#endif
      } else {
        bitset_dist_current.reset();
      }

      (*syncSubstrate).set_num_run(run + 1);
      InitializeGraph::go(*hg);
      galois::runtime::getHostBarrier().wait();
    }
  }

  StatTimer_total.stop();

  if (output) {
    std::vector<uint32_t> results = makeResults(hg);
    auto globalIDs                = hg->getMasterGlobalIDs();
    assert(results.size() == globalIDs.size());

    writeOutput(outputLocation, "distance", results.data(), results.size(),
                globalIDs.data());
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push_cuda.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = true;
#include "sssp_push_cuda.cuh"
static const int __tb_FirstItr_SSSP = TB_SIZE;
static const int __tb_SSSP = TB_SIZE;
__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, unsigned long long local_src_node, uint32_t * p_dist_current, uint32_t * p_dist_old)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type src_end;
  // FP: "1 -> 2;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
      p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;
    }
  }
  // FP: "8 -> 9;
}
__global__ void FirstItr_SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        dst = graph.getAbsDestination(jj);
        new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "49 -> 50;
}
__global__ void FirstItr_SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_FirstItr_SSSP;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "6 -> 7;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "7 -> 8;
    if (pop)
    {
      p_dist_old[src]  = p_dist_current[src];
    }
    // FP: "10 -> 11;
    // FP: "13 -> 14;
    // FP: "14 -> 15;
    int threshold = TOTAL_THREADS_1D;
    // FP: "15 -> 16;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "18 -> 19;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "19 -> 20;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "20 -> 21;
    _np_closure[threadIdx.x].src = src;
    // FP: "21 -> 22;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "24 -> 25;
    // FP: "25 -> 26;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "26 -> 27;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "27 -> 28;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "30 -> 31;
    __syncthreads();
    // FP: "31 -> 32;
    while (true)
    {
      // FP: "32 -> 33;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "35 -> 36;
      __syncthreads();
      // FP: "36 -> 37;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "37 -> 38;
        __syncthreads();
        // FP: "38 -> 39;
        break;
      }
      // FP: "40 -> 41;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "43 -> 44;
      __syncthreads();
      // FP: "44 -> 45;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "45 -> 46;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "48 -> 49;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "49 -> 50;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "62 -> 63;
      __syncthreads();
    }
    // FP: "64 -> 65;

    // FP: "65 -> 66;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "66 -> 67;
      const int _np_laneid = cub::LaneId();
      // FP: "67 -> 68;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            dst = graph.getAbsDestination(jj);
            new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(dst);
            }
          }
        }
      }
      // FP: "90 -> 91;
      __syncthreads();
      // FP: "91 -> 92;
    }

    // FP: "92 -> 93;
    __syncthreads();
    // FP: "93 -> 94;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "94 -> 95;
    while (_np.work())
    {
      // FP: "95 -> 96;
      int _np_i =0;
      // FP: "96 -> 97;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "97 -> 98;
      __syncthreads();
      // FP: "98 -> 99;

      // FP: "99 -> 100;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          dst = graph.getAbsDestination(jj);
          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "113 -> 114;
      _np.execute_round_done(ITSIZE);
      // FP: "114 -> 115;
      __syncthreads();
    }
    // FP: "116 -> 117;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "118 -> 119;
}
__global__ void SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_edges, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned src;
  unsigned int offset;
  unsigned int current_work;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  unsigned blockdim_x = BLOCK_DIM_X;
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  // FP: "9 -> 10;
  total_work = thread_prefix_work_wl[num_items - 1];
  // FP: "10 -> 11;
  my_work = ceilf((float)(total_work) / (float) nthreads);
  // FP: "11 -> 12;

  // FP: "12 -> 13;
  __syncthreads();
  // FP: "13 -> 14;

  // FP: "14 -> 15;
  if (my_work != 0)
  {
    current_work = tid;
  }
  // FP: "17 -> 18;
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type jj;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      src= thread_src_wl.in_wl().dwl[src_index];
      jj = (graph).getFirstEdge(src)+ offset;
      {
        index_type dst;
        uint32_t new_dist;
        uint32_t old_dist;
        work_edges.reduce( 1);
        dst = graph.getAbsDestination(jj);
        new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
        if (old_dist > new_dist)
        {
          bitset_dist_current.set(dst);
        }
      }
      current_work = current_work + nthreads;
    }
    __syncthreads();
  }
  // FP: "50 -> 51;
}
__global__ void SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_edges, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_SSSP;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;
  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage work_edges_ts;
  index_type src_end;
  index_type src_rup;
  // FP: "1 -> 2;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  // FP: "2 -> 3;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;
  unsigned d_limit = DEGREE_LIMIT;
  // FP: "3 -> 4;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  // FP: "4 -> 5;
  __shared__ npsTy nps ;
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  active_vertices.thread_entry();
  // FP: "7 -> 8;
  // FP: "8 -> 9;
  work_edges.thread_entry();
  // FP: "9 -> 10;
  src_end = __end;
  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));
  for (index_type src = __begin + tid; src < src_rup; src += nthreads)
  {
    int index;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    // FP: "10 -> 11;
    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);
    // FP: "11 -> 12;
    if (pop)
    {
      if (p_dist_old[src] > p_dist_current[src])
      {
        active_vertices.reduce( 1);
        if (local_priority > p_dist_current[src])
        {
          p_dist_old[src] = p_dist_current[src];
        }
        else
        {
          pop = false;
        }
      }
      else
      {
        pop = false;
      }
    }
    // FP: "19 -> 20;
    // FP: "22 -> 23;
    // FP: "23 -> 24;
    int threshold = TOTAL_THREADS_1D;
    // FP: "24 -> 25;
    if (pop && (graph).getOutDegree(src) >= threshold)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);
      thread_src_wl.in_wl().dwl[index] = src;
      pop = false;
    }
    // FP: "27 -> 28;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    // FP: "28 -> 29;
    __shared__ struct { index_type src; } _np_closure [TB_SIZE];
    // FP: "29 -> 30;
    _np_closure[threadIdx.x].src = src;
    // FP: "30 -> 31;
    if (pop)
    {
      _np.size = (graph).getOutDegree(src);
      _np.start = (graph).getFirstEdge(src);
    }
    // FP: "33 -> 34;
    // FP: "34 -> 35;
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    // FP: "35 -> 36;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    // FP: "36 -> 37;
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    // FP: "39 -> 40;
    __syncthreads();
    // FP: "40 -> 41;
    while (true)
    {
      // FP: "41 -> 42;
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      // FP: "44 -> 45;
      __syncthreads();
      // FP: "45 -> 46;
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        // FP: "46 -> 47;
        __syncthreads();
        // FP: "47 -> 48;
        break;
      }
      // FP: "49 -> 50;
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      // FP: "52 -> 53;
      __syncthreads();
      // FP: "53 -> 54;
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      // FP: "54 -> 55;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      // FP: "57 -> 58;
      assert(nps.tb.src < __kernel_tb_size);
      src = _np_closure[nps.tb.src].src;
      // FP: "58 -> 59;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type jj;
        jj = ns +_np_j;
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          work_edges.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "72 -> 73;
      __syncthreads();
    }
    // FP: "74 -> 75;

    // FP: "75 -> 76;
    {
      const int warpid = threadIdx.x / 32;
      // FP: "76 -> 77;
      const int _np_laneid = cub::LaneId();
      // FP: "77 -> 78;
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        src = _np_closure[nps.warp.src[warpid]].src;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type jj;
          jj = _np_w_start +_np_ii;
          {
            index_type dst;
            uint32_t new_dist;
            uint32_t old_dist;
            work_edges.reduce( 1);
            dst = graph.getAbsDestination(jj);
            new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
            if (old_dist > new_dist)
            {
              bitset_dist_current.set(dst);
            }
          }
        }
      }
      // FP: "101 -> 102;
      __syncthreads();
      // FP: "102 -> 103;
    }

    // FP: "103 -> 104;
    __syncthreads();
    // FP: "104 -> 105;
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    // FP: "105 -> 106;
    while (_np.work())
    {
      // FP: "106 -> 107;
      int _np_i =0;
      // FP: "107 -> 108;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      // FP: "108 -> 109;
      __syncthreads();
      // FP: "109 -> 110;

      // FP: "110 -> 111;
      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type jj;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        src = _np_closure[nps.fg.src[_np_i]].src;
        jj= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          uint32_t new_dist;
          uint32_t old_dist;
          work_edges.reduce( 1);
          dst = graph.getAbsDestination(jj);
          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];
          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);
          if (old_dist > new_dist)
          {
            bitset_dist_current.set(dst);
          }
        }
      }
      // FP: "125 -> 126;
      _np.execute_round_done(ITSIZE);
      // FP: "126 -> 127;
      __syncthreads();
    }
    // FP: "128 -> 129;
    assert(threadIdx.x < __kernel_tb_size);
    src = _np_closure[threadIdx.x].src;
  }
  // FP: "132 -> 133;
  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);
  // FP: "133 -> 134;
  work_edges.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_edges_ts);
  // FP: "134 -> 135;
}
__global__ void SSSPSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGAccumulator<uint64_t> dg_avg, HGReduceMax<uint32_t> DGMax)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;
  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage dg_avg_ts;
  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;
  index_type src_end;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  DGAccumulator_sum.thread_entry();
  // FP: "3 -> 4;
  // FP: "4 -> 5;
  dg_avg.thread_entry();
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  DGMax.thread_entry();
  // FP: "7 -> 8;
  src_end = __end;
  for (index_type src = __begin + tid; src < src_end; src += nthreads)
  {
    bool pop  = src < __end;
    if (pop)
    {
      if (p_dist_current[src] < local_infinity)
      {
        DGAccumulator_sum.reduce( 1);
        DGMax.reduce(p_dist_current[src]);
        dg_avg.reduce( p_dist_current[src]);
      }
    }
  }
  // FP: "17 -> 18;
  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);
  // FP: "18 -> 19;
  dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts);
  // FP: "19 -> 20;
  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);
  // FP: "20 -> 21;
}
void InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  t_work.init_thread_work(ctx->gg.nnodes);
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr());
  cudaDeviceSynchronize();
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);
  // FP: "2 -> 3;
}
void FirstItr_SSSP_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  FirstItr_SSSP <<<blocks, __tb_FirstItr_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      FirstItr_SSSP_TB_LB <<<blocks, __tb_FirstItr_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "5 -> 6;
  check_cuda_kernel;
  // FP: "6 -> 7;
}
void FirstItr_SSSP_allNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_SSSP_cuda(0, ctx->gg.nnodes, ctx);
  // FP: "2 -> 3;
}
void FirstItr_SSSP_masterNodes_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);
  // FP: "2 -> 3;
}
void FirstItr_SSSP_nodesWithEdges_cuda(struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  FirstItr_SSSP_cuda(0, ctx->numNodesWithEdges, ctx);
  // FP: "2 -> 3;
}
void SSSP_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<unsigned int> _active_vertices;
  HGAccumulator<unsigned int> _work_edges;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(active_verticesval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _active_vertices.rv = active_verticesval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<unsigned int> work_edgesval  = Shared<unsigned int>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(work_edgesval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _work_edges.rv = work_edgesval.gpu_wr_ptr();
  // FP: "12 -> 13;
  SSSP <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_edges, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
  cudaDeviceSynchronize();
  if (enable_lb)
  {
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      SSSP_TB_LB <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_edges, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
      t_work.reset_thread_work();
    }
  }
  // FP: "13 -> 14;
  check_cuda_kernel;
  // FP: "14 -> 15;
  active_vertices = *(active_verticesval.cpu_rd_ptr());
  // FP: "15 -> 16;
  work_edges = *(work_edgesval.cpu_rd_ptr());
  // FP: "16 -> 17;
}
void SSSP_allNodes_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(0, ctx->gg.nnodes, active_vertices, work_edges, local_priority, ctx);
  // FP: "2 -> 3;
}
void SSSP_masterNodes_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_edges, local_priority, ctx);
  // FP: "2 -> 3;
}
void SSSP_nodesWithEdges_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, work_edges, local_priority, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  dim3 blocks;
  dim3 threads;
  HGAccumulator<uint64_t> _DGAccumulator_sum;
  HGAccumulator<uint64_t> _dg_avg;
  HGReduceMax<uint32_t> _DGMax;
  // FP: "1 -> 2;
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  kernel_sizing(blocks, threads);
  // FP: "4 -> 5;
  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);
  // FP: "5 -> 6;
  // FP: "6 -> 7;
  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;
  // FP: "7 -> 8;
  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();
  // FP: "8 -> 9;
  Shared<uint64_t> dg_avgval  = Shared<uint64_t>(1);
  // FP: "9 -> 10;
  // FP: "10 -> 11;
  *(dg_avgval.cpu_wr_ptr()) = 0;
  // FP: "11 -> 12;
  _dg_avg.rv = dg_avgval.gpu_wr_ptr();
  // FP: "12 -> 13;
  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);
  // FP: "13 -> 14;
  // FP: "14 -> 15;
  *(DGMaxval.cpu_wr_ptr()) = 0;
  // FP: "15 -> 16;
  _DGMax.rv = DGMaxval.gpu_wr_ptr();
  // FP: "16 -> 17;
  SSSPSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _dg_avg, _DGMax);
  cudaDeviceSynchronize();
  // FP: "17 -> 18;
  check_cuda_kernel;
  // FP: "18 -> 19;
  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());
  // FP: "19 -> 20;
  dg_avg = *(dg_avgval.cpu_rd_ptr());
  // FP: "20 -> 21;
  DGMax = *(DGMaxval.cpu_rd_ptr());
  // FP: "21 -> 22;
}
void SSSPSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}
void SSSPSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)
{
  // FP: "1 -> 2;
  SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);
  // FP: "2 -> 3;
}

================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "sssp_push_cuda.h"
#include "galois/runtime/cuda/DeviceSync.h"

struct CUDA_Context : public CUDA_Context_Common {
	struct CUDA_Context_Field<uint32_t> dist_current;
	struct CUDA_Context_Field<uint32_t> dist_old;
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {
	size_t mem_usage = mem_usage_CUDA_common(g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);
	mem_usage += mem_usage_CUDA_field(&ctx->dist_old, g, num_hosts);
	printf("[%d] Host memory for communication context: %3u MB\n", ctx->id, mem_usage/1048756);
	load_graph_CUDA_common(ctx, g, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);
	load_graph_CUDA_field(ctx, &ctx->dist_old, num_hosts);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
	ctx->dist_current.data.zero_gpu();
	ctx->dist_old.data.zero_gpu();
}

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_current.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_current, begin, end);
}

uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();
	return dist_current[LID];
}

void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] = v;
}

void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	dist_current[LID] += v;
}

bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();
	if (dist_current[LID] > v){
		dist_current[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);
}

void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);
}

void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);
}

void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);
}

void get_bitset_dist_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {
	ctx->dist_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
}

void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx) {
	ctx->dist_old.is_updated.cpu_rd_ptr()->reset();
}

void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {
	reset_bitset_field(&ctx->dist_old, begin, end);
}

uint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_rd_ptr();
	return dist_old[LID];
}

void set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	dist_old[LID] = v;
}

void add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	dist_old[LID] += v;
}

bool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {
	uint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();
	if (dist_old[LID] > v){
		dist_old[LID] = v;
		return true;
	}
	return false;
}

void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v);
}

void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);
}

void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v);
}

void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {
	batch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);
}

void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, i);
}

void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {
	batch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode, i);
}

void batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {
	batch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);
}

void batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {
	reset_data_field<uint32_t>(&ctx->dist_old, begin, end, v);
}


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/HostDecls.h"

void get_bitset_dist_current_cuda(struct CUDA_Context* ctx,
                                  uint64_t* bitset_compute);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end);
uint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
bool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,
                                uint32_t v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v);
void batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      size_t* v_size, DataCommMode* data_mode);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             size_t* v_size,
                                             DataCommMode* data_mode);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            uint32_t i);
void batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,
                                            unsigned from_id, uint8_t* v,
                                            size_t* v_size,
                                            DataCommMode* data_mode,
                                            uint32_t i);
void batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,
                                             unsigned from_id, uint8_t* v,
                                             DataCommMode data_mode);
void batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,
                                      unsigned from_id, uint8_t* v,
                                      DataCommMode data_mode);
void batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,
                                        size_t end, uint32_t v);

void get_bitset_dist_old_cuda(struct CUDA_Context* ctx,
                              uint64_t* bitset_compute);
void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx);
void bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,
                                size_t end);
uint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID);
void set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
bool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);
void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v);
void batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, size_t* v_size,
                                  DataCommMode* data_mode);
void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v);
void batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         size_t* v_size,
                                         DataCommMode* data_mode);
void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        uint32_t i);
void batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,
                                        unsigned from_id, uint8_t* v,
                                        size_t* v_size, DataCommMode* data_mode,
                                        uint32_t i);
void batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,
                                         unsigned from_id, uint8_t* v,
                                         DataCommMode data_mode);
void batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,
                                  uint8_t* v, DataCommMode data_mode);
void batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin,
                                    size_t end, uint32_t v);

void FirstItr_SSSP_cuda(unsigned int __begin, unsigned int __end,
                        struct CUDA_Context* ctx);
void FirstItr_SSSP_allNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_SSSP_masterNodes_cuda(struct CUDA_Context* ctx);
void FirstItr_SSSP_nodesWithEdges_cuda(struct CUDA_Context* ctx);
void InitializeGraph_cuda(unsigned int __begin, unsigned int __end,
                          const uint32_t& local_infinity,
                          unsigned long long local_src_node,
                          struct CUDA_Context* ctx);
void InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,
                                   unsigned long long local_src_node,
                                   struct CUDA_Context* ctx);
void InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,
                                      unsigned long long local_src_node,
                                      struct CUDA_Context* ctx);
void InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,
                                         unsigned long long local_src_node,
                                         struct CUDA_Context* ctx);
void SSSP_cuda(unsigned int __begin, unsigned int __end,
               unsigned int& active_vertices, unsigned int& work_edges,
               uint32_t local_priority, struct CUDA_Context* ctx);
void SSSPSanityCheck_cuda(unsigned int __begin, unsigned int __end,
                          uint64_t& DGAccumulator_sum, uint64_t& dg_avg,
                          uint32_t& DGMax, const uint32_t& local_infinity,
                          struct CUDA_Context* ctx);
void SSSPSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum,
                                   uint64_t& dg_avg, uint32_t& DGMax,
                                   const uint32_t& local_infinity,
                                   struct CUDA_Context* ctx);
void SSSPSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,
                                      uint64_t& dg_avg, uint32_t& DGMax,
                                      const uint32_t& local_infinity,
                                      struct CUDA_Context* ctx);
void SSSPSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,
                                         uint64_t& dg_avg, uint32_t& DGMax,
                                         const uint32_t& local_infinity,
                                         struct CUDA_Context* ctx);
void SSSP_allNodes_cuda(unsigned int& active_vertices, unsigned int& work_edges,
                        uint32_t local_priority, struct CUDA_Context* ctx);
void SSSP_masterNodes_cuda(unsigned int& active_vertices,
                           unsigned int& work_edges, uint32_t local_priority,
                           struct CUDA_Context* ctx);
void SSSP_nodesWithEdges_cuda(unsigned int& active_vertices,
                              unsigned int& work_edges, uint32_t local_priority,
                              struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("sssp_push_cuda.cuh", system = False)], parse = False),
Kernel("InitializeGraph", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
CBlock(["p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity"]),
]),
]),
]),
Kernel("FirstItr_SSSP", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current')],
[
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
CBlock(["p_dist_old[src]  = p_dist_current[src]"]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = graph.getAbsWeight(jj) + p_dist_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(dst)"]),
]),
]),
),
]),
]),
Kernel("SSSP", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_priority'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices'), ('HGAccumulator<unsigned int>', 'work_edges')],
[
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "active_vertices_ts", "")]),
CBlock(["active_vertices.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "work_edges_ts", "")]),
CBlock(["work_edges.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_old[src] > p_dist_current[src]",
[
CBlock(["active_vertices.reduce( 1)"]),
If("local_priority > p_dist_current[src]",
[
CBlock(["p_dist_old[src] = p_dist_current[src]"]),
], [ CBlock(["pop = false"]), ]),
], [ CBlock(["pop = false"]), ]),
]),
UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
ClosureHint(
ForAll("jj", G.edges("src"),
[
CBlock(["work_edges.reduce( 1)"]),
CDecl([("index_type", "dst", "")]),
CBlock(["dst = graph.getAbsDestination(jj)"]),
CDecl([("uint32_t", "new_dist", "")]),
CBlock(["new_dist = graph.getAbsWeight(jj) + p_dist_current[src]"]),
CDecl([("uint32_t", "old_dist", "")]),
CBlock(["old_dist = atomicTestMin(&p_dist_current[dst], new_dist)"]),
If("old_dist > new_dist",
[
CBlock(["bitset_dist_current.set(dst)"]),
]),
]),
),
]),
CBlock(["active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)"], parse = False),
CBlock(["work_edges.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_edges_ts)"], parse = False),
]),
Kernel("SSSPSanityCheck", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGAccumulator<uint64_t>', 'dg_avg'), ('HGReduceMax<uint32_t>', 'DGMax')],
[
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "DGAccumulator_sum_ts", "")]),
CBlock(["DGAccumulator_sum.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage", "dg_avg_ts", "")]),
CBlock(["dg_avg.thread_entry()"]),
CDecl([("__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage", "DGMax_ts", "")]),
CBlock(["DGMax.thread_entry()"]),
ForAll("src", G.nodes("__begin", "__end"),
[
CDecl([("bool", "pop", " = src < __end")]),
If("pop", [
If("p_dist_current[src] < local_infinity",
[
CBlock(["DGAccumulator_sum.reduce( 1)"]),
CBlock(["DGMax.reduce(p_dist_current[src])"]),
CBlock(["dg_avg.reduce( p_dist_current[src])"]),
]),
]),
]),
CBlock(["DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)"], parse = False),
CBlock(["dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts)"], parse = False),
CBlock(["DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)"], parse = False),
]),
Kernel("InitializeGraph_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("InitializeGraph", ("ctx->gg", "__begin", "__end", "local_infinity", "local_src_node", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("InitializeGraph_allNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_masterNodes_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("InitializeGraph_nodesWithEdges_cuda", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)"]),
], host = True),
Kernel("FirstItr_SSSP_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
Invoke("FirstItr_SSSP", ("ctx->gg", "__begin", "__end", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())")),
CBlock(["check_cuda_kernel"], parse = False),
], host = True),
Kernel("FirstItr_SSSP_allNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_SSSP_cuda(0, ctx->gg.nnodes, ctx)"]),
], host = True),
Kernel("FirstItr_SSSP_masterNodes_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)"]),
], host = True),
Kernel("FirstItr_SSSP_nodesWithEdges_cuda", [('struct CUDA_Context* ', 'ctx')],
[
CBlock(["FirstItr_SSSP_cuda(0, ctx->numNodesWithEdges, ctx)"]),
], host = True),
Kernel("SSSP_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<unsigned int>", "active_verticesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_active_vertices", "")]),
CBlock(["*(active_verticesval.cpu_wr_ptr()) = 0"]),
CBlock(["_active_vertices.rv = active_verticesval.gpu_wr_ptr()"]),
CDecl([("Shared<unsigned int>", "work_edgesval", " = Shared<unsigned int>(1)")]),
CDecl([("HGAccumulator<unsigned int>", "_work_edges", "")]),
CBlock(["*(work_edgesval.cpu_wr_ptr()) = 0"]),
CBlock(["_work_edges.rv = work_edgesval.gpu_wr_ptr()"]),
Invoke("SSSP", ("ctx->gg", "__begin", "__end", "local_priority", "ctx->dist_current.data.gpu_wr_ptr()", "ctx->dist_old.data.gpu_wr_ptr()", "*(ctx->dist_current.is_updated.gpu_rd_ptr())", "_active_vertices", "_work_edges")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["active_vertices = *(active_verticesval.cpu_rd_ptr())"]),
CBlock(["work_edges = *(work_edgesval.cpu_rd_ptr())"]),
], host = True),
Kernel("SSSP_allNodes_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(0, ctx->gg.nnodes, active_vertices, work_edges, local_priority, ctx)"]),
], host = True),
Kernel("SSSP_masterNodes_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_edges, local_priority, ctx)"]),
], host = True),
Kernel("SSSP_nodesWithEdges_cuda", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, work_edges, local_priority, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CDecl([("dim3", "blocks", "")]),
CDecl([("dim3", "threads", "")]),
CBlock(["kernel_sizing(blocks, threads)"]),
CDecl([("Shared<uint64_t>", "DGAccumulator_sumval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_DGAccumulator_sum", "")]),
CBlock(["*(DGAccumulator_sumval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()"]),
CDecl([("Shared<uint64_t>", "dg_avgval", " = Shared<uint64_t>(1)")]),
CDecl([("HGAccumulator<uint64_t>", "_dg_avg", "")]),
CBlock(["*(dg_avgval.cpu_wr_ptr()) = 0"]),
CBlock(["_dg_avg.rv = dg_avgval.gpu_wr_ptr()"]),
CDecl([("Shared<uint32_t>", "DGMaxval", " = Shared<uint32_t>(1)")]),
CDecl([("HGReduceMax<uint32_t>", "_DGMax", "")]),
CBlock(["*(DGMaxval.cpu_wr_ptr()) = 0"]),
CBlock(["_DGMax.rv = DGMaxval.gpu_wr_ptr()"]),
Invoke("SSSPSanityCheck", ("ctx->gg", "__begin", "__end", "local_infinity", "ctx->dist_current.data.gpu_wr_ptr()", "_DGAccumulator_sum", "_dg_avg", "_DGMax")),
CBlock(["check_cuda_kernel"], parse = False),
CBlock(["DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())"]),
CBlock(["dg_avg = *(dg_avgval.cpu_rd_ptr())"]),
CBlock(["DGMax = *(DGMaxval.cpu_rd_ptr())"]),
], host = True),
Kernel("SSSPSanityCheck_allNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_masterNodes_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
Kernel("SSSPSanityCheck_nodesWithEdges_cuda", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],
[
CBlock(["SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/distributed/sssp/sssp_push_sync.hh
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/runtime/SyncStructures.h"

GALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);
GALOIS_SYNC_STRUCTURE_BITSET(dist_current);


================================================
FILE: lonestar/analytics/distributed/triangle-counting/CMakeLists.txt
================================================
app_dist(tc triangle-counting)
add_test_dist(triangle-counting-dist rmat15 NO_ASYNC ${BASEINPUT}/scalefree/symmetric/rmat15.csgr -symmetricGraph)


================================================
FILE: lonestar/analytics/distributed/triangle-counting/README.md
================================================
Triangle Counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

Counts the number of triangles in a symmetric, clean (i.e., no self-loops and
no multiedges) graph in a multi-GPU setting. This implementation is the
one used in the paper "DistTC: High Performance Distributed Triangle Counting"
which appeared in the Graph Challenge 2019 competition.

A CPU implementation is currently in planning and will appear here once it is
ready.

INPUT
--------------------------------------------------------------------------------

Takes in symmetric Galois .gr graphs that have been cleaned.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/distributed/triangle-counting; make -j

RUN
--------------------------------------------------------------------------------

To run on 1 with a single GPU, use the following:
`./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=g -num_nodes=1`

To run on a single machine with 56 CPU threads, use the following:
`./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -t=56`

To run on 3 GPUs on a machine, use the following:
`mpirun -n=3 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggg -num_nodes=1`

To run on 6 GPUs on 2 machines h1 and h2 with 3 GPUs each, use the following:
`mpirun -n=6 -hosts=h1,h2 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggg -num_nodes=2`

To run on 4 GPUs and 2 CPUs on 2 machines h1 and h2 with 2 GPUs and 1 CPU each, use the following:
`mpirun -n=6 -hosts=h1,h2 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggc -num_nodes=2`


PERFORMANCE
--------------------------------------------------------------------------------

The performance analysis of distributed triangle counting can be found at [1]. The key observations from our study are as follows.

* On a single GPU, we do not partition the graph, so application performance is better due to the computation phase on the GPU.

* For distributed multi-GPUs,  we observe that application scales. With the increase in the number of GPUs,  the time taken to compute the number of triangles decreases since our algorithm is free from the synchronization except for the final aggregation.

[1] Loc Hoang, Vishwesh Jatala, Xuhao Chen, Udit Agarwal, Roshan Dathathri, Gurbinder Gill, and Keshav Pingali, [DistTC: High Performance Distributed Triangle Counting. In 2019 IEEE High Performance Extreme Computing Conference](https://ieeexplore.ieee.org/document/8916438), HPEC 2019. IEEE, 2019.


================================================
FILE: lonestar/analytics/distributed/triangle-counting/tc.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/* This is an implementation of Distributed multi-GPU triangle counting code.
 * The single GPU code which is executed on GPU is generated using the IrGL
 * compiler. Currently, it does not support distributed multi-CPU code.
 *
 * TODO implement CPU kernel
 */

#include "DistBench/MiningStart.h"
#include "galois/DistGalois.h"
#include "galois/DReducible.h"
#include "galois/DTerminationDetector.h"
#include "galois/gstl.h"
#include "galois/graphs/GenericPartitioners.h"
#include "galois/graphs/MiningPartitioner.h"
#include "galois/runtime/Tracer.h"

#include <iostream>
#include <limits>

#ifdef GALOIS_ENABLE_GPU
#include "tc_cuda.h"
struct CUDA_Context* cuda_ctx;
#else
enum { CPU, GPU_CUDA };
int personality = CPU;
#endif

namespace cll = llvm::cl;

constexpr static const char* const REGION_NAME = "TC";

/*******************************************************************************
 * Graph structure declarations + other initialization
 ******************************************************************************/

typedef galois::graphs::MiningGraph<void, void, MiningPolicyDegrees> Graph;
typedef typename Graph::GraphNode GNode;

std::unique_ptr<galois::graphs::GluonEdgeSubstrate<Graph>> syncSubstrate;

template <bool async>
struct TC {
  Graph* graph;
  using DGAccumulatorTy = galois::DGAccumulator<uint64_t>;
  DGAccumulatorTy& numTriangles;

  TC(Graph* _graph, DGAccumulatorTy& _numTriangles)
      : graph(_graph), numTriangles(_numTriangles) {}

  // use the below line once CPU code is added
  void static go(Graph& _graph) {
    unsigned _num_iterations = 0;
    DGAccumulatorTy numTriangles;
    syncSubstrate->set_num_round(_num_iterations);
    numTriangles.reset();
    const auto& allMasterNodes = _graph.masterNodesRange();

#ifdef GALOIS_ENABLE_GPU
    if (personality == GPU_CUDA) { ///< GPU TC.
      std::string impl_str(syncSubstrate->get_run_identifier("TC"));
      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);
      StatTimer_cuda.start();
      uint64_t num_local_triangles = 0;
      TC_masterNodes_cuda(num_local_triangles, cuda_ctx);
      numTriangles += num_local_triangles;
      StatTimer_cuda.stop();
    } else { ///< CPU TC.
#endif
      galois::do_all(
          galois::iterate(allMasterNodes), TC(&_graph, numTriangles),
          galois::steal(),
          galois::loopname(syncSubstrate->get_run_identifier("TC").c_str()));
#ifdef GALOIS_ENABLE_GPU
    }
#endif

    uint64_t total_triangles = numTriangles.reduce();
    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
      galois::gPrint("Total number of triangles ", total_triangles, "\n");
    }
  }

  void operator()(GNode v) const {
    size_t numTriangles_local = 0;
    for (auto vIter : graph->edges(v)) {
      GNode w                       = graph->getEdgeDst(vIter);
      Graph::edge_iterator vIterBeg = graph->edge_begin(v);
      Graph::edge_iterator vIterEnd = graph->edge_end(v);

      for (auto wIter : graph->edges(w)) {
        auto x                      = graph->getEdgeDst(wIter);
        Graph::edge_iterator vvIter = vIterBeg;
        while (graph->getEdgeDst(vvIter) < x && vvIter < vIterEnd) {
          vvIter++;
        }
        if (vvIter < vIterEnd && x == graph->getEdgeDst(vvIter)) {
          ++numTriangles_local;
        }
      }
    } ///< Finding triangles is done.
    numTriangles += numTriangles_local;
  } ///< CPU operator is done.
};

/*******************************************************************************
 * Main
 ******************************************************************************/

constexpr static const char* const name =
    "TC - Distributed Multi-GPU Triangle Counting ";
constexpr static const char* const desc = "TC on Distributed GPU (D-IrGL).";
constexpr static const char* const url  = nullptr;

int main(int argc, char** argv) {
  galois::DistMemSys G;
  DistBenchStart(argc, argv, name, desc, url);

  if (!symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  const auto& net = galois::runtime::getSystemNetworkInterface();

  galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);

  StatTimer_total.start();
  std::unique_ptr<Graph> hg;
#ifdef GALOIS_ENABLE_GPU
  std::tie(hg, syncSubstrate) =
      distGraphInitialization<void, void>(&cuda_ctx, false);
#else
  std::tie(hg, syncSubstrate) = distGraphInitialization<void, void>(false);
#endif

  if (personality == GPU_CUDA) {
#ifdef GALOIS_ENABLE_GPU
    std::string timer_str("SortEdgesGPU");
    galois::StatTimer edgeSortTime("SortEdgesGPU", REGION_NAME);
    edgeSortTime.start();
    sortEdgesByDestination_cuda(cuda_ctx);
    edgeSortTime.stop();
#else
    abort();
#endif
  } else if (personality == CPU) {
    galois::StatTimer edgeSortTime("SortEdgesCPU", REGION_NAME);
    edgeSortTime.start();
    hg->sortEdgesByDestination();
    edgeSortTime.stop();
  }
  ///! accumulators for use in operators
  galois::DGAccumulator<uint64_t> DGAccumulator_numTriangles;

  for (auto run = 0; run < numRuns; ++run) {
    galois::gPrint("[", net.ID, "] TC::go run ", run, " called\n");
    std::string timer_str("Timer_" + std::to_string(run));
    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);

    StatTimer_main.start();
    TC<false>::go(*hg);
    StatTimer_main.stop();

    syncSubstrate->set_num_run(run + 1);
  }
  StatTimer_total.stop();

  if (output) {
    galois::gError("output requested but this application doesn't support it");
    return 1;
  }

  return 0;
}


================================================
FILE: lonestar/analytics/distributed/triangle-counting/tc_cuda.cu
================================================
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
#include "moderngpu/kernel_reduce.hxx"
#include "tc_cuda.cuh"
#include "moderngpu/kernel_segsort.hxx"
#include <cuda_profiler_api.h>
#define WARP_SIZE 32

inline __device__ unsigned long intersect(CSRGraph graph, index_type u, index_type v) {
	index_type u_start = graph.getFirstEdge(u);
	index_type u_end = u_start + graph.getOutDegree(u);
	index_type v_start = graph.getFirstEdge(v);
	index_type v_end = v_start + graph.getOutDegree(v);
	unsigned long count = 0;
	index_type u_it = u_start;
	index_type v_it = v_start;
	index_type a;
	index_type b;
	while (u_it < u_end && v_it < v_end) {
		a = graph.getAbsDestination(u_it);
		b = graph.getAbsDestination(v_it);
		int d = a - b;
		if (d <= 0) u_it++;
		if (d >= 0) v_it++;
		if (d == 0) count++;
	}
	return count;
}

__global__ void base(CSRGraph graph, unsigned begin, unsigned end, HGAccumulator<unsigned long> num_local_triangles) {
	unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned long local_total = 0;
	__shared__ cub::BlockReduce<unsigned long, TB_SIZE>::TempStorage num_local_triangles_ts;
	num_local_triangles.thread_entry();
	for (index_type src = begin + tid; src < end; src += TOTAL_THREADS_1D) {
		index_type row_begin = graph.getFirstEdge(src);
		index_type row_end = row_begin + graph.getOutDegree(src); 
		for (index_type offset = row_begin; offset < row_end; ++ offset) {
			index_type dst = graph.getAbsDestination(offset);
			local_total = intersect(graph, dst, src);
			if (local_total) num_local_triangles.reduce(local_total);
		}
	}
	num_local_triangles.thread_exit<cub::BlockReduce<unsigned long, TB_SIZE> >(num_local_triangles_ts);
}

inline __device__ bool serial_search(CSRGraph graph, unsigned key, index_type begin, index_type end) {
	for (index_type offset = begin; offset < end; ++ offset) {
		index_type d = graph.getAbsDestination(offset);
		if (d == key) return true;
		if (d > key) return false;
	}
	return false;
}

inline __device__ bool binary_search(CSRGraph graph, index_type key, index_type begin, index_type end) {
	assert(begin < end);
	int l = begin;
	int r = end-1;
	while (r >= l) { 
		//assert(l<graph.nedges && r<graph.nedges);
		int mid = l + (r - l) / 2; 
		if (mid >= graph.nedges) printf("mid=%u, l=%u, r=%u, begin=%u, end=%u, key=%u\n", mid, l, r, begin, end, key);
		assert(mid < graph.nedges);
		index_type value = graph.getAbsDestination(mid);
		if (value == key) return true;
		if (value < key) l = mid + 1;
		else r = mid - 1;
	}
	return false;
}

__global__ void warp(CSRGraph graph, unsigned begin, unsigned end, HGAccumulator<unsigned long> num_local_triangles) {
	unsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
	unsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index
	unsigned num_warps   = (TB_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps

	__shared__ cub::BlockReduce<unsigned long, TB_SIZE>::TempStorage num_local_triangles_ts;
	num_local_triangles.thread_entry();
	// each warp takes one vertex
	for (index_type src = begin + warp_id; src < end; src += num_warps) {
		index_type row_begin = graph.getFirstEdge(src);
		index_type src_size = graph.getOutDegree(src);
		index_type row_end = row_begin + src_size;
		// take one edge
		for (index_type offset = row_begin; offset < row_end; offset ++) {
			index_type dst = graph.getAbsDestination(offset);
			assert(src != dst);
			index_type dst_size = graph.getOutDegree(dst);
			index_type lookup = src;
			index_type search = dst;
			if (src_size > dst_size) {
				lookup = dst;
				search = src;
			}
			index_type lookup_begin = graph.getFirstEdge(lookup);
			index_type lookup_size = graph.getOutDegree(lookup);
			index_type search_size = graph.getOutDegree(search);
			if (lookup_size > 0 && search_size > 0) {
				for (index_type i = thread_lane; i < lookup_size; i += WARP_SIZE) {
					index_type index = lookup_begin + i;
					index_type key = graph.getAbsDestination(index);
					index_type search_begin = graph.getFirstEdge(search);
					if (binary_search(graph, key, search_begin, search_begin+search_size))
					//if (serial_search(graph, key, search_begin, search_begin+search_size))
						num_local_triangles.reduce(1);
				}
			}
		}
	}
	num_local_triangles.thread_exit<cub::BlockReduce<unsigned long, TB_SIZE> >(num_local_triangles_ts);
}

void sortEdgesByDestination_cuda(struct CUDA_Context* ctx) {
	mgpu::standard_context_t context;
        mgpu::segmented_sort(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, mgpu::less_t<int>(), context);
}

void TC_cuda(unsigned __begin, unsigned __end, unsigned long & num_local_triangles, struct CUDA_Context* ctx) {
	dim3 blocks;
	dim3 threads;
	kernel_sizing(blocks, threads);
	HGAccumulator<unsigned long> _num_local_triangles;
	Shared<unsigned long> num_local_trianglesval  = Shared<unsigned long>(1);
	*(num_local_trianglesval.cpu_wr_ptr()) = 0;
	_num_local_triangles.rv = num_local_trianglesval.gpu_wr_ptr();
	//mgc = mgpu::CreateCudaDevice(ctx->device);
	//mgpu::SegSortKeysFromIndices(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, *mgc);
	//base<<<blocks, TB_SIZE>>>(ctx->gg, __begin, __end, _num_local_triangles);
	warp<<<blocks, TB_SIZE>>>(ctx->gg, __begin, __end, _num_local_triangles);
	cudaDeviceSynchronize();
	check_cuda_kernel;
	num_local_triangles = *(num_local_trianglesval.cpu_rd_ptr());
	//dump_memory_info("end", ctx->id);
	cudaProfilerStop();
	//num_local_triangles = (unsigned)h_total;
}

void TC_masterNodes_cuda(unsigned long& num_local_triangles, struct CUDA_Context* ctx) {
	TC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, num_local_triangles, ctx);
}


================================================
FILE: lonestar/analytics/distributed/triangle-counting/tc_cuda.cuh
================================================
#pragma once
#include <cuda.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "tc_cuda.h"
#include "galois/runtime/cuda/DeviceEdgeSync.h"

void dump_memory_info(const char *s, int netId) {
  size_t total, free;

  if(cudaMemGetInfo(&free, &total) == cudaSuccess) {
    printf("[%d] GPU_memory_total_%s %zu\n", netId, s, total);
    printf("[%d] GPU_memory_free_%s %zu\n", netId, s, free);
  }
}

struct CUDA_Context : public CUDA_Context_Common_Edges {
};

struct CUDA_Context* get_CUDA_context(int id) {
	struct CUDA_Context* ctx;
	ctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));
	ctx->id = id;
	return ctx;
}

bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
	return init_CUDA_context_common_edges(ctx, device);
}

void load_graph_CUDA(struct CUDA_Context* ctx, EdgeMarshalGraph &g, unsigned num_hosts) {
	dump_memory_info("start", ctx->id);
	load_graph_CUDA_common_edges(ctx, g, num_hosts, false);
	reset_CUDA_context(ctx);
}

void reset_CUDA_context(struct CUDA_Context* ctx) {
}


================================================
FILE: lonestar/analytics/distributed/triangle-counting/tc_cuda.h
================================================
#pragma once

#include "galois/runtime/DataCommMode.h"
#include "galois/cuda/EdgeHostDecls.h"

void sortEdgesByDestination_cuda(struct CUDA_Context* ctx);
void TC_cuda(unsigned int __begin, unsigned int __end,
             unsigned long& num_local_triangles, struct CUDA_Context* ctx);
void TC_masterNodes_cuda(unsigned long& num_local_triangles,
                         struct CUDA_Context* ctx);


================================================
FILE: lonestar/analytics/distributed/triangle-counting/tc_cuda.py
================================================
from gg.ast import *
from gg.lib.graph import Graph
from gg.lib.wl import Worklist
from gg.ast.params import GraphParam
import cgen
G = Graph("graph")
WL = Worklist()
ast = Module([
CBlock([cgen.Include("kernels/reduce.cuh", system = False)], parse = False),
CBlock([cgen.Include("tc_cuda.cuh", system = False)], parse = False),
CBlock([cgen.Include("kernels/segmentedsort.cuh", system = False)], parse = False),
CBlock([cgen.Include("moderngpu.cuh", system = False)], parse = False),
CBlock([cgen.Include("util/mgpucontext.h", system = False)], parse = False),
CBlock([cgen.Include("cuda_profiler_api.h", system = True)], parse = False),

CBlock('mgpu::ContextPtr mgc', parse = False),

Kernel("intersect", [G.param(), ('index_type', 'u'), ('index_type', 'v')],
       [
        CDecl([('index_type', 'u_start', '= graph.getFirstEdge(u)'),
               ('index_type', 'u_end', '= u_start + graph.getOutDegree(u)'),
               ('index_type', 'v_start', '= graph.getFirstEdge(v)'),
               ('index_type', 'v_end', '= v_start + graph.getOutDegree(v)'),
               ('int', 'count', '= 0'),
               ('index_type', 'u_it', '= u_start'),
               ('index_type', 'v_it', '= v_start'),
               ('index_type', 'a', ''),
               ('index_type', 'b', ''),                       
               ]),
        While('u_it < u_end && v_it < v_end',
              [
                CBlock('a = graph.getAbsDestination(u_it)'),
                CBlock('b = graph.getAbsDestination(v_it)'),                        
                CDecl(('int', 'd', '= a - b')),
                If('d <= 0', [CBlock('u_it++')]),
                If('d >= 0', [CBlock('v_it++')]),
                If('d == 0', [CBlock('count++')]),
                ]
              ),                      
        CBlock('return count'),
        ],
       device=True,
       ret_type = 'unsigned int',
),

Kernel("TC", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('HGAccumulator<unsigned int>', 'num_local_triangles')],
       [CDecl([("__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage", "num_local_triangles_ts", "")]),
		CBlock(["num_local_triangles.thread_entry()"]),
		ForAll("src", G.nodes("__begin", "__end"),
               [CDecl([("bool", "pop", " = src < __end")]),
				UniformConditional(If("!pop", [CBlock("continue")]), uniform_only = False, _only_if_np = True),
                ClosureHint(ForAll("edge", G.edges("src"), 
                                   [CDecl([('index_type', 'u', '= graph.getAbsDestination(edge)'),
                                           ('index_type', 'd_u', '= graph.getOutDegree(u)'),
                                           ('int', 'xcount', '= 0')]),
                                    CBlock('xcount = intersect(graph, u, src)'),
                                    If('xcount', [CBlock(["num_local_triangles.reduce(xcount)"])])
                                    ]
                                   )
                            ),
                ]
        ),
		CBlock(["num_local_triangles.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(num_local_triangles_ts)"], parse = False),
        ],
),

Kernel("TC_cuda", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'num_local_triangles'), ('struct CUDA_Context* ', 'ctx')],
[
	CDecl([("dim3", "blocks", "")]),
	CDecl([("dim3", "threads", "")]),
	CBlock(["kernel_sizing(blocks, threads)"]),
	CDecl([("Shared<unsigned int>", "num_local_trianglesval", " = Shared<unsigned int>(1)")]),
	CDecl([("HGAccumulator<unsigned int>", "_num_local_triangles", "")]),
	CBlock(["*(num_local_trianglesval.cpu_wr_ptr()) = 0"]),
	CBlock(["_num_local_triangles.rv = num_local_trianglesval.gpu_wr_ptr()"]),
	CBlock(["mgc = mgpu::CreateCudaDevice(ctx->device)"], parse=False),
	CBlock("mgpu::SegSortKeysFromIndices(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, *mgc)", parse=False),
	Invoke("TC", ("ctx->gg", "__begin", "__end",  "_num_local_triangles")),
	CBlock(["check_cuda_kernel"], parse = False),
	CBlock(["num_local_triangles = *(num_local_trianglesval.cpu_rd_ptr())"]),
	CBlock('dump_memory_info("end", ctx->id)', parse=False),
	CBlock('cudaProfilerStop()', parse=False),
], host = True),

Kernel("TC_masterNodes_cuda", [('unsigned int &', 'num_local_triangles'), ('struct CUDA_Context* ', 'ctx')],
[
	CBlock(["TC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, num_local_triangles, ctx)"]),
], host = True),
])


================================================
FILE: lonestar/analytics/gpu/CMakeLists.txt
================================================
add_subdirectory(bfs)
add_subdirectory(sssp)
add_subdirectory(connected-components)
add_subdirectory(pagerank)
add_subdirectory(triangle-counting)
add_subdirectory(spanningtree)
add_subdirectory(independentset)
add_subdirectory(pointstoanalysis)
add_subdirectory(matrixcompletion)


================================================
FILE: lonestar/analytics/gpu/README.md
================================================
Overview of LonestarGPU Analytic Benchmark Suite
================================================================================

The LonestarGPU suite contains CUDA implementations of several
irregular algorithms that exhibit amorphous data parallelism.
Currently, LonestarGPU suite contains the following analytic applications,
which can be executed on a single-GPU.

### Analytics Applications
  * Breadth-First Search 
  * Connected Components
  * Maximal Independent Set
  * Minimum Spanning Tree
  * Pagerank
  * Points-to Analysis
  * Single-Source Shortest Paths
  * Stochastic Gradient Descent
  * Triangle Counting

Compiling LonestarGPU Through CMake 
================================================================================

The dependencies for LonestarGPU suite are the same as shared-memory.
Note that  LonestarGPU requires CUDA 8.0 and above.

Note that distributed Galois requires the cub and moderngpu git submodules,
which can be cloned using the followed commands.

```Shell
cd $GALOIS_ROOT
git submodule init
git submodule update
```
These modules will be cloned in the ${GALOIS\_ROOT}/external directory

To build the LonestarGPU suite, first, create a build directory and
run CMake with -DGALOIS\_CUDA\_CAPABILITY=\<insert CUDA capability here\> flag
in the build directory. The CUDA capability should be one that your
GPU supports. For example, if you wanted to build for a GTX 1080 and a K80,
the commands would look like this:

```Shell
cd ${GALOIS_ROOT}
mkdir build
cd build
cmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY="3.7;6.1"

After compiling through CMake, the system will create the 'lonestar/analytics/gpu'
and 'lonestar/scientific/gpu' directories in ${GALOIS\_ROOT}/build directory. 

Compiling Analytics Applications
================================================================================

Once CMake is completed,  compile the provided analytics apps by executing the 
following command in the ${GALOIS\_ROOT}/build/lonestar/analytics/gpu directory.

`make -j`

You can compile a specific app by executing the following commands (shown for bfs).

```Shell
cd bfs
make -j
```

Running Analytics Applications
================================================================================

To run a specific app, follow the instructions given in the README.md
in the particular app directory. 

Documentation
================================================================================

Further documentation is available at
[http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu](http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu)


================================================
FILE: lonestar/analytics/gpu/bfs/CMakeLists.txt
================================================
app_analy_gpu(bfs bfs)
add_test_gpu(bfs rmat15 rmat15.out bfs -s 0 -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)


================================================
FILE: lonestar/analytics/gpu/bfs/README.md
================================================
Breadth-First Search
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark computes the level of each node from a source node in an unweighted graph. It starts at a node and explores all the nodes on the same level and move on to nodes at the next depth level.

INPUT
--------------------------------------------------------------------------------

Take in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/bfs; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./bfs-gpu -o <output-file> -l -s <startNode> <input-graph>`

-`$ ./bfs-gpu -o outfile.txt -l -s 0 rmat15.gr`

The option -l enables thread block load balancer. Enable this option for power-law graphs to improve the performance. It is recommended to disable this option for high diameter graphs, such as road-networks.


================================================
FILE: lonestar/analytics/gpu/bfs/bfs.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
extern int start_node;
bool enable_lb = false;
typedef int edge_data_type;
typedef int node_data_type;
extern const node_data_type INF = INT_MAX;
static const int __tb_bfs_kernel = TB_SIZE;
static const int __tb_gg_main_pipe_1_gpu_gb = 256;
__global__ void bfs_init(CSRGraph graph, int src)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    graph.node_data[node] = (node == src) ? 0 : INF ;
  }
}
__global__ void bfs_kernel_dev_TB_LB(CSRGraph graph, int LEVEL, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned node;
  unsigned int offset;
  unsigned int current_work;
  unsigned blockdim_x = BLOCK_DIM_X;
  total_work = thread_prefix_work_wl[num_items - 1];
  my_work = ceilf((float)(total_work) / (float) nthreads);

  __syncthreads();

  if (my_work != 0)
  {
    current_work = tid;
  }
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      node= thread_src_wl.in_wl().dwl[src_index];
      edge = (graph).getFirstEdge(node)+ offset;
      {
        index_type dst;
        dst = graph.getAbsDestination(edge);
        if (graph.node_data[dst] == INF)
        {
          index_type _start_24;
          graph.node_data[dst] = LEVEL;
          _start_24 = (out_wl).setup_push_warp_one();;
          (out_wl).do_push(_start_24, 0, dst);
        }
      }
      current_work = current_work + nthreads;
    }
  }
}
__global__ void Inspect_bfs_kernel_dev(CSRGraph graph, int LEVEL, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type wlnode_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    int index;
    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false);
    if (pop)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);
      thread_src_wl.in_wl().dwl[index] = node;
    }
  }
}
__device__ void bfs_kernel_dev(CSRGraph graph, int LEVEL, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_bfs_kernel;
  index_type wlnode_end;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false);
    struct NPInspector1 _np = {0,0,0,0,0,0};
    if (pop)
    {
      _np.size = (graph).getOutDegree(node);
      _np.start = (graph).getFirstEdge(node);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    __syncthreads();
    while (true)
    {
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      __syncthreads();
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        __syncthreads();
        break;
      }
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      __syncthreads();
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type edge;
        edge = ns +_np_j;
        {
          index_type dst;
          dst = graph.getAbsDestination(edge);
          if (graph.node_data[dst] == INF)
          {
            index_type _start_24;
            graph.node_data[dst] = LEVEL;
            _start_24 = (out_wl).setup_push_warp_one();;
            (out_wl).do_push(_start_24, 0, dst);
          }
        }
      }
      __syncthreads();
    }

    {
      const int warpid = threadIdx.x / 32;
      const int _np_laneid = cub::LaneId();
      while (__any_sync(0xffffffff,_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;

          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type edge;
          edge = _np_w_start +_np_ii;
          {
            index_type dst;
            dst = graph.getAbsDestination(edge);
            if (graph.node_data[dst] == INF)
            {
              index_type _start_24;
              graph.node_data[dst] = LEVEL;
              _start_24 = (out_wl).setup_push_warp_one();;
              (out_wl).do_push(_start_24, 0, dst);
            }
          }
        }
      }
      __syncthreads();
    }

    __syncthreads();
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect(nps.fg.itvalue, ITSIZE);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        edge= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          dst = graph.getAbsDestination(edge);
          if (graph.node_data[dst] == INF)
          {
            index_type _start_24;
            graph.node_data[dst] = LEVEL;
            _start_24 = (out_wl).setup_push_warp_one();;
            (out_wl).do_push(_start_24, 0, dst);
          }
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
  }
}
__global__ void bfs_kernel(CSRGraph graph, int LEVEL, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  bfs_kernel_dev(graph, LEVEL, enable_lb, in_wl, out_wl);
}
void gg_main_pipe_1(CSRGraph& gg, int& LEVEL, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  while (pipe.in_wl().nitems())
  {
    pipe.out_wl().will_write();
    if (enable_lb)
    {
      t_work.reset_thread_work();
      Inspect_bfs_kernel_dev <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());
      cudaDeviceSynchronize();
      int num_items = t_work.thread_work_wl.in_wl().nitems();
      if (num_items != 0)
      {
        t_work.compute_prefix_sum();
        cudaDeviceSynchronize();
        bfs_kernel_dev_TB_LB <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl());
        cudaDeviceSynchronize();
      }
    }
    bfs_kernel <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, enable_lb, pipe.in_wl(), pipe.out_wl());
    cudaDeviceSynchronize();
    pipe.in_wl().swap_slots();
    pipe.advance2();
    LEVEL++;
  }
}
__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(CSRGraph gg, int LEVEL, PipeContextT<Worklist2> pipe, int* cl_LEVEL, bool enable_lb, GlobalBarrier gb)
{
  unsigned tid = TID_1D;

  LEVEL = *cl_LEVEL;
  while (pipe.in_wl().nitems())
  {
    if (tid == 0)
      pipe.in_wl().reset_next_slot();
    bfs_kernel_dev (gg, LEVEL, enable_lb, pipe.in_wl(), pipe.out_wl());
    pipe.in_wl().swap_slots();
    gb.Sync();
    pipe.advance2();
    LEVEL++;
  }
  gb.Sync();
  if (tid == 0)
  {
    *cl_LEVEL = LEVEL;
  }
}
void gg_main_pipe_1_wrapper(CSRGraph& gg, int& LEVEL, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;
  static bool gg_main_pipe_1_gpu_gb_barrier_inited;
  extern bool enable_lb;
  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);
  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);
  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};
  if (enable_lb)
  {
    gg_main_pipe_1(gg,LEVEL,pipe,blocks,threads);
  }
  else
  {
    int* cl_LEVEL;
    check_cuda(cudaMalloc(&cl_LEVEL, sizeof(int) * 1));
    check_cuda(cudaMemcpy(cl_LEVEL, &LEVEL, sizeof(int) * 1, cudaMemcpyHostToDevice));

    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,LEVEL,pipe,cl_LEVEL, enable_lb, gg_main_pipe_1_gpu_gb_barrier);
    check_cuda(cudaMemcpy(&LEVEL, cl_LEVEL, sizeof(int) * 1, cudaMemcpyDeviceToHost));
    check_cuda(cudaFree(cl_LEVEL));
  }
}
void gg_main(CSRGraph& hg, CSRGraph& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  t_work.init_thread_work(gg.nnodes);
  PipeContextT<Worklist2> wl;
  bfs_init <<<blocks, threads>>>(gg, start_node);
  cudaDeviceSynchronize();
  int LEVEL = 1;
  wl = PipeContextT<Worklist2>(gg.nnodes);
  wl.in_wl().wl[0] = start_node;
  wl.in_wl().update_gpu(1);
  gg_main_pipe_1_wrapper(gg,LEVEL,wl,blocks,threads);
}


================================================
FILE: lonestar/analytics/gpu/bfs/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"

const char *prog_opts = "ls:";
const char *prog_usage = "[-l] [-s startNode]";
const char *prog_args_usage = "-l: enable thread block load balancer (by default false)";

extern const int INF;
int start_node = 0;
extern bool enable_lb;

int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) {
   if(c == 's') {
     start_node = atoi(optarg);
     assert(start_node >= 0);
   }
   if(c == 'l') {
	   enable_lb = true;
   }
}

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  if(!output_file)
    return;

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");

  const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;    
  for(int i = 0; i < g.nnodes; i++) {
    if(g.node_data[i] == INF) {
      //formatting the output to be compatible with the distributed bfs ouput 
      check_fprintf(f, "%d %d\n", i, infinity);
    } else {
      check_fprintf(f, "%d %d\n", i, g.node_data[i]);
    }    
  }
}


================================================
FILE: lonestar/analytics/gpu/connected-components/CMakeLists.txt
================================================
app_analy_gpu(cc connected-components)
add_test_gpu(connected-components rmat15 rmat15.out cc -o rmat15.out ${BASEINPUT}/scalefree/symmetric/rmat15.sgr)


================================================
FILE: lonestar/analytics/gpu/connected-components/README.md
================================================
Connected Components
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------


A connected component of an undirected graph is a subgraph in which there is a path between any two nodes. A node with no edges is itself a connected component. This benchmark computes number of connected components in an undirected graph.

INPUT
--------------------------------------------------------------------------------

Take in symmetric Galois .sgr graphs. 

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/connected-components; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./connected-components-gpu -o <output-file> <symmetric-input-graph>`

-`$ ./connected-components-gpu -o outfile.txt road-USA.sgr`


================================================
FILE: lonestar/analytics/gpu/connected-components/cc.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
extern unsigned long DISCOUNT_TIME_NS;
bool enable_lb = true;
static const int __tb_prep_edge_src = TB_SIZE;
static const int __tb_gg_main_pipe_4_gpu_gb = 256;
static const int __tb_gg_main_pipe_3_gpu_gb = 256;
__global__ void init(CSRGraph graph)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    graph.node_data[node] = node;
  }
}
__global__ void prep_edge_src_TB_LB(CSRGraph graph, index_type * edge_src, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned node;
  unsigned int offset;
  unsigned int current_work;
  unsigned blockdim_x = BLOCK_DIM_X;
  total_work = thread_prefix_work_wl[num_items - 1];
  my_work = ceilf((float)(total_work) / (float) nthreads);

  __syncthreads();

  if (my_work != 0)
  {
    current_work = tid;
  }
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      node= thread_src_wl.in_wl().dwl[src_index];
      edge = (graph).getFirstEdge(node)+ offset;
      {
        edge_src[edge] = node;
      }
      current_work = current_work + nthreads;
    }
  }
}
__global__ void Inspect_prep_edge_src(CSRGraph graph, index_type * edge_src, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    bool pop;
    int index;
    pop = (((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false)) && graph.valid_node(node);;
    if (pop)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);
      thread_src_wl.in_wl().dwl[index] = node;
    }
  }
}
__global__ void prep_edge_src(CSRGraph graph, index_type * edge_src, bool enable_lb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_prep_edge_src;
  index_type node_end;
  const int _NP_CROSSOVER_WP = 32;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  node_end = roundup(((graph).nnodes), (blockDim.x));
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    bool pop;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = (((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false)) && graph.valid_node(node);;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    __shared__ struct { index_type node; } _np_closure [TB_SIZE];
    _np_closure[threadIdx.x].node = node;
    if (pop)
    {
      _np.size = (graph).getOutDegree(node);
      _np.start = (graph).getFirstEdge(node);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    __syncthreads();
    while (true)
    {
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      __syncthreads();
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        __syncthreads();
        break;
      }
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      __syncthreads();
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      assert(nps.tb.src < __kernel_tb_size);
      node = _np_closure[nps.tb.src].node;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type edge;
        edge = ns +_np_j;
        {
          edge_src[edge] = node;
        }
      }
      __syncthreads();
    }

    {
      const int warpid = threadIdx.x / 32;
      const int _np_laneid = cub::LaneId();
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))
      {
        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        node = _np_closure[nps.warp.src[warpid]].node;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type edge;
          edge = _np_w_start +_np_ii;
          {
            edge_src[edge] = node;
          }
        }
      }
      __syncthreads();
    }

    __syncthreads();
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        node = _np_closure[nps.fg.src[_np_i]].node;
        edge= nps.fg.itvalue[_np_i];
        {
          edge_src[edge] = node;
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
    assert(threadIdx.x < __kernel_tb_size);
    node = _np_closure[threadIdx.x].node;
  }
}
__global__ void hook_init(CSRGraph graph, index_type * edge_src)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  int edge_end;
  edge_end = graph.nedges;
  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)
  {
    index_type x = edge_src[edge];
    index_type y = graph.getAbsDestination(edge);
    index_type mx = x > y ? x : y;
    index_type mn = x > y ? y : x;
    graph.node_data[mx] = mn;
  }
}
__global__ void hook_high_to_low(CSRGraph graph, const __restrict__ index_type * edge_src, char * marks, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
  int edge_end;
  edge_end = graph.nedges;
  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)
  {
    if (!marks[edge])
    {
      index_type u = edge_src[edge];
      index_type v = graph.getAbsDestination(edge);
      node_data_type p_u = graph.node_data[u];
      node_data_type p_v = graph.node_data[v];
      index_type mx = p_u > p_v ? p_u : p_v;
      index_type mn = p_u > p_v ? p_v : p_u;
      if (mx == mn)
      {
        marks[edge] = 1;
      }
      else
      {
        graph.node_data[mn] = mx;
        ret_val.reduce(true);
        continue;
        continue;
      }
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
__global__ void hook_low_to_high(CSRGraph graph, index_type * edge_src, char * marks, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
  int edge_end;
  edge_end = graph.nedges;
  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)
  {
    if (!marks[edge])
    {
      index_type u = edge_src[edge];
      index_type v = graph.getAbsDestination(edge);
      node_data_type p_u = graph.node_data[u];
      node_data_type p_v = graph.node_data[v];
      index_type mx = p_u > p_v ? p_u : p_v;
      index_type mn = p_u > p_v ? p_v : p_u;
      if (mx == mn)
      {
        marks[edge] = 1;
      }
      else
      {
        graph.node_data[mx] = mn;
        ret_val.reduce(true);
        continue;
        continue;
      }
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
__device__ void p_jump_dev(CSRGraph graph, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    node_data_type p_u = graph.node_data[node];
    node_data_type p_v = graph.node_data[p_u];
    if (p_u != p_v)
    {
      graph.node_data[node] = p_v;
      ret_val.reduce(true);
      continue;
      continue;
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
__global__ void p_jump(CSRGraph graph, HGAccumulator<int> ret_val)
{
  p_jump_dev(graph, ret_val);
}
__global__ void identify_roots(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    if (graph.node_data[node] == node)
    {
      index_type _start_73;
      _start_73 = (out_wl).setup_push_warp_one();;
      (out_wl).do_push(_start_73, 0, node);
    }
  }
}
__device__ void p_jump_roots_dev(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
  index_type wlnode_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    bool pop;
    int node;
    pop = (in_wl).pop_id(wlnode, node);
    node_data_type p_u = graph.node_data[node];
    node_data_type p_v = graph.node_data[p_u];
    if (p_u != p_v)
    {
      graph.node_data[node] = p_v;
      ret_val.reduce(true);
      continue;
      continue;
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
__global__ void p_jump_roots(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  p_jump_roots_dev(graph, in_wl, out_wl, ret_val);
}
__global__ void p_jump_leaves(CSRGraph graph)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    node_data_type p_u = graph.node_data[node];
    node_data_type p_v = graph.node_data[p_u];
    if (p_u != p_v)
    {
      graph.node_data[node] = p_v;
    }
  }
}
__global__ void count_components(CSRGraph graph, int * count)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    if (node == graph.node_data[node])
    {
      atomicAdd(count, 1);
    }
  }
}
void gg_main_pipe_4(CSRGraphTy& gg, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  bool loopc = false;
  do
  {
    Shared<int> retval = Shared<int>(1);
    HGAccumulator<int> _rv;
    *(retval.cpu_wr_ptr()) = 0;
    _rv.rv = retval.gpu_wr_ptr();
    pipe.out_wl().will_write();
    p_jump_roots <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl(), _rv);
    cudaDeviceSynchronize();
    loopc = *(retval.cpu_rd_ptr()) > 0;
  }
  while (loopc);
}
__global__ void __launch_bounds__(__tb_gg_main_pipe_4_gpu_gb) gg_main_pipe_4_gpu_gb(CSRGraphTy gg, PipeContextT<Worklist2> pipe, int* retval, bool enable_lb, GlobalBarrier gb)
{
  unsigned tid = TID_1D;

  bool loopc = false;
  do
  {
    HGAccumulator<int> _rv;
    *retval = 0;
    _rv.rv = retval;
    gb.Sync();
    if (tid == 0)
      pipe.in_wl().reset_next_slot();
    p_jump_roots_dev (gg, pipe.in_wl(), pipe.out_wl(), _rv);
    _rv.local = *retval;
    gb.Sync();
    loopc = *retval > 0;
    gb.Sync();
  }
  while (loopc);
  gb.Sync();
  if (tid == 0)
  {
    pipe.save();
  }
}
void gg_main_pipe_4_wrapper(CSRGraphTy& gg, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  static GlobalBarrierLifetime gg_main_pipe_4_gpu_gb_barrier;
  static bool gg_main_pipe_4_gpu_gb_barrier_inited;
  extern bool enable_lb;
  static const size_t gg_main_pipe_4_gpu_gb_residency = maximum_residency(gg_main_pipe_4_gpu_gb, __tb_gg_main_pipe_4_gpu_gb, 0);
  static const size_t gg_main_pipe_4_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_4_gpu_gb_residency);
  if(!gg_main_pipe_4_gpu_gb_barrier_inited) { gg_main_pipe_4_gpu_gb_barrier.Setup(gg_main_pipe_4_gpu_gb_blocks); gg_main_pipe_4_gpu_gb_barrier_inited = true;};
  Shared<int> retval (1);
  if (enable_lb)
  {
    gg_main_pipe_4(gg,pipe,blocks,threads);
  }
  else
  {
    pipe.prep();

    gg_main_pipe_4_gpu_gb<<<gg_main_pipe_4_gpu_gb_blocks, __tb_gg_main_pipe_4_gpu_gb>>>(gg,pipe,retval.gpu_wr_ptr(), enable_lb, gg_main_pipe_4_gpu_gb_barrier);
    pipe.restore();
  }
}
void gg_main_pipe_3(CSRGraphTy& gg, dim3& blocks, dim3& threads)
{
  bool loopc = false;
  do
  {
    Shared<int> retval = Shared<int>(1);
    HGAccumulator<int> _rv;
    *(retval.cpu_wr_ptr()) = 0;
    _rv.rv = retval.gpu_wr_ptr();
    p_jump <<<blocks, threads>>>(gg, _rv);
    cudaDeviceSynchronize();
    loopc = *(retval.cpu_rd_ptr()) > 0;
  }
  while (loopc);
}
__global__ void __launch_bounds__(__tb_gg_main_pipe_3_gpu_gb) gg_main_pipe_3_gpu_gb(CSRGraphTy gg, int* retval, bool enable_lb, GlobalBarrier gb)
{
  bool loopc = false;
  do
  {
    HGAccumulator<int> _rv;
    *retval = 0;
    _rv.rv = retval;
    gb.Sync();
    p_jump_dev (gg, _rv);
    _rv.local = *retval;
    gb.Sync();
    loopc = *retval > 0;
    gb.Sync();
  }
  while (loopc);
  gb.Sync();
}
void gg_main_pipe_3_wrapper(CSRGraphTy& gg, dim3& blocks, dim3& threads)
{
  static GlobalBarrierLifetime gg_main_pipe_3_gpu_gb_barrier;
  static bool gg_main_pipe_3_gpu_gb_barrier_inited;
  extern bool enable_lb;
  static const size_t gg_main_pipe_3_gpu_gb_residency = maximum_residency(gg_main_pipe_3_gpu_gb, __tb_gg_main_pipe_3_gpu_gb, 0);
  static const size_t gg_main_pipe_3_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_3_gpu_gb_residency);
  if(!gg_main_pipe_3_gpu_gb_barrier_inited) { gg_main_pipe_3_gpu_gb_barrier.Setup(gg_main_pipe_3_gpu_gb_blocks); gg_main_pipe_3_gpu_gb_barrier_inited = true;};
  Shared<int> retval (1);
  if (enable_lb)
  {
    gg_main_pipe_3(gg,blocks,threads);
  }
  else
  {

    gg_main_pipe_3_gpu_gb<<<gg_main_pipe_3_gpu_gb_blocks, __tb_gg_main_pipe_3_gpu_gb>>>(gg,retval.gpu_wr_ptr(), enable_lb, gg_main_pipe_3_gpu_gb_barrier);
  }
}
void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  t_work.init_thread_work(gg.nnodes);
  int edge_blocks;
  int node_blocks;
  cudaEvent_t start;
  cudaEvent_t stop;
  PipeContextT<Worklist2> pipe;
  int it_hk = 1;
  Shared<index_type> edge_src (gg.nedges);
  Shared<char> edge_marks (gg.nedges);
  bool flag = false;
  edge_blocks = hg.nedges / TB_SIZE + 1;
  node_blocks = hg.nnodes / TB_SIZE + 1;
  edge_marks.zero_gpu();
  check_cuda(cudaEventCreate(&start));
  check_cuda(cudaEventCreate(&stop));
  check_cuda(cudaEventRecord(start));
  if (enable_lb)
  {
    t_work.reset_thread_work();
    Inspect_prep_edge_src <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);
    cudaDeviceSynchronize();
    int num_items = t_work.thread_work_wl.in_wl().nitems();
    if (num_items != 0)
    {
      t_work.compute_prefix_sum();
      cudaDeviceSynchronize();
      prep_edge_src_TB_LB <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);
      cudaDeviceSynchronize();
    }
  }
  prep_edge_src <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), enable_lb);
  cudaDeviceSynchronize();
  check_cuda(cudaEventRecord(stop));
  init <<<node_blocks, threads>>>(gg);
  cudaDeviceSynchronize();
  hook_init <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr());
  cudaDeviceSynchronize();
  gg_main_pipe_3_wrapper(gg,blocks,threads);
  pipe = PipeContextT<Worklist2>(gg.nnodes);
  {
    {
      do
      {
        pipe.out_wl().will_write();
        identify_roots <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
        cudaDeviceSynchronize();
        pipe.in_wl().swap_slots();
        pipe.advance2();
        if (it_hk != 0)
        {
          Shared<int> retval = Shared<int>(1);
          HGAccumulator<int> _rv;
          *(retval.cpu_wr_ptr()) = 0;
          _rv.rv = retval.gpu_wr_ptr();
          hook_low_to_high <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr(), edge_marks.gpu_wr_ptr(), _rv);
          cudaDeviceSynchronize();
          flag = *(retval.cpu_rd_ptr());
          it_hk = (it_hk + 1) % 4;
        }
        else
        {
          Shared<int> retval = Shared<int>(1);
          HGAccumulator<int> _rv;
          *(retval.cpu_wr_ptr()) = 0;
          _rv.rv = retval.gpu_wr_ptr();
          hook_high_to_low <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr(), edge_marks.gpu_wr_ptr(), _rv);
          cudaDeviceSynchronize();
          flag = *(retval.cpu_rd_ptr());
        }
        if (!flag)
        {
          break;
        }
        gg_main_pipe_4_wrapper(gg,pipe,blocks,threads);
        p_jump_leaves <<<node_blocks, threads>>>(gg);
        cudaDeviceSynchronize();
      }
      while (flag);
    }
  }
  printf("iterations: %d\n", it_hk);
  Shared<int> count (1);
  *(count.cpu_wr_ptr()) = 0;
  count_components <<<blocks, threads>>>(gg, count.gpu_wr_ptr());
  cudaDeviceSynchronize();
  printf("components: %d\n", *(count.cpu_rd_ptr()));
  float ms =0;
  check_cuda(cudaEventElapsedTime(&ms, start, stop));
  DISCOUNT_TIME_NS = (int) (ms * 1000000);
  printf("prep_edge_src: %llu ns\n", DISCOUNT_TIME_NS);
}


================================================
FILE: lonestar/analytics/gpu/connected-components/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"

const char *prog_opts = "";
const char *prog_usage = "";
const char *prog_args_usage = "";

int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) {
}

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  if(!output_file)
    return;

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");
    

  for(int i = 0; i < g.nnodes; i++) {
    check_fprintf(f, "%d %d\n", i, g.node_data[i]);
    
    for(int j = g.getFirstEdge(i); j < g.getFirstEdge(i+1); j++) {
      index_type dst = g.getAbsDestination(j);
      check_fprintf(f, "\te %d: %d %d %d\n", j, dst, g.edge_data[j], g.node_data[dst] );
    }
  }
}


================================================
FILE: lonestar/analytics/gpu/independentset/CMakeLists.txt
================================================
app_analy_gpu(mis maximal-independentset)
set_property(TARGET maximal-independentset-gpu PROPERTY CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(maximal-independentset-gpu ${CUDA_cudadevrt_LIBRARY})
target_link_libraries(maximal-independentset-gpu Galois::gpu -lcurand)
add_test_gpu(maximal-independentset rmat15 rmat15.out mis -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)


================================================
FILE: lonestar/analytics/gpu/independentset/README.md
================================================
Maximal Independent Set
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark computes the maximal independent set in an unweighted graph.

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/independentset; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./minimum-spanningtree-gpu -o=<output-file> <input-graph>`

-`$ ./minimum-spanningtree-gpu -o outfile.txt road-USA.gr`


================================================
FILE: lonestar/analytics/gpu/independentset/mis.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include <curand.h>

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture";
#include <curand.h>
#define UNMARKED 0
#define MARKED 1
#define NON_INDEPENDENT 2
#define NON_MAXIMAL 3
#define SEED1 0x12345678LL
#define SEED2 0xabbdef12LL
#define SEED3 0xcafe1234LL
#define SEED4 0x09832516LL
static const int __tb_one = 1;
__global__ void gen_prio_gpu(CSRGraph graph, unsigned int * prio, unsigned int x, unsigned int y, unsigned int z, unsigned int w)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  index_type node_end;
  x ^= tid;
  y ^= tid;
  z ^= tid;
  w ^= tid;
  assert(!(x == 0 && y == 0 && z == 0 && w == 0));
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    unsigned int t;
    t = x ^ (x << 11);
    x = y;
    y = z;
    z = w;
    w = w ^ (w >> 19) ^ t ^ (t >> 8);
    prio[node] = w;
  }
}
void gen_prio(CSRGraph graph, unsigned int * prio)
{
  curandGenerator_t gen;
  check_rv(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MT19937), CURAND_STATUS_SUCCESS);
  check_rv(curandSetPseudoRandomGeneratorSeed(gen, SEED1), CURAND_STATUS_SUCCESS);
  check_rv(curandSetGeneratorOrdering (gen, CURAND_ORDERING_PSEUDO_BEST), CURAND_STATUS_SUCCESS);
}
__global__ void init_wl(CSRGraph graph, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    (out_wl).push(node);
  }
}
__global__ void mark_nodes(CSRGraph graph, const unsigned int * __restrict__ prio, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    bool pop;
    int node;
    index_type edge_end;
    pop = (in_wl).pop_id(wlnode, node);
    int max_prio = prio[node];
    int max_prio_node = node;
    edge_end = (graph).getFirstEdge((node) + 1);
    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)
    {
      index_type dst = graph.getAbsDestination(edge);
      if (dst != node && graph.node_data[dst] != NON_INDEPENDENT && prio[dst] >= max_prio)
      {
        if ((prio[dst] > max_prio) || dst > max_prio_node)
        {
          max_prio = prio[dst];
          max_prio_node = dst;
        }
      }
    }
    if (max_prio_node == node)
    {
      assert(graph.node_data[node] == UNMARKED);
      graph.node_data[node] = MARKED;
    }
  }
}
__global__ void drop_marked_nodes_and_nbors(CSRGraph graph, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    bool pop;
    int node;
    pop = (in_wl).pop_id(wlnode, node);
    bool drop = false;
    if (graph.node_data[node] == MARKED)
    {
      drop = true;
    }
    if (!drop)
    {
      index_type edge_end;
      edge_end = (graph).getFirstEdge((node) + 1);
      for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)
      {
        index_type dst = graph.getAbsDestination(edge);
        if (graph.node_data[dst] == MARKED)
        {
          drop = true;
        }
      }
    }
    if (!drop)
    {
      (out_wl).push(node);
    }
    else
    {
      if (graph.node_data[node] == UNMARKED)
      {
        graph.node_data[node] = NON_INDEPENDENT;
      }
    }
  }
}
void gg_main_pipe_1(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)
{
  {
    pipe.out_wl().will_write();
    init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
    pipe.in_wl().swap_slots();
    pipe.advance2();
    while (pipe.in_wl().nitems())
    {
      pipe.out_wl().will_write();
      mark_nodes <<<blocks, threads>>>(gg, prio.gpu_rd_ptr(), pipe.in_wl(), pipe.out_wl());
      pipe.out_wl().will_write();
      drop_marked_nodes_and_nbors <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      pipe.advance2();
      STEPS++;
    }
  }
}

void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)
{
    gg_main_pipe_1(gg,STEPS,prio,pipe,blocks,threads);
}
void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  PipeContextT<WorklistT> pipe;
  Shared<unsigned int> prio (hg.nnodes);
  int STEPS = 0;
  ggc::Timer t ("random");
  t.start();
  gen_prio_gpu <<<blocks, threads>>>(gg, prio.gpu_wr_ptr(), SEED1, SEED2, SEED3, SEED4);
  cudaDeviceSynchronize();
  t.stop();
  printf("Random number generation took %llu ns\n", t.duration());
  pipe = PipeContextT<WorklistT>(gg.nnodes);
  gg_main_pipe_1_wrapper(gg,STEPS,prio,pipe,blocks,threads);
  printf("Total steps: %d\n", STEPS);
}


================================================
FILE: lonestar/analytics/gpu/independentset/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"

const char *prog_opts = "";
const char *prog_usage = "";
const char *prog_args_usage = "";

int process_prog_arg(int argc, char *argv[], int arg_start) {
  return 1;
}

void process_prog_opt(char c, char *optarg) {
  ;
}

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  if(!output_file)
    return;

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");

  unsigned int count = 0;
  for(int i = 0; i < g.nnodes; i++) {
    count += (g.node_data[i] == 1);
  }

  check_fprintf(f, "%u\n", count);
  for(int i = 0; i < g.nnodes; i++) {
    if(g.node_data[i] == 1)
      check_fprintf(f, "%d\n", i);
  }

  fclose(f);
}


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/CMakeLists.txt
================================================
app_analy_gpu(sgd matrixcompletion)
#add_test_gpu(matrixcompletion Epinions Epinions.out sgd ${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr)
add_test_gpu(matrixcompletion bgg bgg.out sgd)


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/README.md
================================================
Matrix Completion
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark implements Stochastic Gradient Descent (SGD). In particular,
the benchmark uses SGD to complete unknown entries of a sparse matrix.
The sparse matrix represents a bipartite graph, with one set of nodes represent
movies, while the other set represents users. The edge connecting a movie node
to a user node denotes that the user has rated the movie, with the edge label
representing the rating assigned. This benchmark has rough correspondence to
the GPU implementations described
[in this paper](http://www.cs.utexas.edu/~rashid/public/ipdps2016.pdf).

INPUT
--------------------------------------------------------------------------------

This application takes in directed bipartite Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/matrixcompletion; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./matrixcompletion <input-graph>`
-`$ ./matrixcompletion Epinions_dataset.gr`


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifdef _WIN32
#define NOMINMAX
#include <windows.h>
#include <process.h>
#include <time.h>
#include <Psapi.h>
#else
#include <sys/time.h>
#endif
#include <cuda.h>
#include "SGDCommonCu.h"
#include "SGDGraphCu.h"
#include <algorithm>

#define R 2
#define C 2
#define BLOCKSIZE 1
#define GRANULARITY 5000
#define _P_DATA1_(_t25) _P_DATA1[_t25]
#define _P_DATA1__(_t25) _P_DATA1[_t25 + 1]
#define new_ratings(_t49, _t50, _t51, _t52)                                    \
  new_ratings[(_t50)*R * C + (_t51)*C + (_t52)]
struct a_list {
  int col_[1];
  float ratings[R * C];
  struct a_list* next;
};

struct mk {
  struct a_list* ptr;
};

#ifndef GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_
#define GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_
#define col_(_t5) col[_t5]
#define index_(i) index[i]
#define index__(i) index[i + 1]
#define __rose_lt(x, y) ((x) < (y) ? (x) : (y))

#define gpuErrchk(ans)                                                         \
  { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line,
                      bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
            line);
    if (abort)
      exit(code);
  }
}
bool out_degree_compare(std::pair<int, int> i, std::pair<int, int> j) {
  return (i.second > j.second);
}

struct data_list {
  float data[R][C];
  int col;
  struct data_list* next;
};
__device__ void segreduce_warp2(float* y, float* val) {
  int tx     = threadIdx.x;
  float left = 0;

  if (tx >= 1) {
    left = val[tx - 1];
    val[tx] += left;
    left = 0;
  }
  __syncthreads();

  if (tx >= 2) {
    left = val[tx - 2];
    val[tx] += left;
    left = 0;
  }
  __syncthreads();
  if (tx >= 4) {
    left = val[tx - 4];
    val[tx] += left;
    left = 0;
  }
  __syncthreads();
  if (tx >= 8) {
    left = val[tx - 8];
    val[tx] += left;
    left = 0;
  }
  __syncthreads();

  if (tx == SGD_FEATURE_SIZE - 1)
    *y += val[tx];
  __syncthreads();
}

__global__ void sgd_blk_diag_operator(float* fv, int* metadata,
                                      float* new_ratings, int* _P_DATA2,
                                      int* _P_DATA1, float step_size, int t2) {
  int l;
  int i;
  int bx;
  bx = blockIdx.x;
  int tx;
  tx = threadIdx.x;
  int ty;
  ty = threadIdx.y;
  __device__ __shared__ float _P2[BLOCKSIZE];
  __device__ __shared__ float _P3[BLOCKSIZE * SGD_FEATURE_SIZE];
  int newVariable5;
  float _P4[C];
  float _P5[R];
  // int t4;
  // int t6;
  // int t8;
  int t10;
  int t12;
  int movie_size = metadata[2];
  if (ty <= _P_DATA1__(t2) - _P_DATA1_(t2) - BLOCKSIZE * bx - 1)
    newVariable5 = _P_DATA2[_P_DATA1_(t2) + BLOCKSIZE * bx + ty];
  if (ty <= _P_DATA1__(t2) - _P_DATA1_(t2) - BLOCKSIZE * bx - 1) {
    for (t10 = 0; t10 <= R - 1; t10 += 1)
      _P5[R * newVariable5 + t10 - R * newVariable5] =
          fv[(R * newVariable5 + t10) * SGD_FEATURE_SIZE + tx];
    for (i = 0; i <= R - 1; i += 1) {
      for (t12 = 0; t12 <= C - 1; t12 += 1)
        _P4[C * newVariable5 + C * t2 + t12 + 1 -
            (C * t2 + C * newVariable5 + 1)] =
            fv[(C * newVariable5 + C * t2 + t12 + 1) * SGD_FEATURE_SIZE + tx];
      for (l = 0; l <= C - 1; l += 1) {
        if (0 <= new_ratings(t2, BLOCKSIZE * bx + _P_DATA1_(t2) + ty, i, l)) {
          _P2[ty] = -new_ratings[(BLOCKSIZE * bx + _P_DATA1_(t2) + ty) * R * C +
                                 i * C + l];
          _P3[tx + ty * SGD_FEATURE_SIZE] =
              (_P5[R * newVariable5 + i - R * newVariable5] *
               _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -
                   (C * t2 + C * newVariable5 + 1)]);
          segreduce_warp2(&_P2[ty], &_P3[0 + ty * SGD_FEATURE_SIZE]);
          _P5[R * newVariable5 + i - R * newVariable5] -=
              (step_size *
               ((_P2[ty] * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +
                               movie_size - (C * t2 + C * newVariable5 + 1)]) +
                (0.05f * _P5[R * newVariable5 + i - R * newVariable5])));
          _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -
              (C * t2 + C * newVariable5 + 1)] -=
              (step_size *
               ((_P2[ty] * _P5[R * newVariable5 + i - R * newVariable5]) +
                (0.05f * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +
                             movie_size - (C * t2 + C * newVariable5 + 1)])));
        } else if (new_ratings(t2, BLOCKSIZE * bx + _P_DATA1_(t2) + ty, i, l) <=
                   -2) {
          _P2[ty] = -new_ratings[(BLOCKSIZE * bx + _P_DATA1_(t2) + ty) * R * C +
                                 i * C + l];
          _P3[tx + ty * SGD_FEATURE_SIZE] =
              (_P5[R * newVariable5 + i - R * newVariable5] *
               _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -
                   (C * t2 + C * newVariable5 + 1)]);
          segreduce_warp2(&_P2[ty], &_P3[0 + ty * SGD_FEATURE_SIZE]);
          _P5[R * newVariable5 + i - R * newVariable5] -=
              (step_size *
               ((_P2[ty] * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +
                               movie_size - (C * t2 + C * newVariable5 + 1)]) +
                (0.05f * _P5[R * newVariable5 + i - R * newVariable5])));
          _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -
              (C * t2 + C * newVariable5 + 1)] -=
              (step_size *
               ((_P2[ty] * _P5[R * newVariable5 + i - R * newVariable5]) +
                (0.05f * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +
                             movie_size - (C * t2 + C * newVariable5 + 1)])));
        }
      }
      for (t12 = 0; t12 <= C - 1; t12 += 1)
        fv[(C * newVariable5 + C * t2 + t12 + 1) * SGD_FEATURE_SIZE + tx] =
            _P4[C * newVariable5 + C * t2 + t12 + 1 -
                (C * t2 + C * newVariable5 + 1)];
    }
    for (t10 = 0; t10 <= R - 1; t10 += 1)
      fv[(R * newVariable5 + t10) * SGD_FEATURE_SIZE + tx] =
          _P5[R * newVariable5 + t10 - R * newVariable5];
  }
}

struct Timer {
  double _start;
  double _end;
  Timer() : _start(0), _end(0) {}
  void clear() { _start = _end = 0; }
  void start() { _start = rtclock(); }
  void stop() { _end = rtclock(); }
  double get_time_seconds(void) { return (_end - _start); }

#ifdef _WIN32
  static double rtclock() {
    LARGE_INTEGER tickPerSecond, tick;
    QueryPerformanceFrequency(&tickPerSecond);
    QueryPerformanceCounter(&tick);
    return (tick.QuadPart * 1000000 / tickPerSecond.QuadPart) * 1.0e-6;
  }
#else
  static double rtclock() {
    struct timezone Tzp;
    struct timeval Tp;
    int stat;
    stat = gettimeofday(&Tp, &Tzp);
    if (stat != 0)
      printf("Error return from gettimeofday: %d", stat);
    return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
  }
#endif
};

/************************************************************************
 ************************************************************************/
///
struct RunStats {
  int round;
  int curr_step;
  float total_time;
  float time_per_diagonal;
  float insp_time;
  RunStats(int r, int s, float t, float t_p_d, float i_t) {
    round             = r;
    curr_step         = s;
    total_time        = t;
    time_per_diagonal = t_p_d;
    insp_time         = i_t;
  }
  RunStats() {
    round = curr_step = 0;
    total_time = time_per_diagonal = insp_time = 0.0f;
  }
  //   fprintf(stderr, "diag\t%d\t%d\t%6.6g\t%6.6g\t%6.6g\t", round,
  //   curr_step,total_time,total_time/(double)(m+ n -1), insp_time);
};
struct StatAccumulator {
  std::vector<RunStats> stats;
  void push_stats(int r, int s, float t, float t_p_d, float i_t) {
    RunStats rs(r, s, t, t_p_d, i_t);
    stats.push_back(rs);
    //      fprintf(stderr, "diag#\t%d\t%d\t%6.6g\t%6.6g\t%6.6g\t", r,
    //      s,t,t_p_d, i_t);
  }
  void print() {
    RunStats sum;
    for (int i = 0; i < stats.size(); ++i) {
      RunStats& s = stats[i];
      sum.round += s.round;
      sum.curr_step += s.curr_step;
      sum.total_time += s.total_time;
      sum.time_per_diagonal += s.time_per_diagonal;
      sum.insp_time += s.insp_time;
    }
    size_t num_items = stats.size();
    printf("\nAverage time per iteration: %6.6g\n", sum.total_time / num_items);
  }
};
///
template <typename T>
struct CUDAArray {
  T* device_data;
  T* host_data;
  size_t _size;
  CUDAArray(size_t s) : _size(s) {
    host_data   = new T[_size];
    device_data = NULL;
  }
  ~CUDAArray() {
    delete[] host_data;
    if (device_data != NULL)
      gpuErrchk(cudaFree(device_data));
  }
  void copy_to_device() {
    gpuErrchk(cudaMemcpy(device_data, host_data, sizeof(T) * _size,
                         cudaMemcpyHostToDevice));
  }
  void create_on_device() {
    gpuErrchk(cudaMalloc(&device_data, sizeof(T) * _size));
  }
  void copy_to_host() {
    gpuErrchk(cudaMemcpy(host_data, device_data, sizeof(T) * _size,
                         cudaMemcpyDeviceToHost));
  }
  T* host_ptr() { return host_data; }
  T* device_ptr() { return device_data; }
  size_t size() { return _size; }
};

/************************************************************************

 ************************************************************************/

// typedef float EdgeDataType;
typedef unsigned int EdgeDataType;

struct SGDAsynEdgeCudaFunctor {
  typedef SGD_LC_LinearArray_Undirected_Graph<unsigned int, EdgeDataType>
      GraphTy;
  typedef CUDAArray<int> ArrayType;
  typedef CUDAArray<float> FeatureArrayType;
  ////////////////////////////////////////////////////////////
  /************************************************
   *
   *************************************************/
  StatAccumulator stats;
  GraphTy graph;
  std::vector<int> movies;
  std::map<int, int> old_pos_to_new_pos;
  std::vector<std::pair<int, int>>
      sorted_nodes; // 1st field is the position of the node, second field is
                    // the out_degree

  std::vector<int> user_indices;
  ArrayType* metadata;

  ArrayType* index;
  ArrayType* diag_number;
  ArrayType* new_index;
  ArrayType* col;
  ArrayType* new_col;
  FeatureArrayType* new_ratings;

  // to track diagonal arrays
  int count_of_diagonals;
  FeatureArrayType* features;
  FeatureArrayType* ratings;
  float accumulated_error;
  int round;
  unsigned int max_rating;
  char filename[512];
  std::vector<int> user_edge_count;
  /************************************************************************
   *
   *metadata (16)
   *edge_info, worklist, ratings (2+1+1)*NE
   *locks, features*FEATURE_SIZE, (1+FEATURE_SIZE)NN
   ************************************************************************/
  SGDAsynEdgeCudaFunctor(bool road, const char* p_filename) : round(0) {
    strcpy(filename, p_filename);
    // fprintf(stderr, "Creating SGDAsynEdgeCudaFunctor -  features =[%d].\n",
    // SGD_FEATURE_SIZE);
    graph.read(p_filename);
    allocate();
    initialize();
    printf("Feature size: %d\n", SGD_FEATURE_SIZE);
    // printf("Number of movies found: %ld\n", movies.size());
  }
  /************************************************************************
   *
   ************************************************************************/
  SGDAsynEdgeCudaFunctor(int num_m, int num_u) : round(0) {
    strcpy(filename, "generated-input");
    // fprintf(stderr, "Creating SGDAsynEdgeFunctor -  features =[%d] .\n",
    // SGD_FEATURE_SIZE);
    complete_bipartitie(graph, num_m, num_u);
    allocate();
    initialize();
    fprintf(stderr, "Number of movies found :: %ld\n", movies.size());
  }
  /************************************************************************
   *
   ************************************************************************/
  SGDAsynEdgeCudaFunctor(int num_m) : round(0) {
    strcpy(filename, "gen-diagonal-input");
    // fprintf(stderr, "Creating SGDAsynEdgeFunctor -  features =[%d] .\n",
    // SGD_FEATURE_SIZE);
    diagonal_graph(graph, num_m);
    allocate();
    initialize();
    fprintf(stderr, "Number of movies found :: %ld\n", movies.size());
  }
  /************************************************************************
   *
   ************************************************************************/
  void allocate() {
    features = new FeatureArrayType(graph.num_nodes() * SGD_FEATURE_SIZE);
    features->create_on_device();
    ratings  = new FeatureArrayType(graph.num_edges());
    metadata = new ArrayType(16);
    metadata->create_on_device();
    index = new ArrayType(graph.num_nodes() + 1);
    col   = new ArrayType(graph.num_edges());
  }
  /************************************************************************
   *
   ************************************************************************/
  void deallocate() {
    delete features;
    delete ratings;
    delete metadata;
    delete index;
  }
  /************************************************************************
   *
   ************************************************************************/
  void copy_to_device() { features->copy_to_device(); }
  /************************************************************************
   *
   ************************************************************************/
  void copy_to_host() { features->copy_to_host(); }
  /************************************************************************
   *
   ************************************************************************/
  void initialize() {
    {
      int deviceCount;
      cudaGetDeviceCount(&deviceCount);
      int device;
      for (device = 0; device < deviceCount; ++device) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, device);
        // fprintf(stderr, "Device %s (%d) : CC %d.%d, MaxThreads:%d \n",
        //		deviceProp.name, device, deviceProp.major,
        //		deviceProp.minor, deviceProp.maxThreadsPerBlock);
      }
    }
    std::vector<int> all_edges;
    initialize_features_random(graph, features, movies);
    movies.clear();
    unsigned int max_degree = 0;
    // unsigned max_degree_id = 0;

    for (unsigned int i = 0; i < graph.num_nodes(); ++i) {
      for (int j = 0; j < graph.num_neighbors(i); j++) {
        if (graph.out_neighbors(i, j) >= graph.num_nodes())
          fprintf(stderr, "error in input at %d\n", i);
      }
    }

    for (unsigned int i = 0; i < graph.num_nodes(); ++i) {

      sorted_nodes.push_back(std::pair<int, int>(i, graph.num_neighbors(i)));
      if (graph.num_neighbors(i) > max_degree) {
        max_degree = graph.num_neighbors(i);
        // max_degree_id = i;
      }
      if (graph.num_neighbors(i) > 0) {
        movies.push_back(i);
      } else {
        user_indices.push_back(i);
      }
    }
    std::sort(sorted_nodes.begin(), sorted_nodes.end(), out_degree_compare);
    max_rating = 0;
    for (unsigned int i = 0; i < graph.num_edges(); ++i) {
      max_rating = std::max(max_rating, graph.out_edge_data()[i]);
    }
    // fprintf(stderr, "] , max_Rating: %d, movies: %ld, Max degree:: %d for
    // node: %d\n", 		max_rating, movies.size(), max_degree,
    // max_degree_id);
    distribute_chunks(all_edges);
    cache_chunks(all_edges);
  }
  /************************************************************************
   *
   ************************************************************************/
  void cache_chunks(std::vector<int>& all_edges) {
    index->host_ptr()[0] = 0;
    int count            = 0;
    int user_count       = movies.size();

    for (int i = 0; i < sorted_nodes.size(); i++) {
      for (int j = 0; j < sorted_nodes[i].second; j++) {
        int old_pos = graph.out_neighbors(sorted_nodes[i].first, j);

        if (old_pos_to_new_pos.find(old_pos) != old_pos_to_new_pos.end())
          col->host_ptr()[count] = old_pos_to_new_pos.find(old_pos)->second;
        else {
          col->host_ptr()[count] = user_count;
          old_pos_to_new_pos.insert(std::pair<int, int>(old_pos, user_count));
          user_count++;
        }
        ratings->host_ptr()[count++] =
            graph.out_edge_data(sorted_nodes[i].first, j);
      }
      index->host_ptr()[i + 1] = count;
    }

    graph.outgoing_index()[0] = index->host_ptr()[0];
    for (int i = 0; i < sorted_nodes.size(); i++) {
      graph.outgoing_index()[i + 1] = index->host_ptr()[i + 1];
      for (int j = index->host_ptr()[i]; j < index->host_ptr()[i + 1]; j++) {
        graph.out_neighbors(i, j - index->host_ptr()[i]) = col->host_ptr()[j];
        graph.out_edge_data(i, j - index->host_ptr()[i]) =
            ratings->host_ptr()[j];
        ratings->host_ptr()[j] /= (float)max_rating;
      }
    }
  }
  /************************************************************************
   *
   ************************************************************************/
  void distribute_chunks(std::vector<int>& all_edges) {
    std::vector<int> in_edge_wl(graph.num_edges());
    for (size_t i = 0; i < graph.num_edges(); ++i) {
      in_edge_wl[i] = i;
    }
    size_t num_edges_to_process = in_edge_wl.size();
    int num_items               = graph.num_edges();
    all_edges.resize(num_items);
    memcpy(all_edges.data(), in_edge_wl.data(), num_items * sizeof(int));
  }
  /************************************************************************
   *
   ************************************************************************/
  void operator()(int num_steps) {
    // print_edges();
    // print_latent();
    // max_rating = 1;
    copy_to_device();
    compute_err(graph, features, max_rating);
    for (round = 0; round < num_steps; ++round) {
      this->gpu_operator();
      copy_to_host();
      float rmse = compute_err(graph, features, max_rating);
      if (rmse < 0.1)
        break;
    }
    stats.print();
    // print_latent();
  }

  void print_latent() {
    for (int n = 0; n < 10; n++) {
      FeatureType* features_l = &(features->host_ptr()[n * SGD_FEATURE_SIZE]);
      printf("latent(%d)[%.3f", n, features_l[0]);
      for (int i = 1; i < SGD_FEATURE_SIZE; i++)
        printf(" %.3f", features_l[i]);
      printf("]\n");
    }
  }
  void print_edges() {
    std::cout << "edges: [" << graph.out_edge_data()[0];
    for (int n = 1; n < 100; n++) {
      if (n >= graph.num_edges())
        break;
      std::cout << ", " << graph.out_edge_data()[n];
    }
    printf("]\n");
  }
  /************************************************************************
   *
   ************************************************************************/
  void gpu_operator() {
    int curr_step           = 0;
    metadata->host_ptr()[4] = graph.num_edges();
    double total_time       = 0;
    double insp_time        = 0;
    metadata->host_ptr()[2] = movies.size();
    metadata->host_ptr()[4] = 0;
    metadata->host_ptr()[0] = user_indices.size();
    metadata->copy_to_device();

    const float step_size = SGD_STEP_SIZE(round);
    dim3 block_size       = dim3(SGD_FEATURE_SIZE, BLOCKSIZE);
    int num_blocks        = ceil(movies.size() / (float)BLOCKSIZE);
    cudaError_t err;

    Timer timer, timer2;
    int iter;
    int num_items = graph.num_edges();
    int iter2     = 0;
    timer2.start();
    diag_inspector(movies.size(), user_indices.size(), index->host_ptr(),
                   ratings->host_ptr(), col->host_ptr(), iter2);
    timer2.stop();
    insp_time += timer2.get_time_seconds();
    new_col->copy_to_device();
    new_ratings->copy_to_device();
    new_index->copy_to_device();
    //		diag_number->copy_to_device();
    timer.start();

    int non_zero_blk_diags = 0;
    for (iter = 0; iter < count_of_diagonals; iter++) {
      if (new_index->host_ptr()[iter + 1] - new_index->host_ptr()[iter] > 0) {
        non_zero_blk_diags++;
        num_blocks =
            (new_index->host_ptr()[iter + 1] - new_index->host_ptr()[iter]) %
                        (BLOCKSIZE) ==
                    0
                ? (new_index->host_ptr()[iter + 1] -
                   new_index->host_ptr()[iter]) /
                      (BLOCKSIZE)
                :

                (new_index->host_ptr()[iter + 1] -
                 new_index->host_ptr()[iter]) /
                        (BLOCKSIZE) +
                    1;
        // std::cout << "num_blocks = " << num_blocks << "block_size = " <<
        // SGD_FEATURE_SIZE * BLOCKSIZE << "\n";
        sgd_blk_diag_operator<<<num_blocks, block_size>>>(
            features->device_ptr(), metadata->device_ptr(),
            new_ratings->device_ptr(), new_col->device_ptr(),
            new_index->device_ptr(), step_size, iter);
      }
    }

    cudaDeviceSynchronize();
    timer.stop();

    total_time += timer.get_time_seconds();
    if ((err = cudaGetLastError()) != cudaSuccess) {
      fprintf(stderr, "aborted %s \n", cudaGetErrorString(err));
      exit(-1);
    }

    metadata->copy_to_host();
    // fprintf(stderr, "blk_diag: round %d curr_step %d total_time %.3f
    // per_diag_time %6.3g insp_time %.3f\t", round, 		curr_step,
    // total_time,
    // total_time / (double) count_of_diagonals, insp_time);
    printf("round %d: total_time %.3f\t", round, total_time);
    stats.push_stats(round, curr_step, total_time,
                     total_time / (double)count_of_diagonals, insp_time);

    delete new_ratings;
    delete new_index;
    delete new_col;
  }

  int diag_inspector(int movies, int users, int* index, float* a, int* col,
                     int iter) {

    int t6;
    int t4;
    int t2;
    int newVariable4;
    int newVariable3;
    int newVariable2;
    struct a_list* _P_DATA4;
    int newVariable1;
    int newVariable0;
    struct mk* _P_DATA3;
    struct a_list** _P1;
    int chill_count_1;
    int* _P_DATA1;
    int _t31;
    int _t34;
    /*
    int t8;
    int *_P_DATA2;
    int chill_count_0;
    int _t39;
    int _t38;
    int _t37;
    int In_3;
    int In_2;
    int In_1;
    int _t36;
    int _t35;
    int _t33;
    int _t32;
    int _t30;
    int _t29;
    int _t28;
    int _t27;
    int _t25;
    int _t26;
    int _t24;
    int _t23;
    int _t22;
    int _t21;
    int _t20;
    int _t19;
    int _t18;
    int _t17;
    int _t16;
    int _t15;
    int _t14;
    int _t12;
    int _t11;
    int _t10;
    int _t9;
    int _t7;
    int _t6;
    int _t5;
    int _t4;
    int l;
    int _t3;
    int _t2;
    int _t1;
    int i;
    int j;
    int k;
    */
    _P_DATA1      = (int*)malloc(sizeof(int) * (users / C + movies / R));
    _P1           = (struct a_list**)malloc(sizeof(struct a_list*) *
                                  (users / C + movies / R - 1));
    _P_DATA1[0]   = 0;
    _P_DATA3      = ((
        struct mk*)(malloc(sizeof(struct mk) * (users / C + movies / R - 1))));
    chill_count_1 = 0;
    _P_DATA1[0]   = 0;
    for (_t31 = 0; _t31 <= users / C + movies / R - 2; _t31 += 1) {
      _P1[1 * _t31]          = 0;
      _P_DATA1[1 * _t31 + 1] = 0;
    }
    for (t2 = 0; t2 <= movies / R - 1; t2 += 1) {
      for (t4 = 0; t4 <= R - 1; t4 += 1)
        for (t6 = index_(R * t2 + t4); t6 <= index__(R * t2 + t4) - 1;
             t6 += 1) {
          _t31 = (col_(t6) - movies + R * (movies / R - 1)) / C - t2;
          _P_DATA3[_t31].ptr = 0;
        }
      for (t4 = 0; t4 <= R - 1; t4 += 1)
        for (t6 = index_(R * t2 + t4); t6 <= index__(R * t2 + t4) - 1;
             t6 += 1) {
          _t31 = (col_(t6) - movies + R * (movies / R - 1)) / C - t2;
          _t34 = (col_(t6) - movies + R * (movies / R - 1)) % C;
          if (_P_DATA3[_t31].ptr == 0) {
            _P_DATA4 = ((struct a_list*)(malloc(sizeof(struct a_list) * 1)));
            _P_DATA4->next     = _P1[_t31];
            _P1[_t31]          = _P_DATA4;
            _P_DATA3[_t31].ptr = _P1[_t31];
            for (newVariable0 = 0; newVariable0 <= R - 1; newVariable0 += 1)
              for (newVariable1 = 0; newVariable1 <= C - 1; newVariable1 += 1)
                _P_DATA3[_t31]
                    .ptr->ratings[C * newVariable0 + 1 * newVariable1] = -1;
            _P_DATA3[_t31].ptr->col_[0] = t2;
            chill_count_1 += 1;
            _P_DATA1[_t31 + 1] += 1;
          }
          _P_DATA3[_t31].ptr->ratings[C * t4 + 1 * _t34] = a[t6];
        }
    }

    new_col     = new ArrayType(chill_count_1);
    new_index   = new ArrayType(users / C + movies / R);
    new_ratings = new FeatureArrayType(chill_count_1 * R * C);
    new_col->create_on_device();
    new_index->create_on_device();
    new_ratings->create_on_device();
    new_index->host_ptr()[0] = 0;
    for (t2 = 0; t2 <= users / C + movies / R - 2; t2 += 1) {
      for (newVariable2 = 1 - _P_DATA1[1 * t2 + 1]; newVariable2 <= 0;
           newVariable2 += 1) {
        new_col->host_ptr()[_P_DATA1[1 * t2] - newVariable2] =
            _P1[1 * t2]->col_[0];
        for (newVariable3 = 0; newVariable3 <= R - 1; newVariable3 += 1)
          for (newVariable4 = 0; newVariable4 <= C - 1; newVariable4 += 1)
            new_ratings->host_ptr()[R * C * (_P_DATA1[1 * t2] - newVariable2) +
                                    C * newVariable3 + 1 * newVariable4] =
                _P1[1 * t2]->ratings[C * newVariable3 + 1 * newVariable4];
        _P_DATA4 = _P1[1 * t2]->next;
        free(_P1[1 * t2]);
        _P1[1 * t2] = _P_DATA4;
      }
      _P_DATA1[1 * t2 + 1] += _P_DATA1[1 * t2];
      new_index->host_ptr()[t2 + 1] = _P_DATA1[t2 + 1];
    }

    count_of_diagonals = users / C + movies / R - 1;
    free(_P_DATA1);
    free(_P_DATA3);
    free(_P1);
    return chill_count_1;
  }

  /************************************************************************
   *
   ************************************************************************/
  ~SGDAsynEdgeCudaFunctor() {
    deallocate();
    // fprintf(stderr, "Destroying SGDAsynEdgeCudaFunctor object.\n");
  }
};
//###################################################################//

#endif /* GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_ */


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef SGDCOMMON_CU_H_
#define SGDCOMMON_CU_H_

#include <assert.h>
#define _SGD_USE_SHARED_MEM_ 1
#define SGD_FEATURE_SIZE 16

#ifdef _WIN32
template <typename T>
bool isnormal(const T&) {
  return true;
}
#else
using namespace std;

const float SGD_LAMBDA        = 0.05f;
const float SGD_LEARNING_RATE = 0.012f;
const float SGD_DECAY_RATE    = 0.015f;
const int SGD_MAX_ROUNDS      = 5;
typedef float FeatureType;

/************************************************
 *
 *************************************************/
float SGD_STEP_SIZE(int X) {
  return SGD_LEARNING_RATE * 1.5f / (1.0f + SGD_DECAY_RATE * pow(X + 1, 1.5f));
} // Purdue.
//#define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.
/************************************************
 *
 *************************************************/
float sum_vector(const float* a) {
  float res = 0.0f;
  for (int i = 0; i < SGD_FEATURE_SIZE; ++i)
    res += a[i];
  return res;
}
/************************************************
 *
 *************************************************/
template <typename T>
T dot_product(const T* a, const T* b) {
  T res = 0.0f;
  for (int i = 0; i < SGD_FEATURE_SIZE; i++) {
    assert((a[i] == 0 || isnormal(a[i]) == true));
    assert((b[i] == 0 || isnormal(b[i]) == true));
    res += a[i] * b[i];
  }
  return res;
}
/************************************************
 *
 *************************************************/
float toMB(long val) { return val / (float)(1024 * 1024); }
/************************************************
 *
 *************************************************/
struct DebugData {
  struct NodeStats {
    int max_rating;
    int min_rating;
    int sum_rating;
    int count_rating;
    int my_degree;
    bool is_movie;
    NodeStats() {
      max_rating = sum_rating = count_rating = 0;
      min_rating                             = std::numeric_limits<int>::max();
      my_degree                              = 0;
      is_movie                               = false;
    }
    void stat(int val) {
      max_rating = std::max(max_rating, val);
      min_rating = std::min(min_rating, val);
      sum_rating += (val);
      count_rating++;
    }
  };
  std::vector<std::pair<int, int>> user_degrees;
  std::vector<std::pair<int, int>> movie_degrees;
  std::map<int, int> user_map;
  std::map<int, int> movie_map;
};
/************************************************
 *
 *************************************************/

template <typename GraphTy>
static void write_stats_to_file(GraphTy& graph) {
  // 0 Write graph as csv to file:
  {
    std::ofstream out_file("/workspace/rashid/bgg.csv");
    out_file << "Src,Dst,Wt\n";
    for (size_t i = 0; i < graph.num_edges(); ++i) {
      out_file << graph.get_edge_src(i) << "," << graph.out_neighbors()[i]
               << "," << graph.out_edge_data()[i] << "\n";
    }
    out_file.close();
  }
  //      return;
  // 3 Write average-user degree per-movie
  {
    int max_user_degree  = 0;
    int max_movie_degree = 0;
    std::vector<DebugData::NodeStats> all_nodes_stats(graph.num_nodes());
    std::vector<int> movie_indices;
    std::vector<int> user_indices;
    {
      for (size_t i = 0; i < graph.num_edges(); ++i) {
        int src    = graph.get_edge_src(i);
        int dst    = graph.out_neighbors()[i];
        int rating = graph.out_edge_data()[i];
        all_nodes_stats[src].my_degree++;
        all_nodes_stats[dst].my_degree++;
        all_nodes_stats[src].is_movie = true;
        all_nodes_stats[src].stat(rating);
        all_nodes_stats[dst].stat(rating);
        movie_indices.push_back(src);
        user_indices.push_back(dst);
        max_movie_degree =
            std::max(max_movie_degree, all_nodes_stats[src].my_degree);
        max_user_degree =
            std::max(max_user_degree, all_nodes_stats[dst].my_degree);
      }
    }
    {
      /*std::sort(debug_data.user_degrees.begin(),
       debug_data.user_degrees.end(), [](const std::pair<int, int>& lhs, const
       std::pair<int, int>& rhs) { return lhs.second > rhs.second;});
       std::sort(debug_data.movie_degrees.begin(),
       debug_data.movie_degrees.end(), [](const std::pair<int, int>& lhs, const
       std::pair<int, int>& rhs) { return lhs.second > rhs.second;});
       */
    }
    std::vector<DebugData::NodeStats> user_per_degree_stats(max_user_degree +
                                                            1);
    std::vector<DebugData::NodeStats> movie_per_degree_stats(max_movie_degree +
                                                             1);
    long sum_ratings = 0;
    for (size_t i = 0; i < graph.num_edges(); ++i) {
      int m      = graph.get_edge_src(i);
      int u      = graph.out_neighbors()[i];
      int rating = graph.out_edge_data()[i];
      int m_d    = all_nodes_stats[m].my_degree;
      int u_d    = all_nodes_stats[u].my_degree;
      assert(all_nodes_stats[m].is_movie == true &&
             all_nodes_stats[u].is_movie == false);
      user_per_degree_stats.at(u_d).stat(rating);
      movie_per_degree_stats.at(m_d).stat(rating);
      sum_ratings += rating;
    }
    std::cout << "Sizes:: " << movie_indices.size() << ", "
              << user_indices.size() << "\n";
    std::cout << "Max-degree:: " << max_movie_degree << ", " << max_user_degree
              << "\n";
    std::cout << "Average rating:: " << sum_ratings / (float)(graph.num_edges())
              << "\n";
    {
      std::ofstream out_file_u("/workspace/rashid/user_stats.csv");
      out_file_u << "Id,Degree,NumNodes,Min,Max,Sum\n";
      for (size_t i = 0; i < user_indices.size(); ++i) {
        int index = user_indices[i];
        assert(all_nodes_stats[index].is_movie == false);
        out_file_u << i << "," << all_nodes_stats[index].count_rating << ","
                   << all_nodes_stats[index].min_rating << ","
                   << all_nodes_stats[index].max_rating << ","
                   << all_nodes_stats[index].sum_rating << "\n";
      }
      out_file_u.close();
    }
    {
      std::ofstream out_file_u("/workspace/rashid/movie_stats.csv");
      out_file_u << "Id,Degree,NumNodes,Min,Max,Sum\n";
      for (size_t i = 0; i < movie_indices.size(); ++i) {
        int index = movie_indices[i];
        assert(all_nodes_stats[index].is_movie == true);
        out_file_u << i << "," << all_nodes_stats[index].count_rating << ","
                   << all_nodes_stats[index].min_rating << ","
                   << all_nodes_stats[index].max_rating << ","
                   << all_nodes_stats[index].sum_rating << "\n";
      }
      out_file_u.close();
    }
    {
      std::ofstream out_file_u("/workspace/rashid/bgg_user_average_degree.csv");
      out_file_u << "Degree,NumNodes,Min,Max,Sum\n";
      for (int i = 0; i < max_user_degree; ++i) {
        if (user_per_degree_stats[i].count_rating > 0)
          out_file_u << i << "," << user_per_degree_stats[i].count_rating << ","
                     << user_per_degree_stats[i].min_rating << ","
                     << user_per_degree_stats[i].max_rating << ","
                     << user_per_degree_stats[i].sum_rating << "\n";
      }
      out_file_u.close();
    }
    {
      std::ofstream out_file_m(
          "/workspace/rashid/bgg_movie_average_degree.csv");
      out_file_m << "Degree,NumNodes,Min,Max,Sum\n";
      for (int i = 0; i < max_movie_degree; ++i) {
        if (movie_per_degree_stats[i].count_rating > 0)
          out_file_m << i << "," << movie_per_degree_stats[i].count_rating
                     << "," << movie_per_degree_stats[i].min_rating << ","
                     << movie_per_degree_stats[i].max_rating << ","
                     << movie_per_degree_stats[i].sum_rating << "\n";
      }
      out_file_m.close();
    }
  }
  // 4 Write average-movie degree per-user
  std::cout << "Done writing debug info...\n";
  exit(-1);
}
/************************************************
 *
 ************************************************/
template <typename GraphType, typename FeatureArrayType>
float compute_err(GraphType& graph, FeatureArrayType* features,
                  int max_rating) {
  int fail_count = 0;
  float sum      = 0;
  for (unsigned int i = 0; i < features->size(); ++i) {
    float f = features->host_ptr()[i];
    sum += f;
    if ((f != 0 && isnormal(f) == false)) {
      fail_count++;
    }
  }
  // fprintf(stderr, "Failed:: %6.6g,Sum, %6.6g ", fail_count / (float)
  // (features->size()), sum);
  float accumulated_error = 0.0f;
  float max_err           = 0.0f;
  for (unsigned int i = 0; i < graph.num_edges(); ++i) {
    int src      = graph.get_edge_src(i);
    int dst      = graph.out_neighbors()[i];
    float rating = graph.out_edge_data()[i] / (float)max_rating;
    // if(src <0 || src >= graph.num_nodes() || dst < 0 || dst >=
    // graph.num_nodes())
    //     fprintf(stderr, " error at src %d and dst %d\n", src, dst);
    float computed_rating =
        dot_product(&features->host_ptr()[src * SGD_FEATURE_SIZE],
                    &features->host_ptr()[dst * SGD_FEATURE_SIZE]);
    float err = (computed_rating - rating);
    max_err   = std::max((double)max_err, (double)fabs(err));
    accumulated_error += err * err;
  }
  accumulated_error /= (float)graph.num_edges();
  //   float rms = std::sqrt((float) accumulated_error);
  float rms = sqrt((float)accumulated_error);
  // fprintf(stderr, "Average_error %.3f , max_error %.3f, RMS %.5f \n",
  // accumulated_error, max_err, rms);
  printf("RMS %.5f\n", rms);
  return rms;
}
/************************************************************************
 *
 ************************************************************************/
template <typename GraphType, typename FeatureArrayType, typename LockType>
void initialize_features_random(GraphType& graph, FeatureArrayType* features,
                                LockType* locks, std::vector<int>& movies) {
  using namespace std;

  FeatureType top = 1.0 / sqrt(SGD_FEATURE_SIZE);
  //   uniform_real_distribution<FeatureType> dist(0, top);
  //   mt19937 gen;
  /*      std::uniform_real_distribution<FeatureType> dist(-1.0f, 1.0f);*/
  FeatureType feature_sum = 0.0f, min_feature = top, max_feature = -top;
  // For each node, initialize features to random, and lock to be unlocked.
  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {
    locks->host_ptr()[i] = -1;
    FeatureType* features_l =
        &(features
              ->host_ptr()[i *
                           SGD_FEATURE_SIZE]); // graph.node_data()[i].features;
    for (int j = 0; j < SGD_FEATURE_SIZE; ++j) {
      (features_l[j] = rand() / (float)std::numeric_limits<int>::max());
      feature_sum += (features_l[j] = features_l[j] * top);
      max_feature = std::max(features_l[j], max_feature);
      min_feature = std::min(features_l[j], min_feature);
      assert(isnormal(features_l[j]) || features_l[j] == 0);
    }
    if (graph.num_neighbors(i) > 0)
      movies.push_back(i);
  }
  // std::cout << "initial features:: " << feature_sum << " , [" << min_feature
  // << " , " << max_feature;
}
/************************************************************************
 *
 ************************************************************************/
template <typename GraphType, typename FeatureArrayType>
void initialize_features_random(GraphType& graph, FeatureArrayType* features,
                                std::vector<int>& movies) {
  using namespace std;

  FeatureType top         = 1.0 / sqrt(SGD_FEATURE_SIZE);
  FeatureType feature_sum = 0.0f, min_feature = top, max_feature = -top;
  // For each node, initialize features to random, and lock to be unlocked.
  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {
    FeatureType* features_l =
        &(features
              ->host_ptr()[i *
                           SGD_FEATURE_SIZE]); // graph.node_data()[i].features;
    for (int j = 0; j < SGD_FEATURE_SIZE; j++) {
      (features_l[j] = rand() / (float)std::numeric_limits<int>::max());
      feature_sum += (features_l[j] = features_l[j] * top);
      max_feature = std::max(features_l[j], max_feature);
      min_feature = std::min(features_l[j], min_feature);
      assert(isnormal(features_l[j]) || features_l[j] == 0);
    }
    if (graph.num_neighbors(i) > 0)
      movies.push_back(i);
  }
  // std::cout << "initial features:: " << feature_sum << " , [" << min_feature
  // << " , " << max_feature; std::cout << "initial features: feature_sum " <<
  // feature_sum << " min_feature " << min_feature << " max_feature " <<
  // max_feature << "\n";
}
/************************************************************************
 *
 ************************************************************************/
/************************************************
 *
 *************************************************/
template <typename GraphType>
void diagonal_graph(GraphType& g, int num_nodes) {
  g.init(2 * num_nodes, num_nodes);
  for (int i = 0; i < num_nodes; ++i) {
    g.outgoing_index()[i] = i;
    g.get_edge_src()[i]   = i;
    g.out_neighbors()[i]  = i + num_nodes;
    g.out_edge_data()[i]  = 3;
  }
  for (int i = num_nodes; i < 2 * num_nodes; ++i) {
    g.outgoing_index()[i] = num_nodes;
  }
  g.outgoing_index()[2 * num_nodes] = num_nodes;

} // End complete_bipartitie
/************************************************
 *
 *************************************************/
template <typename GraphType>
void complete_bipartitie(GraphType& g, int num_movies, int num_users) {
  g.init(num_movies + num_users, num_users * num_movies);
  int index = 0;
  for (int i = 0; i < num_movies; ++i) {
    g.outgoing_index()[i] = index;
    for (int j = 0; j < num_users; ++j) {
      g.get_edge_src()[index + j] = i;
      //         g.out_neighbors()[index + j] = num_movies + ((j + i) %
      //         num_users);
      g.out_neighbors()[index + j] = num_movies + j;
      g.out_edge_data()[index + j] = 3;
    }
    index += num_users;
  }
  for (int i = num_movies; i < num_movies + num_users; ++i) {
    g.outgoing_index()[i] = index;
  }
  g.outgoing_index()[num_movies + num_users] = index;

  if (false) {
    std::ofstream out_file("gen_graph.csv");
    for (int i = 0; i < num_movies; ++i) {
      for (size_t nbr_idx = g.outgoing_index()[i];
           nbr_idx < g.outgoing_index()[i + 1]; ++nbr_idx) {
        out_file << g.out_neighbors()[nbr_idx] << ",";
      }
      out_file << "\n";
    }
    out_file.close();
  }
} // End complete_bipartitie
/************************************************
 *
 *************************************************/
template <typename GraphType>
void compute_err(GraphType& graph) {
  int fail_count    = 0;
  float sum         = 0;
  float sum_ratings = 0;
  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {
    for (int idx = 0; idx < SGD_FEATURE_SIZE; ++idx) {
      float f = graph.node_data()[i].features[idx];
      sum += f;
      if ((f != 0 && isnormal(f) == false)) {
        fail_count++;
      }
    }
  }
  // fprintf(stderr, "Failed:: %6.6g,Sum, %6.6g ", fail_count / (float)
  // (graph.num_nodes()*SGD_FEATURE_SIZE), sum);
  float accumulated_error = 0.0f;
  float max_err           = 0.0f;
  typedef typename GraphType::NodeDataType NodeDataType;
  NodeDataType* features = graph.node_data();
  for (unsigned int i = 0; i < graph.num_edges(); ++i) {
    unsigned int src = graph.out_edge_src()[i];
    unsigned int dst = graph.out_neighbors()[i];
    float rating     = graph.out_edge_data()[i];
    sum_ratings += rating;
    float computed_rating =
        dot_product(features[src], features[dst], graph.num_nodes());
    float err = (computed_rating - rating);
    max_err   = std::max((double)max_err, (double)fabs(err));
    accumulated_error += err * err;
  }
  accumulated_error /= (float)graph.num_edges();
  float rms = sqrt((float)accumulated_error);
  // fprintf(stderr, "Average_error, %6.6f , max_error, %6.6f, RMS, %6.6f ,
  // RatingsSum, %6.6g\n", accumulated_error, max_err, rms,sum_ratings);
}
/************************************************
 *
 *************************************************/

#endif // OpenCL.
#endif /* SGDCOMMON_H_ */


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
================================================
/*
 * SGDGraphCu.h
 *
 *  Created on: Nov 12, 2014
 *      Author: rashid
 */
#include <iostream>
#include <fstream>
#include <cstring>
#include <algorithm>
#include <string>
#include <vector>
#include <map>
#include <limits>
#include <math.h>
#include <fstream>
#include <string>
#include <iostream>
#include <limits>
#include <stdio.h>
#include <cassert>
#ifdef _WIN32
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <io.h>
#include <stdio.h>
#else
#include <unistd.h>
#include <sys/mman.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

#ifndef GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_
#define GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_

#ifdef __APPLE__
#include <libkern/OSByteOrder.h>
#define le64toh(x) (x) // OSSwapLittleToHostInt64(x)
#define le32toh(x) (x) //  OSSwapLittleToHostInt32(x)
#elif __FreeBSD__
#include <sys/endian.h>
#elif __linux__
typedef ulong uint64_t;
typedef uint uint32_t;
#include <endian.h>
#ifndef le64toh
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define le64toh(x) (x)
#define le32toh(x) (x)
#else
#define le64toh(x) __bswap_64(x)
#define le32toh(x) __bswap_32(x)
#endif
#endif
#else
#endif

/*
 * LC_LinearArray_Undirected_Graph.h
 *
 *  Created on: Oct 24, 2013
 *  Single array representation, has outgoing edges.
 *      Author: rashid
 */

template <typename NodeDataTy, typename EdgeDataTy>
struct SGD_LC_LinearArray_Undirected_Graph {
  // Are you using gcc/4.7+ Error on line below for earlier versions.
  typedef NodeDataTy NodeDataType;
  typedef EdgeDataTy EdgeDataType;
  typedef unsigned int NodeIDType;
  typedef unsigned int EdgeIDType;
  size_t _num_nodes;
  size_t _num_edges;
  unsigned int _max_degree;
  const size_t SizeEdgeData;
  const size_t SizeNodeData;
  int* gpu_graph;
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  SGD_LC_LinearArray_Undirected_Graph()
      : SizeEdgeData(sizeof(EdgeDataType) / sizeof(unsigned int)),
        SizeNodeData(sizeof(NodeDataType) / sizeof(unsigned int)) {
    _max_degree = _num_nodes = _num_edges = 0;
    gpu_graph                             = 0;
  }
  void read(const char* filename) {
    readFromGR(filename);
    for (unsigned int i = 0; i < num_nodes(); ++i) {
      for (unsigned int e = outgoing_index()[i]; e < outgoing_index()[i + 1];
           ++e) {
        get_edge_src()[e] = i;
      }
    }
  }
  unsigned inline readFromGR(const char* file) {
    std::ifstream cfile;
    cfile.open(file);

    // copied from GaloisCpp/trunk/src/FileGraph.h
    int masterFD = open(file, O_RDONLY);
    if (masterFD == -1) {
      printf("FileGraph::structureFromFile: unable to open %s.\n", file);
      abort();
    }

    struct stat buf;
    int f = fstat(masterFD, &buf);
    if (f == -1) {
      printf("FileGraph::structureFromFile: unable to stat %s.\n", file);
      abort();
    }
    size_t masterLength = buf.st_size;

    int _MAP_BASE = MAP_PRIVATE;
    //#ifdef MAP_POPULATE
    //  _MAP_BASE  |= MAP_POPULATE;
    //#endif

    void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
    if (m == MAP_FAILED) {
      m = 0;
      printf("FileGraph::structureFromFile: mmap failed.\n");
      abort();
    }

    // parse file
    uint64_t* fptr                           = (uint64_t*)m;
    __attribute__((unused)) uint64_t version = le64toh(*fptr++);
    assert(version == 1);
    __attribute__((unused)) uint64_t sizeEdgeTy = le64toh(*fptr++);
    uint64_t numNodes                           = le64toh(*fptr++);
    uint64_t numEdges                           = le64toh(*fptr++);
    uint64_t* outIdx                            = fptr;
    fptr += numNodes;
    uint32_t* fptr32 = (uint32_t*)fptr;
    uint32_t* outs   = fptr32;
    fptr32 += numEdges;
    if (numEdges % 2)
      fptr32 += 1;
    unsigned* edgeData = (unsigned*)fptr32;

    _num_nodes = numNodes;
    _num_edges = numEdges;
    std::cout << "num_nodes: " << _num_nodes << ", num_edges: " << _num_edges
              << "\n";
    init(_num_nodes, _num_edges);
    // node_data
    memset(node_data(), 0, sizeof(unsigned int) * _num_nodes);
    for (unsigned int i = 0; i < _num_edges; ++i) {
      out_neighbors()[i] = le32toh(outs[i]);
    }
    outgoing_index()[0] = 0;
    for (unsigned int i = 0; i < _num_nodes; ++i) {
      outgoing_index()[i + 1] = le32toh(outIdx[i]);
    }
    unsigned int start        = 0;
    unsigned int displacement = 0;
    for (unsigned int i = 0; i < _num_nodes; ++i) {
      unsigned int end = le32toh(outIdx[i]);
      for (unsigned int idx = start; idx < end; ++idx) {
        // node i's idx neighbor is to be populated here.
        out_edge_data()[displacement] = le32toh(edgeData[idx]);
        // out_edge_data()[displacement] = 1;
        out_neighbors()[displacement] = le32toh(outs[idx]);
        displacement++;
      }
      start = end;
    }
    /*   for (size_t i = 0; i < g._num_nodes; ++i)
          g.node_data()[i] = std::numeric_limits<unsigned int>::max() / 2;*/
    cfile.close();
    update_in_neighbors();
    return 0;
  }

  NodeDataType* node_data() { return (NodeDataType*)gpu_graph + 4; }
  unsigned int* outgoing_index() {
    return (unsigned int*)(node_data()) + _num_nodes * SizeNodeData;
  }
  unsigned int outgoing_index(const int idx) const {
    return ((unsigned int*)(gpu_graph + 4) + _num_nodes * SizeNodeData)[idx];
  }
  unsigned int* out_neighbors() {
    return (unsigned int*)outgoing_index() + _num_nodes + 1;
  }
  EdgeDataType* out_edge_data() {
    return (EdgeDataType*)(unsigned int*)(out_neighbors()) + _num_edges;
  }
  EdgeDataType& out_edge_data(unsigned int node_id, unsigned int nbr_id) {
    return ((EdgeDataType*)out_edge_data())[outgoing_index()[node_id] + nbr_id];
  }
  unsigned int& out_neighbors(unsigned int node_id, unsigned int nbr_id) {
    return ((unsigned int*)out_neighbors())[outgoing_index()[node_id] + nbr_id];
  }
  unsigned int* incoming_index() { return outgoing_index(); }
  unsigned int* in_neighbors() { return outgoing_index(); }
  EdgeDataType* in_edge_data() { return out_edge_data(); }
  unsigned int* get_edge_src() {
    return (unsigned*)out_edge_data() + _num_edges;
  }
  unsigned int get_edge_src(int edge_index) {
    return get_edge_src()[edge_index];
  }
  unsigned int* last() {
    return (unsigned int*)in_edge_data() + _num_edges * SizeEdgeData;
  }

  size_t num_nodes() { return _num_nodes; }
  size_t num_edges() { return _num_edges; }
  size_t num_neighbors(const unsigned int node_id) const {
    return outgoing_index(node_id + 1) - outgoing_index(node_id);
  }
  size_t max_degree() { return _max_degree; }
  void init(size_t n_n, size_t n_e) {
    _num_nodes = n_n;
    _num_edges = n_e;
    // const int arr_size = (4 + (_num_nodes * SizeNodeData) + (_num_nodes + 1)
    // + (_num_edges) + (_num_edges * SizeEdgeData) + (_num_edges)); std::cout
    // << "Allocating NN: " << _num_nodes << "(" << SizeNodeData << ") , NE :"
    // << _num_edges << ", TOTAL:: " << arr_size << "\n"; Num_nodes, num_edges,
    // [node_data] , [outgoing_index], [out_neighbors], [edge_data] , [src
    // indices] fprintf(stderr, "GraphSize :: %6.6g MB\n", arr_size /
    // (float(1024
    // * 1024)));
    gpu_graph =
        new int[(4 + (_num_nodes * SizeNodeData) + (_num_nodes + 1) +
                 (_num_edges) + (_num_edges * SizeEdgeData) + (_num_edges))];
    (gpu_graph)[0] = (int)_num_nodes;
    (gpu_graph)[1] = (int)_num_edges;
    (gpu_graph)[2] = (int)SizeNodeData;
    (gpu_graph)[3] = (int)SizeEdgeData;
    // allocate_on_gpu();
  }
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void print_header(void) {
    std::cout << "Header :: [";
    for (unsigned int i = 0; i < 6; ++i) {
      std::cout << gpu_graph[i] << ",";
    }
    std::cout << "\n";
    return;
  }
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void print_node(unsigned int idx, const char* post = "") {
    if (idx < _num_nodes) {
      std::cout << "N-" << idx << "(" << (node_data())[idx] << ")"
                << " :: [";
      for (size_t i = (outgoing_index())[idx]; i < (outgoing_index())[idx + 1];
           ++i) {
        std::cout << " " << (out_neighbors())[i] << "(" << (out_edge_data())[i]
                  << "<" << node_data()[out_neighbors()[i]] << ">"
                  << "), ";
      }
      std::cout << "]" << post;
    }
    return;
  }
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void print_graph(void) {
    std::cout << "\n====Printing graph (" << _num_nodes << " , " << _num_edges
              << ")=====\n";
    for (size_t i = 0; i < _num_nodes; ++i) {
      print_node(i);
      std::cout << "\n";
    }
    return;
  }

  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void update_in_neighbors(void) {}
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void print_compact(void) {
    std::cout << "Summary:: [" << _num_nodes << ", " << _num_edges << ", "
              << outgoing_index()[_num_nodes] << "]";
    std::cout << "\nOut-index [";
    for (size_t i = 0; i < _num_nodes + 1; ++i) {
      if (i < _num_nodes && outgoing_index()[i] > outgoing_index()[i + 1])
        std::cout << "**ERR**";
      std::cout << " " << outgoing_index()[i] << ",";
    }
    std::cout << "]\nNeigh[";
    for (size_t i = 0; i < _num_edges; ++i) {
      if (out_neighbors()[i] > _num_nodes)
        std::cout << "**ERR**";
      std::cout << " " << out_neighbors()[i] << ",";
    }
    std::cout << "]\nEData [";
    for (size_t i = 0; i < _num_edges; ++i) {
      std::cout << " " << out_edge_data()[i] << ",";
    }
    std::cout << "]";
  }
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  unsigned int verify() {
    unsigned int* t_node_data      = node_data();
    unsigned int* t_outgoing_index = outgoing_index();
    unsigned int* t_neighbors      = out_neighbors();
    unsigned int* t_out_edge_data  = out_edge_data();
    unsigned int err_count         = 0;
    for (unsigned int node_id = 0; node_id < _num_nodes; ++node_id) {
      unsigned int curr_distance = t_node_data[node_id];
      // Go over all the neighbors.
      for (unsigned int idx = t_outgoing_index[node_id];
           idx < t_outgoing_index[node_id + 1]; ++idx) {
        unsigned int temp = t_node_data[t_neighbors[idx]];
        if (curr_distance + t_out_edge_data[idx] < temp) {
          if (err_count < 10) {
            std::cout << "Error :: ";
            print_node(node_id);
            std::cout << " With :: ";
            print_node(t_neighbors[idx]);
            std::cout << "\n";
          }
          err_count++;
        }
      }
    } // End for
    return err_count;
  }
  ////////////##############################################################///////////
  ////////////##############################################################///////////
  unsigned int verify_in() { return 0; }
  /////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////
  void deallocate(void) { delete gpu_graph; }
};
// End LC_Graph
/////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////
#endif /* GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_ */


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/sgd.cu
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

//#define _GOPT_DEBUG 1
#include <algorithm>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <map>
#include <math.h>
#include <set>
#include <string>
#include <vector>
#include <libgen.h>

#include <cuda.h>

#include "SGDAsyncEdgeCu.h"

int main(int argc, char ** args) {
   const char * fname = "/net/ohm/export/iss/inputs/GaloisGPU/bgg.gr";
   if (argc == 2)
      fname = args[1];
   typedef SGDAsynEdgeCudaFunctor SGDFunctorTy;
   //fprintf(stderr, "===============================Starting- processing %s\n===============================", fname);
   SGDFunctorTy func(false, fname);
	func(5);
   //fprintf(stderr, "====================Terminating - processed%s================================\n", fname);
   //std::cout << "Completed successfully!\n";
   return 0;
}


================================================
FILE: lonestar/analytics/gpu/matrixcompletion/support.cu
================================================


================================================
FILE: lonestar/analytics/gpu/pagerank/CMakeLists.txt
================================================
app_analy_gpu(pagerank pagerank)
add_test_gpu(pagerank rmat15 rmat15.out pagerank -o rmat15.out -x 100 ${BASEINPUT}/scalefree/rmat15.gr)


================================================
FILE: lonestar/analytics/gpu/pagerank/README.md
================================================
Pagerank
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------


 PageRank is a key technique in web mining to rank the importance of web pages. In PageRank, each web page is assigned a numerical weight to begin with, and the algorithm tries to estimate the importance of the web page relative to other web pages in the hyperlinked set of pages. The key assumption is that more important web pages are likely to receive more links from other websites. More details about the problem and different solutions can be found in [1, 2].

[1] https://en.wikipedia.org/wiki/PageRank

[2] Whang et al. Scalable Data-driven PageRank: Algorithms, System Issues, and Lessons Learned. European Conference on Parallel Processing, 2015.

 This benchmark computes the PageRank of the nodes for a given input graph using  using a push-style  residual-based algorithm. The algorithm takes input as a graph, and some constant parameters that are used in the computation. The algorithmic parameters are the following:

* ALPHA: ALPHA represents the damping factor, which is the probability that a web surfer will continue browsing by clicking on the linked pages. The damping factor is generally set to 0.85 in the literature.
* TOLERANCE: It represents a bound on the error in the computation.
* MAX_ITER: The number of iterations to repeat the PageRank computation.

INPUT
--------------------------------------------------------------------------------

Take in Galois .gr graphs. 

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/pagerank; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./pagerank-gpu -o <output-file> -t <top_ranks> -x <max_iterations> <input-graph>`

-`$ ./pagerank-gpu -o outfile.txt -x 1000 road-USA.gr`


================================================
FILE: lonestar/analytics/gpu/pagerank/pagerank.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['wp', 'fg']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
bool enable_lb = false;
typedef int edge_data_type;
typedef int node_data_type;
typedef float* gfloat_p;
float* P_CURR ;
float* P_NEXT ;
extern const float ALPHA = 0.85;
extern const float EPSILON = 0.000001;
extern int MAX_ITERATIONS ;
static const int __tb_gg_main_pipe_1_gpu_gb = 256;
static const int __tb_pagerank_main = TB_SIZE;
static const int __tb_remove_dups = TB_SIZE;
__global__ void init_1(CSRGraph graph, float * p_curr, float * residual)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    float update;
    index_type edge_end;
    p_curr[node] = 1.0 - ALPHA;
    update = 1.0/graph.getOutDegree(node);
    edge_end = (graph).getFirstEdge((node) + 1);
    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)
    {
      index_type dst;
      dst = graph.getAbsDestination(edge);
      atomicAdd(residual + dst, update);
    }
  }
}
__device__ void init_2_dev(CSRGraph graph, float * residual, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type _start_22;
  index_type node_end;
  _start_22 = (out_wl).push_range((tid < ((graph).nnodes)) ? ((((graph).nnodes) - 1 - tid)/nthreads + 1) : 0);;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid, node_pos = 0; node < node_end; node_pos++, node += nthreads)
  {
    residual[node] *= (1.0 - ALPHA) * ALPHA;
    (out_wl).do_push(_start_22, node_pos, node);
  }
}
__global__ void init_2(CSRGraph graph, float * residual, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  init_2_dev(graph, residual, in_wl, out_wl);
}
__global__ void remove_dups(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  index_type wlnode2_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    pop = (in_wl).pop_id(wlnode, node);
    marks[node] = wlnode;
  }
  gb.Sync();
  wlnode2_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode2 = 0 + tid; wlnode2 < wlnode2_end; wlnode2 += nthreads)
  {
    int node;
    bool pop;
    pop = (in_wl).pop_id(wlnode2, node);
    if (marks[node] == wlnode2)
    {
      index_type _start_37;
      _start_37 = (out_wl).setup_push_warp_one();;
      (out_wl).do_push(_start_37, 0, node);
    }
  }
}
__device__ void pagerank_main_dev(CSRGraph graph, float * p_curr, float * residual, float * p_diff, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_pagerank_main;
  index_type wlnode_end;
  const int _NP_CROSSOVER_WP = 32;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct empty_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int sdeg;
    float update;
    int node;
    bool pop;
    float res;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = (in_wl).pop_id(wlnode, node);
    if (pop)
    {
      res =atomicExch(residual + node, 0);
      p_curr[node] += res;
      sdeg = graph.getOutDegree(node);
      update = res * ALPHA / sdeg;
    }
    struct NPInspector1 _np = {0,0,0,0,0,0};
    __shared__ struct { float update; } _np_closure [TB_SIZE];
    _np_closure[threadIdx.x].update = update;
    if (pop)
    {
      _np.size = (graph).getOutDegree(node);
      _np.start = (graph).getFirstEdge(node);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
    }
    __syncthreads();
    {
      const int warpid = threadIdx.x / 32;
      const int _np_laneid = cub::LaneId();
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP))
      {
        if (_np.size >= _NP_CROSSOVER_WP)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        update = _np_closure[nps.warp.src[warpid]].update;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type edge;
          edge = _np_w_start +_np_ii;
          {
            index_type dst;
            float prev;
            dst = graph.getAbsDestination(edge);
            prev = atomicAdd(residual + dst, update);
            if (prev + update > EPSILON && prev < EPSILON)
            {
              index_type _start_57;
              _start_57 = (out_wl).setup_push_warp_one();;
              (out_wl).do_push(_start_57, 0, dst);
            }
          }
        }
      }
      __syncthreads();
    }

    __syncthreads();
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        update = _np_closure[nps.fg.src[_np_i]].update;
        edge= nps.fg.itvalue[_np_i];
        {
          index_type dst;
          float prev;
          dst = graph.getAbsDestination(edge);
          prev = atomicAdd(residual + dst, update);
          if (prev + update > EPSILON && prev < EPSILON)
          {
            index_type _start_57;
            _start_57 = (out_wl).setup_push_warp_one();;
            (out_wl).do_push(_start_57, 0, dst);
          }
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
    assert(threadIdx.x < __kernel_tb_size);
    update = _np_closure[threadIdx.x].update;
  }
}
__global__ void __launch_bounds__(TB_SIZE, 3) pagerank_main(CSRGraph graph, float * p_curr, float * residual, float * p_diff, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  pagerank_main_dev(graph, p_curr, residual, p_diff, enable_lb, in_wl, out_wl);
}
void gg_main_pipe_1(gfloat_p p2, gfloat_p p0, gfloat_p rp, int& iter, CSRGraph& gg, CSRGraph& hg, int MAX_ITERATIONS, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  {
    pipe.out_wl().will_write();
    init_2 <<<blocks, threads>>>(gg, rp, pipe.in_wl(), pipe.out_wl());
    cudaDeviceSynchronize();
    pipe.in_wl().swap_slots();
    pipe.advance2();
    while (pipe.in_wl().nitems())
    {
      pipe.out_wl().will_write();
      pagerank_main <<<blocks, __tb_pagerank_main>>>(gg, p0, rp, p2, enable_lb, pipe.in_wl(), pipe.out_wl());
      cudaDeviceSynchronize();
      pipe.in_wl().swap_slots();
      pipe.advance2();
      iter++;
      if (iter >= MAX_ITERATIONS)
      {
        break;
      }
    }
  }
}
__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(gfloat_p p2, gfloat_p p0, gfloat_p rp, int iter, CSRGraph gg, CSRGraph hg, int MAX_ITERATIONS, PipeContextT<Worklist2> pipe, int* cl_iter, bool enable_lb, GlobalBarrier gb)
{
  unsigned tid = TID_1D;

  iter = *cl_iter;
  {
    if (tid == 0)
      pipe.in_wl().reset_next_slot();
    init_2_dev (gg, rp, pipe.in_wl(), pipe.out_wl());
    pipe.in_wl().swap_slots();
    gb.Sync();
    pipe.advance2();
    while (pipe.in_wl().nitems())
    {
      if (tid == 0)
        pipe.in_wl().reset_next_slot();
      pagerank_main_dev (gg, p0, rp, p2, enable_lb, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      gb.Sync();
      pipe.advance2();
      iter++;
      if (iter >= MAX_ITERATIONS)
      {
        break;
      }
    }
  }
  gb.Sync();
  if (tid == 0)
  {
    *cl_iter = iter;
  }
}
void gg_main_pipe_1_wrapper(gfloat_p p2, gfloat_p p0, gfloat_p rp, int& iter, CSRGraph& gg, CSRGraph& hg, int MAX_ITERATIONS, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;
  static bool gg_main_pipe_1_gpu_gb_barrier_inited;
  extern bool enable_lb;
  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);
  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);
  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};
  if (enable_lb)
  {
    gg_main_pipe_1(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,blocks,threads);
  }
  else
  {
    int* cl_iter;
    check_cuda(cudaMalloc(&cl_iter, sizeof(int) * 1));
    check_cuda(cudaMemcpy(cl_iter, &iter, sizeof(int) * 1, cudaMemcpyHostToDevice));

    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,cl_iter, enable_lb, gg_main_pipe_1_gpu_gb_barrier);
    check_cuda(cudaMemcpy(&iter, cl_iter, sizeof(int) * 1, cudaMemcpyDeviceToHost));
    check_cuda(cudaFree(cl_iter));
  }
}
void gg_main(CSRGraph& hg, CSRGraph& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  t_work.init_thread_work(gg.nnodes);
  static GlobalBarrierLifetime remove_dups_barrier;
  static bool remove_dups_barrier_inited;
  PipeContextT<Worklist2> pipe;
  Shared<float> p[3] = {Shared<float> (hg.nnodes), Shared<float> (hg.nnodes), Shared<float>(hg.nnodes)};
  Shared<float> r (hg.nnodes);
  Shared<int> marks (hg.nnodes);
  static const size_t remove_dups_residency = maximum_residency(remove_dups, __tb_remove_dups, 0);
  static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);
  if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};
  int iter = 0;
  r.zero_gpu();
  init_1 <<<blocks, threads>>>(gg, p[0].gpu_wr_ptr(), r.gpu_wr_ptr());
  cudaDeviceSynchronize();
  gfloat_p p0 =p[0].gpu_wr_ptr();
  gfloat_p p2 =p[2].gpu_wr_ptr();
  gfloat_p rp =r.gpu_wr_ptr();
  pipe = PipeContextT<Worklist2>(hg.nedges);
  gg_main_pipe_1_wrapper(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,blocks,threads);
  printf("PR took %d iterations\n", iter);
  P_CURR = p[0].cpu_rd_ptr();
  P_NEXT = p[0].cpu_rd_ptr();
}


================================================
FILE: lonestar/analytics/gpu/pagerank/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"
#include <float.h>
#include <stdint.h>

struct pr_value {
  index_type node;
  float rank;
  inline bool operator< (const pr_value& rhs) const {
    return rank < rhs.rank;
  }
};

/* TODO: accept ALPHA and EPSILON */
const char *prog_opts = "nt:x:";
const char *prog_usage = "[-n] [-t top_ranks] [-x max_iterations]";
const char *prog_args_usage = "";

extern float *P_CURR, *P_NEXT;
extern const float ALPHA, EPSILON;
extern int MAX_ITERATIONS;

int NO_PRINT_PAGERANK = 0;
int PRINT_TOP = 0;
int MAX_ITERATIONS =  INT_MAX;

int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) {
  if(c == 'n')
    NO_PRINT_PAGERANK = 1;

  if(c == 't') {
    PRINT_TOP = atoi(optarg);    
  }

  if(c == 'x') {
    MAX_ITERATIONS = atoi(optarg);
  }
}

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  struct pr_value * pr;

  pr = (struct pr_value *) calloc(g.nnodes, sizeof(struct pr_value));

  if(pr == NULL) {
    fprintf(stderr, "Failed to allocate memory\n");
    exit(1);
  }

  fprintf(stderr, "Calculating sum ...\n");
  float sum = 0;
  for(int i = 0; i < g.nnodes; i++) {
    pr[i].node = i;
    pr[i].rank = P_CURR[i];
    sum += P_CURR[i];
  }

  fprintf(stdout, "sum: %f (%d)\n", sum, g.nnodes);

  if(!output_file)
    return;

//  fprintf(stderr, "Sorting by rank ...\n");
//  std::stable_sort(pr, pr + g.nnodes);
//  fprintf(stderr, "Writing to file ...\n");

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");

//  check_fprintf(f, "ALPHA %*e EPSILON %*e\n", FLT_DIG, ALPHA, FLT_DIG, EPSILON);

  if(PRINT_TOP == 0)
    PRINT_TOP = g.nnodes;

//  check_fprintf(f, "RANKS 1--%d of %d\n", PRINT_TOP, g.nnodes);

  /* for(int i = 1; i <= PRINT_TOP; i++) {
    if(NO_PRINT_PAGERANK) 
      check_fprintf(f, "%d %d\n", i, pr[g.nnodes - i].node);
    else 
      check_fprintf(f, "%d %d %*e\n", i, pr[g.nnodes - i].node, FLT_DIG, pr[g.nnodes - i].rank/sum);  
  } */
  for(int i = 0; i < g.nnodes; i++) {
    if(NO_PRINT_PAGERANK) 
      check_fprintf(f, "%d\n", pr[i].node);
    else 
      check_fprintf(f, "%d %f\n", pr[i].node, FLT_DIG, pr[i].rank);  
  }

  free(pr);
}


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/CMakeLists.txt
================================================
app_analy_gpu(pta pointstoanalysis)
add_test_gpu(pointstoanalysis tshark tshark.out pta ${BASEINPUT}/java/pta/tshark_nodes.txt ${BASEINPUT}/java/pta/tshark_constraints_after_hcd.txt ${BASEINPUT}/java/pta/tshark_hcd.txt ${BASEINPUT}/java/pta/tshark_correct_soln_001.txt)


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/README.md
================================================
Points To Analysis
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

Given a set of points-to constraints, the problem is to compute the points-to
information for each pointer, in a flow-insensitive context-insensitive manner.

INPUT
--------------------------------------------------------------------------------

This application takes in Galois .gr graphs representing constraints.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/pointstoanalysis; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./pta <nodes-file> <constraints-file> <hcd-table> <solution-file> [TRANSFER, VERIFY]`
-`$ ./pta tshark_nodes.txt tshark_constraints_after_hcd.txt tshark_hcd.txt tshark_correct_soln_001.txt`


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/andersen.cu
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "andersen.h"
#include <thrust/adjacent_difference.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <thrust/unique.h>
#include <thrust/functional.h>
#include <thrust/remove.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include "cuda_launch_config.hpp"

using namespace thrust;

__constant__ uint __storeStart__;
__constant__ uint __loadInvStart__;

/**
 *  number of variables of the input program.
 */
__constant__ uint __numVars__;

__constant__ uint* __ptsConstraints__;
__constant__ uint __numPtsConstraints__;

__constant__ uint*  __copyConstraints__;
__constant__ uint __numCopyConstraints__;

__constant__ uint* __loadConstraints__;
__constant__ uint __numLoadConstraints__;

__constant__ uint* __storeConstraints__;
__constant__ uint __numStoreConstraints__;
__device__ uint __numStore__ = 0;

__constant__ uint* __gepInv__;
__constant__ uint __numGepInv__;

__constant__ uint* __size__;

__constant__ uint* __initialRep__;
__constant__ uint* __initialNonRep__;
__constant__ uint __numInitialRep__;

__constant__  uint* __nextVar__;

 /**
  * Table of indexes to the information inferred by HCD.
  * Each entry is a pair (index, index + delta) that refers to __hcdTable__ 
  */
__constant__ uint* __hcdIndex__;
__constant__ uint __numHcdIndex__;
/**
 * List of pairs (y, x_0, x_(delta - 2)) where pts(*y) = pts(x_0) = ... pts(x_((delta - 2))
 * The equivalences have been detected during the offline phase of HCD, executed in the CPU
 */
__constant__ uint* __hcdTable__;
__constant__ uint __numHcdTable__;

/**
 * Representative array
 */
__constant__ volatile uint* __rep__; // HAS to be volatile

/**
 * array of elements containing all the edges in the graph.
 */
__constant__ volatile uint* __edges__; // HAS to be volatile
__constant__ uint* __graph__;

__constant__  uint* __lock__;

__constant__ uint* __key__;
__constant__ uint* __val__;
__constant__ uint* __keyAux__;
__device__ uint __numKeysCounter__ = 0;
__device__ uint __numKeys__;
__constant__ uint* __currPtsHead__;

__device__ uint __counter__ = 0;
__device__ uint __max__ = 0;
__device__ uint __min__ = 0;

__device__ bool __done__ = true;
__device__ uint __error__;

__device__ uint __worklistIndex0__ = 0;
__device__ uint __worklistIndex1__ = 1;

uint createTime = 0;
double createTime2 = 0;

//////////// utility functions for the GPU /////////

__device__ uint  __errorCode__ = 0;
__device__ uint  __errorLine__ = 0;
__device__ char* __errorMsg__;

__device__ inline uint nextPowerOfTwo(uint v) {
  return 1U << (uintSize * 8 - __clz(v - 1));
}

__device__ inline uint __count(int predicate) {
  const uint ballot = __ballot_sync(0xffffffff,predicate);
  return __popc(ballot);
}

__device__ inline uint isFirstThreadOfWarp(){
  return !threadIdx.x;
}

__device__ inline uint getWarpIdInGrid(){
  return (blockIdx.x * (blockDim.x * blockDim.y / WARP_SIZE) + threadIdx.y);
}

__device__ inline uint isFirstWarpOfGrid(){
  return !(blockIdx.x || threadIdx.y);
}

__device__ inline uint isFirstWarpOfBlock(){
  return !threadIdx.y;
}

__device__ inline uint getThreadIdInBlock(){
  return mul32(threadIdx.y) + threadIdx.x;
}

__device__ inline uint isFirstThreadOfBlock(){
  return !getThreadIdInBlock();
}

__device__ inline uint getThreadIdInGrid(){
  return mul32(getWarpIdInGrid()) + threadIdx.x;
}

__device__ inline uint getThreadsPerBlock() {
  return blockDim.x * blockDim.y;
}

__device__ inline uint isLastThreadOfBlock(){
  return getThreadIdInBlock() == getThreadsPerBlock() - 1;
}

__device__ inline uint getWarpsPerBlock() {
  return blockDim.y;
}

__device__ inline uint getWarpsPerGrid() {
  return blockDim.y * gridDim.x;
}

__device__ inline uint getThreadsPerGrid() {
  return mul32(getWarpsPerGrid());
}

__device__ inline uint getBlockIdInGrid(){
  return blockIdx.x;
}

__device__ inline uint getBlocksPerGrid(){
  return gridDim.x;
}

__device__ void syncAllThreads() {
  __syncthreads();
  uint to = getBlocksPerGrid() - 1;
  if (isFirstThreadOfBlock()) {      
    volatile uint* counter = &__counter__;
    if (atomicInc((uint*) counter, to) < to) {       
      while (*counter); // spinning...
    }
  }
  __syncthreads();
}

__device__ uint getValAtThread(volatile uint* const _shared_, const uint myVal, const uint i) {
  if (threadIdx.x == i) {
    _shared_[threadIdx.y] = myVal;
  }
  return _shared_[threadIdx.y];
}

__device__ uint getValAtThread(const uint myVal, const uint i) {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (threadIdx.x == i) {
    _shared_[threadIdx.y] = myVal;
  }
  return _shared_[threadIdx.y];
}

/*
 * Forward declarations
 */
__device__ void insertAll(const uint storeIndex, uint* _shared_, uint numFrom, bool sort = true);

template<uint toRel, uint fromRel>
__device__ void unionAll(const uint to, uint* _shared_, uint numFrom, bool sort = true);

template<uint toRel, uint fromRel>
__device__  void map(const uint to, const uint base, const uint myBits, uint* _shared_,
    uint& numFrom);

__device__ inline uint mul960(uint num) {
  // 960 = 1024 - 64
  return (num << 10) - (num << 6);
}

__device__ inline uint __graphGet__(const uint row,  const uint col) {
  return __edges__[row + col];
}

__device__ inline uint __graphGet__(const uint pos) {
  return __graph__[pos];
}

__device__ inline void __graphSet__(const uint row,  const uint col, const uint val) {
  __edges__[row + col] = val;
}

__device__ inline void __graphSet__(const uint pos, const uint val) {
  __graph__[pos] = val;
}

__device__ inline uint _sharedGet_(volatile uint* _shared_, uint index, uint offset) {
  return _shared_[index + offset];
}

__device__ inline void _sharedSet_(volatile uint* _shared_, uint index, uint offset, uint val) {
  _shared_[index + offset] = val;
}

__device__ inline uint getHeadIndex(uint var, uint rel){
  if (rel == NEXT_DIFF_PTS) {
    return NEXT_DIFF_PTS_START - mul32(var);
  }
  if (rel == COPY_INV) {
    return COPY_INV_START + mul32(var);
  }
  if (rel == CURR_DIFF_PTS) {
    return CURR_DIFF_PTS_START - mul32(var);
  }
  if (rel == PTS) {
    return mul32(var);
  }
  if (rel == STORE) {
    return __storeStart__ + mul32(var);
  }
  // it has to be LOAD_INV, right?
  return __loadInvStart__ + mul32(var);
}

__device__ inline uint getNextDiffPtsHeadIndex(uint var){
    return NEXT_DIFF_PTS_START - mul32(var);
}

__device__ inline uint getCopyInvHeadIndex(uint var){
    return COPY_INV_START + mul32(var);
}

__device__ inline uint getCurrDiffPtsHeadIndex(uint var){
    return CURR_DIFF_PTS_START - mul32(var);
}

__device__ inline uint getPtsHeadIndex(uint var){
    return mul32(var);
}

__device__ inline uint getStoreHeadIndex(uint var){
    return __storeStart__ + mul32(var);
}

__device__ inline uint getLoadInvHeadIndex(uint var){
    return __loadInvStart__ + mul32(var);
}

__device__ inline int isEmpty(uint var, uint rel) {
  const uint headIndex = getHeadIndex(var, rel);
  return __graphGet__(headIndex, BASE) == NIL;
}

/**
 * Mask that tells whether the variables contained in an element have size > offset
 * There is one such mask per offset.
 * stored in compressed format
 */
__constant__ uint* __offsetMask__;

/**
 * Number of rows needed to represent the mask of ONE offset.
 * = ceil(numObjectVars / DST_PER_ELEMENT), since non-object pointers have size 1.
 */
__constant__ uint __offsetMaskRowsPerOffset__; 

__device__ inline uint __offsetMaskGet__(const uint base, const uint col, const uint offset) {
  return __offsetMask__[mul32((offset - 1) * __offsetMaskRowsPerOffset__ + base) + col];
}

__device__ inline void __offsetMaskSet__(const uint base, const uint col, const uint offset,
    const uint val) {
  __offsetMask__[mul32((offset - 1) * __offsetMaskRowsPerOffset__ + base) + col] = val;
}

/**
 * Mask that tells whether the pts-to of an element changed.
 * the BASE and NEXT words are always equal to 0
 * stored in compressed format
 */
__constant__ uint* __diffPtsMask__;

__device__ inline uint __diffPtsMaskGet__(const uint base, const uint col) {
  return __diffPtsMask__[mul32(base) + col];
}

__device__ inline void __diffPtsMaskSet__(const uint base, const uint col, const uint val) {
  __diffPtsMask__[mul32(base) + col] = val;
}

/**
 * Index of the next free element in the corresponding free list.
 * The index is given in words, not bytes or number of elements.
 */
__device__ uint __ptsFreeList__,__nextDiffPtsFreeList__, __currDiffPtsFreeList__, __otherFreeList__;

__device__ inline uint mallocPts(uint size = ELEMENT_WIDTH) {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicAdd(&__ptsFreeList__, size);
  }
  return _shared_[threadIdx.y];
}

__device__ inline uint mallocNextDiffPts() {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicSub(&__nextDiffPtsFreeList__, ELEMENT_WIDTH);
  }
  return _shared_[threadIdx.y];
}

__device__ inline uint mallocCurrDiffPts() {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicSub(&__currDiffPtsFreeList__, ELEMENT_WIDTH);
  }
  return _shared_[threadIdx.y];
}

__device__ inline uint mallocOther() {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK]; 
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicAdd(&__otherFreeList__, ELEMENT_WIDTH);
  }
  return _shared_[threadIdx.y];
}

__device__ inline uint mallocIn(uint rel) {
  if (rel == NEXT_DIFF_PTS) {
    return mallocNextDiffPts();
  }
  if (rel >= COPY_INV) {
    return mallocOther();
  }
  if (rel == PTS) {
    return mallocPts();
  }
  if (rel == CURR_DIFF_PTS) {
    return mallocCurrDiffPts();
  }
  //printf("WTF! (%u)", rel);
  return 0;
}

/**
 * Get and increment the current worklist index
 * Granularity: warp
 * @param delta Number of elements to be retrieved at once 
 * @return Worklist index 'i'. All the work items in the [i, i + delta) interval are guaranteed
 * to be assigned to the current warp.
 */
__device__ inline uint getAndIncrement(const uint delta) {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicAdd(&__worklistIndex0__, delta);
  }
  return _shared_[threadIdx.y];
}

__device__ inline uint getAndIncrement(uint* counter, uint delta) {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _shared_[threadIdx.y] = atomicAdd(counter, delta);
  }
  return _shared_[threadIdx.y];
}

/**
 * Lock a given variable 
 * Granularity: warp
 * @param var Id of the variable
 * @return A non-zero value if the operation succeeded
 */
__device__ inline uint lock(const uint var) {
  uint any = __any_sync(0xffffffff,isFirstThreadOfWarp() && (atomicCAS(__lock__ + var, UNLOCKED, LOCKED) 
      == UNLOCKED));
  return any;
}

/**
 * Unlock a variable
 * Granularity: warp or thread
 * @param var Id of the variable
 */
__device__ inline void unlock(const uint var) {
  __lock__[var] = UNLOCKED;
}

__device__ inline int isRep(const uint var) {
  return __rep__[var] == var;
}

__device__ inline void setRep(const uint var, const uint rep) {
  __rep__[var] = rep;
}

__device__ inline uint getRep(const uint var) {
  return __rep__[var];
}

__device__ inline uint getRepRec(const uint var) {
  uint rep = var;
  uint repRep = __rep__[rep];
  while (repRep != rep) {
    rep = repRep;
    repRep = __rep__[rep];
  } 
  return rep;
}

__device__ ulongint recordStartTime() {
  __shared__ volatile ulongint _ret_[MAX_WARPS_PER_BLOCK];
  if (isFirstThreadOfWarp()) {
    _ret_[threadIdx.y] = clock();
  }
  return _ret_[threadIdx.y];
}

__device__ void recordElapsedTime(ulongint start){
  if (isFirstThreadOfWarp()) {
    ulongint delta;
    ulongint end = clock();
    if (end > start) {
      delta = end - start;
    } else {
      delta = end + (0xffffffff - start);
    }
    double time = TICKS_TO_MS(delta);
    printf("Block %u, Warp: %u: %8.2f ms.\n", blockIdx.x, threadIdx.y, time);
  }
}

__device__ inline uint decodeWord(const uint base, const uint word, const uint bits) {
  uint ret = mul960(base) + mul32(word);
  return (isBitActive(bits, threadIdx.x)) ? __rep__[ret + threadIdx.x] : NIL;
}

__device__ inline void swap(volatile uint* const keyA, volatile uint* const keyB, const uint dir) {
  uint n1 = *keyA;
  uint n2 = *keyB;
  if ((n1 < n2) != dir) {
    *keyA = n2;
    *keyB = n1;
  }
}

// Bitonic Sort, in ascending order using one WARP
// precondition: size of _shared_ has to be a power of 2
__device__ inline void bitonicSort(volatile uint* const _shared_, const uint to) {
  for (int size = 2; size <= to; size <<= 1) {
    for (int stride = size / 2; stride > 0; stride >>= 1) {
      for (int id = threadIdx.x; id < (to / 2); id += WARP_SIZE) {
        const uint myDir = ((id & (size / 2)) == 0);
        uint pos = 2 * id - mod(id, stride);
        volatile uint* start = _shared_  + pos;
        swap(start, start + stride, myDir);
      }
    }
  }
}

__device__ void blockBitonicSort(volatile uint* _shared_, uint to) {
  uint idInBlock = getThreadIdInBlock();
  for (int size = 2; size <= to; size <<= 1) {
    for (int stride = size / 2; stride > 0; stride >>= 1) {
      __syncthreads();
      for (int id = idInBlock; id < (to / 2); id += getThreadsPerBlock()) {
        const uint myDir = ((id & (size / 2)) == 0);
        uint pos = 2 * id - mod(id, stride);
        volatile uint* start = _shared_ + pos;
        swap(start, start + stride, myDir);
      }
    }
  }
}

/**
 * Sort an array in ascending order.
 * Granularity: block
 * @param _shared_ list of integers
 * @param to size of the sublist we want to process
 */
__device__ void blockSort(volatile uint* _shared_, uint to) {
  uint size = max(nextPowerOfTwo(to), 32);
  uint id = getThreadIdInBlock();
  for (int i = to + id; i < size; i += getThreadsPerBlock()) {
    _shared_[i] = NIL;
  }
  blockBitonicSort(_shared_, size);  
  __syncthreads();
}

/**
 * Remove duplicates on a sorted sequence, equivalent to Thrust 'unique' function but uses one warp.
 * If there are NILS, they are treated like any other number
 * precondition: the input list is sorted
 * precondition: to >= 32
 * precondition: shared_[-1] exists and is equal to NIL
 * Granularity: warp
 *
 * @param _shared_ list of integers
 * @param to size of the sublist we want to process
 * @return number of unique elements in the input.
 */
__device__  inline uint unique(volatile uint* const _shared_, uint to) {
  uint startPos = 0;
  uint myMask = (1 << (threadIdx.x + 1)) - 1;
  for (int id = threadIdx.x; id < to; id += WARP_SIZE) {
    uint myVal = _shared_[id];
    uint fresh = __ballot_sync(0xffffffff,myVal != _shared_[id - 1]);
    // pos = starting position + number of 1's to my right (incl. myself) minus one
    uint pos = startPos + __popc(fresh & myMask) - 1;
    _shared_[pos] = myVal;
    startPos += __popc(fresh);
  }
  return startPos;
}

__device__ uint removeDuplicates(volatile uint* const _shared_, const uint to) {
  const uint size = max(nextPowerOfTwo(to), 32);
  for (int i = to + threadIdx.x; i < size; i += WARP_SIZE) {
    _shared_[i] = NIL;
  }
  bitonicSort(_shared_, size);
  uint ret = unique(_shared_, size);
  return (size > to) ? ret - 1 : ret;
}

__device__ void print(uint* m, const uint size) {
  if (!isFirstThreadOfWarp())
    return;
  //printf("[");
  for (int i = 0; i < size; i++) {
    //printf("%u", m[i]);
    if (i < size - 1) {
      //printf(", ");
    }
  }
  //printf ("]");
}

__device__ void print(int* m, const uint size) {
  if (!isFirstThreadOfWarp())
    return;
  //printf("[");
  for (int i = 0; i < size; i++) {
    //printf("%d", m[i]);
    if (i < size - 1) {
      //printf(", ");
    }
  }
  //printf ("]");
}


__device__ volatile uint __printBuffer__[PRINT_BUFFER_SIZE];

 // TODO: assumes we print with 1 block and 1 warp...
__device__ void printElementAsSet(const uint base, volatile uint myBits, bool& first) {
  for (int i = 0; i < BASE; i++) {
    uint word = getValAtThread(myBits, i);
    uint myDst = decodeWord(base, i, word);
    for (int j = 0; j < WARP_SIZE; j++) {
      uint dst = getValAtThread(myDst, j);
      if (dst != NIL && isFirstThreadOfWarp()) {
        if (first) {
          //printf("%u", dst);
        } else {
          //printf(", %u", dst);
        }
        first = false;
      }
    }
  }
}

__device__ void printDiffPtsMask() {
  uint numVars = __numVars__;
  if (isFirstThreadOfWarp()) {
    //printf("DIFF_PTS_MASK: [");
  }
  bool first = true;
  int to = ceil((float) numVars /  (float) ELEMENT_CARDINALITY);
  for (int base = 0; base < to; base++) {
    uint myBits = __diffPtsMaskGet__(base, threadIdx.x);
    printElementAsSet(base, myBits, first);
  }
  if (isFirstThreadOfWarp())
    ;//printf("]\n");
}

__global__ void __printDiffPtsMask() {
  printDiffPtsMask();
}

__device__ void printOffsetMask(uint numObjectsVars, uint offset) {
  if (isFirstThreadOfWarp()) {
    //printf("MASK for offset %u: [", offset);
  }
  bool first = true;
  int to = __offsetMaskRowsPerOffset__;
  for (int base = 0; base < to; base++) {
    uint myBits = __offsetMaskGet__(base, threadIdx.x, offset);
    printElementAsSet(base, myBits, first);
  }
  if (isFirstThreadOfWarp())
    ;//printf("]\n");
}

__device__ void printOffsetMasks(uint numObjectsVars, uint maxOffset) {
  if (!isFirstWarpOfGrid()) {
    return;
  }
  for (int i = 1; i <= maxOffset; i++) {
    printOffsetMask(numObjectsVars, i);
  }
}

__global__ void __printOffsetMasks(uint numObjectsVars, uint maxOffset) {
  printOffsetMasks(numObjectsVars, maxOffset);
}

__device__ void printElementRec(uint index) {
  volatile uint myBits = __graphGet__(index, threadIdx.x);
  uint all = __all_sync(0xffffffff,myBits == NIL);
  if (all) {
    return;
  }
  while (index != NIL) {
    //printf("Thread: %u, value: %u\n", threadIdx.x, myBits);
    index = __graphGet__(index, NEXT);
    if (index != NIL) {
      myBits = __graphGet__(index, threadIdx.x);
    }
  }
}

__device__ void printSharedElementRec(uint* volatile _shared_, uint index) {
  volatile uint myBits = _sharedGet_(_shared_, index, threadIdx.x);
  uint all = __all_sync(0xffffffff,myBits == NIL);
  if (all) {
    return;
  }
  while (index != NIL) {
    //printf("Thread: %u, value: %u\n", threadIdx.x, myBits);
    index = _sharedGet_(_shared_, index, NEXT);
    if (index != NIL) {
      myBits = _sharedGet_(_shared_, index, threadIdx.x);
    }
  }
}

__device__  void accumulate(const uint base, uint myBits, uint& numFrom, uint rel) {
  uint nonEmpty = __ballot_sync(0xffffffff,myBits && threadIdx.x < BASE);
  while (nonEmpty) {
    uint pos = __ffs(nonEmpty) - 1;
    nonEmpty &= (nonEmpty - 1);
    uint bits = getValAtThread(myBits, pos);
    uint numOnes = __popc(bits);
    //cudaAssert(numFrom + numOnes > PRINT_BUFFER_SIZE); 
    uint var = mul960(base) + mul32(pos) + threadIdx.x;
    // PTS edges: we do not use representatives. In all the other relations we do.
    var = isBitActive(bits, threadIdx.x) ? (rel > CURR_DIFF_PTS ? __rep__[var] : var) : NIL;
    pos = numFrom + __popc(bits & ((1 << threadIdx.x) - 1));
    if (var != NIL) {
      __printBuffer__[pos] = var;
    }
    numFrom += numOnes;
  }
}

__device__ void printEdges(const uint src, const uint rel, const uint printEmptySets) { 
  if (isEmpty(src, rel) && !printEmptySets) {
    return;
  }
  if (isFirstThreadOfWarp()) {
    //printf("%d => [", src);
  }
  uint index = getHeadIndex(src, rel);
  uint numFrom = 0;
  do {
    uint myBits = __graphGet__(index, threadIdx.x);
    uint base = __graphGet__(index, BASE);
    if (base == NIL) {
      break;
    }
    index = __graphGet__(index, NEXT);
    accumulate(base, myBits, numFrom, rel);
  } while (index != NIL);
  if (numFrom) {
    if (rel > CURR_DIFF_PTS) {
      numFrom = removeDuplicates(__printBuffer__, numFrom);
    }
    for (int i = 0; i < numFrom; i++) {
      uint val = __printBuffer__[i]; // has to be non-NIL
      if (isFirstThreadOfWarp()) {
        if (!i) {
          //printf("%u", val);
        } else {
          //printf(", %u", val);
        }
      }
    }
  }
  if (isFirstThreadOfWarp()) {
    //printf("]\n");
  }
}

__device__ void printEdgesOf(const uint src, int rel) {
  if (isFirstThreadOfWarp()) {
    //printf("%s of ", getName(rel));
  }
  printEdges(src, rel, 1);
}

__device__ void printEdgesStartingAt(uint index, int rel) {
  if (isFirstThreadOfWarp()) {
    //printf("%s @ %u => [", getName(rel), index);
  }
  uint numFrom = 0;
  do {
    uint myBits = __graphGet__(index, threadIdx.x);
    uint base = __graphGet__(index, BASE);
    if (base == NIL) {
      break;
    }
    index = __graphGet__(index, NEXT);
    accumulate(base, myBits, numFrom, rel);
  } while (index != NIL);
  if (numFrom) {
    if (rel > CURR_DIFF_PTS) {
      numFrom = removeDuplicates(__printBuffer__, numFrom);
    }
    for (int i = 0; i < numFrom; i++) {
      uint val = __printBuffer__[i]; // has to be non-NIL
      if (isFirstThreadOfWarp()) {
        if (!i) {
          //printf("%u", val);
        } else {
          //printf(", %u", val);
        }
      }
    }
  }
  if (isFirstThreadOfWarp()) {
    //printf("]\n");
  }
}

__device__ void printEdgesOf(uint src) {
  for (int i = 0; i <= LAST_DYNAMIC_REL; i++) {
    printEdgesOf(src, i);
  }
}

__global__ void __printEdgesOf(uint src, int rel) {
  printEdgesOf(src, rel);
}

__global__ void __printEdgesOf(uint src) {
  printEdgesOf(src);
}

__device__ void printEdges(int rel) {
  if (isFirstThreadOfWarp()) {
    //printf("%s edges:\n", getName(rel));
  }
  for (int src = 0; src < __numVars__; src++) {
    printEdges(src, rel, 0);
  }
}

__global__ void __printEdges(int rel) {
  printEdges(rel);
}

__device__ void printGepEdges() {
  uint numVarsGepInv = __numGepInv__;
  if (isFirstThreadOfWarp()) {
    //printf("GEP_INV edges:\n");
  }
  volatile __shared__ uint _shared_[WARP_SIZE];
  for (int i = 0; i < numVarsGepInv; i += WARP_SIZE) {
    _shared_[threadIdx.x] = __gepInv__[i + threadIdx.x];
    for (int j= 0; j < WARP_SIZE && _shared_[j] != NIL; j +=2) {
      uint dst = _shared_[j];
      uint srcOffset = _shared_[j + 1];
      if (isFirstThreadOfWarp()) {
        //printf("%u => %u (%u)\n", dst, id(srcOffset), offset(srcOffset));
      }
    }
  }
}

__global__ void __printGepEdges() {
  printGepEdges();
}

__device__ void printConstraints(uint* __constraints__, const uint numConstraints) { 
  volatile __shared__ uint _shared_[WARP_SIZE];
  for (int i = 0; i < numConstraints * 2; i += WARP_SIZE) {
    _shared_[threadIdx.x] = __constraints__[i + threadIdx.x];
    for (int j = 0; j < WARP_SIZE; j += 2) {
      if (i + j >= numConstraints * 2) {
        return;
      }
      uint src = _shared_[j];
      uint dst = _shared_[j + 1];
      if (isFirstThreadOfWarp()) {
        //printf("%u => %u\n", src, dst);
      }
    }
  }
}

__device__ int checkForErrors(uint var, uint rel) {
  uint index = getHeadIndex(var, rel);
  uint lastBase = 0;
  uint first = 1;

  uint bits = __graphGet__(index, threadIdx.x);
  uint all = __all_sync(0xffffffff,bits == NIL);
  if (all) {
    return 0;
  }
  do {
    bits = __graphGet__(index, threadIdx.x);
    uint all_bits = __all_sync(0xffffffff,threadIdx.x >= BASE || bits == NIL);
    if (all_bits) {
      if (isFirstThreadOfWarp()) {
        //printf("ERROR: empty element at %s of %u \n", getName(rel), var);
      }
      //printElementRec(getHeadIndex(var, rel));
      __error__ = 1;
      return 1;
    }
    uint base = __graphGet__(index, BASE);
    index = __graphGet__(index, NEXT);
    if (base == NIL) {
      if (isFirstThreadOfWarp()) {
        //printf("ERROR: inconsistency at %s of %u: BASE is NIL but other word is not\n",
            //getName(rel), var);
      }
      printElementRec(getHeadIndex(var, rel));
      __error__ = 1;
      return 1;
    }
    if (!first && base <= lastBase) {
      if (isFirstThreadOfWarp()) {
        //printf("ERROR: BASE(element) = %u <= BASE(prev(element)) = %u at %s of %u\n", base, 
            //lastBase, getName(rel), var);
      }
      //printElementRec(getHeadIndex(var, rel));
      __error__ = 1;
      return 1;
    }
    first = 0;
    lastBase = base;
  } while (index != NIL);
  return 0;
}

__global__ void checkForErrors(uint rel) {
  uint numVars = __numVars__;
  int inc = mul32(getWarpsPerGrid());
  int init = mul32(getWarpIdInGrid());
  for (int initVar = init; initVar < numVars; initVar += inc) {
    for (int i = 0; i < WARP_SIZE; i++) {
      uint var = initVar + i;
      if (var > numVars || checkForErrors(var, rel)) {
        return;
      }
    }
  }
}

__device__ uint hashCode(uint index) {
  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];
  volatile uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];
  uint myRet = 0;
  uint bits = __graphGet__(index + threadIdx.x);
  uint base = __graphGet__(index + BASE);
  if (base == NIL) {
    return 0;
  }
  while (1) {
    uint elementHash = base * (30 + threadIdx.x) ^ bits;
    if (bits) {
      myRet ^= elementHash;      
    }
    index = __graphGet__(index + NEXT);
    if (index == NIL) {
      break;
    }
    bits = __graphGet__(index + threadIdx.x);
    base = __graphGet__(index + BASE);
  } 
  _shared_[threadIdx.x] = myRet;
  if (threadIdx.x < 14) {
    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 2];
  }
  if (threadIdx.x < 8) {
    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 4];
  }
  if (threadIdx.x < 4) {
    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 8];
  }
  return _shared_[0] ^ _shared_[1] ^ _shared_[2] ^ _shared_[3];
}

__device__ uint equal(uint index1, uint index2) {
  uint bits1 = __graphGet__(index1 + threadIdx.x);
  uint bits2 = __graphGet__(index2 + threadIdx.x);
  uint all = __all_sync(0xffffffff,(threadIdx.x == NEXT) || (bits1 == bits2));
  while (all) {
    index1 = __graphGet__(index1 + NEXT);
    index2 = __graphGet__(index2 + NEXT);
    if (index1 == NIL || index2 == NIL) {
      return index1 == index2;
    }
    bits1 = __graphGet__(index1 + threadIdx.x);
    bits2 = __graphGet__(index2 + threadIdx.x);
  }
  return 0;
}

__device__ uint size(uint var, uint rel) {
  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];
  volatile uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];
  if (isEmpty(var, rel)) {
    return 0;
  }
  uint index = getHeadIndex(var, rel);
  uint myRet = 0;
  do {
    uint myBits = __graphGet__(index, threadIdx.x);
    index = __graphGet__(index, NEXT);
    myRet += __popc(myBits);
  } while (index != NIL);
  _shared_[threadIdx.x] = threadIdx.x >= BASE ? 0 : myRet;
  for (int stride = WARP_SIZE / 2; stride > 0; stride >>= 1) {
    if (threadIdx.x < stride) {
      _shared_[threadIdx.x] += _shared_[threadIdx.x + stride];
    }
  }
  return _shared_[0];
}

__device__ void unionToCopyInv(const uint to, const uint fromIndex, uint* const _shared_, 
    bool applyCopy = true) {
  uint toIndex = getCopyInvHeadIndex(to);
  if (fromIndex == toIndex) {
    return;
  }
  uint fromBits = __graphGet__(fromIndex + threadIdx.x);
  uint fromBase = __graphGet__(fromIndex + BASE);
  if (fromBase == NIL) {
    return;
  }
  uint fromNext = __graphGet__(fromIndex + NEXT);
  uint toBits = __graphGet__(toIndex + threadIdx.x);
  uint toBase = __graphGet__(toIndex + BASE);
  uint toNext = __graphGet__(toIndex + NEXT);
  uint numFrom = 0;
  uint newVal;
  while (1) {
    if (toBase > fromBase) {
      if (toBase == NIL) {
        newVal = fromNext == NIL ? NIL : mallocOther();
      } else {
        newVal = mallocOther();
        __graphSet__(newVal + threadIdx.x, toBits);
      }
      fromBits = threadIdx.x == NEXT ? newVal : fromBits;
      __graphSet__(toIndex + threadIdx.x, fromBits);
      if (applyCopy) {
        map<NEXT_DIFF_PTS, PTS>(to, fromBase, fromBits, _shared_, numFrom);
      }
      if (fromNext == NIL) {
        break;
      }
      toIndex = newVal;
      fromBits = __graphGet__(fromNext + threadIdx.x);
      fromBase = __graphGet__(fromNext + BASE);
      fromNext = __graphGet__(fromNext + NEXT);      
    } else if (toBase == fromBase) {
      uint orBits = fromBits | toBits;
      uint diffs = __any_sync(0xffffffff,uint(orBits != toBits && threadIdx.x < NEXT));
      bool nextWasNil = false;
      if (toNext == NIL && fromNext != NIL) {
        toNext = mallocOther();
        nextWasNil = true;
      }
      uint newBits = threadIdx.x == NEXT ? toNext : orBits;
      if (newBits != toBits) {
        __graphSet__(toIndex + threadIdx.x, newBits);
      }
      // if there was any element added to COPY_INV, apply COPY_INV rule
      if (applyCopy && diffs) {
        uint diffBits = fromBits & ~toBits;
        map<NEXT_DIFF_PTS, PTS > (to, fromBase, diffBits, _shared_, numFrom);
      }
      //advance `to` and `from`
      if (fromNext == NIL) {
        break;
      }
      toIndex = toNext;
      if (nextWasNil) {
        toBits = NIL;
        toBase = NIL;
        toNext = NIL;
      } else {
        toBits = __graphGet__(toIndex + threadIdx.x);
        toBase = __graphGet__(toIndex + BASE);
        toNext = __graphGet__(toIndex + NEXT);
      }
      fromBits = __graphGet__(fromNext + threadIdx.x);
      fromBase = __graphGet__(fromNext + BASE);
      fromNext = __graphGet__(fromNext + NEXT);      
    } else { //toBase < fromBase
      if (toNext == NIL) {
        uint newNext = mallocOther();
        __graphSet__(toIndex + NEXT, newNext);
        toIndex = newNext;
        toBits = NIL;
        toBase = NIL;
      } else {
        toIndex = toNext;
        toBits = __graphGet__(toNext + threadIdx.x);
        toBase = __graphGet__(toIndex + BASE);
        toNext = __graphGet__(toNext + NEXT);        
      }
    }
  }
  if (applyCopy && numFrom) {
    // flush pending unions
    unionAll<NEXT_DIFF_PTS, PTS> (to, _shared_, numFrom);
  }
}

__device__ void clone(uint toIndex, uint fromBits, uint fromNext, const uint toRel) {  
  while (1) {
    uint newIndex = fromNext == NIL ? NIL : mallocIn(toRel);    
    uint val = threadIdx.x == NEXT ? newIndex : fromBits;
    __graphSet__(toIndex + threadIdx.x, val);
    if (fromNext == NIL) {
      break;
    }
    toIndex = newIndex;
    fromBits = __graphGet__(fromNext + threadIdx.x);
    fromNext = __graphGet__(fromNext + NEXT);        
  } 
}

// toRel = any non-static relationship
__device__ void unionG2G(const uint to, const uint toRel, const uint fromIndex) {
  uint toIndex = getHeadIndex(to, toRel);
  uint fromBits = __graphGet__(fromIndex + threadIdx.x); 
  uint fromBase = __graphGet__(fromIndex + BASE);
  if (fromBase == NIL) {
    return;
  }
  uint fromNext = __graphGet__(fromIndex + NEXT);
  uint toBits = __graphGet__(toIndex + threadIdx.x);
  uint toBase = __graphGet__(toIndex + BASE);
  if (toBase == NIL) {
    clone(toIndex, fromBits, fromNext, toRel);
    return;
  }
  uint toNext = __graphGet__(toIndex + NEXT);
  while (1) {
    if (toBase > fromBase) {
      uint newIndex = mallocIn(toRel);
      __graphSet__(newIndex + threadIdx.x, toBits);      
      uint val = threadIdx.x == NEXT ? newIndex : fromBits;
      __graphSet__(toIndex + threadIdx.x, val);
      // advance 'from'
      if (fromNext == NIL) {
        return;
      }
      toIndex = newIndex;
      fromBits = __graphGet__(fromNext + threadIdx.x);
      fromBase = __graphGet__(fromNext + BASE);
      fromNext = __graphGet__(fromNext + NEXT);        
    } else if (toBase == fromBase) {
      uint newToNext = (toNext == NIL && fromNext != NIL) ? mallocIn(toRel) : toNext;
      uint orBits = fromBits | toBits;
      uint newBits = threadIdx.x == NEXT ? newToNext : orBits;
      if (newBits != toBits) {
        __graphSet__(toIndex + threadIdx.x, newBits);
      }
      //advance `to` and `from`
      if (fromNext == NIL) {
        return;
      }
      fromBits = __graphGet__(fromNext + threadIdx.x);
      fromBase = __graphGet__(fromNext + BASE);
      fromNext = __graphGet__(fromNext + NEXT);      
      if (toNext == NIL) {
        clone(newToNext, fromBits, fromNext, toRel);
        return;
      } 
      toIndex = newToNext;
      toBits = __graphGet__(toNext + threadIdx.x);
      toBase = __graphGet__(toNext + BASE);
      toNext = __graphGet__(toNext + NEXT);
    } else { // toBase < fromBase
      if (toNext == NIL) {
        toNext = mallocIn(toRel);
        __graphSet__(toIndex + NEXT, toNext);
        clone(toNext, fromBits, fromNext, toRel);
        return;
      } 
      toIndex = toNext;
      toBits = __graphGet__(toNext + threadIdx.x);
      toBase = __graphGet__(toNext + BASE);
      toNext = __graphGet__(toNext + NEXT);      
    }
  } 
}

// WATCH OUT: ASSUMES fromRel==toRel
// like unionTo, but reusing the elements of 'from' (introduces sharing of elements)
// toRel = any non-static relationship
__device__  void unionG2GRecycling(const uint to, const uint toRel, uint fromIndex) {
  uint fromBits = __graphGet__(fromIndex, threadIdx.x);
  uint fromBase = __graphGet__(fromIndex, BASE);
  if (fromBase == NIL) {
    return;
  }
  uint toIndex = getHeadIndex(to, toRel);
  uint toBits = __graphGet__(toIndex, threadIdx.x);
  uint toBase = __graphGet__(toIndex, BASE);
  if (toBase == NIL) {
    __graphSet__(toIndex, threadIdx.x, fromBits);
    return;
  }
  uint toNext = __graphGet__(toIndex, NEXT);
  uint fromNext = __graphGet__(fromIndex, NEXT);
  uint fromHeadIndex = fromIndex;
  do {
    if (toBase == fromBase) {
      uint newToNext = (toNext == NIL) ? fromNext : toNext;
      uint orBits = fromBits | toBits;
      uint newBits = threadIdx.x == NEXT ? newToNext : orBits;
      if (newBits != toBits) {
        __graphSet__(toIndex, threadIdx.x, newBits);
      }
      //advance `to` and `from`
      if (toNext == NIL || fromNext == NIL) { // done with current elt and there is no NEXT => exit
        return;
      }
      fromIndex = fromNext;
      fromBits = __graphGet__(fromIndex, threadIdx.x);
      fromBase = __graphGet__(fromIndex, BASE);
      fromNext = __graphGet__(fromIndex, NEXT);
      toIndex = toNext;
      toBits = __graphGet__(toIndex, threadIdx.x);
      toBase = __graphGet__(toIndex, BASE);
      toNext = __graphGet__(toIndex, NEXT);
    } else if (toBase < fromBase) {
      if (toNext == NIL) {
        if (fromIndex == fromHeadIndex) {
          fromIndex = mallocIn(toRel);
          __graphSet__(fromIndex, threadIdx.x, fromBits);
        }
        __graphSet__(toIndex, NEXT, fromIndex);
        return;
      }
      // advance 'to'
      toIndex = toNext;
      toBits = __graphGet__(toIndex, threadIdx.x);
      toBase = __graphGet__(toIndex, BASE);
      toNext = __graphGet__(toIndex, NEXT);
    } else { // toBase > fromBase
      if (fromIndex == fromHeadIndex) {
        fromIndex = mallocIn(toRel);      
      }
      __graphSet__(fromIndex, threadIdx.x, toBits);
      int val = threadIdx.x == NEXT ? fromIndex : fromBits;
      __graphSet__(toIndex, threadIdx.x, val);
      toIndex = fromIndex; // toBits does not change
      fromIndex = fromNext;
      if (fromNext != NIL) {
        //advance 'from'
        fromBits = __graphGet__(fromIndex, threadIdx.x);
        fromBase = __graphGet__(fromIndex, BASE);
        fromNext = __graphGet__(fromIndex, NEXT);
      }
    }
  } while (fromIndex != NIL);
}

__device__ uint addVirtualElement(uint index, const uint fromBase, const uint fromBits, 
    const uint toRel) {
  for (;;) {
    uint toBits = __graphGet__(index + threadIdx.x);
    uint toBase = __graphGet__(index + BASE);
    if (toBase == NIL) {
      // can only happen if the adjancency list of `to` is empty
      // cost: exactly one global write
      __graphSet__(index + threadIdx.x, fromBits);
      return index;
    }
    if (toBase == fromBase) {
      // cost: at most one global write
      uint orBits = toBits | fromBits;
      if (orBits != toBits && threadIdx.x < NEXT) {
        __graphSet__(index + threadIdx.x, orBits);
      }
      return index;
    }
    if (toBase < fromBase) {
      uint toNext = getValAtThread(toBits, NEXT);
      if (toNext == NIL) {
        // appending; cost: two global writes
        uint newIndex = mallocIn(toRel);
        __graphSet__(newIndex + threadIdx.x, fromBits);
        __graphSet__(index + NEXT, newIndex);
        return newIndex;
      }
      index = toNext;
    } else {
      // cost: two global writes
      uint newIndex = mallocIn(toRel);
      __graphSet__(newIndex + threadIdx.x, toBits);
      uint val = threadIdx.x == NEXT ? newIndex : fromBits;
      __graphSet__(index + threadIdx.x, val);
      return index;
    }
  }
}

__device__ uint insert(const uint index, const uint var, const int rel) {  
  uint base = BASE_OF(var);
  uint word = WORD_OF(var);
  uint bit = BIT_OF(var);
  uint myBits = 0;
  if (threadIdx.x == word) {
    myBits = 1 << bit;
  } else if (threadIdx.x == BASE) {
    myBits = base;
  } else if (threadIdx.x == NEXT) {
    myBits = NIL;
  }  
  return addVirtualElement(index, base, myBits, rel);
}

__device__ inline uint resetWorklistIndex() {
  __syncthreads();
  uint numBlocks = getBlocksPerGrid();
  if (isFirstThreadOfBlock() && atomicInc(&__counter__, numBlocks - 1) == (numBlocks - 1)) {
    __worklistIndex0__ = 0;
    __counter__ = 0;
    return 1;
  }  
  return 0;
}

__global__ void addEdges(uint* __key__, uint* __keyAux__, uint* __val__, const uint to,  uint rel) {
  __shared__ uint _sh_[WARPS_PER_BLOCK(DEF_THREADS_PER_BLOCK) * WARP_SIZE];
  uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];
  uint i = getAndIncrement(1);
  while (i < to) {
    uint src = __key__[i];
    if (src == NIL) {
      break;
    }
    uint index  = getHeadIndex(src, rel);
    uint startIndex = __keyAux__[i];
    uint end = __keyAux__[i + 1]; 
    uint start = roundToPrevMultipleOf(startIndex, WARP_SIZE); // to ensure alignment
    for (int j = start; j < end; j += WARP_SIZE) {
      uint myIndex = j + threadIdx.x;
      _shared_[threadIdx.x] = myIndex < end ? __val__[myIndex] : NIL; 
      uint startK = max(((int) startIndex) - j, 0);
      uint endK = min(end - j, WARP_SIZE);      
      for (int k = startK; k < endK; k++) {
        uint dst = _shared_[k];
        index = insert(index, dst, rel);
      }      
    }   
    i = getAndIncrement(1);
  }
  resetWorklistIndex();  
}

template<uint toRel, uint fromRel>
__device__  inline void unionAll(const uint to, uint* const _shared_, uint numFrom, bool sort) {
  if (numFrom > 1 && sort) {
    numFrom = removeDuplicates(_shared_, numFrom);
  }
  for (int i = 0; i < numFrom; i++) {
    uint fromIndex = _shared_[i];     
    if (fromRel != CURR_DIFF_PTS) {
      fromIndex = getHeadIndex(fromIndex, fromRel);
    }
    if (toRel == COPY_INV) {
      unionToCopyInv(to, fromIndex, _shared_ + DECODE_VECTOR_SIZE + 1);
    } else {
      unionG2G(to, toRel, fromIndex);
    }
  }
}

template<uint toRel, uint fromRel>
__device__  void map(uint to, const uint base, const uint myBits, uint* const _shared_, 
    uint& numFrom) {
  uint ballot = __ballot_sync(0xffffffff,myBits);
  uint nonEmpty = ballot & LT_BASE;
  const uint threadMask = 1 << threadIdx.x;
  const uint myMask = threadMask - 1;
  const uint mul960base = mul960(base);
  while (nonEmpty) {
    uint pos = __ffs(nonEmpty) - 1;
    nonEmpty &= (nonEmpty - 1);
    uint bits = getValAtThread(myBits, pos);
    uint var =  getRep(mul960base + mul32(pos) + threadIdx.x); //coalesced
    uint bitActive = (var != I2P) && (bits & threadMask);
    bits = __ballot_sync(0xffffffff,bitActive);
    uint numOnes = __popc(bits);
    if (numFrom + numOnes > DECODE_VECTOR_SIZE) {
      numFrom = removeDuplicates(_shared_, numFrom);
      if (numFrom + numOnes > DECODE_VECTOR_SIZE) {
        if (toRel == STORE) {
          insertAll(to, _shared_, numFrom, false);
        } else {                
          unionAll<toRel, fromRel>(to, _shared_, numFrom, false); 
        }
        numFrom = 0;
      }
    }
    pos = numFrom + __popc(bits & myMask);
    if (bitActive) {      
      if (fromRel == CURR_DIFF_PTS) {
        _shared_[pos] = __currPtsHead__[var];
      }
      else {
        _shared_[pos] = var;
      }
    }
    numFrom += numOnes;
  }
}

template<uint firstRel, uint secondRel, uint thirdRel>
__device__ void apply(const uint src, uint* const _shared_) {
  uint numFrom = 0;
  uint index = getHeadIndex(src, firstRel);
  do {
    uint myBits = __graphGet__(index + threadIdx.x);
    uint base = __graphGet__(index + BASE);
    if (base == NIL) {
      break;
    }
    index = __graphGet__(index + NEXT);
    if (secondRel == CURR_DIFF_PTS) {
      myBits &= __diffPtsMaskGet__(base, threadIdx.x);
    } 
    map<thirdRel, secondRel>(src, base, myBits, _shared_, numFrom);
  } while (index != NIL);
  if (numFrom) {
    unionAll<thirdRel, secondRel>(src, _shared_, numFrom);
  }
}

__device__ void insertAll(const uint src, uint* const _shared_, uint numFrom, const bool sort) {
  if (numFrom > 1 && sort) {
    numFrom = removeDuplicates(_shared_, numFrom);
  }
  const uint storeIndex = getStoreHeadIndex(src);
  for (int i = 0; i < numFrom; i += WARP_SIZE) {
    uint size = min(numFrom - i, WARP_SIZE);
    uint next = getAndIncrement(&__numKeysCounter__, size);
    // TODO: we need to make sure that (next + threadIdx.x < MAX_HASH_SIZE)
    if (threadIdx.x < size) {
      __key__[next + threadIdx.x] = _shared_[i + threadIdx.x]; // at most 2 transactions
      __val__[next + threadIdx.x] = storeIndex;    
    }
  }
}

__device__ void store2storeInv(const uint src, uint* const _shared_) {
  uint currDiffPtsIndex = getCurrDiffPtsHeadIndex(src);
  uint numFrom = 0;
  do {
    uint myBits = __graphGet__(currDiffPtsIndex + threadIdx.x);
    uint base = __graphGet__(currDiffPtsIndex + BASE);
    if (base == NIL) {
      break;
    }
    currDiffPtsIndex = __graphGet__(currDiffPtsIndex + NEXT);
    map<STORE, STORE>(src, base, myBits, _shared_, numFrom);
  } while (currDiffPtsIndex != NIL);
  if (numFrom) {
    insertAll(src, _shared_, numFrom);
  }
}

__global__ void copyInv_loadInv_store2storeInv() {
  __shared__ uint _sh_[WARPS_PER_BLOCK(COPY_INV_THREADS_PER_BLOCK) * (DECODE_VECTOR_SIZE * 2 + 2)];
  uint* const _shared_ = &_sh_[threadIdx.y * (DECODE_VECTOR_SIZE * 2 + 2)];
  _shared_[0] = NIL;
  _shared_[DECODE_VECTOR_SIZE + 1] = NIL;
  uint to = __numVars__;
  uint src = getAndIncrement(&__worklistIndex1__, 1);
  while (src < to) {
    apply<COPY_INV, CURR_DIFF_PTS, NEXT_DIFF_PTS>(src, _shared_ + 1 + DECODE_VECTOR_SIZE + 1);
    apply<LOAD_INV, CURR_DIFF_PTS, COPY_INV>(src, _shared_ + 1);
    src = getAndIncrement(&__worklistIndex1__,1);
  }
  to = __numStore__;
  src = getAndIncrement(1);
  while (src < to) {
    src = __storeConstraints__[src];
    if (src != NIL) {
      store2storeInv(src, _shared_ + 1);
    }
    src = getAndIncrement(1);
  }
  if (resetWorklistIndex()) {
    __key__[__numKeysCounter__] = NIL;
    __val__[__numKeysCounter__] = NIL;        
    __numKeys__ = __numKeysCounter__ + 1;
    __numKeysCounter__ = 0;
    __worklistIndex1__ = 0;
  }  
}

__device__ void warpStoreInv(const uint i, uint* const _pending_, uint* _numPending_) {
  uint src = __key__[i];
  uint startIndex = __keyAux__[i];
  uint end = __keyAux__[i + 1]; 
  if (end - startIndex > WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 4) { 
    // too big for a single warp => add to pending, so the whole block will process this variable
    if (isFirstThreadOfWarp()) {
      uint where = 3 * atomicAdd(_numPending_, 1);
      _pending_[where] = src;
      _pending_[where + 1] = startIndex;
      _pending_[where + 2] = end;
    }
    return;
  }
  uint* const _shared_ = _pending_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 3 + 
      threadIdx.y * (WARP_SIZE + DECODE_VECTOR_SIZE + 1);
  _shared_[WARP_SIZE] = NIL;
  uint start = roundToPrevMultipleOf(startIndex, WARP_SIZE); // to ensure alignment
  for (int j = start; j < end; j += WARP_SIZE) {
    uint myIndex = j + threadIdx.x;
    _shared_[threadIdx.x] = myIndex < end ? __val__[myIndex] : NIL; 
    uint startK = max(((int) startIndex) - j, 0);
    uint endK = min(end - j, WARP_SIZE);      
    for (int k = startK; k < endK; k++) {
      uint fromIndex = _shared_[k];
      unionToCopyInv(src, fromIndex, _shared_ + 1 + WARP_SIZE); 
    }      
  }
}

__device__ void blockStoreInv(uint src, uint* const _dummyVars_, volatile uint* _warpInfo_, 
    uint& _numPending_) {
  uint* _shared_ = _dummyVars_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 4 + 
      threadIdx.y * (WARP_SIZE + DECODE_VECTOR_SIZE + 1);
  __shared__ uint _counter_, _start_, _end_;

  _shared_[WARP_SIZE] = NIL;
  _shared_ += WARP_SIZE + 1;
  __syncthreads();
  for (int i = 0; i < _numPending_; i++) {
    if (isFirstWarpOfBlock()) {
      uint* pending = _dummyVars_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);    
      src =     pending[3 * i]; 
      _start_ = pending[3 * i + 1];
      _end_ =   pending[3 * i + 2];
      _counter_ = _start_; 
    }
    __syncthreads();
    if (isFirstThreadOfWarp()) {
      _warpInfo_[threadIdx.y] = atomicAdd(&_counter_, 1);      
    }
    uint j = _warpInfo_[threadIdx.y];
    while (j < _end_) {      
      uint fromIndex = __val__[j];
      unionToCopyInv(src, fromIndex, _shared_, isFirstWarpOfBlock());         
      if (isFirstThreadOfWarp()) {
        _warpInfo_[threadIdx.y] = atomicAdd(&_counter_, 1);      
      }
      j = _warpInfo_[threadIdx.y];
    }
    __syncthreads(); 
    if (isFirstWarpOfBlock()) {
      for (int i = 1; i < WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK); i++) {
        uint var2 = _dummyVars_[i];
        unionToCopyInv(src, getCopyInvHeadIndex(var2), _shared_);
      }
    }
    __syncthreads();
    if (!isFirstWarpOfBlock()) { //reset fields so updateDiffPts doesn't work on dummy variables
      uint index = getHeadIndex(src, COPY_INV);
      __graphSet__(index, threadIdx.x, NIL);
    }         
  }
  if (isFirstWarpOfBlock()) {
    _numPending_ = 0;
  }
  __syncthreads();
}

__global__ void storeInv() {
  __shared__ uint _sh_[WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 
      (5 + WARP_SIZE + DECODE_VECTOR_SIZE + 1)];
  __shared__ volatile uint* _warpInfo_;
  __shared__ volatile uint _warpsWorking_;
  __shared__ uint* _dummyVars_;
  __shared__ uint _numPending_, _to_;
  
  if (isFirstWarpOfBlock()) {
    _to_ = __numKeys__ - 1; // because the last one is NIL
    _dummyVars_ = _sh_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);
    if (threadIdx.x < WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK)) {
      _dummyVars_[threadIdx.x] = __initialNonRep__[mul32(blockIdx.x) + threadIdx.x];
    }
    _warpInfo_ = _sh_;
    _numPending_ = 0;
    _warpsWorking_ = WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);
  } 
  __syncthreads();
  uint counter, src;
  if (!isFirstWarpOfBlock()) {
    src = _dummyVars_[threadIdx.y];    
  }
  if (isFirstThreadOfWarp()) {
    uint next = atomicAdd(&__worklistIndex0__, 1);
    if (next >= _to_) {
      atomicSub((uint*) &_warpsWorking_, 1);
    }
    _warpInfo_[threadIdx.y] = next;      
  }
  counter = _warpInfo_[threadIdx.y]; 
  while (_warpsWorking_) {
    if (counter < _to_) {
      warpStoreInv(counter, _sh_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 2, &_numPending_);
    }
    __syncthreads();
    if (_numPending_) {
      blockStoreInv(src, _dummyVars_, _warpInfo_, _numPending_);
    }
    if (counter < _to_ ) {
      if (isFirstThreadOfWarp()) {
        uint next = atomicAdd(&__worklistIndex0__, 1);
        if (next >= _to_) {
          atomicSub((uint*) &_warpsWorking_, 1);
        }
        _warpInfo_[threadIdx.y] = next;      
      }
      counter = _warpInfo_[threadIdx.y]; 
    }
  }
  resetWorklistIndex();  
}

__device__ void shift(const uint base, const uint bits, const uint offset,
    volatile uint* _shifted_) {
  _shifted_[threadIdx.x] = 0;
  _shifted_[threadIdx.x + WARP_SIZE] = 0;
  _shifted_[threadIdx.x + WARP_SIZE * 2] = 0;
  uint delta = div32(offset);
  uint highWidth = mod32(offset);
  uint lowWidth = WARP_SIZE - highWidth;
  // these memory accesses do not conflict
  _shifted_[threadIdx.x + delta] = (bits << highWidth);
  _shifted_[threadIdx.x + delta + 1] |= (bits >> lowWidth);
  _shifted_[threadIdx.x + WARP_SIZE * 2] = _shifted_[threadIdx.x + BASE * 2];
  _shifted_[threadIdx.x + WARP_SIZE] = _shifted_[threadIdx.x + BASE];
  _shifted_[BASE] = base;
  _shifted_[BASE + WARP_SIZE] = base + 1;
  _shifted_[BASE + WARP_SIZE * 2] = base + 2;
}

__device__ void applyGepInvRule(uint x, const uint y, const uint offset, volatile uint* _shared_) {
  uint yIndex = getCurrDiffPtsHeadIndex(y);
  uint myBits = __graphGet__(yIndex, threadIdx.x);
  uint all = __all_sync(0xffffffff,myBits == NIL);
  if (all) {
    return;
  }
  uint xIndex = getNextDiffPtsHeadIndex(x);
  do {
    myBits = __graphGet__(yIndex, threadIdx.x);
    uint base = __graphGet__(yIndex, BASE);
    yIndex = __graphGet__(yIndex, NEXT);
    myBits &= __offsetMaskGet__(base, threadIdx.x, offset);
    uint all = __all_sync(0xffffffff,myBits == 0);
    if (all) {
      continue;
    }
    shift(base, myBits, offset, _shared_);
    for (int i = 0; i < 3; i++) {
      uint myBits = threadIdx.x == NEXT ? NIL : _shared_[threadIdx.x + WARP_SIZE * i];
      uint all = __any_sync(0xffffffff,myBits && threadIdx.x < BASE);
      if (all) {
        xIndex = addVirtualElement(xIndex, base + i, myBits, NEXT_DIFF_PTS);
      }
    }
  } while (yIndex != NIL);
}

__global__ void gepInv() {
  __shared__ uint _sh_[WARPS_PER_BLOCK(GEP_INV_THREADS_PER_BLOCK) * (WARP_SIZE * 3)];
  volatile uint* _shared_ = &_sh_[threadIdx.y * (WARP_SIZE * 3)];
  const uint to = __numGepInv__ * 2;
  uint index = getAndIncrement(2);
  while (index < to) {
    uint x = __gepInv__[index];
    x = getRep(x);
    uint val1 = __gepInv__[index + 1];
    while (!lock(x));  // busy wait, should be short
    const uint y = getRep(id(val1));
    applyGepInvRule(x, y, offset(val1), _shared_);
    unlock(x);
    index = getAndIncrement(2);
  }
  if (resetWorklistIndex()) {
    __done__ = true;
  }  
}

__device__ void cloneAndLink(const uint var, const uint ptsIndex, uint& currDiffPtsIndex, 
    const uint diffPtsBits, const uint diffPtsNext) {
  clone(ptsIndex, diffPtsBits, diffPtsNext, PTS);
  if (currDiffPtsIndex != NIL) {
    __graphSet__(currDiffPtsIndex + NEXT, ptsIndex);
  } else {
    currDiffPtsIndex = getCurrDiffPtsHeadIndex(var);
    uint ptsBits = __graphGet__(ptsIndex + threadIdx.x);
    __graphSet__(currDiffPtsIndex + threadIdx.x, ptsBits);        
  }  
}

/**
 * Update the current, next and total PTS sets of a variable. In the last iteration of the main
 * loop, points-to edges have been added to NEXT_DIFF_PTS. However, many of them might already be
 * present in PTS. The purpose of this function is to update PTS as PTS U NEXT_DIFF_PTS, and set 
 * CURR_DIFF_PTS as the difference between the old and new PTS for the given variable.
 *  
 * @param var ID of the variable
 * @return true if new pts edges have been added to this variable
 */ 
__device__ bool updatePtsAndDiffPts(const uint var) {
  const uint diffPtsHeadIndex = getNextDiffPtsHeadIndex(var);
  uint diffPtsBits = __graphGet__(diffPtsHeadIndex + threadIdx.x);
  uint diffPtsBase = __graphGet__(diffPtsHeadIndex + BASE);
  if (diffPtsBase == NIL) {
    return false;
  }
  uint diffPtsNext = __graphGet__(diffPtsHeadIndex + NEXT);
  __graphSet__(diffPtsHeadIndex + threadIdx.x, NIL);
  uint ptsIndex = getPtsHeadIndex(var);
  uint ptsBits = __graphGet__(ptsIndex + threadIdx.x);
  uint ptsBase = __graphGet__(ptsIndex + BASE);
  if (ptsBase == NIL) { 
    //we pass ptsBase instead of NIL because it's also NIL but it can be modified
    cloneAndLink(var, ptsIndex, ptsBase, diffPtsBits, diffPtsNext);
    return true;    
  }      
  uint ptsNext = __graphGet__(ptsIndex + NEXT);
  uint currDiffPtsIndex = NIL;
  while (1)  {   
    if (ptsBase > diffPtsBase) {
      uint newIndex = mallocPts();
      __graphSet__(newIndex + threadIdx.x, ptsBits);        
      uint val = threadIdx.x == NEXT ? newIndex : diffPtsBits;
      __graphSet__(ptsIndex + threadIdx.x, val);
      ptsIndex = newIndex;
      // update CURR_DIFF_PTS
      newIndex = currDiffPtsIndex == NIL ? getCurrDiffPtsHeadIndex(var) : mallocCurrDiffPts();
      val = threadIdx.x == NEXT ? NIL : diffPtsBits;
      __graphSet__(newIndex + threadIdx.x, val);
      if (currDiffPtsIndex != NIL) {
        __graphSet__(currDiffPtsIndex + NEXT, newIndex);
      }
      if (diffPtsNext == NIL) {
        return true;
      }
      currDiffPtsIndex = newIndex;
      diffPtsBits = __graphGet__(diffPtsNext + threadIdx.x);
      diffPtsBase = __graphGet__(diffPtsNext + BASE);      
      diffPtsNext = __graphGet__(diffPtsNext + NEXT);      
    } else if (ptsBase == diffPtsBase) {      
      uint newPtsNext = (ptsNext == NIL && diffPtsNext != NIL) ? mallocPts() : ptsNext;
      uint orBits = threadIdx.x == NEXT ? newPtsNext : ptsBits | diffPtsBits;
      uint ballot = __ballot_sync(0xffffffff,orBits != ptsBits);
      if (ballot) {
        __graphSet__(ptsIndex + threadIdx.x, orBits);          
        if (ballot & LT_BASE) {
          // update CURR_DIFF_PTS
          orBits = diffPtsBits & ~ptsBits;
          if (threadIdx.x == BASE) {
            orBits = ptsBase;
          } else if (threadIdx.x == NEXT) {
            orBits = NIL;
          }
          uint newIndex;
          if (currDiffPtsIndex != NIL) {
            newIndex = mallocCurrDiffPts();
            __graphSet__(currDiffPtsIndex + NEXT, newIndex);
          } else {
            newIndex = getCurrDiffPtsHeadIndex(var);
          }
          __graphSet__(newIndex + threadIdx.x, orBits);
          currDiffPtsIndex = newIndex;
        }
      }
      if (diffPtsNext == NIL) {
        return (currDiffPtsIndex != NIL);
      }
      diffPtsBits = __graphGet__(diffPtsNext + threadIdx.x);
      diffPtsBase = __graphGet__(diffPtsNext + BASE);      
      diffPtsNext = __graphGet__(diffPtsNext + NEXT);      
      if (ptsNext == NIL) {
        cloneAndLink(var, newPtsNext, currDiffPtsIndex, diffPtsBits, diffPtsNext);
        return true;    
      } 
      ptsIndex = ptsNext;
      ptsBits = __graphGet__(ptsIndex + threadIdx.x);
      ptsBase = __graphGet__(ptsIndex + BASE);
      ptsNext = __graphGet__(ptsIndex + NEXT);         
    } else { // ptsBase > diffPtsBase
      if (ptsNext == NIL) {
        uint newPtsIndex = mallocPts();
        __graphSet__(ptsIndex + NEXT, newPtsIndex);
        cloneAndLink(var, newPtsIndex, currDiffPtsIndex, diffPtsBits, diffPtsNext);
        return true;
      }
      ptsIndex = ptsNext;
      ptsBits = __graphGet__(ptsIndex + threadIdx.x);
      ptsBase = __graphGet__(ptsIndex + BASE);
      ptsNext = __graphGet__(ptsIndex + NEXT);        
    } 
  }
}

__global__ void updatePtsInformation() {
  bool newWork = false;
  const uint numVars = __numVars__;
  const uint CHUNK_SIZE = 12;
  //ulongint start = recordStartTime();  
  int i = getAndIncrement(CHUNK_SIZE);
  while (i < numVars) {    
    for (int var = i; var < min(i + CHUNK_SIZE, numVars); var++) {
      bool newStuff = updatePtsAndDiffPts(var);
      newWork |= newStuff;
      if (!newStuff) {
        const uint currPtsHeadIndex = getCurrDiffPtsHeadIndex(var);
        __graphSet__(currPtsHeadIndex + threadIdx.x, NIL);        
      }    
    }
    i = getAndIncrement(CHUNK_SIZE);
  }
  if (newWork) {
    __done__ = false;
  }
//  if (isFirstThreadOfWarp()) {
//    //printf("Warp %u: %u\n", getWarpIdInGrid(), getEllapsedTime(start));
//  }  
  uint headerSize = numVars * ELEMENT_WIDTH;
  if (resetWorklistIndex()) {
    __currDiffPtsFreeList__ = CURR_DIFF_PTS_START - headerSize;
    __nextDiffPtsFreeList__ = NEXT_DIFF_PTS_START - headerSize;
  }
}

__global__ void createOffsetMasks(int numObjectVars, uint maxOffset) {
  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];
  volatile uint* _mask_ =  &_sh_[threadIdx.y * WARP_SIZE];

  int inc = mul960(getWarpsPerGrid());
  int init = mul960(getWarpIdInGrid());
  for (int i = init; i < numObjectVars; i += inc) {
    uint base = BASE_OF(i);
    for (int offset = 1; offset <= maxOffset; offset++) {
      _mask_[threadIdx.x] = 0;
      for (int src = i; src < min(i + ELEMENT_CARDINALITY, numObjectVars); src += WARP_SIZE) {
        uint size = __size__[src + threadIdx.x];
        uint all = __all_sync(0xffffffff,size <= offset);
        if (all) {
          continue;
        }
        uint word = WORD_OF(src - i);
        _mask_[word] = __ballot_sync(0xffffffff,size > offset);
      }
      __offsetMaskSet__(base, threadIdx.x, offset, _mask_[threadIdx.x]);
    }
  }
}

__device__ uint lockToVar(uint lock) {
  if ((lock < VAR(0)) || (lock >= LOCKED)) {
    return lock;
  }
  return lock - VAR(0);
}

__device__ void merge(const uint var1, const uint var2, const uint rep) {
  //if (isFirstThreadOfWarp()) //printf("%u <= %u\n", var1, var2);
  uint headIndex = getPtsHeadIndex(var2);
  unionG2GRecycling(var1, PTS, headIndex);
  __graphSet__(headIndex, threadIdx.x, NIL);
  headIndex = getCopyInvHeadIndex(var2);
  unionG2GRecycling(var1, COPY_INV, headIndex);
  __graphSet__(headIndex, threadIdx.x, NIL);
  headIndex = getStoreHeadIndex(var2);
  unionG2GRecycling(var1, STORE, headIndex);
  __graphSet__(headIndex, threadIdx.x, NIL);
  headIndex = getLoadInvHeadIndex(var2);
  unionG2GRecycling(var1, LOAD_INV, headIndex);
  __graphSet__(headIndex, threadIdx.x, NIL);
  // clear CURR_DIFF_PTS 
  headIndex = getCurrDiffPtsHeadIndex(var2);
  //unionG2GRecycling(var1, CURR_DIFF_PTS, headIndex);
  __graphSet__(headIndex, threadIdx.x, NIL);
  setRep(var2, rep);
  __threadfence(); 
  unlock(var2);
}

/**
 * Merge a list of pointer-equivalent variables
 * Granularity: block
 * @param _list_ Pointer-equivalent variables
 * @param _listSize_ Number of variables to be processed
 */
__device__ void mergeCycle(const uint* const _list_, const uint _listSize_) {
  __shared__ uint _counter_;
  if (!_listSize_) {
    __syncthreads();
    return;
  }
  // 'ry' will be the representative of this cycle
  uint ry = _list_[0];  
  if (_listSize_ == 1) {
    if (isFirstWarpOfBlock()) {
      unlock(ry);
    }    
    __syncthreads();
    return;
  }
  uint warpsPerBlock = getWarpsPerBlock();
  if (_listSize_ > warpsPerBlock) {
    // each warp chooses a local representative and then merges each popped worklist item with it.
    uint var1 = _list_[threadIdx.y];
    _counter_ = warpsPerBlock;
    __syncthreads();
    uint index = getAndIncrement(&_counter_, 1);
    while (index < _listSize_) {
      uint var2 = _list_[index];
      merge(var1, var2, ry);
      index = getAndIncrement(&_counter_, 1);
    }
  }
  __syncthreads();
  // the first warp merges the local representatives. This is actually faster (and simpler)
  // than performing a reduction of the list using the entire block, due to load imbalance.
  if (isFirstWarpOfBlock()) { 
    uint to = min(_listSize_, warpsPerBlock);
    for (int i = 1; i < to; i++) {
      uint var = _list_[i];
      merge(ry, var, ry);
    }    
    //reset CURR_PTS of the cycle representative to be PTS
    uint myBits = __graphGet__(getPtsHeadIndex(ry), threadIdx.x);
    __graphSet__(getCurrDiffPtsHeadIndex(ry), threadIdx.x, myBits); 
    __threadfence();    
    unlock(ry);
  }
  __syncthreads();  
}

// to be executed by one thread
__device__ uint lockVarRep(uint& var) {
  while (1) {
    uint rep = getRepRec(var);
    uint old = atomicCAS(__lock__ + rep, UNLOCKED, VAR(blockIdx.x));      
    if (old == PTR(blockIdx.x)) {
        // try to promote lock to type VAR
      old = atomicCAS(__lock__ + rep, PTR(blockIdx.x), VAR(blockIdx.x));            
    }
    if (old != UNLOCKED && old != PTR(blockIdx.x)) {
      var = rep;
      return old;
    }
    // we locked it, but maybe is not a representative anymore
    var = getRep(rep);
    if (var == rep) {
      return UNLOCKED;
    }
    if (old == PTR(blockIdx.x)) { // back to PTR
        __lock__[rep] = PTR(blockIdx.x);            
    } else {
      unlock(rep);
    }
  }
}

/**
 * Lock a list of variables
 * Granularity: block
 * @param _currVar_ List of variables to lock, sorted in ascending order
 * @param _currVarSize_ Number of variables we want to process. At the end of the function,
 * it stores the number of variables we were able to lock.
 * @param _nextVar_ List where to add all the variables we could not lock
 * @param _nextVarSize_ Number of variables we could not lock
 */
__device__ void lockVars(uint* const _currVar_, uint& _currVarSize_, uint* const _nextVar_, 
    uint* _nextVarSize_) {
  __shared__ uint _count_;
  _count_ = 0;
  __syncthreads();
  for (int i = getThreadIdInBlock(); i < _currVarSize_; i+= getThreadsPerBlock()) {
    uint var = _currVar_[i];  
    // block culling to filter out some duplicates
    if (i && var == _currVar_[i - 1]) {
      continue;        
    }
    uint stat = lockVarRep(var);
    uint pos;
    if (stat == UNLOCKED) {
      pos = atomicAdd(&_count_, 1);
      _currVar_[pos] = var;
    } else if (stat != VAR(blockIdx.x)) { 
      uint pos = atomicAdd(_nextVarSize_, 1);
      _nextVar_[pos] = var;        
    }       
  }   
  __syncthreads();  
  _currVarSize_ = _count_; //first currVarSize positions are populated
  __syncthreads();  
}

// to be executed by one WARP
__device__ uint lockPtr(uint ptr) {
  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];
  uint intended = PTR(getBlockIdInGrid());
  if (isFirstThreadOfWarp()) {    
    _shared_[threadIdx.y] = atomicCAS(__lock__ + ptr, UNLOCKED, intended);      
  }
  return _shared_[threadIdx.y];
}

/**
 * Lock every variable in the current points-to set of the input variable.
 * Granularity: warp
 * @param x A variable locked by the current block
 * @param _currVar_ List of locked variables
 * @param _currVarSize_ Number of locked variables
 * @param _nextVar_ List of variables we could not lock
 * @param _nextVarSize_ Number of variables we could not lock
 */
__device__ void decodeCurrPts(const uint x, uint* const _currVar_, uint* const _currVarSize_, 
    uint* const _nextVar_, uint* const _nextVarSize_) {
  uint index = getCurrDiffPtsHeadIndex(x);
  do {
    uint myBits = __graphGet__(index, threadIdx.x);
    uint base = __graphGet__(index, BASE);
    if (base == NIL) {
      break;
    }
    index = __graphGet__(index, NEXT);
    uint nonEmpty = __ballot_sync(0xffffffff,myBits && threadIdx.x < BASE);
    uint lastVar = NIL;
    while (nonEmpty) {
      uint pos = __ffs(nonEmpty) - 1;
      nonEmpty &= (nonEmpty - 1);
      uint bits = getValAtThread(myBits, pos);
      uint var = mul960(base) + mul32(pos) + threadIdx.x;
      if (var == I2P || !isBitActive(bits, threadIdx.x)) {
        var = NIL;
      } else {
        uint stat = lockVarRep(var);             
        if (stat != UNLOCKED) {
          if (stat != VAR(blockIdx.x) && var != lastVar) { 
            // TODO: do something so we do not lose equivalences. This only affects Linux, though
            uint where = atomicInc(_nextVarSize_, HCD_DECODE_VECTOR_SIZE - 1); 
            _nextVar_[where] = var;              
            lastVar = var;
          }         
          var = NIL;
        }  
      }
      bits = __ballot_sync(0xffffffff,var != NIL);
      if (!bits) {
        continue;
      }
      uint numOnes = __popc(bits);
      uint prevNumFrom = 0;
      if (isFirstThreadOfWarp()) {
        prevNumFrom = atomicAdd(_currVarSize_, numOnes);
      }
      prevNumFrom = getValAtThread(prevNumFrom, 0);
      // TODO: make sure that (prevNumFrom + numOnes < HCD_DECODE_VECTOR_SIZE)      
      //if (isFirstThreadOfWarp() && ((prevNumFrom + numOnes) >= HCD_DECODE_VECTOR_SIZE)) { 
      //  //printf("Exceeded HCD_DECODE_VECTOR_SIZE!!\n"); 
      //} 
      pos = prevNumFrom + __popc(bits & ((1 << threadIdx.x) - 1));
      if (var != NIL) { 
        _currVar_[pos] = var;
      }             
    }
  } while (index != NIL);
}

/**
 * Lock a list of (pointer) variables and their points-to sets
 * Granularity: block 
 */
__device__ void lockPtrs(uint* const _currPtr_, uint& _currPtrSize_, uint* const _nextPtr_, 
    uint* _nextPtrSize_, uint* const _currVar_, uint* _currVarSize_, uint* const _nextVar_, 
    uint* _nextVarSize_) {
  const uint warpsPerBlock = getWarpsPerBlock();  
  for (int i = threadIdx.y; i < _currPtrSize_; i += warpsPerBlock) {
    uint ptr = _currPtr_[i];
    uint stat = lockPtr(ptr);
    if (stat != UNLOCKED && stat != VAR(blockIdx.x)) {       
      _currPtr_[i] = NIL;
      if (isFirstThreadOfWarp()) {
        uint pos = atomicAdd(_nextPtrSize_, 1);
        _nextPtr_[pos] = ptr;
      }          
    } else {
      decodeCurrPts(ptr, _currVar_, _currVarSize_, _nextVar_, _nextVarSize_);
    }
  }
  __syncthreads();   
}

__device__ void unlockPtrs(const uint* const _list_, const uint _listSize_) {
  int init = getThreadIdInBlock();
  int inc = getThreadsPerBlock();
  for (int i = init; i < _listSize_; i += inc) {
    uint var = _list_[i];
    if (var != NIL) {
      // if it is locked by VAR(blockIdx.x), keep it that way
      atomicCAS(__lock__ + var, PTR(blockIdx.x), UNLOCKED);
    }
  }
  __syncthreads();
}

/**
 * Online phase of Hybrid Cycle Detection
 * This is when things get really hairy -- but the overall performance of the algorithm is 
 * dramatically improved by removing the equivalents discovered during the offline analysis, so
 * there is not way around it AFAIK.
 * The kernel takes a list of tuples (y, x_0, ..., x_N) where pts(*y) = pts(x_0) = ... pts(x_N)
 * Each block pops a pair out of the worklist, and performs the following logic:
 *   a) lock variables y,x_0,...,x_N
 *   b) decode and lock the points-to of x_0,...,x_N
 *   c) merge all the variables that we were able to lock
 *   d) unlock the merged variables
 *   e) repeat a-d for all the variables we were not able to lock
 * Note that e) is not strictly necessary, but we would be missing some (maybe relevant) 
 * equivalences that will eventually result in more work for the standard graph rules.
 */
__global__ void hcd() {
  __shared__ uint _counter_;
  /**
   * list of variables (x,...,x_N) such that all the variables in the set {pts(x),...pts(x_N)}
   * are pointer-equivalent.
   */
  __shared__ uint _ptr_[HCD_TABLE_SIZE * 2];
  /*
   * pointer to _ptr_ indicating where the current list starts
   */
  __shared__ uint *_currPtr_;
  /**
   * pointer to _ptr_ indicating where the next list starts. 
   * The reason why need of sublists within _ptr_ is because we might not have been able to lock
   * all the variables in _currPtr_, so everything that is pending (=needs to be processed in the
   * next iteration) is placed in the subarray pointed by _nextPtr_
   */
  __shared__ uint *_nextPtr_;
  /**
   * list of variables that are pointer equivalent (thus need to be merged)
   */
  __shared__ uint _currVar_[HCD_DECODE_VECTOR_SIZE];
  /**
   * list of variables that are pointer equivalent but could not be locked in the current iteration
   */
  __shared__ uint *_nextVar_;
  __shared__ uint _currPtrSize_, _nextPtrSize_, _currVarSize_, _nextVarSize_;    
  const uint threadIdInBlock = getThreadIdInBlock();
  const uint threadsInBlock = getThreadsPerBlock();
  const uint to = __numHcdIndex__;
  
  // first thread of the block picks next hcd pair to work on
  if (isFirstThreadOfBlock()) {
    _counter_ = atomicAdd(&__worklistIndex0__, 1);
    _nextVar_ = __nextVar__ + getBlockIdInGrid() * HCD_DECODE_VECTOR_SIZE;
  }
  __syncthreads();
  while (_counter_ < to) {
    uint pair = __hcdIndex__[_counter_];
    uint start = getFirst(pair);
    uint end = getSecond(pair);
    // move the (x0,...,x_N) sublist to shared memory
    for (int i = start + 1 + threadIdInBlock; i < end; i += threadsInBlock) {
      _ptr_[i - start - 1] = __hcdTable__[i];
    } 
    if (isFirstWarpOfBlock()) {
      _currPtrSize_ = end - start - 1;
      _currVar_[0] = __hcdTable__[start];
      _currVarSize_ = 1;
      _currPtr_ = _ptr_;
      // we do not know how many variables we will not be able to lock, so unfortunately we have
      // use a statically fixed index
      _nextPtr_ = _ptr_ + HCD_TABLE_SIZE;
    }
    while (1) {   
      _nextPtrSize_ = 0;
      _nextVarSize_ = 0;
      __syncthreads();           
      // lock variables in the current variable list (variables that belong to the points-to set
      // of x_I and could not be locked in a previous iteration)
      lockVars(_currVar_, _currVarSize_, _nextVar_, &_nextVarSize_);     
      // lock variables in current pointer list, then decode their points-to sets and lock those too
      lockPtrs(_currPtr_, _currPtrSize_, _nextPtr_, &_nextPtrSize_, _currVar_, &_currVarSize_,  _nextVar_, &_nextVarSize_);
      // unlock variables in pointer list if they are not in the variable list
      unlockPtrs(_currPtr_, _currPtrSize_);                        
      blockSort(_currVar_, _currVarSize_);
      // merge variable list!
      mergeCycle(_currVar_, _currVarSize_); 
      // if there is any pending work -because variables or pointers could not be locked-, update
      // the corresponding information and retry
      if (!_nextPtrSize_ && (!_nextVarSize_ || (_currVarSize_ + _nextVarSize_ == 1))) {
        break;
      }
      if (isFirstWarpOfBlock() && _currVarSize_) {
        _currVar_[_nextVarSize_] = _currVar_[0]; // merge representative with pending
      }
      __syncthreads();
      for (int i = threadIdInBlock; i < _nextVarSize_; i+= threadsInBlock) {
        _currVar_[i] = _nextVar_[i];
      }
      if (isFirstWarpOfBlock()) {
        _currVarSize_ = _nextVarSize_ + (_currVarSize_ > 0);
        _currPtrSize_ = _nextPtrSize_;
        uint* tmp = _nextPtr_;
        _nextPtr_ = _currPtr_;
        _currPtr_ = tmp;
      }        
      __syncthreads(); 
      blockSort(_currVar_, _currVarSize_);       
    }
    if (isFirstThreadOfBlock()) {
      _counter_ = atomicAdd(&__worklistIndex0__, 1);
    }
    __syncthreads();    
  }
  resetWorklistIndex();
}

__global__ void updateInfo() {
  int inc = getThreadsPerGrid();
  int init = getThreadIdInGrid();
  uint to = __numVars__;
  // a) path compression
  for (int var = init; var < to; var += inc) {
    uint rep = getRepRec(var); // non-coalesced
    if (rep != var) {
      setRep(var, rep); //coalesced
    }
    uint diffPtsMask = __ballot_sync(0xffffffff,!isEmpty(rep, CURR_DIFF_PTS)); //non aligned
    __diffPtsMaskSet__(BASE_OF(var), WORD_OF(var), diffPtsMask); //aligned
  }
  syncAllThreads();
  // b) update store rules
  to = __numStore__;
  for (int index = init; index < to; index += inc) {
    // the size of store has been rounded to a multiple of 32, so no out-of-bounds
    uint src = __storeConstraints__[index];
    if (src != NIL) {
      src = getRep(src);
      uint val = (atomicCAS(__lock__ + src, UNLOCKED, LOCKED) == UNLOCKED) ? src : NIL;
      __storeConstraints__[index] = val;        
    }
  }
  syncAllThreads();
  // c) unlock
  for (int index = init; index < to; index += inc) {
    uint src = __storeConstraints__[index];
    if (src != NIL) {
      unlock(getRep(src));
    }
  }
}

__launch_bounds__ (DEF_THREADS_PER_BLOCK)
__global__ void initialize() {
  uint to = __numVars__;
  uint headerSize = to * ELEMENT_WIDTH;
  if (isFirstThreadOfBlock()) {
    __ptsFreeList__ = headerSize;
    __currDiffPtsFreeList__ = CURR_DIFF_PTS_START - headerSize;    
    __nextDiffPtsFreeList__ = NEXT_DIFF_PTS_START - headerSize;
    // after LOAD_INV, STORE and CURR_DIFF_PTS_INV  header regions
    __otherFreeList__ = COPY_INV_START + headerSize * (LAST_DYNAMIC_REL - COPY_INV + 1);
  }
  __syncthreads();
  int inc = mul32(getWarpsPerGrid());
  int init = mul32(getWarpIdInGrid());
  for (int var = init; var < to; var += inc) {
    unlock(var + threadIdx.x);
    setRep(var + threadIdx.x, var + threadIdx.x);
    for (int i = 0; i < WARP_SIZE; i++) {
      uint index = getHeadIndex(var + i, PTS);
      __graphSet__(index + threadIdx.x, NIL);
      index = getHeadIndex(var + i, NEXT_DIFF_PTS);
      __graphSet__(index + threadIdx.x, NIL);
      index = getHeadIndex(var + i, CURR_DIFF_PTS);
      __graphSet__(index + threadIdx.x, NIL);
      index = getHeadIndex(var + i, COPY_INV);
      __graphSet__(index + threadIdx.x, NIL);
      index = getHeadIndex(var + i, STORE);
      __graphSet__(index + threadIdx.x, NIL);
      index = getHeadIndex(var + i, LOAD_INV);
      __graphSet__(index + threadIdx.x, NIL);
    }
  }
  inc = mul960(getWarpsPerGrid());
  init = mul960(getWarpIdInGrid());
  for (int i = init; i < to; i += inc) {
    uint base = BASE_OF(i);
    __diffPtsMaskSet__(base, threadIdx.x, 0);
  }
  syncAllThreads();
  to = __numInitialRep__;
  init = getThreadIdInGrid();
  inc = getThreadsPerGrid();
  // the offline phase of Hybrid Cycle Detection already detected some pointer equivalent variables.
    for (int i = init; i < to; i += inc) {
    setRep(__initialNonRep__[i], __initialRep__[i]);    
  }
}

__global__ void computeCurrPtsHash() {
  const uint to = __numVars__;
  uint src = getAndIncrement(WARP_SIZE);
  while (src < to) {
    for (int i = 0; i < WARP_SIZE; i++) {
      if (!isEmpty(src + i, CURR_DIFF_PTS)) {
        uint hash = hashCode(getHeadIndex(src + i, CURR_DIFF_PTS));
        uint next = getAndIncrement(&__numKeysCounter__, 1);
        __key__[next] = hash;
        __val__[next] = src + i;
      }
    }
    src = getAndIncrement(WARP_SIZE);
  }
  if (resetWorklistIndex()) {
    __numKeys__ = __numKeysCounter__;
    __numKeysCounter__ = 0;
  }  
}

__global__ void findCurrPtsEquivalents() {
  __shared__ uint _sh_[WARPS_PER_BLOCK(UPDATE_THREADS_PER_BLOCK) * WARP_SIZE * 2];
  uint* _key_ = &_sh_[threadIdx.y * WARP_SIZE * 2];
  uint* _val_ = _key_ + WARP_SIZE;

  const uint to = __numKeys__;
  uint index = getAndIncrement(WARP_SIZE);
  while (index < to) {
    if (index + threadIdx.x < to) {
      _key_[threadIdx.x] = __key__[index + threadIdx.x];
      _val_[threadIdx.x] = __val__[index + threadIdx.x];
    }
    for (int i = 0; i < WARP_SIZE && index + i < to; i++) {
      uint var1 = _val_[i];
      uint var1Head = getHeadIndex(var1, CURR_DIFF_PTS);
      uint j = _key_[i];
      while (j < index + i) {
        uint var2 = __val__[j];
        uint var2Head = getHeadIndex(var2, CURR_DIFF_PTS);
        if (equal(var1Head, var2Head)) {
          __currPtsHead__[var1] = var2Head;
          break;
        }
        j++;
      }
      if (j == index + i) {
        __currPtsHead__[var1] = var1Head;
      }
    }
    index = getAndIncrement(WARP_SIZE);
  } 
  resetWorklistIndex();
}

__host__ void checkKernelErrors(char *msg) {
  cudaError_t e;
  cudaThreadSynchronize(); 
  if (cudaSuccess != (e = cudaGetLastError())) {
    printf("\n%s: %s\n", msg, cudaGetErrorString(e));
    exit(-1);
  }
}

__host__ void checkErrors(uint rel) {
#if CHECK_SPV
  uint error = 0;
  checkForErrors << <getBlocks(), THREADS_PER_BLOCK >> >(rel);
  checkKernelErrors("ERROR while checking for errors");
  cudaSafeCall(cudaMemcpyFromSymbol(&error, __error__, uintSize, 0, D2H));
  if (error) {
    exit(-1);
  }
#endif
}

__host__ void checkAllErrors() {
  checkErrors(PTS);
  checkErrors(NEXT_DIFF_PTS);
  checkErrors(CURR_DIFF_PTS);
  checkErrors(COPY_INV);
  checkErrors(LOAD_INV);
  checkErrors(STORE);
}

__host__ void addTimeToRule(uint& counter, clock_t& startTime) {
  uint ellapsedTime = (int) (1000.0f * (clock() - startTime) / CLOCKS_PER_SEC);
  counter += ellapsedTime;
  startTime = clock();
}

__host__ void printRule(const char* msg) {
#if PRINT_RULES
    printf("%s", msg);
#endif
}

template <typename Vector>
__host__ void printVector(const Vector& v, uint size) {
  std::cout << "[";
  for (size_t i = 0; i < size; i++) {    
    uint num =  v[i];
    if (num != NIL) {
      std::cout << num;
      if (i < size - 1) {
        std::cout << ", ";
      }    
    }
  }
  std::cout << "]";
}

__host__ void initializeEdges(uint* &constraintsName, uint &constraintNumber, uint rel) {
  dim3 dimInitialize(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK) / WARP_SIZE);
  uint* constraints;
  uint numConstraints;
  cudaSafeCall(cudaMemcpyFromSymbol(&constraints, constraintsName, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyFromSymbol(&numConstraints, constraintNumber, uintSize));
  device_ptr<uint> src(constraints);
  device_vector<uint> dstIndex(numConstraints);
  sequence(dstIndex.begin(), dstIndex.begin() + numConstraints);    
  uint numSrc = unique_by_key(src, src + numConstraints, dstIndex.begin()).first - src;    
  addEdges<<<getBlocks() * 3, dimInitialize>>>(constraints, raw_pointer_cast(&dstIndex[0]), 
      constraints + numConstraints, numSrc, rel); 
  if (rel == STORE) {
    cudaSafeCall(cudaMemcpyToSymbol(__numStore__, &numSrc, uintSize));    
  } else {
    cudaFree(constraints);
  }  
  checkKernelErrors("ERROR while adding initial edges");
}

extern "C" void createGraph(const uint numObjectVars, const uint maxOffset) {
  setbuf(stdout, NULL);
  printf("[dev]  Creating graph and masks out of constraints...");
  const uint startTime = clock();
  double startTime2 = rtclock();
  dim3 dim(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK)/ WARP_SIZE);

  /* no need for maximum_residency here, since kernel will fail to launch otherwise */

  initialize<<<getBlocks(), dim>>>();
  checkKernelErrors("ERROR at initialize");

  initializeEdges(__ptsConstraints__, __numPtsConstraints__, NEXT_DIFF_PTS);
  initializeEdges(__copyConstraints__, __numCopyConstraints__, COPY_INV);
  initializeEdges(__loadConstraints__, __numLoadConstraints__, LOAD_INV);
  initializeEdges(__storeConstraints__, __numStoreConstraints__, STORE);
  // no need to add GEP_INV edges, there is only one per variable

  createOffsetMasks<<<getBlocks(), dim>>>(numObjectVars, maxOffset);
  checkKernelErrors("ERROR while creating the offset mask");
  uint* size;
  cudaSafeCall(cudaMemcpyFromSymbol(&size, __size__, sizeof(uint*)));    
  cudaFree(size);
  
  printf("OK.\n");
  createTime = getEllapsedTime(startTime);
  createTime2 = rtclock() - startTime2;
}

struct neqAdapter : public thrust::unary_function<tuple<uint, uint>, uint>{
  __host__ __device__
  uint operator()(const tuple<uint, uint>& a) {
    return get<0>(a) != get<1>(a);
  }
};

struct mulAdapter : public thrust::unary_function<tuple<uint, uint>, uint>{
  __host__ __device__
  uint operator()(const tuple<uint, uint>& a) {
    return get<0>(a) * get<1>(a);
  }
};

__host__ void buildHashMap(device_vector<uint>& key, device_vector<uint>& val,const uint size) {
  sort_by_key(key.begin(), key.begin() + size, val.begin());    
  thrust::maximum<uint> uintMax;
  inclusive_scan(
     make_transform_iterator(
        make_zip_iterator(make_tuple(
          make_transform_iterator(
              make_zip_iterator(make_tuple(key.begin() + 1, key.begin())), 
              neqAdapter()), 
          counting_iterator<uint>(1))), 
        mulAdapter()),
     make_transform_iterator(
         make_zip_iterator(make_tuple(
             make_transform_iterator(
                 make_zip_iterator(make_tuple(key.begin() + size, key.begin() + size - 1)), 
                 neqAdapter()), 
          counting_iterator<uint>(1))), 
         mulAdapter()), key.begin() + 1, uintMax);  
  key[0] = 0;          
}

extern "C" uint andersen(uint numVars) {
  setbuf(stdout, NULL);
  printf("[dev]  Solving: ");
  const uint startTime = clock();
  const double startTime2 = rtclock();

  uint iteration = 0;
  uint updatePtsTime = 0;
  uint hcdTime = 0;
  uint ptsEquivTime = 0;
  uint copyInvTime = 0;
  uint storeInvTime = 0;
  uint gepInvTime = 0;
  dim3 dim512(WARP_SIZE, getThreadsPerBlock(512) / WARP_SIZE);
  dim3 dimDefThreads(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK) / WARP_SIZE);
  dim3 dimUpdate2(WARP_SIZE, getThreadsPerBlock(UPDATE_THREADS_PER_BLOCK) / WARP_SIZE);
  dim3 dimHcd(WARP_SIZE, getThreadsPerBlock(HCD_THREADS_PER_BLOCK) / WARP_SIZE);
  dim3 dimCopy(WARP_SIZE, getThreadsPerBlock(COPY_INV_THREADS_PER_BLOCK) / WARP_SIZE);
  dim3 dimStore(WARP_SIZE, getThreadsPerBlock(STORE_INV_THREADS_PER_BLOCK) / WARP_SIZE);
  dim3 dimGep(WARP_SIZE, getThreadsPerBlock(GEP_INV_THREADS_PER_BLOCK) / WARP_SIZE);
 
  device_vector<uint> key(MAX_HASH_SIZE);
  uint* ptr = raw_pointer_cast(&key[0]);
  cudaSafeCall(cudaMemcpyToSymbol(__key__, &ptr, sizeof(uint*)));
  device_vector<uint> keyAux(MAX_HASH_SIZE);
  ptr = raw_pointer_cast(&keyAux[0]);
  cudaSafeCall(cudaMemcpyToSymbol(__keyAux__, &ptr, sizeof(uint*)));
  device_vector<uint> val(MAX_HASH_SIZE);
  ptr = raw_pointer_cast(&val[0]);  
  cudaSafeCall(cudaMemcpyToSymbol(__val__, &ptr, sizeof(uint*)));

  clock_t ruleTime = clock();
  uint blocks = getBlocks();
  // TODO: mega-hack to avoid race condition on 'gcc' input.
  uint hcdBlocks = getenv("GCC") ? 4 : blocks;
  
  /**
   * TODO (Jan'11)
   *  
   * a) use pointers instead of integers for the indexes, which is possible because all the 
   * inputs can be analyzed using a 4GB heap. Advantages:
   *   a.1) when dereferencing an index, currently we assume that in reality is a delta with 
   *   respect to __edges__. Because of that, every access to an element becomes *(__edges__ + delta).
   *   If we are using pointers, we could simply do *ptr. Note that __edges__ is in constant memory.
   *   a.2.) we could use the malloc in the CUDA libraries. Malloc could potentially be used in two
   *   places: OTHER and PTS edges. In practice, we currently keep the PTS edges together because they
   *   contain the solution so we would restric malloc to allocating copy/load/store edges. Since
   *   malloc returns a pointer, it would be compatible with the index-is-a-pointer system
   *
   * b) HCD is buggy when many blocks are used. This happens only for the gcc input, so the 
   * temporal path (see "hcdBlocks" variable) is to set the limit of blocks to four.
   * 
   * c) retrieve the amount of memory and use that as HEAP_SIZE. 
   * 
   *  d) devise a better representation scheme st all the benchmarks fit in 3GB, so I can effectively
   *  use an MSI GTX580 (=> much faster than the Tesla C2070 or Quadro 6000) for all the inputs.
   */  


  const int updateInfo_residency = maximum_residency(updateInfo, dim512.x * dim512.y * dim512.z, 0);
  
  uint ptsStartIndex;  
  while (1) {
    //printf("\n\nIteration: %u\n", iteration);
    cudaSafeCall(cudaMemcpyFromSymbol(&ptsStartIndex, __ptsFreeList__, uintSize));
  //printf("\tstart = %d.\n", ptsStartIndex);
    printRule("    updating pts...");
    updatePtsInformation<<<blocks, dimUpdate2>>>();
    checkKernelErrors("ERROR at update pts");
    printRule("done\n");
    addTimeToRule(updatePtsTime, ruleTime);
    bool done = true;
    cudaSafeCall(cudaMemcpyFromSymbol(&done, __done__, sizeof(bool)));
    if (done) {
      break;
    }
    // Ideally, we would use one stream to copy all the points-to edges discovered during the 
    // last iteration (resident in the interval [CURR_DIFF_PTS_START, __currDiffPtsFreeList__]) 
    // back to the host while the other stream computes the next iteration, computation that does
    // not modify the CURR_DIFF_PTS set. However, Thrust does not currently support streams, and
    // kernel invocations using the default stream add a implicit synchronization point [CUDA 4.1
    // programming guide, 3.2.5.5.4]
    // If you do want to implement the simultaneous copy-kernel scheme, you can always modify
    // the Thrust source code or create your custom Thrust library with the stream hardcoded on it.
    // To avoid going that way, I chose to publish the version of the code that does pay a penalty
    // for the data transfer.
       
    printRule("    hcd...");
    hcd<<<hcdBlocks, dimHcd>>>();
    checkKernelErrors("ERROR at hcd rule");                    
    updateInfo<<<updateInfo_residency * blocks, dim512>>>();
    checkKernelErrors("ERROR while updating information after collapsing");
    printRule("done\n");
    addTimeToRule(hcdTime, ruleTime);

    printRule("    finding curr_pts equivalences...");
    computeCurrPtsHash<<<3 * blocks, dimDefThreads>>>();
    checkKernelErrors("ERROR at compute hash");
    uint numKeys;
    cudaSafeCall(cudaMemcpyFromSymbol(&numKeys, __numKeys__, uintSize));
    buildHashMap(key, val, numKeys);
    findCurrPtsEquivalents<<<3 * blocks, dimUpdate2>>>();
    checkKernelErrors("ERROR in finding CURR_PTS equivalents");       
    printRule("done\n");
    addTimeToRule(ptsEquivTime, ruleTime);
    
    printRule("    copy_inv and load_inv and store2storeInv...");
    copyInv_loadInv_store2storeInv<<<blocks, dimCopy>>>();
    checkKernelErrors("ERROR at copy_inv/load_inv/store2storeinv rule");        
  
    cudaSafeCall(cudaMemcpyFromSymbol(&numKeys, __numKeys__, uintSize));    
    assert(numKeys <= MAX_HASH_SIZE);
    sort_by_key(key.begin(), key.begin() + numKeys, val.begin());
    sequence(keyAux.begin(), keyAux.begin() + numKeys);    
    numKeys = unique_by_key(key.begin(), key.begin() + numKeys, keyAux.begin()).first - key.begin();    
    cudaSafeCall(cudaMemcpyToSymbol(__numKeys__, &numKeys, uintSize));   
    printRule("done\n");
    addTimeToRule(copyInvTime, ruleTime);
    
    printRule("    store_inv...");
    storeInv<<<blocks, dimStore>>>();
    checkKernelErrors("ERROR at store_inv rule");
    printRule("done\n");
    addTimeToRule(storeInvTime, ruleTime);

    printRule("    gep_inv...");
    gepInv<<<blocks, dimGep>>>();
    checkKernelErrors("ERROR at gep_inv rule");
    printRule("done\n");
    addTimeToRule(gepInvTime, ruleTime);

    iteration++;
    printf(".");
  }
  printf("OK.\n");
  printf("Iterations = %u.\n", iteration);
  // store the last index for the PTS elements
  uint ptsEndIndex;  
  cudaSafeCall(cudaMemcpyFromSymbol(&ptsEndIndex, __ptsFreeList__, uintSize));
  uint solveTime = getEllapsedTime(startTime);
  double solveTime2 = rtclock() - startTime2;

  printf("SOLVE runtime: %u ms.\n", createTime + solveTime);
  printf("SOLVE runtime2: %f ms.\n", (createTime2 + solveTime2) * 1000.0);
  printf("    create graph    : %u ms.\n", createTime);
  printf("    rule solving    : %u ms.\n", solveTime);
  printf("        updatePts   : %u ms.\n", updatePtsTime);
  printf("        hcd         : %u ms.\n", hcdTime);
  printf("        equiv       : %u ms.\n", ptsEquivTime);
  printf("        cpLdSt2inv  : %u ms.\n", copyInvTime);
  printf("        store       : %u ms.\n", storeInvTime);
  printf("        gepInv      : %u ms.\n", gepInvTime);
  //printf("amount of points-to info = %d.\n", ptsEndIndex - ptsStartIndex);
  //  return ptsEndIndex - ptsStartIndex;
  return ptsEndIndex;
}


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/andersen.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef ANDERSEN_H
#define ANDERSEN_H

#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <limits.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>

// production or debug mode
// in debug mode, only one warp is active in the whole grid.
#define DEBUG 0
#define CHECK_SPV 0
#define PRINT_RULES 0

// Amount of memory reserved for the graph edges. It has to be slightly smaller
// than the total amount of memory available in the system
#define HEAP_SIZE_MB (3930)

typedef unsigned int uint;
typedef unsigned long int ulongint;
#define uintSize (sizeof(uint))

#define BASE_OF(x) ((x) / ELEMENT_CARDINALITY)
#define WORD_OF(x) (div32((x) % ELEMENT_CARDINALITY))
#define BIT_OF(x) (mod32(x))

#define WARPS_PER_BLOCK(x) (x / WARP_SIZE)
#define WARPS(x) (x / WARP_SIZE * BLOCKS)

#define I2P (1U)
#define NIL (UINT_MAX)
// offset for the given field of an element
#define BASE (30U)
#define NEXT (31U)

#define PTS 0
#define NEXT_DIFF_PTS 1
#define CURR_DIFF_PTS 2
#define COPY_INV 3
#define LOAD_INV 4
#define STORE 5
#define GEP_INV 7
#define LAST_DYNAMIC_REL STORE

// size (in words) of an element
#define ELEMENT_WIDTH 32
#define ELEMENT_CARDINALITY (30 * 32)

#define HEAP_SIZE_MBf ((float)HEAP_SIZE_MB)
// Size of the region dedicated to CURR_DIFF_PTS edges
#define CURR__DIFF_PTS_REGION_SIZE_MB ((uint)(HEAP_SIZE_MBf * 0.1425))
// Size of the region dedicated to copy/load/store edges
#define OTHER_REGION_SIZE_MB ((uint)(HEAP_SIZE_MBf * 0.1475))
// sizes are given in 32-bit words
#define HEAP_SIZE (HEAP_SIZE_MB * 1024 * 256)
#define MAX_HASH_SIZE (1 << 20)
#define COPY_INV_START                                                         \
  (HEAP_SIZE - OTHER_REGION_SIZE_MB * 1024 * 256) // COPY region
#define CURR_DIFF_PTS_START (COPY_INV_START - ELEMENT_WIDTH)
#define NEXT_DIFF_PTS_START                                                    \
  (CURR_DIFF_PTS_START - CURR__DIFF_PTS_REGION_SIZE_MB * 1024 * 256 -          \
   ELEMENT_WIDTH)

// profiling variables. No need to set them up for your system unless your are
// timing device invocations
#define CLOCK_FREQUENCY                                                        \
  (1500000.0f) // 1.15M cycles per ms for Quadro 6000/ Tesla C2070
#define TICKS_TO_MS(x) (((double)(x)) / CLOCK_FREQUENCY)
// bytes to megabytes

#define B2MB(x) ((x) / (1024 * 1024))
// megabytes to bytes
#define MB2B(x) ((x)*1024 * 1024)

#define MAX_NODES (1 << 22)
#define OFFSET_BITS 10
#define MAX_GEP_OFFSET (1 << OFFSET_BITS)
#define OFFSET_MASK (MAX_GEP_OFFSET - 1)
// all 1's except for bits 30 and 31
#define LT_BASE ((1 << 30) - 1)
// all 1's except for bit 31
#define LT_NEXT ((((uint)(1 << 31)) - 1))

// given in words
#define DECODE_VECTOR_SIZE (128) // has to be power of two

// maximum size of an HCD table
#define HCD_TABLE_SIZE (256)
#define HCD_DECODE_VECTOR_SIZE (8192) // maxed out for 'pine' input

#define PRINT_BUFFER_SIZE (16384) // up to this many neighbors
#define ERROR_MESSAGE_BUFFER_SIZE (512)

#define WARP_SIZE 32
#define LOG2_32 5

#define MAX_WARPS_PER_BLOCK (32)

// number of threads per block for each rule. The thread count is based on the
// amount of shared memory available and empirical measures.
//#define DEF_THREADS_PER_BLOCK (1024)
//#define UPDATE_THREADS_PER_BLOCK (1024)
//#define HCD_THREADS_PER_BLOCK (512)
//#define COPY_INV_THREADS_PER_BLOCK (864)
//#define STORE_INV_THREADS_PER_BLOCK (864)
//#define GEP_INV_THREADS_PER_BLOCK (1024)

#include "pta_tuning.h"

#define UNLOCKED (UINT_MAX)
#define LOCKED (UINT_MAX - 1)
#define VAR(x) (((x) + (UINT_MAX >> 1)))
#define PTR(x) ((x))

#define cudaSafeCall(err)                                                      \
  {                                                                            \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "%s(%i) : Runtime API error %d : %s.\n", __FILE__,       \
              __LINE__, (int)err, cudaGetErrorString(err));                    \
      exit(-1);                                                                \
    }                                                                          \
  }

#define D2H cudaMemcpyDeviceToHost
#define H2D cudaMemcpyHostToDevice

extern "C" void createGraph(const uint numObjectVars, const uint maxOffset);
extern "C" uint andersen(uint numVars);

__host__ inline uint getBlocks() {
  if (DEBUG) {
    return 1;
  }
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, 0);
  return deviceProp.multiProcessorCount;
}

__host__ inline uint getThreadsPerBlock(uint intended) {
  return DEBUG ? WARP_SIZE : intended;
}

//////////// utility functions used in both the CPU and GPU /////////

__device__ __host__ inline const char* getName(uint rel) {
  if (rel == PTS)
    return "PTS";
  if (rel == NEXT_DIFF_PTS)
    return "NEXT_DIFF_PTS";
  if (rel == CURR_DIFF_PTS)
    return "CURR_DIFF_PTS";
  if (rel == COPY_INV)
    return "COPY_INV";
  if (rel == LOAD_INV)
    return "LOAD_INV";
  if (rel == STORE)
    return "STORE";
  if (rel == GEP_INV)
    return "GEP_INV";
  return "UNKNOWN_REL";
}

// ellapsed time, in milliseconds
__device__ __host__ inline uint getEllapsedTime(const clock_t& startTime) {
  // TODO: this code should depend on whether it is executing on the GPU or the
  // CPU
  return (int)(1000.0f * (clock() - startTime) / CLOCKS_PER_SEC);
}

__device__ __host__ static inline int isBitActive(uint word, uint bit) {
  return word & (1 << bit);
}

__device__ __host__ static inline uint isOdd(uint num) { return num & 1; }

__device__ __host__ static inline uint mul32(uint num) {
  return num << LOG2_32;
}

__device__ __host__ static inline uint div32(uint num) {
  return num >> LOG2_32;
}

__device__ __host__ static inline uint mod32(uint num) { return num & 31; }

// base has to be a power of two
__device__ __host__ static inline uint mod(uint num, uint base) {
  return num & (base - 1);
}

__device__ __host__ static inline uint getFirst(uint pair) {
  return pair >> 16;
}

__device__ __host__ static inline uint getSecond(uint pair) {
  return (pair & 0x0000FFFF);
}

__device__ __host__ static inline uint createPair(uint first, uint second) {
  return (first << 16) | second;
}

// related to GEP constraints
__device__ __host__ static inline uint offset(const uint srcOffset) {
  return srcOffset & OFFSET_MASK;
}

// related to GEP constraints
__device__ __host__ static inline uint id(const uint srcOffset) {
  return srcOffset >> OFFSET_BITS;
}

__device__ __host__ static inline uint idOffset(const uint src,
                                                const uint offset) {
  return offset | (src << OFFSET_BITS);
}

// e.g. for powerOfTwo==32: 4 => 32, 32 => 32, 33 => 64
// second parameter has to be a power of two
__device__ __host__ static inline uint roundToNextMultipleOf(uint num,
                                                             uint powerOfTwo) {
  if ((num & (powerOfTwo - 1)) == 0) {
    return num;
  }
  return (num / powerOfTwo + 1) * powerOfTwo;
}

// e.g. for powerOfTwo==32: 0 => 0, 4 => 0, 32 => 32, 33 => 32
// second parameter has to be a power of two
__device__ __host__ static inline uint roundToPrevMultipleOf(uint num,
                                                             uint powerOfTwo) {
  if ((num & (powerOfTwo - 1)) == 0) {
    return num;
  }
  return ((num / powerOfTwo + 1) * powerOfTwo) - 32;
}

// The second parameter has to be a power of 2
__device__ __host__ static inline int isMultipleOf(uint num, uint powerOfTwo) {
  return !(num & (powerOfTwo - 1));
}

static double rtclock() {
  struct timezone Tzp;
  struct timeval Tp;
  int stat;
  stat = gettimeofday(&Tp, &Tzp);
  if (stat != 0)
    printf("Error return from gettimeofday: %d", stat);
  return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
}

#endif


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/pta.cu
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */


#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <vector>

#include "andersen.h"
//#include "mySVN/gzstream.h"	// commented by,
#define igzstream ifstream	// added by,

#include "andersen.cu"		// added by Rupesh Nasre on Dec 27, 2012.
using namespace std;
    
// check that the obtained solution is a subset of the desired solution. Useful when trying to 
// detect bugs (for instance, detected the 1st iteration such that the inclusion does not hold)
#define USE_INCLUSION (2)

static uint transferH2dTime = 0;
static uint transferD2hTime = 0;

//static void printDeviceMemory() {
  //size_t uCurAvailMemoryInBytes, uTotalMemoryInBytes;
  //cudaMemGetInfo( &uCurAvailMemoryInBytes, &uTotalMemoryInBytes );
  //cout << "[host] GPU's total memory: "<< B2MB(uTotalMemoryInBytes) << " MB, free Memory: "
  //        << B2MB(uCurAvailMemoryInBytes) << " MB" << endl;    
  //if (B2MB(uCurAvailMemoryInBytes) < 3930) {
  //    cout << "Warning: there is not enough memory in your GPU to analyze all inputs." << endl;
  //}
//}

static void printVector(const vector<uint>& m) {
  vector<uint>::size_type size = m.size();
  cout << "[";
  if (size) {
    ostream_iterator<uint> out_it (cout,", ");
    std::copy(m.begin(), m.begin() + size - 1, out_it);
    cout << m[size - 1];
  }
  cout << "]";
}

static void printVector(uint* m, const uint size) {
  cout << "[";
  if (size) {
    ostream_iterator<uint> out_it (cout,", ");
    std::copy(m, m + size - 1, out_it);
    cout << m[size - 1];
  }
  cout << "]";
}

void printMatrix(uint* m, const uint rows, const uint cols) {
  printf("[");
  for (uint i = 0; i < rows; i++) {
    if (i > 0) {
      printf(" ");
    }
    printVector(&m[i * cols], cols);
    if (i < rows - 1) {
      printf("\n");
    }
  }
  printf("]\n");
}

void checkGPUConfiguration() {
  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
    cerr << "There is no device supporting CUDA\n" << endl;
    exit(-1);
  }
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, 0);
  if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {
    cerr << "There is no CUDA capable device" << endl;
    exit(-1);
  }
  if ((WARP_SIZE != 32)) {
    cerr << "Warp size must be 32" << endl ;
    exit(-1);
  }
  // Make printf buffer bigger, otherwise some printf messages are not displayed
  size_t limit;
  cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize); 
  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, limit * 16);
  // Make stack bigger, otherwise recursive functions will fail silently (?)
  //cudaThreadGetLimit(&limit, cudaLimitStackSize);
  //cudaThreadSetLimit(cudaLimitStackSize, limit * 8);
}

uint nextUint(istringstream& lineStream) {
  string item;
  getline(lineStream, item, ',');
  return atoi(item.c_str());
}

string skipBlanksAndComments(igzstream& inFile){
  string line;  
  for (;;) {
    getline(inFile, line);
    if (!line.empty() && line[0] != '#') {
      return string(line);
    }
  }
}

uint readNumVars(igzstream &inFile) {
  string line = skipBlanksAndComments(inFile);
  istringstream linestream(line);
  return nextUint(linestream);
}

uint readNodes(char *fileName, uint& numVars, uint& numObjectVars) {
  cout << "[host] Reading nodes..." << flush;
  igzstream inFile(fileName, igzstream::in);
  if (!inFile) {
    fprintf(stderr, "Error: file %s not found.\n", fileName);
    exit(-1);
  }
  string line = skipBlanksAndComments(inFile);
  istringstream linestream(line);
  // read total number of variables
  numVars = roundToNextMultipleOf(nextUint(linestream), 32);
    // cout << "number of variables: " << numVars << endl;
  line = skipBlanksAndComments(inFile);
  istringstream linestream2(line);
  // for some reason, the number stored is lastObjectVar
  numObjectVars = nextUint(linestream2) + 1; 
  // cout << "    object variables: " << numObjectVars << endl;
  skipBlanksAndComments(inFile); // skip lastFunctionNode
  uint length = roundToNextMultipleOf(numObjectVars, 32);
  uint* size = new uint[length];
  assert (size != NULL);
  for (uint i = 0; i < numObjectVars; i++) {
    line = skipBlanksAndComments(inFile);
    istringstream linestream(line);
    nextUint(linestream);  // ignore var ID
    size[i] = nextUint(linestream);
    nextUint(linestream);// ignore functionNode crap
  } 
  inFile.close();
  for (uint i = numObjectVars; i < length; i++) {
    size[i] = 0;
  }
  const uint startTime = clock();
  uint* sizeLocal;
  cudaSafeCall(cudaMalloc((void **) &sizeLocal, length * uintSize));
  cudaSafeCall(cudaMemcpy(sizeLocal, size, length * uintSize, H2D));
  cudaSafeCall(cudaMemcpyToSymbol(__size__, &sizeLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(__numVars__, &numVars, uintSize));
  transferH2dTime += getEllapsedTime(startTime);
  cout << "OK." << endl << flush;
  return numObjectVars;
}

uint inline padNumber(uint num) {
  uint ret = roundToNextMultipleOf(num, 32);
  if (ret == num) {
    ret = roundToNextMultipleOf(num + 1, 32);
  }
  return ret;
}

uint* readConstraints(igzstream &inFile, uint rows) {
  uint length = padNumber(rows);
  uint* constraints = new uint[length * 2];
  assert (constraints != NULL);
  for (uint i = 0; i < rows; i++) {
    string line = skipBlanksAndComments(inFile);
    istringstream linestream(line);
    nextUint(linestream);  // ignore constraint ID
    uint src = nextUint(linestream);
    uint dst = nextUint(linestream);
    nextUint(linestream); // ignore type
    uint offset = nextUint(linestream); 
    if (offset) {
      cerr << "Detected constraint with offset" << endl << flush;
      exit(-1);
    }
    constraints[i] = dst;
    constraints[i + length] = src;
  }  
  // pad with NILs
  for (uint i = rows; i < length; i++) {
    constraints[i] = NIL;
    constraints[i + length] = NIL;
  }
  return constraints;
}

void readAndTransferConstraints(igzstream &inFile, uint numConstraints, uint* &constraintsName, 
    uint &numConstraintsName) {
  uint* constraints = readConstraints(inFile, numConstraints);
  const uint startTime = clock();
  uint* constraintLocal;
  uint paddedSize = padNumber(numConstraints);
  size_t size = paddedSize * uintSize * 2;
  cudaSafeCall(cudaMalloc((void **) &constraintLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(constraintsName, &constraintLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(numConstraintsName, &paddedSize, uintSize));
  cudaSafeCall(cudaMemcpy(constraintLocal, constraints, size, H2D));
  transferH2dTime += getEllapsedTime(startTime);
  delete [] constraints;
}

void readAndTransferGepConstraints(igzstream &inFile, uint numConstraints, uint& maxOffset) {
  uint length = roundToNextMultipleOf(numConstraints * 2, 32);
  uint* constraints = new uint[length];
  assert (constraints != NULL);
  for (uint i = 0; i < numConstraints; i++) {
    string line = skipBlanksAndComments(inFile);
    istringstream linestream(line);
    nextUint(linestream); // ignore constraint ID
    uint src = nextUint(linestream);
    uint dst = nextUint(linestream);
    nextUint(linestream); // ignore type
    uint offset = nextUint(linestream);
    if (offset > maxOffset) {
      maxOffset = offset;
    }
    if (offset > MAX_GEP_OFFSET) {
      cerr << "Offset too large: " << offset << " (max. allowed: " << MAX_GEP_OFFSET << ")";
      exit(-1);
    }
    constraints[i * 2] = dst;
    constraints[i * 2 + 1] = idOffset(src, offset);
  } 
  // pad with NILs
  for (uint i = numConstraints * 2; i < length; i++) {
    constraints[i] = NIL;
  }
  
  const uint startTime = clock();
  uint* formattedConstraintsLocal;
  cudaSafeCall(cudaMalloc((void **) &formattedConstraintsLocal, length * uintSize));
  cudaSafeCall(cudaMemcpy(formattedConstraintsLocal, constraints, length * uintSize, H2D));
  cudaSafeCall(cudaMemcpyToSymbol(__gepInv__, &formattedConstraintsLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(__numGepInv__, &numConstraints, uintSize, 0, H2D));
  transferH2dTime += getEllapsedTime(startTime);
  delete [] constraints;
}

// returns a pointer to __pts__
void readConstraints(char *fileName, uint numVars, uint& maxOffset) {
  cout << "[host] Reading constraints..." << flush;
  igzstream inFile(fileName, igzstream::in);
  if (!inFile) {
    fprintf(stderr, "Error: file %s not found.\n", fileName);
    exit(-1);
  }
  string line = skipBlanksAndComments(inFile);
  istringstream linestream(line);
  uint numAddressOf = nextUint(linestream); 
  uint numCopy = nextUint(linestream);
  uint numLoad = nextUint(linestream);
  uint numStore = nextUint(linestream);
  uint numGep = nextUint(linestream);
  readAndTransferConstraints(inFile, numAddressOf, __ptsConstraints__, __numPtsConstraints__);
  readAndTransferConstraints(inFile, numCopy, __copyConstraints__, __numCopyConstraints__);
  readAndTransferConstraints(inFile, numLoad, __loadConstraints__, __numLoadConstraints__);
  readAndTransferConstraints(inFile, numStore, __storeConstraints__, __numStoreConstraints__);
  uint headerSize = numVars * ELEMENT_WIDTH;
  uint start = COPY_INV_START + headerSize;
  cudaSafeCall(cudaMemcpyToSymbol(__loadInvStart__, &start, sizeof(uint)));
  start += headerSize;
  cudaSafeCall(cudaMemcpyToSymbol(__storeStart__, &start, sizeof(uint)));
  readAndTransferGepConstraints(inFile, numGep, maxOffset);
  inFile.close();
  cout << "OK." << endl << flush;
}

// TODO: this code is too complex, simplify
void readHcdInfo(char *fileName) {
  cout << "[host] Reading HCD table..." << flush;
  igzstream inFile(fileName, igzstream::in);
  if (!inFile) {
    fprintf(stderr, "Error: file %s not found.\n", fileName);
    exit(-1);
  }
  // a) read initial table of representatives
  string line = skipBlanksAndComments(inFile);
  istringstream linestream(line);
  uint numMerged = nextUint(linestream);
  uint* initialNonRep = new uint[numMerged];
  uint* initialRep = new uint[numMerged];
  for (uint i = 0; i < numMerged; i++) {
    string line = skipBlanksAndComments(inFile);
    istringstream linestream(line);
    uint var = nextUint(linestream);
    uint rep = nextUint(linestream);
    initialNonRep[i] = var;
    initialRep[i] = rep;
  }
  int* initRepLocal;
  // transfer index table
  cudaSafeCall(cudaMalloc((void **) &initRepLocal, uintSize * numMerged));
  cudaSafeCall(cudaMemcpy(initRepLocal, initialRep, uintSize * numMerged, H2D));
  cudaSafeCall(cudaMemcpyToSymbol(__initialRep__, &initRepLocal, sizeof(uint*)));
  cudaSafeCall(cudaMalloc((void **) &initRepLocal, uintSize * numMerged));
  cudaSafeCall(cudaMemcpy(initRepLocal, initialNonRep, uintSize * numMerged, H2D));
  cudaSafeCall(cudaMemcpyToSymbol(__initialNonRep__, &initRepLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(__numInitialRep__, &numMerged, uintSize));
  // b) read HCD table itself
  {
    string line = skipBlanksAndComments(inFile);
    istringstream linestream(line);
    uint numKeys = nextUint(linestream);
    uint numValues = nextUint(linestream);
    uint hcdTableSize = numKeys + numValues;
    uint* table = new uint[hcdTableSize];
    uint* index = new uint[numKeys];
    if (numKeys) {
      uint keys = 0;
      uint lastY = 0;
      index[keys] = getFirst(0);
      for (uint i = 0; i < numValues; i++) {
        string line = skipBlanksAndComments(inFile);
        istringstream linestream(line);
        uint y = nextUint(linestream);
        uint x = nextUint(linestream);
        if (y != lastY) {
          table[i + keys] = y;
          if (keys) {
            assert(((i + keys) - (index[keys - 1])) <= HCD_TABLE_SIZE);
            index[keys - 1] = createPair(index[keys - 1], i + keys);
            index[keys] = i + keys;
          }
          keys++;
          lastY = y;
        }
        table[i + keys] = x;
      }
      assert(((numKeys + numValues) - (index[keys - 1])) <= HCD_TABLE_SIZE);
      index[keys - 1] = createPair(index[keys - 1], numKeys + numValues);
    }
    int* hcdIndexLocal;
    int* hcdTableLocal;
    // transfer index table
    cudaSafeCall(cudaMalloc((void **) &hcdIndexLocal, uintSize * numKeys));
    cudaSafeCall(cudaMemcpy(hcdIndexLocal, index, uintSize * numKeys, H2D));
    cudaSafeCall(cudaMemcpyToSymbol(__hcdIndex__, &hcdIndexLocal, sizeof(uint*)));
    cudaSafeCall(cudaMemcpyToSymbol(__numHcdIndex__, &numKeys, uintSize));
    // transfer HCD table
    cudaSafeCall(cudaMalloc((void **) &hcdTableLocal, uintSize * (numKeys + numValues)));
    cudaSafeCall(cudaMemcpy(hcdTableLocal, table, uintSize * (numKeys + numValues), H2D));
    cudaSafeCall(cudaMemcpyToSymbol(__hcdTable__, &hcdTableLocal, sizeof(uint*)));
    cudaSafeCall(cudaMemcpyToSymbol(__numHcdTable__, &hcdTableSize, uintSize));
    }
  cout << "OK." << endl << flush;
}

// allocate memory for the graph edges
uint* allocateElementPool() {
  const uint startTime = clock();
  uint* elementPoolLocal;
  
  size_t size =  HEAP_SIZE * sizeof(uint);
  cudaSafeCall(cudaMalloc((void **) &elementPoolLocal, size));
  // elements are initialized on the GPU, so we only transfer the pointers 
  cudaSafeCall(cudaMemcpyToSymbol(__graph__, &elementPoolLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(__edges__, &elementPoolLocal, sizeof(uint*)));
  transferH2dTime += getEllapsedTime(startTime);
  return elementPoolLocal;
}

uint* allocateOther(uint numVars) {
  uint* lockLocal;
  size_t size =  roundToNextMultipleOf(numVars, 32) * sizeof(uint);
  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(__lock__, &lockLocal, sizeof(uint*)));
  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(__currPtsHead__, &lockLocal, sizeof(uint*)));
  cudaSafeCall(cudaMalloc((void **) &lockLocal, getBlocks() * HCD_DECODE_VECTOR_SIZE));
  cudaSafeCall(cudaMemcpyToSymbol(__nextVar__, &lockLocal, sizeof(uint*)));
  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(__rep__, &lockLocal, sizeof(uint*)));
  return lockLocal;
}

void allocateDiffPtsMask(uint numVars) {
  int* maskLocal; 
  int rows = ceil((float) numVars /  (float) ELEMENT_CARDINALITY);
  size_t size =  rows * ELEMENT_WIDTH * sizeof(uint);
  cudaSafeCall(cudaMalloc((void **) &maskLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(__diffPtsMask__, &maskLocal, sizeof(uint*)));
}

void allocateOffsetMask(uint numObjectVars, uint maxOffset) {
  int* maskLocal;
  int rows = ceil((float) numObjectVars /  (float) ELEMENT_CARDINALITY);
  size_t size =  rows * ELEMENT_WIDTH * maxOffset * sizeof(uint);
  cudaSafeCall(cudaMalloc((void **) &maskLocal, size));
  cudaSafeCall(cudaMemcpyToSymbol(__offsetMask__, &maskLocal, sizeof(uint*)));
  cudaSafeCall(cudaMemcpyToSymbol(__offsetMaskRowsPerOffset__, &rows, sizeof(uint)));
}

uint* allocateOthers(const uint numVars, const uint numObjectVars, const uint maxOffset) {
  const uint startTime = clock();
  uint* repD = allocateOther(numVars);
  allocateDiffPtsMask(numVars);
  allocateOffsetMask(numObjectVars, maxOffset);
  transferH2dTime += getEllapsedTime(startTime);
  return repD;
}

void convertCsvIntoVector(string csv, vector<uint>& ret) {
  if (csv.empty()) {
    return;
  }
  istringstream linestream(csv);
  while (!linestream.eof()) {
    uint next = nextUint(linestream);
    ret.push_back(next);
  }
}

void getPts(uint var, uint* ptsEdges, uint ptsSize, vector<uint>& ret) {
  uint index = mul32(var);
  do {
    if (index > ptsSize) {
      cerr << "Error at variable " << var << ". The NEXT field exceeds the size of PTS. Next: "
          << index << ", size: " << ptsSize << endl << flush;
      return;
      //exit(-1);
    }
    uint base = ptsEdges[index + BASE];
    // if base == NIL => empty adjancency list
    if (base == NIL) {
      return;
    }
    for (uint j = 0; j < BASE; j++) {
      uint word = ptsEdges[index + j];
      if (!word) {
        continue;
      }
      for (uint z = 0; z < WARP_SIZE; z++) {
        if (isBitActive(word, z)) {
          uint num = base * ELEMENT_CARDINALITY + j * WARP_SIZE + z;
          ret.push_back(num);
        }
      }
    }
    index = ptsEdges[index + NEXT];
  } while (index != NIL);
}

void verifySolution(bool useInclusion, uint* ptsEdges, uint ptsSize, uint* rep, const vector<uint>& vars,
    const vector<uint>& sol) {
  for (uint i = 0; i < vars.size(); i++) {
    uint var = vars[i];
    vector<uint> ptsVar;
    uint representative = rep[var];   
    if (representative != var) {
      // non-representative: simply make sure that the representative is included in 'vars'
      if (std::find(vars.begin(), vars.end(), representative) == vars.end()) {
        getPts(representative, ptsEdges, ptsSize, ptsVar);
        cerr << "Error at variable " << var << " (rep=" << representative
            << "): the obtained pts (1st line) differs from the correct solution (2nd line)" << endl;
       printVector(ptsVar);
       cerr << endl;
       printVector(sol);
       cerr << endl;      
       exit(-1);
      }
    } else {
      getPts(representative, ptsEdges, ptsSize, ptsVar);
      bool OK = useInclusion ? includes(sol.begin(), sol.end(), ptsVar.begin(), ptsVar.end()) : 
        (ptsVar == sol);
      if (!OK) {
        cerr << "Error at representative " << var << ": the obtained pts (1st line) "
             << "differs from the correct solution (2nd line)" << endl;
       printVector(ptsVar);
       cerr << endl;
       printVector(sol);
       cerr << endl;      
       exit(-1);
      }
    }
  }
}

void verifySolution(uint verify, uint* ptsEdges, uint ptsSize, uint* rep, char* solFile) {
  if (!verify) {
    return;
  }
  igzstream inFile(solFile, igzstream::in);
  if (!inFile) {
    fprintf(stderr, "Error: file %s not found.\n", solFile);
    exit(-1);
  }
  if (verify == USE_INCLUSION) {
    cerr << "[host] WARNING: verification uses inclusion." << endl << flush;
  }
  cerr << "[host] Verifying against " << solFile << "..." << flush;
  string line;  
  getline(inFile, line); // skip first line
  while (getline(inFile, line)) {
    size_t pos = line.find("] => [");
    string lhs = line.substr(1, pos - 1);
    vector<uint> vars;
    convertCsvIntoVector(lhs, vars);
    string rhs = line.substr(pos + 6);
    rhs = rhs.substr(0, rhs.size() - 1);
    vector<uint> sol;   
    convertCsvIntoVector(rhs, sol);
    verifySolution(verify == USE_INCLUSION, ptsEdges, ptsSize, rep, vars, sol);
  }
  inFile.close();
  cerr << "OK." << endl << flush;
}

void printSolution(uint numVars, uint* ptsEdges, uint ptsSize) {
  for (uint i = 0; i < numVars; i++) {
    vector<uint> ptsVar;
    getPts(i, ptsEdges, ptsSize, ptsVar);
    if (!ptsVar.empty()) {
      cout << i << " => " << flush;
      printVector(ptsVar);
      cout << endl << flush;
    }
  }
}

// transfer back PTS and representative tables
void transferBackInfo(uint verify, uint numVars, uint* edgesD, uint ptsSize, uint* repD, char* solFile) {
  cerr << "[host] Tranferring back " << B2MB(ptsSize * 4) << " MB..." << flush;
  const uint startTime = clock();
  uint* ptsEdges = NULL;
  cudaSafeCall(cudaHostAlloc((void**) &ptsEdges, ptsSize * uintSize, 0));
  cudaSafeCall(cudaMemcpy(ptsEdges, edgesD, ptsSize * uintSize, D2H));
  uint* rep = NULL; 
  cudaSafeCall(cudaHostAlloc((void**) &rep, numVars * uintSize, 0));
  cudaSafeCall(cudaMemcpy(rep, repD, numVars * uintSize, D2H));
  //printSolution(numVars, ptsEdges, ptsSize);
  transferD2hTime += getEllapsedTime(startTime);
  cerr << "OK." << endl << flush;
  cout << "TRANSFER runtime: "  << (transferH2dTime + transferD2hTime) << " ms." << endl;
  cout << "    h2d: " << transferH2dTime << " ms." << endl;
  cout << "    d2h: " << transferD2hTime << " ms." << endl;
  verifySolution(verify, ptsEdges, ptsSize, rep, solFile);
  cudaSafeCall(cudaFreeHost(ptsEdges));
  cudaSafeCall(cudaFreeHost(rep));
}

int main(int argc, char** argv) {  
  if ((argc < 5) || (argc > 7)) {
    cerr << "Usage : " << argv[0] << " NODES_FILE CONSTRAINTS_FILE HCD_TABLE SOLUTION_FILE [TRANSFER, VERIFY]" << endl;
    exit(-1);
  }
//  printDeviceMemory();
  // TODO: a lot of checks on the arguments are missing...
  bool transfer = false;
  int verify = 0;
  if (argc > 5) {
    transfer = atoi(argv[5]);
    verify = atoi(argv[6]);
  }
  checkGPUConfiguration();
  uint maxOffset = 0; 
  uint numVars, numObjectVars;
  string input(argv[1]);
  size_t start = input.find_last_of('/') + 1;
  size_t end = input.find('_');
  cerr << "\n[host] Input: " <<  input.substr(start, end - start) << endl;
#ifdef __LP64__
  cout << "[host] 64-bit detected." << endl << flush;
#endif
  readNodes(argv[1], numVars, numObjectVars);   
  readConstraints(argv[2], numVars, maxOffset);
	printf("%d\t%d\n", numObjectVars, numVars); 
  readHcdInfo(argv[3]);
  uint* edgesD = allocateElementPool();
  uint* repD = allocateOthers(numVars, numObjectVars, maxOffset);
  createGraph(numObjectVars, maxOffset);
  uint endIndex = andersen(numVars);
  if (transfer) {
    transferBackInfo(verify, numVars, edgesD, endIndex, repD, argv[4]);
  }
  return 0;
}


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/pta_tuning.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once
#define GPU_NAME "Quadro 6000"
#define GPU_VERSION_MAJOR 2
#define GPU_VERSION_MINOR 0
#define RT_VERSION 5050
#define DRV_VERSION 6050

#define DEF_THREADS_PER_BLOCK 480
#define UPDATE_THREADS_PER_BLOCK 992
#define HCD_THREADS_PER_BLOCK 256
#define COPY_INV_THREADS_PER_BLOCK 512
#define STORE_INV_THREADS_PER_BLOCK 352
#define GEP_INV_THREADS_PER_BLOCK 512
static const char* TUNING_PARAMETERS =
    "DEF_THREADS_PER_BLOCK 480\nUPDATE_THREADS_PER_BLOCK "
    "992\nHCD_THREADS_PER_BLOCK 256\nCOPY_INV_THREADS_PER_BLOCK "
    "512\nSTORE_INV_THREADS_PER_BLOCK 352\nGEP_INV_THREADS_PER_BLOCK 512\n";


================================================
FILE: lonestar/analytics/gpu/pointstoanalysis/support.cu
================================================


================================================
FILE: lonestar/analytics/gpu/spanningtree/CMakeLists.txt
================================================
app_analy_gpu(mst minimum-spanningtree)
add_test_gpu(minimum-spanningtree rmat15 rmat15.out mst -o rmat15.out ${BASEINPUT}/scalefree/symmetric/rmat15.sgr)


================================================
FILE: lonestar/analytics/gpu/spanningtree/README.md
================================================
Minumum Spanning Tree
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark computes a minimum spanning tree in a graph. This program uses worklists for better performance.
The algorithm is implemented by successive edge-relaxations of the minimum weight edges. However, since an explicit edge-relaxation involves modifying the graph, the implementation performs edge-relaxation indirectly. This is done by keeping track of the set of nodes that have been merged, called components, which avoids modifications to the graph. Each component's size grows in each iteration, while the number of components reduces (due to components getting merged). 

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric Galois .gr graphs.
You must specify the -symmetricGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/spanningtree; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./maximal-independent-gpu -o=<output-file> <input-graph> -symmetricGraph`
-`$ ./maximal-independent-gpu -o=rmat15.out rmat15.sgr -symmetricGraph`


================================================
FILE: lonestar/analytics/gpu/spanningtree/mst-tex.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraphTex &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture";
AppendOnlyList el;
#include "mst.h"
#define INF UINT_MAX
const int DEBUG = 0;
static const int __tb_union_components = TB_SIZE;
__global__ void init_wl(CSRGraphTex graph, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    (out_wl).push(node);
  }
}
__global__ void find_comp_min_elem(CSRGraphTex graph, struct comp_data comp, LockArrayTicket complocks, ComponentSpace cs, int level, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;

  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    index_type edge_end;
    pop = (in_wl).pop_id(wlnode, node);
    unsigned minwt = INF;
    unsigned minedge = INF;
    int degree = graph.getOutDegree(node);
    int mindstcomp  = 0;
    int srccomp = cs.find(node);
    edge_end = (graph).getFirstEdge((node) + 1);
    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)
    {
      int edgewt = graph.getAbsWeight(edge);
      if (edgewt < minwt)
      {
        int dstcomp = cs.find(graph.getAbsDestination(edge));
        if (dstcomp != srccomp)
        {
          minwt = edgewt;
          minedge = edge;
        }
      }
    }
    if (minwt != INF)
    {
      (out_wl).push(node);
      {
        volatile bool done_ = false;
		int _ticket = (complocks).reserve(srccomp);
        while (!done_)
        {
          if (complocks.acquire_or_fail(srccomp, _ticket))
          {
            if (comp.minwt[srccomp] == 0 || (comp.lvl[srccomp] < level) || (comp.minwt[srccomp] > minwt))
            {
              comp.minwt[srccomp] = minwt;
              comp.lvl[srccomp] = level;
              comp.minedge[srccomp] = minedge;
            }
            complocks.release(srccomp);
            done_ = true;
          }
        }
      }
    }
    else
    {
      if (cs.isBoss(node) && degree)
      {
        (out_wl).push(node);
      }
    }
  }
}
__global__ void union_components(CSRGraphTex graph, ComponentSpace cs, struct comp_data compdata, int level, AppendOnlyList el, AppendOnlyList ew, WorklistT in_wl, WorklistT out_wl, GlobalBarrier gb, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (nthreads));
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    pop = (in_wl).pop_id(wlnode, node);
    int r = 0;
    int dstcomp = -1;
    int srccomp = -1;
    if (pop && compdata.lvl[node] == level)
    {
      srccomp = cs.find(node);
      dstcomp = cs.find(graph.getAbsDestination(compdata.minedge[node]));
    }
    gb.Sync();
    if (srccomp != dstcomp)
    {
      if (!cs.unify(srccomp, dstcomp))
      {
        r = 1;
      }
      else
      {
        el.push(compdata.minedge[node]);
        ew.push(compdata.minwt[node]);
      }
    }
    gb.Sync();
    if (r)
    {
      ret_val.reduce(true);
      continue;
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
void gg_main(CSRGraphTex& hg, CSRGraphTex& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  static GlobalBarrierLifetime union_components_barrier;
  static bool union_components_barrier_inited;
  struct comp_data comp;
  PipeContextT<WorklistT> pipe;
  ComponentSpace cs (hg.nnodes);
  el = AppendOnlyList(hg.nedges);
  AppendOnlyList ew (hg.nedges);
  static const size_t union_components_residency = maximum_residency(union_components, __tb_union_components, 0);
  static const size_t union_components_blocks = GG_MIN(blocks.x, ggc_get_nSM() * union_components_residency);
  if(!union_components_barrier_inited) { union_components_barrier.Setup(union_components_blocks); union_components_barrier_inited = true;};
  comp.weight.alloc(hg.nnodes);
  comp.edge.alloc(hg.nnodes);
  comp.node.alloc(hg.nnodes);
  comp.level.alloc(hg.nnodes);
  comp.dstcomp.alloc(hg.nnodes);
  comp.lvl = comp.level.zero_gpu();
  comp.minwt = comp.weight.zero_gpu();
  comp.minedge = comp.edge.gpu_wr_ptr();
  comp.minnode = comp.node.gpu_wr_ptr();
  comp.mindstcomp = comp.dstcomp.gpu_wr_ptr();
  LockArrayTicket complocks (hg.nnodes);
  int level = 1;
  int mw = 0;
  int last_mw = 0;
  pipe = PipeContextT<WorklistT>(hg.nnodes);
  {
    {
      pipe.out_wl().will_write();
      init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      pipe.advance2();
      while (pipe.in_wl().nitems())
      {
        bool loopc = false;
        last_mw = mw;
        pipe.out_wl().will_write();
        find_comp_min_elem <<<blocks, threads>>>(gg, comp, complocks, cs, level, pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        do
        {
          Shared<int> retval = Shared<int>(1);
          HGAccumulator<int> _rv;
          *(retval.cpu_wr_ptr()) = 0;
          _rv.rv = retval.gpu_wr_ptr();
          pipe.out_wl().will_write();
          union_components <<<union_components_blocks, __tb_union_components>>>(gg, cs, comp, level, el, ew, pipe.in_wl(), pipe.out_wl(), union_components_barrier, _rv);
          loopc = *(retval.cpu_rd_ptr()) > 0;
        }
        while (loopc);
        mw = el.nitems();
        level++;
        if (last_mw == mw)
        {
          break;
        }
      }
    }
  }
  unsigned long int rweight = 0;
  size_t nmstedges ;
  nmstedges = ew.nitems();
  mgpu::standard_context_t context;
  mgpu::reduce(ew.list.gpu_rd_ptr(), nmstedges, &rweight, mgpu::plus_t<long unsigned int>(), context);
  printf("number of iterations: %d\n", level);
  printf("final mstwt: %llu\n", rweight);
  printf("total edges: %llu, total components: %llu\n", nmstedges, cs.numberOfComponentsHost());
}


================================================
FILE: lonestar/analytics/gpu/spanningtree/mst.cu
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
AppendOnlyList el;
#include "mst.h"
#define INF UINT_MAX
const int DEBUG = 0;
static const int __tb_union_components = TB_SIZE;
__global__ void init_wl(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type node_end;
  // FP: "1 -> 2;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    (out_wl).push(node);
  }
  // FP: "4 -> 5;
}
__global__ void find_comp_min_elem(CSRGraph graph, struct comp_data comp, LockArrayTicket complocks, ComponentSpace cs, int level, AppendOnlyList bosses, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  // FP: "1 -> 2;

  // FP: "2 -> 3;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    index_type edge_end;
    pop = (in_wl).pop_id(wlnode, node);
    unsigned minwt = INF;
    unsigned minedge = INF;
    int degree = graph.getOutDegree(node);
    int mindstcomp  = 0;
    int srccomp = cs.find(node);
    bool isBoss = srccomp == node;
    edge_end = (graph).getFirstEdge((node) + 1);
    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)
    {
      int edgewt = graph.getAbsWeight(edge);
      if (edgewt < minwt)
      {
        int dstcomp = cs.find(graph.getAbsDestination(edge));
        if (dstcomp != srccomp)
        {
          minwt = edgewt;
          minedge = edge;
        }
      }
    }
    if (isBoss && degree)
    {
      bosses.push(node);
    }
    if (minwt != INF)
    {
      (out_wl).push(node);
      {
        #if __CUDACC_VER_MAJOR__ >= 7
        volatile bool done_ = false;
        #else
        bool done_ = false;
        #endif
        int _ticket = (complocks).reserve(srccomp);
        while (!done_)
        {
          if (complocks.acquire_or_fail(srccomp, _ticket))
          {
            if (comp.minwt[srccomp] == 0 || (comp.lvl[srccomp] < level) || (comp.minwt[srccomp] > minwt))
            {
              comp.minwt[srccomp] = minwt;
              comp.lvl[srccomp] = level;
              comp.minedge[srccomp] = minedge;
            }
            complocks.release(srccomp);
            done_ = true;
          }
        }
      }
    }
    else
    {
      if (isBoss && degree)
      {
        (out_wl).push(node);
      }
    }
  }
  // FP: "30 -> 31;
}
__global__ void union_components(CSRGraph graph, ComponentSpace cs, struct comp_data compdata, int level, AppendOnlyList el, AppendOnlyList ew, AppendOnlyList b_in, AppendOnlyList b_out, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb, HGAccumulator<int> ret_val)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  typedef cub::BlockReduce<int, TB_SIZE> _br;
  __shared__ _br::TempStorage _ts;
  ret_val.thread_entry();
 
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlnode_end;
  index_type wlnode_rup;
  // FP: "1 -> 2;
  wlnode_end = *((volatile index_type *) (b_in).dindex);
  wlnode_rup = ((0) + roundup(((*((volatile index_type *) (b_in).dindex)) - (0)), (nthreads)));
  for (index_type wlnode = 0 + tid; wlnode < wlnode_rup; wlnode += nthreads)
  {
    int node;
    bool pop;
    pop = (b_in).pop_id(wlnode, node);
    int r = 0;
    int dstcomp = -1;
    int srccomp = -1;
    if (pop && compdata.lvl[node] == level)
    {
      srccomp = cs.find(node);
      dstcomp = cs.find(graph.getAbsDestination(compdata.minedge[node]));
    }
    gb.Sync();
    if (srccomp != dstcomp)
    {
      if (!cs.unify(srccomp, dstcomp))
      {
        b_out.push(node);
        r = 1;
      }
      else
      {
        el.push(compdata.minedge[node]);
        ew.push(compdata.minwt[node]);
      }
    }
    gb.Sync();
    if (r)
    {
      ret_val.reduce(true);
      continue;
    }
  }
  ret_val.thread_exit<_br>(_ts);
}
void gg_main(CSRGraph& hg, CSRGraph& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  static GlobalBarrierLifetime union_components_barrier;
  static bool union_components_barrier_inited;
  struct comp_data comp;
  PipeContextT<Worklist2> pipe;
  // FP: "1 -> 2;
  ComponentSpace cs (hg.nnodes);
  // FP: "2 -> 3;
  el = AppendOnlyList(hg.nedges);
  // FP: "3 -> 4;
  AppendOnlyList ew (hg.nedges);
  // FP: "4 -> 5;
  AppendOnlyList bosses[2] = {AppendOnlyList(hg.nnodes), AppendOnlyList(hg.nnodes)};
  int cur_boss = 0;
  // FP: "5 -> 6;
  static const size_t union_components_residency = maximum_residency(union_components, __tb_union_components, 0);
  static const size_t union_components_blocks = GG_MIN(blocks.x, ggc_get_nSM() * union_components_residency);
  if(!union_components_barrier_inited) { union_components_barrier.Setup(union_components_blocks); union_components_barrier_inited = true;};
  // FP: "6 -> 7;
  // FP: "7 -> 8;
  comp.weight.alloc(hg.nnodes);
  comp.edge.alloc(hg.nnodes);
  comp.node.alloc(hg.nnodes);
  comp.level.alloc(hg.nnodes);
  comp.dstcomp.alloc(hg.nnodes);
  comp.lvl = comp.level.zero_gpu();
  comp.minwt = comp.weight.zero_gpu();
  comp.minedge = comp.edge.gpu_wr_ptr();
  comp.minnode = comp.node.gpu_wr_ptr();
  comp.mindstcomp = comp.dstcomp.gpu_wr_ptr();
  // FP: "8 -> 9;
  LockArrayTicket complocks (hg.nnodes);
  // FP: "9 -> 10;
  int level = 1;
  int mw = 0;
  int last_mw = 0;
  // FP: "10 -> 11;
  pipe = PipeContextT<Worklist2>(hg.nnodes);
  {
    {
      pipe.out_wl().will_write();
      init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      pipe.advance2();
      // FP: "12 -> 13;
      while (pipe.in_wl().nitems())
      {
        bool loopc = false;
        last_mw = mw;
        pipe.out_wl().will_write();
        find_comp_min_elem <<<blocks, threads>>>(gg, comp, complocks, cs, level, bosses[cur_boss], pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        do
        {
          Shared<int> retval = Shared<int>(1);
          HGAccumulator<int> _rv;
          *(retval.cpu_wr_ptr()) = 0;
          _rv.rv = retval.gpu_wr_ptr();
          pipe.out_wl().will_write();
          union_components <<<union_components_blocks, __tb_union_components>>>(gg, cs, comp, level, el, ew, bosses[cur_boss], bosses[cur_boss ^ 1], pipe.in_wl(), pipe.out_wl(), union_components_barrier, _rv);
          loopc = *(retval.cpu_rd_ptr()) > 0;
          cur_boss ^= 1;
          bosses[cur_boss].reset();
        }
        while (loopc);
        mw = el.nitems();
        level++;
        if (last_mw == mw)
        {
          break;
        }
      }
    }
  }
  pipe.free();
  unsigned long int rweight = 0;
  size_t nmstedges ;
  nmstedges = ew.nitems();
  printf("nmstedges = %d\n", nmstedges);
  int *h_list = (int *)malloc(nmstedges*sizeof(int));
  check_cuda(cudaMemcpy(h_list, ew.list.gpu_rd_ptr(), nmstedges * sizeof(int), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < nmstedges; i ++) rweight += h_list[i];
  printf("final mstwt: %llu\n", rweight);
  printf("total edges: %llu, total components: %llu\n", nmstedges, cs.numberOfComponentsHost());
}


================================================
FILE: lonestar/analytics/gpu/spanningtree/mst.h
================================================
#pragma once

#include "component.h"
#include "moderngpu/kernel_reduce.hxx"

struct comp_data {
  Shared<int> weight;
  Shared<int> edge;
  Shared<int> node;
  Shared<int> level;
  Shared<int> dstcomp;

  int* lvl;
  int* minwt;
  int* minedge; // absolute
  int* minnode;
  int* mindstcomp;
};

static void dump_comp_data(struct comp_data comp, int n, int lvl);

static void dump_comp_data(struct comp_data comp, int n, int lvl) {
  int *level, *minwt, *minedge, *minnode, *mindstcomp;

  level      = comp.level.cpu_rd_ptr();
  minwt      = comp.weight.cpu_rd_ptr();
  minedge    = comp.edge.cpu_rd_ptr();
  minnode    = comp.node.cpu_rd_ptr();
  mindstcomp = comp.dstcomp.cpu_rd_ptr();

  for (int i = 0; i < n; i++) {
    if (level[i] == lvl) {
      fprintf(stderr, "%d: (%d) node %d edge %d weight %d dstcomp %d\n", i,
              level[i], minnode[i], minedge[i], minwt[i], mindstcomp[i]);
    }
  }

  comp.level.gpu_wr_ptr();
  comp.weight.gpu_wr_ptr();
  comp.edge.gpu_wr_ptr();
  comp.node.gpu_wr_ptr();
  comp.dstcomp.gpu_wr_ptr();
}


================================================
FILE: lonestar/analytics/gpu/spanningtree/support.cu
================================================
#include "gg.h"
const char *prog_opts = "";
const char *prog_usage = "";
const char *prog_args_usage = "";

extern AppendOnlyList el;
int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) { }

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;
  if(!output_file) return;
  if(strcmp(output_file, "-") == 0) f = stdout;
  else f = fopen(output_file, "w");
  el.sort();
  int *e = el.list.cpu_rd_ptr();
  int edges = el.nitems();
  for(int i = 0; i < edges; i++)
    check_fprintf(f, "%d %d %d %d\n", i, e[i], g.getAbsDestination(e[i]), g.getAbsWeight(e[i]));
}


================================================
FILE: lonestar/analytics/gpu/sssp/CMakeLists.txt
================================================
app_analy_gpu(sssp sssp)
add_test_gpu(sssp rmat15 rmat15.out sssp -s 0 -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)


================================================
FILE: lonestar/analytics/gpu/sssp/README.md
================================================
Single-Source Shortest Paths
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark computes the shortest path from a source node to all nodes in a directed graph with non-negative edge weights by using a modified near-far algorithm [1].

[1] https://people.csail.mit.edu/jshun/6886-s18/papers/DBGO14.pdf

INPUT
--------------------------------------------------------------------------------

Take in Galois .gr graphs.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/sssp; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./sssp-gpu -o <output-file> -l -s startNode <input-graph>`

-`$ ./sssp-gpu -o outfile.txt -l -s 0 rmat15.gr`

The option -l enables thread block load balancer. Enable this option for power-law graphs to improve the performance. It is recommended to disable this option for high diameter graphs, such as road-networks.


================================================
FILE: lonestar/analytics/gpu/sssp/sssp.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "thread_work.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
struct ThreadWork t_work;
extern int DELTA;
extern int start_node;
bool enable_lb = false;
typedef int edge_data_type;
typedef int node_data_type;
typedef int * gint_p;
extern const node_data_type INF = INT_MAX;
static const int __tb_gg_main_pipe_1_gpu_gb = 256;
static const int __tb_sssp_kernel = TB_SIZE;
static const int __tb_remove_dups = TB_SIZE;
__global__ void kernel(CSRGraph graph, int src)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type node_end;
  node_end = (graph).nnodes;
  for (index_type node = 0 + tid; node < node_end; node += nthreads)
  {
    graph.node_data[node] = (node == src) ? 0 : INF ;
  }
}
__device__ void remove_dups_dev(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type wlnode_end;
  index_type wlnode2_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    pop = (in_wl).pop_id(wlnode, node);
    marks[node] = wlnode;
  }
  gb.Sync();
  wlnode2_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode2 = 0 + tid; wlnode2 < wlnode2_end; wlnode2 += nthreads)
  {
    int node;
    bool pop;
    pop = (in_wl).pop_id(wlnode2, node);
    if (marks[node] == wlnode2)
    {
      index_type _start_26;
      _start_26 = (out_wl).setup_push_warp_one();;
      (out_wl).do_push(_start_26, 0, node);
    }
  }
}
__global__ void remove_dups(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  remove_dups_dev(marks, in_wl, out_wl, gb);
}
__global__ void sssp_kernel_dev_TB_LB(CSRGraph graph, int delta, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  __shared__ unsigned int total_work;
  __shared__ unsigned block_start_src_index;
  __shared__ unsigned block_end_src_index;
  unsigned my_work;
  unsigned node;
  unsigned int offset;
  unsigned int current_work;
  unsigned blockdim_x = BLOCK_DIM_X;
  total_work = thread_prefix_work_wl[num_items - 1];
  my_work = ceilf((float)(total_work) / (float) nthreads);

  __syncthreads();

  if (my_work != 0)
  {
    current_work = tid;
  }
  for (unsigned i =0; i < my_work; i++)
  {
    unsigned int block_start_work;
    unsigned int block_end_work;
    if (threadIdx.x == 0)
    {
      if (current_work < total_work)
      {
        block_start_work = current_work;
        block_end_work=current_work + blockdim_x - 1;
        if (block_end_work >= total_work)
        {
          block_end_work = total_work - 1;
        }
        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);
        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);
      }
    }
    __syncthreads();

    if (current_work < total_work)
    {
      unsigned src_index;
      index_type edge;
      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);
      node= thread_src_wl.in_wl().dwl[src_index];
      edge = (graph).getFirstEdge(node)+ offset;
      {
        index_type dst = graph.getAbsDestination(edge);
        edge_data_type wt = graph.getAbsWeight(edge);
        if (graph.node_data[dst] > graph.node_data[node] + wt)
        {
          atomicMin(graph.node_data + dst, graph.node_data[node] + wt);
          if (graph.node_data[node] + wt <= delta)
          {
            index_type _start_67;
            _start_67 = (re_wl).setup_push_warp_one();;
            (re_wl).do_push(_start_67, 0, dst);
          }
          else
          {
            (out_wl).push(dst);
          }
        }
      }
      current_work = current_work + nthreads;
    }
  }
}
__global__ void Inspect_sssp_kernel_dev(CSRGraph graph, int delta, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  index_type wlnode_end;
  wlnode_end = *((volatile index_type *) (in_wl).dindex);
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    int index;
    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false);
    if (pop && graph.node_data[node] == INF)
    {
      continue;
    }
    if (pop)
    {
      index = thread_work_wl.in_wl().push_range(1) ;
      thread_src_wl.in_wl().push_range(1);
      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);
      thread_src_wl.in_wl().dwl[index] = node;
    }
  }
}
__device__ void sssp_kernel_dev(CSRGraph graph, int delta, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_sssp_kernel;
  index_type wlnode_end;
  const int _NP_CROSSOVER_WP = 32;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct empty_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));
  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)
  {
    int node;
    bool pop;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false);
    pop = pop && !(graph.node_data[node] == INF);
    struct NPInspector1 _np = {0,0,0,0,0,0};
    __shared__ struct { int node; } _np_closure [TB_SIZE];
    _np_closure[threadIdx.x].node = node;
    if (pop)
    {
      _np.size = (graph).getOutDegree(node);
      _np.start = (graph).getFirstEdge(node);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
    }
    __syncthreads();
    {
      const int warpid = threadIdx.x / 32;
      const int _np_laneid = cub::LaneId();
      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP))
      {
        if (_np.size >= _NP_CROSSOVER_WP)
        {
          nps.warp.owner[warpid] = _np_laneid;
        }
        if (nps.warp.owner[warpid] == _np_laneid)
        {
          nps.warp.start[warpid] = _np.start;
          nps.warp.size[warpid] = _np.size;
          nps.warp.src[warpid] = threadIdx.x;
          _np.start = 0;
          _np.size = 0;
        }
        index_type _np_w_start = nps.warp.start[warpid];
        index_type _np_w_size = nps.warp.size[warpid];
        assert(nps.warp.src[warpid] < __kernel_tb_size);
        node = _np_closure[nps.warp.src[warpid]].node;
        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)
        {
          index_type edge;
          edge = _np_w_start +_np_ii;
          {
            index_type dst = graph.getAbsDestination(edge);
            edge_data_type wt = graph.getAbsWeight(edge);
            if (graph.node_data[dst] > graph.node_data[node] + wt)
            {
              atomicMin(graph.node_data + dst, graph.node_data[node] + wt);
              if (graph.node_data[node] + wt <= delta)
              {
                index_type _start_67;
                _start_67 = (re_wl).setup_push_warp_one();;
                (re_wl).do_push(_start_67, 0, dst);
              }
              else
              {
                (out_wl).push(dst);
              }
            }
          }
        }
      }
      __syncthreads();
    }

    __syncthreads();
    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        node = _np_closure[nps.fg.src[_np_i]].node;
        edge= nps.fg.itvalue[_np_i];
        {
          index_type dst = graph.getAbsDestination(edge);
          edge_data_type wt = graph.getAbsWeight(edge);
          if (graph.node_data[dst] > graph.node_data[node] + wt)
          {
            atomicMin(graph.node_data + dst, graph.node_data[node] + wt);
            if (graph.node_data[node] + wt <= delta)
            {
              index_type _start_67;
              _start_67 = (re_wl).setup_push_warp_one();;
              (re_wl).do_push(_start_67, 0, dst);
            }
            else
            {
              (out_wl).push(dst);
            }
          }
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
    assert(threadIdx.x < __kernel_tb_size);
    node = _np_closure[threadIdx.x].node;
  }
}
__global__ void __launch_bounds__(TB_SIZE, 2) sssp_kernel(CSRGraph graph, int delta, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)
{
  unsigned tid = TID_1D;

  if (tid == 0)
    in_wl.reset_next_slot();

  sssp_kernel_dev(graph, delta, enable_lb, in_wl, out_wl, re_wl);
}
void gg_main_pipe_1(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  while (pipe.in_wl().nitems())
  {
    while (pipe.in_wl().nitems())
    {
      pipe.out_wl().will_write();
      pipe.re_wl().will_write();
      if (enable_lb)
      {
        t_work.reset_thread_work();
        Inspect_sssp_kernel_dev <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());
        cudaDeviceSynchronize();
        int num_items = t_work.thread_work_wl.in_wl().nitems();
        if (num_items != 0)
        {
          t_work.compute_prefix_sum();
          cudaDeviceSynchronize();
          sssp_kernel_dev_TB_LB <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
          cudaDeviceSynchronize();
        }
      }
      sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
      cudaDeviceSynchronize();
      pipe.in_wl().swap_slots();
      pipe.retry2();
    }
    pipe.advance2();
    pipe.out_wl().will_write();
    remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);
    cudaDeviceSynchronize();
    pipe.in_wl().swap_slots();
    pipe.advance2();
    i++;
    curdelta += DELTA;
  }
}
__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(CSRGraph gg, gint_p glevel, int curdelta, int i, int DELTA, GlobalBarrier remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2> pipe, int* cl_curdelta, int* cl_i, bool enable_lb, GlobalBarrier gb)
{
  unsigned tid = TID_1D;

  curdelta = *cl_curdelta;
  i = *cl_i;
  while (pipe.in_wl().nitems())
  {
    while (pipe.in_wl().nitems())
    {
      if (tid == 0)
        pipe.in_wl().reset_next_slot();
      sssp_kernel_dev (gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
      pipe.in_wl().swap_slots();
      gb.Sync();
      pipe.retry2();
    }
    gb.Sync();
    pipe.advance2();
    if (tid == 0)
      pipe.in_wl().reset_next_slot();
    remove_dups_dev (glevel, pipe.in_wl(), pipe.out_wl(), gb);
    pipe.in_wl().swap_slots();
    gb.Sync();
    pipe.advance2();
    i++;
    curdelta += DELTA;
  }
  gb.Sync();
  if (tid == 0)
  {
    *cl_curdelta = curdelta;
    *cl_i = i;
  }
}
void gg_main_pipe_1_wrapper(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;
  static bool gg_main_pipe_1_gpu_gb_barrier_inited;
  extern bool enable_lb;
  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);
  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);
  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};
  if (enable_lb)
  {
    gg_main_pipe_1(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);
  }
  else
  {
    int* cl_curdelta;
    int* cl_i;
    check_cuda(cudaMalloc(&cl_curdelta, sizeof(int) * 1));
    check_cuda(cudaMalloc(&cl_i, sizeof(int) * 1));
    check_cuda(cudaMemcpy(cl_curdelta, &curdelta, sizeof(int) * 1, cudaMemcpyHostToDevice));
    check_cuda(cudaMemcpy(cl_i, &i, sizeof(int) * 1, cudaMemcpyHostToDevice));

    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,cl_curdelta,cl_i, enable_lb, gg_main_pipe_1_gpu_gb_barrier);
    check_cuda(cudaMemcpy(&curdelta, cl_curdelta, sizeof(int) * 1, cudaMemcpyDeviceToHost));
    check_cuda(cudaMemcpy(&i, cl_i, sizeof(int) * 1, cudaMemcpyDeviceToHost));
    check_cuda(cudaFree(cl_curdelta));
    check_cuda(cudaFree(cl_i));
  }
}
void gg_main(CSRGraph& hg, CSRGraph& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  t_work.init_thread_work(gg.nnodes);
  static GlobalBarrierLifetime remove_dups_barrier;
  static bool remove_dups_barrier_inited;
  gint_p glevel;
  PipeContextT<Worklist2> pipe;
  Shared<int> level (hg.nnodes);
  level.cpu_wr_ptr();
  static const size_t remove_dups_residency = maximum_residency(remove_dups, __tb_remove_dups, 0);
  static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);
  if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};
  kernel <<<blocks, threads>>>(gg, start_node);
  cudaDeviceSynchronize();
  int i = 0;
  int curdelta = 0;
  printf("delta: %d\n", DELTA);
  glevel = level.gpu_wr_ptr();
  pipe = PipeContextT<Worklist2>(gg.nedges*2);
  pipe.in_wl().wl[0] = start_node;
  pipe.in_wl().update_gpu(1);
  gg_main_pipe_1_wrapper(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);
  printf("iterations: %d\n", i);
}


================================================
FILE: lonestar/analytics/gpu/sssp/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"
#include <cassert>

const char *prog_opts = "ls:d:";
const char *prog_usage = "[-l] [-d delta] [-s startNode]";
const char *prog_args_usage = "-l: enable thread block load balancer (by default false)";

int DELTA = 10000;
extern const int INF;
int start_node = 0;
extern bool enable_lb;

int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) {
  if(c == 'd') {
    DELTA = atoi(optarg);
    assert(DELTA > 0);
  }
  if(c == 'l') { 
	enable_lb = true;
  }
  if(c == 's') {
     start_node = atoi(optarg);
     assert(start_node >= 0);
  }
}


void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  if(!output_file)
    return;

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");
    
  const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;    
  for(int i = 0; i < g.nnodes; i++) {
    if(g.node_data[i] == INF) {
      //formatting the output to be compatible with the distributed bfs ouput 
      check_fprintf(f, "%d %d\n", i, infinity);
    } else {
      check_fprintf(f, "%d %d\n", i, g.node_data[i]);
    }
  }

}


================================================
FILE: lonestar/analytics/gpu/triangle-counting/CMakeLists.txt
================================================
app_analy_gpu(tc triangle-counting)
add_test_gpu(triangle-counting rmat15 rmat15.out tc ${BASEINPUT}/scalefree/symmetric/rmat15.csgr)


================================================
FILE: lonestar/analytics/gpu/triangle-counting/README.md
================================================
Triangle Counting
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------


This benchmark counts the number of triangles in a given undirected graph. It implements the approach from Polak [1] in IrGL[2].

[1] Adam Polak. Counting triangles in large graphs on GPU. In IPDPS Workshops 2016,  pages  740~@~S746,  2016
[2] https://users.ices.utexas.edu/~sreepai/sree-oopsla2016.pdf

INPUT
--------------------------------------------------------------------------------

Input graphs are Galois .csgr format, i.e., symmetric, have no self-loops, and have no duplicated edges.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/triangle-counting; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./triangle-counting-gpu  <csgr-input-graph>`

-`$ ./triangle-counting-gpu road-USA.csgr`


================================================
FILE: lonestar/analytics/gpu/triangle-counting/support.cu
================================================
/* -*- mode: C++ -*- */

#include "gg.h"

const char *prog_opts = "";
const char *prog_usage = "";
const char *prog_args_usage = "";

int process_prog_arg(int argc, char *argv[], int arg_start) {
   return 1;
}

void process_prog_opt(char c, char *optarg) {
  ;
}

void debug_output(CSRGraphTy &g, unsigned int *valid_edges) {
  for(int i = 0; i < g.nnodes; i++) {    
    int start = g.row_start[i];
    for(int j = 0; j < valid_edges[i]; j++) {
      printf("%d %d\n", i, g.edge_dst[start + j]);
    }    
  }
}

void output(CSRGraphTy &g, const char *output_file) {
  FILE *f;

  if(!output_file)
    return;

  if(strcmp(output_file, "-") == 0)
    f = stdout;
  else
    f = fopen(output_file, "w");
    

  for(int i = 0; i < g.nedges; i++)
    check_fprintf(f, "%d %d\n", i, g.edge_dst[i]);
}


================================================
FILE: lonestar/analytics/gpu/triangle-counting/tc.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['tb', 'fg']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture";
void debug_output(CSRGraphTy &g, unsigned int *valid_edges);;
static const int __tb_preprocess = TB_SIZE;
static const int __tb_count_triangles = TB_SIZE;
__global__ void preprocess(CSRGraph graph, unsigned int * valid_edges)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_preprocess;
  index_type node_rup;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct empty_np, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  node_rup = ((0) + roundup((((graph).nnodes) - (0)), (blockDim.x)));
  for (index_type node = 0 + tid; node < node_rup; node += nthreads)
  {
    bool pop;
    int degree;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = graph.valid_node(node);;
    if (pop)
    {
      degree = graph.getOutDegree(node);
    }
    struct NPInspector1 _np = {0,0,0,0,0,0};
    __shared__ struct { index_type node; int degree; } _np_closure [TB_SIZE];
    _np_closure[threadIdx.x].node = node;
    _np_closure[threadIdx.x].degree = degree;
    if (pop)
    {
      _np.size = (graph).getOutDegree(node);
      _np.start = (graph).getFirstEdge(node);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_TB ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_TB ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    __syncthreads();
    while (true)
    {
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      __syncthreads();
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        __syncthreads();
        break;
      }
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      __syncthreads();
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      assert(nps.tb.src < __kernel_tb_size);
      node = _np_closure[nps.tb.src].node;
      degree = _np_closure[nps.tb.src].degree;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type edge;
        edge = ns +_np_j;
        {
          index_type dst = graph.getAbsDestination(edge);
          int dst_degree = graph.getOutDegree(dst);
          if ((dst_degree > degree) || (dst_degree == degree && dst > node))
          {
            graph.edge_data[edge] = dst;
            atomicAdd(valid_edges + node, 1);
          }
          else
          {
            graph.edge_data[edge] = graph.nnodes;
          }
        }
      }
      __syncthreads();
    }

    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        node = _np_closure[nps.fg.src[_np_i]].node;
        degree = _np_closure[nps.fg.src[_np_i]].degree;
        edge= nps.fg.itvalue[_np_i];
        {
          index_type dst = graph.getAbsDestination(edge);
          int dst_degree = graph.getOutDegree(dst);
          if ((dst_degree > degree) || (dst_degree == degree && dst > node))
          {
            graph.edge_data[edge] = dst;
            atomicAdd(valid_edges + node, 1);
          }
          else
          {
            graph.edge_data[edge] = graph.nnodes;
          }
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
    assert(threadIdx.x < __kernel_tb_size);
    node = _np_closure[threadIdx.x].node;
    degree = _np_closure[threadIdx.x].degree;
  }
}
__device__ unsigned int intersect(CSRGraph graph, index_type u, index_type v, unsigned int * valid_edges)
{
  index_type u_start = graph.getFirstEdge(u);
  index_type u_end = u_start + valid_edges[u];
  index_type v_start = graph.getFirstEdge(v);
  index_type v_end = v_start + valid_edges[v];
  int count = 0;
  index_type u_it = u_start;
  index_type v_it = v_start;
  index_type a ;
  index_type b ;
  while (u_it < u_end && v_it < v_end)
  {
    a = graph.getAbsDestination(u_it);
    b = graph.getAbsDestination(v_it);
    int d = a - b;
    if (d <= 0)
    {
      u_it++;
    }
    if (d >= 0)
    {
      v_it++;
    }
    if (d == 0)
    {
      count++;
    }
  }
  return count;
}
__global__ void count_triangles(CSRGraph graph, unsigned int * valid_edges, int * count)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = __tb_count_triangles;
  index_type v_rup;
  const int _NP_CROSSOVER_TB = __kernel_tb_size;
  const int BLKSIZE = __kernel_tb_size;
  const int ITSIZE = BLKSIZE * 8;

  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;
  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct empty_np, struct fg_np<ITSIZE> > npsTy;

  __shared__ npsTy nps ;
  v_rup = ((0) + roundup((((graph).nnodes) - (0)), (blockDim.x)));
  for (index_type v = 0 + tid; v < v_rup; v += nthreads)
  {
    bool pop;
    multiple_sum<2, index_type> _np_mps;
    multiple_sum<2, index_type> _np_mps_total;
    pop = graph.valid_node(v);;
    struct NPInspector1 _np = {0,0,0,0,0,0};
    __shared__ struct { index_type v; } _np_closure [TB_SIZE];
    _np_closure[threadIdx.x].v = v;
    if (pop)
    {
      _np.size = ((graph).getFirstEdge(v)+ valid_edges[v]) - ((graph).getFirstEdge(v));
      _np.start = (graph).getFirstEdge(v);
    }
    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_TB ? _np.size : 0;
    _np_mps.el[1] = _np.size < _NP_CROSSOVER_TB ? _np.size : 0;
    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);
    if (threadIdx.x == 0)
    {
      nps.tb.owner = MAX_TB_SIZE + 1;
    }
    __syncthreads();
    while (true)
    {
      if (_np.size >= _NP_CROSSOVER_TB)
      {
        nps.tb.owner = threadIdx.x;
      }
      __syncthreads();
      if (nps.tb.owner == MAX_TB_SIZE + 1)
      {
        __syncthreads();
        break;
      }
      if (nps.tb.owner == threadIdx.x)
      {
        nps.tb.start = _np.start;
        nps.tb.size = _np.size;
        nps.tb.src = threadIdx.x;
        _np.start = 0;
        _np.size = 0;
      }
      __syncthreads();
      int ns = nps.tb.start;
      int ne = nps.tb.size;
      if (nps.tb.src == threadIdx.x)
      {
        nps.tb.owner = MAX_TB_SIZE + 1;
      }
      assert(nps.tb.src < __kernel_tb_size);
      v = _np_closure[nps.tb.src].v;
      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)
      {
        index_type edge;
        edge = ns +_np_j;
        {
          index_type u = graph.getAbsDestination(edge);
          index_type d_u = graph.getOutDegree(u);
          int xcount = 0;
          xcount = intersect(graph, u, v, valid_edges);
          if (xcount)
          {
            atomicAdd(count, xcount);
          }
        }
      }
      __syncthreads();
    }

    _np.total = _np_mps_total.el[1];
    _np.offset = _np_mps.el[1];
    while (_np.work())
    {
      int _np_i =0;
      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);
      __syncthreads();

      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)
      {
        index_type edge;
        assert(nps.fg.src[_np_i] < __kernel_tb_size);
        v = _np_closure[nps.fg.src[_np_i]].v;
        edge= nps.fg.itvalue[_np_i];
        {
          index_type u = graph.getAbsDestination(edge);
          index_type d_u = graph.getOutDegree(u);
          int xcount = 0;
          xcount = intersect(graph, u, v, valid_edges);
          if (xcount)
          {
            atomicAdd(count, xcount);
          }
        }
      }
      _np.execute_round_done(ITSIZE);
      __syncthreads();
    }
    assert(threadIdx.x < __kernel_tb_size);
    v = _np_closure[threadIdx.x].v;
  }
}
void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
{
  dim3 blocks, threads;
  kernel_sizing(gg, blocks, threads);
  Shared<int> count (1);
  Shared<unsigned int> valid_edges (hg.nnodes);
  count.zero_gpu();
  valid_edges.zero_gpu();
  preprocess <<<blocks, __tb_preprocess>>>(gg, valid_edges.gpu_wr_ptr());
  void     *d_temp_storage = NULL;
  size_t   temp_storage_bytes = 0;
  cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, gg.edge_data , gg.edge_data, gg.edge_dst, gg.edge_dst, 
                                               gg.nedges, gg.nnodes - 1,  gg.row_start, gg.row_start + 1);
  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  // Run sorting operation
  cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, gg.edge_data , gg.edge_data, gg.edge_dst, gg.edge_dst, 
                                               gg.nedges, gg.nnodes - 1,  gg.row_start, gg.row_start + 1);
  count_triangles <<<blocks, __tb_count_triangles>>>(gg, valid_edges.gpu_rd_ptr(), count.gpu_wr_ptr());
  printf("triangles: %d\n", *count.cpu_rd_ptr());
}


================================================
FILE: lonestar/eda/CMakeLists.txt
================================================
add_subdirectory(cpu)


================================================
FILE: lonestar/eda/cpu/CMakeLists.txt
================================================
add_subdirectory(aig-rewriting)
add_subdirectory(sproute)


================================================
FILE: lonestar/eda/cpu/aig-rewriting/CMakeLists.txt
================================================
file(GLOB Sources 
  subjectgraph/aig/*.cpp
  algorithms/*.cpp
  parsers/*.cpp
  writers/*.cpp
  misc/util/*.cpp
  functional/*.cpp
  xxHash/xxhash.c
)

add_executable(aig-rewriting-cpu main.cpp ${Sources})
add_dependencies(apps aig-rewriting-cpu)
target_link_libraries(aig-rewriting-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS aig-rewriting-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

target_include_directories(aig-rewriting-cpu PRIVATE
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/subjectgraph/aig>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/algorithms>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/parsers>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/writers>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/misc>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/misc/util>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/functional>"
  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/xxHash>"
)

add_test_scale(small1 aig-rewriting-cpu -AIG "${BASEINPUT}/eda/logic-synthesis/EPFL/arithmetic/adder/aiger/adder.aig" -v)
add_test_scale(small2 aig-rewriting-cpu -AIG "${BASEINPUT}/eda/logic-synthesis/EPFL/random_control/voter/aiger/voter.aig" -v)


================================================
FILE: lonestar/eda/cpu/aig-rewriting/README.md
================================================
Aig-Rewriting
================================================================================


DESCRIPTION 
--------------------------------------------------------------------------------

This program rewrites a given AIG in order to reduce the number of AIG nodes
while preseving the functional equivalence of the represented circuit. For 
details, please refer to the following paper:

Vinicius Possani, Yi-Shan Lu, Alan Mishchenko, Keshav Pingali, Renato Ribas, 
André Reis. Unlocking Fine-Grain Parallelism for AIG Rewriting. In ICCAD 2018.


INPUT
--------------------------------------------------------------------------------

The program expects an AIG graph.


BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/eda/cpu/aig-rewriting; make -j aig-rewriting-cpu`


RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./aig-rewriting-cpu <path-AIG> -t 14`
-`$ ./aig-rewriting-cpu <path-AIG> -t 28 -v`


PERFORMANCE  
--------------------------------------------------------------------------------

- Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and
  machine dependent


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/ChoiceManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel AIG Choice Insertion December 6, 2018.

*/

#include "ChoiceManager.h"
#include "galois/worklists/Chunk.h"

#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <chrono>

using namespace std::chrono;

namespace algorithm {

ChoiceManager::ChoiceManager(aig::Aig& aig, CutManager& cutMan,
                             NPNManager& npnMan, PreCompGraphManager& pcgMan,
                             int nGraphs, int nChoices)
    : aig(aig), cutMan(cutMan), npnMan(npnMan), pcgMan(pcgMan),
      perThreadDataCH(), nGraphs(nGraphs), nChoices(nChoices) {

  nFuncs = (1 << 16);
}

ChoiceManager::~ChoiceManager() {}

void ChoiceManager::createNodeChoices(ThreadLocalDataCH* thData,
                                      aig::GNode node) {

  aig::Graph& aigGraph    = this->aig.getGraph();
  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);

  // Get the node's cuts
  this->cutMan.computeCutsRecursively(node);
  Cut* cutsBegin = this->cutMan.getNodeCuts()[nodeData.id];
  assert(cutsBegin != nullptr);

  char* perm;
  unsigned phase;
  unsigned truth;
  bool isOutputCompl = false;
  //  int requiredLevel  = 0;
  // 	int nSubgraphs;
  int addedChoices = 0;
  int i;
  Cut* cut               = nullptr;
  DecGraph* currentGraph = nullptr;
  ForestNode* forestNode = nullptr;

  /*
// Go through the cuts to lock the fanin conee
for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {
  // Consider only 4-input cuts
  if (cut->nLeaves != 4) {
    continue;
  }
  lockFaninCone(aigGraph, node, cut);
}
  */

  // Go through the cuts to rewrite
  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {

    if (addedChoices >= this->nChoices) {
      break;
    }

    // Consider only 4-input cuts
    if (cut->nLeaves != 4) {
      continue;
    }

    // Get the fanin permutation
    truth = 0xFFFF & (*(this->cutMan.readTruth(cut)));
    perm  = this->npnMan.getPerms4()[(int)this->npnMan.getPerms()[truth]];
    phase = this->npnMan.getPhases()[truth];
    isOutputCompl = ((phase & (1 << 4)) > 0);

    // Collect fanins with the corresponding permutation/phase
    for (i = 0; i < cut->nLeaves; i++) {
      aig::GNode faninNode = this->aig.getNodes()[cut->leaves[(int)perm[i]]];
      if (faninNode == nullptr) {
        break;
      }
      thData->currentFanins[i]    = faninNode;
      thData->currentFaninsPol[i] = !((phase & (1 << i)) > 0);
    }

    if (i != cut->nLeaves) {
      continue;
    }

    // find the matching class of subgraphs
    std::vector<ForestNode*>& subgraphs =
        this->pcgMan.getClasses()[this->npnMan.getMap()[truth]];

    // Pruning
    int nSubgraphs = subgraphs.size();
    if (nSubgraphs > this->nGraphs) {
      nSubgraphs = this->nGraphs;
    }

    // determine the best subgrap
    for (i = 0; i < nSubgraphs; i++) {

      forestNode   = subgraphs[i];
      currentGraph = (DecGraph*)forestNode->pNext; // get the current graph

      bool isComplemented = isOutputCompl
                                ? (bool)currentGraph->getRootEdge().fCompl ^ 1
                                : (bool)currentGraph->getRootEdge().fCompl;

      if (isComplemented) {
        continue;
      }

      // Preparing structure/AIG tracking for updating the AIG
      for (int j = 0; j < 20; j++) {
        if (j < 4) {
          thData->decNodeFunc[j] =
              thData->currentFanins[j]; // Link cut leaves to the decomposition
                                        // graph
        } else {
          thData->decNodeFunc[j] =
              nullptr; // Clear the link table, after leaves
        }
      }

      bool wasAdded =
          updateAig(thData, node, nodeData, currentGraph, isOutputCompl);

      if (wasAdded) {
        addedChoices++;
      }
    }
  }
}

bool ChoiceManager::updateAig(ThreadLocalDataCH* thData, aig::GNode rootNode,
                              aig::NodeData& rootData, DecGraph* decGraph,
                              bool isOutputCompl) {

  aig::GNode choiceNode;
  //  aig::GNode auxNode;
  aig::Graph& aigGraph = this->aig.getGraph();

  bool isDecGraphComplemented = isOutputCompl
                                    ? (bool)decGraph->getRootEdge().fCompl ^ 1
                                    : (bool)decGraph->getRootEdge().fCompl;

  // check for constant function
  if (decGraph->isConst()) {
    choiceNode = this->aig.getConstZero();
  } else {
    // check for a literal
    if (decGraph->isVar()) {
      DecNode* decNode = decGraph->getVar();
      isDecGraphComplemented =
          isDecGraphComplemented
              ? (!thData->currentFaninsPol[decNode->id]) ^ true
              : !thData->currentFaninsPol[decNode->id];
      choiceNode = thData->decNodeFunc[decNode->id];
    } else {
      bool isFeasible = decGraphToAigTry(thData, decGraph);
      if (isFeasible) {
        choiceNode = decGraphToAigCreate(thData, decGraph);
      } else {
        return false;
      }
    }
  }

  if (rootNode == choiceNode) {
    return false;
  }

  aig::NodeData& choiceNodeData =
      aigGraph.getData(choiceNode, galois::MethodFlag::WRITE);
  choiceNodeData.choiceList = nullptr;
  // choiceNodeData.isCompl = isDecGraphComplemented;

  aig::GNode currChoice = rootData.choiceList;

  while (currChoice != nullptr) {
    if (choiceNode == currChoice) {
      return false;
    }

    aig::NodeData& currChoiceData =
        aigGraph.getData(currChoice, galois::MethodFlag::WRITE);

    if (currChoiceData.choiceList == nullptr) {
      currChoiceData.choiceList = choiceNode;
      return true;
    } else {
      currChoice = currChoiceData.choiceList;
    }
  }

  rootData.choiceList = choiceNode;

  // std::cout << "Node " << choiceNodeData.id << " was added as choice to node
  // " << rootData.id << std::endl;

  return true;
}

/* Transforms the decomposition graph into the AIG. Before calling this
 * procedure, AIG nodes for the fanins (cut's leaves) should be assigned to
 * thData->decNodeFun[ decNode.id ]. */
bool ChoiceManager::decGraphToAigTry(ThreadLocalDataCH* thData,
                                     DecGraph* decGraph) {

  DecNode* decNode = nullptr;
  DecNode* lhsNode;
  DecNode* rhsNode;
  aig::GNode curAnd;
  aig::GNode lhsAnd;
  aig::GNode rhsAnd;
  bool lhsAndPol;
  bool rhsAndPol;
  aig::Graph& aigGraph = this->aig.getGraph();

  // build the AIG nodes corresponding to the AND gates of the graph
  for (int i = decGraph->getLeaveNum();
       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);
       i++) {

    // get the children of this node
    lhsNode = decGraph->getNode(decNode->eEdge0.Node);
    rhsNode = decGraph->getNode(decNode->eEdge1.Node);

    // get the AIG nodes corresponding to the children
    lhsAnd = thData->decNodeFunc[lhsNode->id];
    rhsAnd = thData->decNodeFunc[rhsNode->id];

    if (lhsAnd && rhsAnd) {
      if (lhsNode->id < 4) { // If lhs is a cut leaf
        lhsAndPol = decNode->eEdge0.fCompl
                        ? !(thData->currentFaninsPol[lhsNode->id])
                        : thData->currentFaninsPol[lhsNode->id];
      } else {
        lhsAndPol = decNode->eEdge0.fCompl ? false : true;
      }

      if (rhsNode->id < 4) { // If rhs is a cut leaf
        rhsAndPol = decNode->eEdge1.fCompl
                        ? !(thData->currentFaninsPol[rhsNode->id])
                        : thData->currentFaninsPol[rhsNode->id];
      } else {
        rhsAndPol = decNode->eEdge1.fCompl ? false : true;
      }

      curAnd =
          this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);

      if (curAnd) {
        aig::NodeData& curAndData =
            aigGraph.getData(curAnd, galois::MethodFlag::READ);
        if (curAndData.nFanout == 0) {
          return false;
        }
      }
    } else {
      curAnd = nullptr;
    }

    thData->decNodeFunc[decNode->id] = curAnd;
  }

  aig::GNode choiceRoot = thData->decNodeFunc[decNode->id];

  if (choiceRoot != nullptr) {
    aig::NodeData& choiceRootData =
        aigGraph.getData(choiceRoot, galois::MethodFlag::READ);
    if (choiceRootData.nFanout > 0) {
      return false;
    }
  }

  return true;
}

/* Transforms the decomposition graph into the AIG. Before calling this
 * procedure, AIG nodes for the fanins (cut's leaves) should be assigned to
 * thData->decNodeFun[ decNode.id ]. */
aig::GNode ChoiceManager::decGraphToAigCreate(ThreadLocalDataCH* thData,
                                              DecGraph* decGraph) {

  DecNode* decNode = nullptr;
  DecNode* lhsNode;
  DecNode* rhsNode;
  aig::GNode curAnd;
  aig::GNode lhsAnd;
  aig::GNode rhsAnd;
  bool lhsAndPol;
  bool rhsAndPol;
  aig::Graph& aigGraph = this->aig.getGraph();

  // build the AIG nodes corresponding to the AND gates of the graph
  for (int i = decGraph->getLeaveNum();
       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);
       i++) {

    // get the children of this node
    lhsNode = decGraph->getNode(decNode->eEdge0.Node);
    rhsNode = decGraph->getNode(decNode->eEdge1.Node);

    // get the AIG nodes corresponding to the children
    lhsAnd = thData->decNodeFunc[lhsNode->id];
    rhsAnd = thData->decNodeFunc[rhsNode->id];

    if (lhsNode->id < 4) { // If lhs is a cut leaf
      lhsAndPol = decNode->eEdge0.fCompl
                      ? !(thData->currentFaninsPol[lhsNode->id])
                      : thData->currentFaninsPol[lhsNode->id];
    } else {
      lhsAndPol = decNode->eEdge0.fCompl ? false : true;
    }

    if (rhsNode->id < 4) { // If rhs is a cut leaf
      rhsAndPol = decNode->eEdge1.fCompl
                      ? !(thData->currentFaninsPol[rhsNode->id])
                      : thData->currentFaninsPol[rhsNode->id];
    } else {
      rhsAndPol = decNode->eEdge1.fCompl ? false : true;
    }

    curAnd =
        this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);

    if (curAnd) {
      thData->decNodeFunc[decNode->id] = curAnd;
    } else {
      thData->decNodeFunc[decNode->id] =
          this->aig.createAND(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);
      aig::NodeData& newNodeData = aigGraph.getData(
          thData->decNodeFunc[decNode->id], galois::MethodFlag::WRITE);
      newNodeData.counter =
          3; // Mark as processed to avoind to insert it into the worklist.
    }
  }

  return thData->decNodeFunc[decNode->id];
}

void ChoiceManager::lockFaninCone(aig::Graph& aigGraph, aig::GNode node,
                                  Cut* cut) {

  aig::NodeData& nodeData =
      aigGraph.getData(node, galois::MethodFlag::READ); // lock

  // If node is a cut leaf
  if ((nodeData.id == cut->leaves[0]) || (nodeData.id == cut->leaves[1]) ||
      (nodeData.id == cut->leaves[2]) || (nodeData.id == cut->leaves[3])) {
    return;
  }

  // If node is a PI
  if ((nodeData.type == aig::NodeType::PI) ||
      (nodeData.type == aig::NodeType::LATCH)) {
    return;
  }

  auto inEdgeIt      = aigGraph.in_edge_begin(node);
  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);
  //  aig::NodeData& lhsData = aigGraph.getData(lhsNode,
  //  galois::MethodFlag::READ); // lock
  aigGraph.getData(lhsNode, galois::MethodFlag::READ); // lock
  inEdgeIt++;
  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);
  //  aig::NodeData& rhsData = aigGraph.getData(rhsNode,
  //  galois::MethodFlag::READ); // lock
  aigGraph.getData(rhsNode, galois::MethodFlag::READ); // lock

  lockFaninCone(aigGraph, lhsNode, cut);
  lockFaninCone(aigGraph, rhsNode, cut);
}

aig::Aig& ChoiceManager::getAig() { return this->aig; }

CutManager& ChoiceManager::getCutMan() { return this->cutMan; }

NPNManager& ChoiceManager::getNPNMan() { return this->npnMan; }

PreCompGraphManager& ChoiceManager::getPcgMan() { return this->pcgMan; }

PerThreadDataCH& ChoiceManager::getPerThreadDataCH() {
  return this->perThreadDataCH;
}

struct ChoiceOperator {

  ChoiceManager& chMan;
  CutManager& cutMan;

  ChoiceOperator(ChoiceManager& chMan)
      : chMan(chMan), cutMan(chMan.getCutMan()) {}

  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {

    aig::Graph& aigGraph = chMan.getAig().getGraph();

    if ((node == nullptr) ||
        !aigGraph.containsNode(node, galois::MethodFlag::WRITE)) {
      return;
    }

    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);

    if (nodeData.counter == 3) {
      return;
    }

    // Touching outgoing neighobors to acquire their locks
    aigGraph.out_edges(node);
    //    for (auto outEdge : aigGraph.out_edges(node)) {}

    if (nodeData.type == aig::NodeType::AND) {
      ThreadLocalDataCH* thData = chMan.getPerThreadDataCH().getLocal();
      chMan.createNodeChoices(thData, node);
      /*
      aig::GNode currChoice = nodeData.choiceList;
      while ( currChoice != nullptr ) {
          aig::NodeData& currChoiceData = aigGraph.getData( currChoice,
      galois::MethodFlag::READ ); std::cout << "Node " << nodeData.id << " ->
      Choice Node " << currChoiceData.id << std::endl; currChoice =
      currChoiceData.choiceList;
      }
      */
    } else {
      if ((nodeData.type == aig::NodeType::PI) ||
          (nodeData.type == aig::NodeType::LATCH)) {
        // Set the trivial cut
        nodeData.counter      = 3;
        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();
        Cut* trivialCut       = cutPool->getMemory();
        trivialCut->leaves[0] = nodeData.id;
        trivialCut->nLeaves++;
        trivialCut->sig = (1U << (nodeData.id % 31));
        if (cutMan.getCompTruthFlag()) {
          unsigned* cutTruth = cutMan.readTruth(trivialCut);
          for (int i = 0; i < cutMan.getNWords(); i++) {
            cutTruth[i] = 0xAAAAAAAA;
          }
        }
        cutMan.getNodeCuts()[nodeData.id] = trivialCut;
      }
    }

    // Schedule fanout nodes
    if (nodeData.counter == 2) {
      nodeData.counter += 1;
    }
    if (nodeData.counter == 3) {
      // Insert nextNodes in the worklist
      for (auto outEdge : aigGraph.out_edges(node)) {
        aig::GNode nextNode = aigGraph.getEdgeDst(outEdge);
        aig::NodeData& nextNodeData =
            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);

        if ((nextNodeData.type == aig::NodeType::PO) ||
            (nextNodeData.type == aig::NodeType::LATCH)) {
          continue;
        }

        nextNodeData.counter += 1;
        if (nextNodeData.counter == 2) {
          if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {
            cutMan.recycleNodeCuts(nextNodeData.id);
          }
          ctx.push(nextNode);
        }
      }
    }
  }
};

void runChoiceOperator(ChoiceManager& chMan) {

  galois::InsertBag<aig::GNode> workList;
  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;

  for (auto pi : chMan.getAig().getInputNodes()) {
    workList.push(pi);
  }

  // for (auto latch : chMan.getAig().getLatchNodes()) {
  //  workList.push(latch);
  //}

  // Galois Parallel Foreach
  galois::for_each(galois::iterate(workList.begin(), workList.end()),
                   ChoiceOperator(chMan), galois::wl<DC_BAG>(),
                   galois::loopname("ChoiceOperator"),
                   galois::per_iter_alloc());
}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/ChoiceManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel AIG Choice Insertion December 6, 2018.

*/

#ifndef CHOICEMANAGER_H_
#define CHOICEMANAGER_H_

#include "Aig.h"
#include "CutManager.h"
#include "NPNManager.h"
#include "PreCompGraphManager.h"
#include "galois/worklists/Chunk.h"

#include <vector>

namespace algorithm {

typedef struct ThreadLocalDataCH_ {
  std::vector<bool> currentFaninsPol;
  std::vector<aig::GNode> currentFanins;
  std::vector<aig::GNode> decNodeFunc;

  ThreadLocalDataCH_()
      : currentFaninsPol(4), currentFanins(4), decNodeFunc(20) {}

} ThreadLocalDataCH;

typedef galois::substrate::PerThreadStorage<ThreadLocalDataCH> PerThreadDataCH;

class ChoiceManager {

private:
  aig::Aig& aig;
  CutManager& cutMan;
  NPNManager& npnMan;
  PreCompGraphManager& pcgMan;
  PerThreadDataCH perThreadDataCH;
  int nFuncs;
  int nGraphs;
  int nChoices;

  bool updateAig(ThreadLocalDataCH* thData, aig::GNode rootNode,
                 aig::NodeData& rootData, DecGraph* decGraph,
                 bool isOutputCompl);
  bool decGraphToAigTry(ThreadLocalDataCH* thData, DecGraph* decGraph);
  aig::GNode decGraphToAigCreate(ThreadLocalDataCH* thData, DecGraph* decGraph);

  void lockFaninCone(aig::Graph& aigGraph, aig::GNode node, Cut* cut);

public:
  ChoiceManager(aig::Aig& aig, CutManager& cutMan, NPNManager& npnMan,
                PreCompGraphManager& pcgMan, int nGraphs, int nChoinces);

  ~ChoiceManager();

  void createNodeChoices(ThreadLocalDataCH* thData, aig::GNode node);

  aig::Aig& getAig();
  CutManager& getCutMan();
  NPNManager& getNPNMan();
  PreCompGraphManager& getPcgMan();
  PerThreadDataCH& getPerThreadDataCH();
};

void runChoiceOperator(ChoiceManager& chMan);

} /* namespace algorithm */

#endif /* CHOICEMANAGER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/CutManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "CutManager.h"
#include "galois/Galois.h"
#include "galois/Bag.h"

#include <iostream>
#include <cstdlib>
#include <chrono>
#include <assert.h>

using namespace std::chrono;

namespace algorithm {

CutManager::CutManager(aig::Aig& aig, int K, int C, int nThreads,
                       bool compTruth)
    :

      aig(aig), K(K), C(C), nWords(Functional32::wordNum(K)),
      nNodes(std::distance(aig.getGraph().begin(), aig.getGraph().end()) -
             aig.getNumOutputs()),
      nThreads(nThreads), cutPoolSize(nNodes / nThreads), compTruth(compTruth),
      perThreadCutPool(cutPoolSize, K, compTruth), perThreadCutList(K),
      perThreadAuxTruth(nWords) {

  kcutTime = 0;
  nodeCuts = new Cut*[nNodes + 1];
  for (int i = 0; i < nNodes + 1; i++) {
    nodeCuts[i] = nullptr;
  }
}

CutManager::~CutManager() { delete[] nodeCuts; }

void CutManager::computeCuts(CutPool* cutPool, CutList* cutList,
                             AuxTruth* auxTruth, int nodeId, int lhsId,
                             int rhsId, bool lhsPolarity, bool rhsPolarity) {

  int i;
  int currentNumCuts = 0;

  // start with the elementary cut
  Cut* trivialCut       = cutPool->getMemory();
  trivialCut->leaves[0] = nodeId;
  trivialCut->nLeaves++;
  trivialCut->sig = (1U << (nodeId % 31));
  if (this->compTruth) {
    unsigned* cutTruth = readTruth(trivialCut);
    for (int i = 0; i < this->nWords; i++) {
      cutTruth[i] = 0xAAAAAAAA;
    }
  }
  cutList->head[1] = trivialCut;
  cutList->tail[1] = trivialCut;
  currentNumCuts++;
  nCuts += 1;
  nTriv += 1;

  // std::chrono::high_resolution_clock::time_point t1 =
  // std::chrono::high_resolution_clock::now();

  Cut* lhsLargeCutsBegin;
  for (lhsLargeCutsBegin = this->nodeCuts[lhsId]; lhsLargeCutsBegin != nullptr;
       lhsLargeCutsBegin = lhsLargeCutsBegin->nextCut) {
    if (lhsLargeCutsBegin->nLeaves == this->K) {
      break;
    }
  }

  Cut* rhsLargeCutsBegin;
  for (rhsLargeCutsBegin = this->nodeCuts[rhsId]; rhsLargeCutsBegin != nullptr;
       rhsLargeCutsBegin = rhsLargeCutsBegin->nextCut) {
    if (rhsLargeCutsBegin->nLeaves == this->K) {
      break;
    }
  }

  // small by small
  for (Cut* lhsCut = this->nodeCuts[lhsId]; lhsCut != lhsLargeCutsBegin;
       lhsCut      = lhsCut->nextCut) {
    for (Cut* rhsCut = this->nodeCuts[rhsId]; rhsCut != rhsLargeCutsBegin;
         rhsCut      = rhsCut->nextCut) {
      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,
                         lhsPolarity, rhsPolarity, currentNumCuts)) {
        commitCuts(nodeId, cutList);
        return; // The Maximum number of cuts per node was reached
      }
    }
  }

  // small by large
  for (Cut* lhsCut = this->nodeCuts[lhsId]; lhsCut != lhsLargeCutsBegin;
       lhsCut      = lhsCut->nextCut) {
    for (Cut* rhsCut = rhsLargeCutsBegin; rhsCut != nullptr;
         rhsCut      = rhsCut->nextCut) {
      if ((lhsCut->sig & rhsCut->sig) != lhsCut->sig) {
        continue;
      }
      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,
                         lhsPolarity, rhsPolarity, currentNumCuts)) {
        commitCuts(nodeId, cutList);
        return; // The Maximum number of cuts per node was reached
      }
    }
  }

  // small by large
  for (Cut* rhsCut = this->nodeCuts[rhsId]; rhsCut != rhsLargeCutsBegin;
       rhsCut      = rhsCut->nextCut) {
    for (Cut* lhsCut = lhsLargeCutsBegin; lhsCut != nullptr;
         lhsCut      = lhsCut->nextCut) {
      if ((lhsCut->sig & rhsCut->sig) != rhsCut->sig) {
        continue;
      }
      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,
                         lhsPolarity, rhsPolarity, currentNumCuts)) {
        commitCuts(nodeId, cutList);
        return; // The Maximum number of cuts per node was reached
      }
    }
  }

  // large by large
  for (Cut* lhsCut = lhsLargeCutsBegin; lhsCut != nullptr;
       lhsCut      = lhsCut->nextCut) {
    for (Cut* rhsCut = rhsLargeCutsBegin; rhsCut != nullptr;
         rhsCut      = rhsCut->nextCut) {
      if (lhsCut->sig != rhsCut->sig) {
        continue;
      }
      for (i = 0; i < this->K; i++) {
        if (lhsCut->leaves[i] != rhsCut->leaves[i]) {
          break;
        }
      }
      if (i < this->K) {
        continue;
      }
      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,
                         lhsPolarity, rhsPolarity, currentNumCuts)) {
        commitCuts(nodeId, cutList);
        return; // The Maximum number of cuts per node was reached
      }
    }
  }

  // Copy from currentCutList to the nodeCuts
  commitCuts(nodeId, cutList);

  // std::chrono::high_resolution_clock::time_point t2 =
  // std::chrono::high_resolution_clock::now(); compTime +=
  // std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
}

void CutManager::computeCutsRecursively(aig::GNode node) {

  aig::Graph& aigGraph    = this->aig.getGraph();
  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

  if (this->nodeCuts[nodeData.id] == nullptr) {

    auto inEdgeIt          = aigGraph.in_edge_begin(node);
    aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& lhsData = aigGraph.getData(lhsNode);
    bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

    inEdgeIt++;
    aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& rhsData = aigGraph.getData(rhsNode);
    bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

    CutPool* cutPool   = this->perThreadCutPool.getLocal();
    CutList* cutList   = this->perThreadCutList.getLocal();
    AuxTruth* auxTruth = this->perThreadAuxTruth.getLocal();

    computeCutsRec(lhsNode, cutPool, cutList, auxTruth);
    computeCutsRec(rhsNode, cutPool, cutList, auxTruth);

    computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id, rhsData.id,
                lhsPolarity, rhsPolarity);
  }
}

void CutManager::computeCutsRec(aig::GNode node, CutPool* cutPool,
                                CutList* cutList, AuxTruth* auxTruth) {

  aig::Graph& aigGraph    = this->aig.getGraph();
  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

  if (this->nodeCuts[nodeData.id] == nullptr) {

    auto inEdgeIt          = aigGraph.in_edge_begin(node);
    aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& lhsData = aigGraph.getData(lhsNode);
    bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

    inEdgeIt++;
    aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& rhsData = aigGraph.getData(rhsNode);
    bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

    computeCutsRec(lhsNode, cutPool, cutList, auxTruth);
    computeCutsRec(rhsNode, cutPool, cutList, auxTruth);

    computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id, rhsData.id,
                lhsPolarity, rhsPolarity);
  }
}

inline bool CutManager::processTwoCuts(CutPool* cutPool, CutList* cutList,
                                       AuxTruth* auxTruth, Cut* lhsCut,
                                       Cut* rhsCut, bool lhsPolarity,
                                       bool rhsPolarity, int& currentNumCuts) {

  // std::chrono::high_resolution_clock::time_point t0 =
  // std::chrono::high_resolution_clock::now();

  Cut* resCut;

  // merge the cuts
  // std::chrono::high_resolution_clock::time_point t1 =
  // std::chrono::high_resolution_clock::now();
  if (lhsCut->nLeaves >= rhsCut->nLeaves) {
    resCut = mergeCuts(cutPool, lhsCut, rhsCut);
  } else {
    resCut = mergeCuts(cutPool, rhsCut, lhsCut);
  }
  // std::chrono::high_resolution_clock::time_point t2 =
  // std::chrono::high_resolution_clock::now(); mergeTime +=
  // std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();

  if (resCut == nullptr) {
    return false;
  }

  // assert( (resCut->nLeaves > 1) && (resCut->nLeaves <= K) );

  // set the signature
  resCut->sig = lhsCut->sig | rhsCut->sig;

  // std::chrono::high_resolution_clock::time_point t3 =
  // std::chrono::high_resolution_clock::now();
  // check containment
  if (cutFilter(cutPool, cutList, resCut, currentNumCuts)) {
    return false;
  }
  // std::chrono::high_resolution_clock::time_point t4 =
  // std::chrono::high_resolution_clock::now(); filterTime +=
  // std::chrono::duration_cast<std::chrono::microseconds>( t4 - t3 ).count();

  if (this->compTruth) {
    computeTruth(auxTruth, resCut, lhsCut, rhsCut, lhsPolarity, rhsPolarity);
    // printf( "%x\n", (*readTruth( resCut )) );
  }

  // add to the list
  if (cutList->head[resCut->nLeaves] == nullptr) {
    cutList->head[resCut->nLeaves] = resCut;
  } else {
    cutList->tail[resCut->nLeaves]->nextCut = resCut;
  }
  cutList->tail[resCut->nLeaves] = resCut;
  currentNumCuts++;
  nCuts += 1;

  // std::chrono::high_resolution_clock::time_point t5 =
  // std::chrono::high_resolution_clock::now(); procTwoTime +=
  // std::chrono::duration_cast<std::chrono::microseconds>( t5 - t0 ).count();

  // return status (0 if okay; 1 if exceeded the limit)

  if (currentNumCuts >= this->C) {
    nSatu += 1;
    return true;
  }

  return false;
}

Cut* CutManager::mergeCuts(CutPool* cutPool, Cut* lhsCut, Cut* rhsCut) {

  int i, j, l;

  // assert( lhsCut->nLeaves >= rhsCut->nLeaves );

  Cut* resCut;

  // the case of the largest cut sizes
  if (lhsCut->nLeaves == this->K && rhsCut->nLeaves == this->K) {
    for (i = 0; i < lhsCut->nLeaves; i++) {
      if (lhsCut->leaves[i] != rhsCut->leaves[i]) {
        return nullptr;
      }
    }
    resCut = cutPool->getMemory();
    for (i = 0; i < lhsCut->nLeaves; i++) {
      resCut->leaves[i] = lhsCut->leaves[i];
    }
    resCut->nLeaves = lhsCut->nLeaves;
    return resCut;
  }

  // the case when one of the cuts is the largest
  if (lhsCut->nLeaves == this->K) {
    for (i = 0; i < rhsCut->nLeaves; i++) {
      for (j = lhsCut->nLeaves - 1; j >= 0; j--) {
        if (lhsCut->leaves[j] == rhsCut->leaves[i]) {
          break;
        }
      }
      if (j == -1) { // did not find
        return nullptr;
      }
    }
    resCut = cutPool->getMemory();
    for (i = 0; i < lhsCut->nLeaves; i++) {
      resCut->leaves[i] = lhsCut->leaves[i];
    }
    resCut->nLeaves = lhsCut->nLeaves;
    return resCut;
  }

  // compare two cuts with different numbers
  resCut = cutPool->getMemory();
  i      = 0;
  j      = 0;
  for (l = 0; l < this->K; l++) {
    if (j == rhsCut->nLeaves) {
      if (i == lhsCut->nLeaves) {
        resCut->nLeaves = l;
        return resCut;
      }
      resCut->leaves[l] = lhsCut->leaves[i++];
      continue;
    }

    if (i == lhsCut->nLeaves) {
      if (j == rhsCut->nLeaves) {
        resCut->nLeaves = l;
        return resCut;
      }
      resCut->leaves[l] = rhsCut->leaves[j++];
      continue;
    }

    if (lhsCut->leaves[i] < rhsCut->leaves[j]) {
      resCut->leaves[l] = lhsCut->leaves[i++];
      continue;
    }

    if (lhsCut->leaves[i] > rhsCut->leaves[j]) {
      resCut->leaves[l] = rhsCut->leaves[j++];
      continue;
    }

    resCut->leaves[l] = lhsCut->leaves[i++];
    j++;
  }

  if (i < lhsCut->nLeaves || j < rhsCut->nLeaves) {
    cutPool->giveBackMemory(resCut);
    return nullptr;
  }

  resCut->nLeaves = l;
  return resCut;
}

inline bool CutManager::cutFilter(CutPool* cutPool, CutList* cutList,
                                  Cut* resCut, int& currentNumCuts) {

  // check if this cut is filtered out by smaller cuts
  for (int i = 2; i <= resCut->nLeaves; i++) {

    for (Cut* cut = cutList->head[i]; cut != nullptr; cut = cut->nextCut) {

      // skip the non-contained cuts
      if ((cut->sig & resCut->sig) != cut->sig) {
        continue;
      }
      // check containment seriously
      if (checkCutDominance(cut, resCut)) {
        // Recycle Cut
        cutPool->giveBackMemory(resCut);
        nFilt += 1;
        return true; // resCut is dominated
      }
    }
  }

  // filter out other cuts using this one
  for (int i = resCut->nLeaves + 1; i <= this->K; i++) {

    Cut* prevCut  = nullptr;
    Cut* toRemove = nullptr;
    Cut* cut      = cutList->head[i];

    while (cut != nullptr) {

      // sKip the non-contained cuts
      if ((cut->sig & resCut->sig) != resCut->sig) {
        prevCut = cut;
        cut     = cut->nextCut;
        continue;
      }
      // check containment seriously
      if (checkCutDominance(resCut, cut)) {

        currentNumCuts--;
        nCuts -= 1;
        nFilt += 1;

        // when the cut to be removed is the first of the list
        if (cut == cutList->head[i]) {
          cutList->head[i] = cut->nextCut;
          toRemove         = cut;
          cut              = cut->nextCut;
          // Recycle Cut
          cutPool->giveBackMemory(toRemove);
          continue;
        }

        // when the cut to be removed is in the middle or in the end of the list
        if (prevCut != nullptr) {
          prevCut->nextCut = cut->nextCut;
          toRemove         = cut;
          cut              = cut->nextCut;
          // Recycle Cut
          cutPool->giveBackMemory(toRemove);
        } else {
          std::cout << "Bug cut removal!" << std::endl;
          exit(1);
        }
      } else {
        prevCut = cut;
        cut     = cut->nextCut;
      }
    }

    cutList->tail[i] = prevCut;
  }

  return false;
}

inline bool CutManager::checkCutDominance(Cut* smallerCut, Cut* largerCut) {

  int i, j;

  for (i = 0; i < smallerCut->nLeaves; i++) {
    for (j = 0; j < largerCut->nLeaves; j++) {
      if (smallerCut->leaves[i] == largerCut->leaves[j]) {
        break;
      }
    }
    if (j ==
        largerCut
            ->nLeaves) { // node i in smallerCut is not contained in largerCut
      return false;
    }
  }
  // every node in smallerCut is contained in largerCut
  return true;
}

void CutManager::commitCuts(int nodeId, CutList* cutList) {

  // Copy from currentCutList to the nodeCuts
  this->nodeCuts[nodeId] = cutList->head[1];
  Cut* lastCut           = cutList->head[1];
  cutList->head[1]       = nullptr;
  for (int i = 2; i < this->K + 1; i++) {
    if (cutList->head[i] == nullptr) {
      continue;
    }
    lastCut->nextCut = cutList->head[i];
    lastCut          = cutList->tail[i];
    cutList->head[i] = nullptr;
    cutList->tail[i] = nullptr;
  }
}

void CutManager::computeTruth(AuxTruth* auxTruth, Cut* resCut, Cut* lhsCut,
                              Cut* rhsCut, bool lhsPolarity, bool rhsPolarity) {

  // permute the first table
  if (lhsPolarity) {
    Functional32::copy(auxTruth->truth[0], readTruth(lhsCut), this->nWords);
  } else {
    Functional32::NOT(auxTruth->truth[0], readTruth(lhsCut), this->nWords);
  }
  Functional32::truthStretch(auxTruth->truth[2], auxTruth->truth[0],
                             lhsCut->nLeaves, this->K,
                             truthPhase(resCut, lhsCut));

  // permute the second table
  if (rhsPolarity) {
    Functional32::copy(auxTruth->truth[1], readTruth(rhsCut), this->nWords);
  } else {
    Functional32::NOT(auxTruth->truth[1], readTruth(rhsCut), this->nWords);
  }
  Functional32::truthStretch(auxTruth->truth[3], auxTruth->truth[1],
                             rhsCut->nLeaves, this->K,
                             truthPhase(resCut, rhsCut));

  // produce the resulting table. In this first version we are not considering
  // the cut->fCompl flag. It may be considerer in further versions according to
  // the demand.
  // if ( cut->fCompl ) {
  //	Functional32::NAND( readTruth( cut ) , auxTruth[2], auxTruth[3], K );
  //}
  // else {
  Functional32::AND(readTruth(resCut), auxTruth->truth[2], auxTruth->truth[3],
                    this->nWords);
  //}
}

inline unsigned CutManager::truthPhase(Cut* resCut, Cut* inCut) {

  unsigned phase = 0;
  int i, j;
  for (i = j = 0; i < resCut->nLeaves; i++) {

    if (j == inCut->nLeaves) {
      break;
    }
    if (resCut->leaves[i] < inCut->leaves[j]) {
      continue;
    }

    assert(resCut->leaves[i] == inCut->leaves[j]);
    phase |= (1 << i);
    j++;
  }

  return phase;
}

unsigned int* CutManager::readTruth(Cut* cut) {
  return (unsigned*)(cut->leaves + this->K);
}

/*
 *     This method gives the cut's memory back to current thread cutPool.
 *     However, the memory can be allocated by the cutPool of one thread
 *     and returned to cutPool of another thread.
 */
void CutManager::recycleNodeCuts(int nodeId) {

  CutPool* cutPool = this->perThreadCutPool.getLocal();
  Cut* cut         = this->nodeCuts[nodeId];

  while (cut != nullptr) {
    Cut* nextCut = cut->nextCut;
    cutPool->giveBackMemory(cut);
    cut = nextCut;
  }

  this->nodeCuts[nodeId] = nullptr;
}

void CutManager::printNodeCuts(int nodeId, long int& counter) {

  std::cout << "Node " << nodeId << ": { ";
  for (Cut* currentCut = this->nodeCuts[nodeId]; currentCut != nullptr;
       currentCut      = currentCut->nextCut) {
    counter++;
    std::cout << "{ ";
    for (int i = 0; i < currentCut->nLeaves; i++) {
      std::cout << currentCut->leaves[i] << " ";
    }
    std::cout << "} ";
  }
  std::cout << "}" << std::endl;
}

void CutManager::printAllCuts() {

  long int counter     = 0;
  aig::Graph& aigGraph = this->aig.getGraph();

  std::cout << std::endl << "########## All K-Cuts ###########" << std::endl;
  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    if ((nodeData.type == aig::NodeType::AND) ||
        (nodeData.type == aig::NodeType::PI)) {
      printNodeCuts(nodeData.id, counter);
    }
  }
  std::cout << "#################################" << std::endl;
}

void CutManager::printCutStatistics() {

  long int nCutsRed = nCuts.reduce();
  nCutsRed += this->aig.getNumInputs();

  long int nTrivRed = nTriv.reduce();
  nTrivRed += this->aig.getNumInputs();

  long int nFiltRed = nFilt.reduce();

  long int nSatuRed = nSatu.reduce();

  std::cout << std::endl
            << "############## Cut Statistics #############" << std::endl;
  std::cout << "nCuts: " << nCutsRed << std::endl;
  std::cout << "nTriv: " << nTrivRed << std::endl;
  std::cout << "nFilt: " << nFiltRed << std::endl;
  std::cout << "nSatu: " << nSatuRed << std::endl;
  std::cout << "nCutPerNode: " << (((double)nCutsRed) / this->nNodes)
            << std::endl;
  std::cout << "###########################################" << std::endl;
}

void CutManager::printRuntimes() {

  std::cout << std::endl << "#### Runtimes in microsecond ####" << std::endl;
  // std::cout << "Merge: " << mergeTime << std::endl;
  // std::cout << "Filter: " << filterTime << std::endl;
  // std::cout << "ProcTwo: " << procTwoTime << std::endl;
  // std::cout << "Compute: " << compTime << std::endl;
  // std::cout << "Schedule: " << scheduleTime << std::endl;
  std::cout << "Total: " << this->kcutTime << std::endl;
  std::cout << "#################################" << std::endl;
}

aig::Aig& CutManager::getAig() { return this->aig; }

int CutManager::getK() { return this->K; }

int CutManager::getC() { return this->C; }

int CutManager::getNWords() { return this->nWords; }

int CutManager::getNThreads() { return this->nThreads; }

bool CutManager::getCompTruthFlag() { return this->compTruth; }

long double CutManager::getKcutTime() { return this->kcutTime; }

void CutManager::setKcutTime(long double time) { this->kcutTime = time; }

PerThreadCutPool& CutManager::getPerThreadCutPool() {
  return this->perThreadCutPool;
}

PerThreadCutList& CutManager::getPerThreadCutList() {
  return this->perThreadCutList;
}

PerThreadAuxTruth& CutManager::getPerThreadAuxTruth() {
  return this->perThreadAuxTruth;
}

Cut** CutManager::getNodeCuts() { return this->nodeCuts; }

// ######################## BEGIN OPERATOR ######################## //
struct KCutOperator {

  CutManager& cutMan;

  KCutOperator(CutManager& cutMan) : cutMan(cutMan) {}

  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {

    aig::Aig& aig        = cutMan.getAig();
    aig::Graph& aigGraph = aig.getGraph();

    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

    if (nodeData.type == aig::NodeType::AND) {

      // Touching outgoing neighobors to acquire their locks
      aigGraph.out_edges(node);

      // Combine Cuts
      auto inEdgeIt          = aigGraph.in_edge_begin(node);
      aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);
      aig::NodeData& lhsData = aigGraph.getData(lhsNode);
      bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

      inEdgeIt++;
      aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);
      aig::NodeData& rhsData = aigGraph.getData(rhsNode);
      bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);

      CutPool* cutPool   = cutMan.getPerThreadCutPool().getLocal();
      CutList* cutList   = cutMan.getPerThreadCutList().getLocal();
      AuxTruth* auxTruth = cutMan.getPerThreadAuxTruth().getLocal();

      // ctx.cautiousPoint();

      cutMan.computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id,
                         rhsData.id, lhsPolarity, rhsPolarity);

      // NextNodes
      for (auto edge : aigGraph.out_edges(node)) {
        aig::GNode nextNode = aigGraph.getEdgeDst(edge);
        aig::NodeData& nextNodeData =
            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);
        nextNodeData.counter += 1;
        if (nextNodeData.counter == 2) {
          ctx.push(nextNode);
        }
      }
    } else {
      if (nodeData.type == aig::NodeType::PI) {
        // Touching outgoing neighobors to acquire their locks and their fanin
        // node's locks.
        aigGraph.out_edges(node);

        // ctx.cautiousPoint();

        // Set the trivial cut
        nodeData.counter      = 3;
        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();
        Cut* trivialCut       = cutPool->getMemory();
        trivialCut->leaves[0] = nodeData.id;
        trivialCut->nLeaves++;
        trivialCut->sig = (1U << (nodeData.id % 31));
        if (cutMan.getCompTruthFlag()) {
          unsigned* cutTruth = cutMan.readTruth(trivialCut);
          for (int i = 0; i < cutMan.getNWords(); i++) {
            cutTruth[i] = 0xAAAAAAAA;
          }
        }
        cutMan.getNodeCuts()[nodeData.id] = trivialCut;

        // Schedule next nodes
        for (auto edge : aigGraph.out_edges(node)) {
          aig::GNode nextNode = aigGraph.getEdgeDst(edge);
          aig::NodeData& nextNodeData =
              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);
          nextNodeData.counter += 1;
          if (nextNodeData.counter == 2) {
            ctx.push(nextNode);
          }
        }
      }
    }
  }
};

void runKCutOperator(CutManager& cutMan) {

  galois::InsertBag<aig::GNode> workList;
  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;
  // typedef galois::worklists::PerSocketChunkFIFO< 200 > DC_FIFO;
  // typedef galois::worklists::PerSocketChunkLIFO< 200 > DC_LIFO;
  // typedef galois::worklists::PerThreadChunkFIFO< 200 > AC_FIFO;

  for (auto pi : cutMan.getAig().getInputNodes()) {
    workList.push(pi);
  }

  // Galois Parallel Foreach
  galois::for_each(galois::iterate(workList.begin(), workList.end()),
                   KCutOperator(cutMan), galois::wl<DC_BAG>(),
                   galois::loopname("KCutOperator"));

  // galois::wl<galois::worklists::Deterministic<>>(),
  // galois::wl<DC_BAG>(),
}
// ######################## END OPERATOR ######################## //

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/CutManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef CUTMANAGER_H_
#define CUTMANAGER_H_

#include "Aig.h"
#include "CutPool.h"
#include "../functional/FunctionHandler32.h"
#include "galois/Reduction.h"

namespace algorithm {

typedef struct cutList_ {

  Cut** head;
  Cut** tail;

  cutList_(int K) {
    head = new Cut*[K + 1];
    for (int i = 0; i < K + 1; i++) {
      head[i] = nullptr;
    }

    tail = new Cut*[K + 1];
    for (int i = 0; i < K + 1; i++) {
      tail[i] = nullptr;
    }
  }

  ~cutList_() {
    delete[] head;
    delete[] tail;
  }

} CutList;

typedef struct auxTruth_ {

  unsigned int* truth[4];

  auxTruth_(int nWords) {
    for (int i = 0; i < 4; i++) {
      truth[i] = new unsigned int[nWords];
    }
  }

  ~auxTruth_() {
    for (int i = 0; i < 4; i++) {
      delete[] truth[i];
    }
  }

} AuxTruth;

typedef galois::substrate::PerThreadStorage<CutPool> PerThreadCutPool;
typedef galois::substrate::PerThreadStorage<CutList> PerThreadCutList;
typedef galois::substrate::PerThreadStorage<AuxTruth> PerThreadAuxTruth;

class CutManager {

private:
  aig::Aig& aig;
  int K;
  int C;
  int nWords;
  int nNodes;
  int nThreads;
  long int cutPoolSize;
  bool compTruth;
  long double kcutTime;

  PerThreadCutPool perThreadCutPool;
  PerThreadCutList perThreadCutList;
  PerThreadAuxTruth perThreadAuxTruth;
  Cut** nodeCuts;

  // Cuts Statistics //
  galois::GAccumulator<long int> nCuts;
  galois::GAccumulator<long int> nTriv;
  galois::GAccumulator<long int> nFilt;
  galois::GAccumulator<long int> nSatu;

  // Runtime Statistics //
  galois::GAccumulator<long int> mergeTime;
  galois::GAccumulator<long int> filterTime;
  galois::GAccumulator<long int> procTwoTime;
  galois::GAccumulator<long int> compTime;
  galois::GAccumulator<long int> scheduleTime;

  void computeCutsRec(aig::GNode node, CutPool* cutPool, CutList* cutList,
                      AuxTruth* auxTruth);

  inline bool processTwoCuts(CutPool* cutPool, CutList* cutList,
                             AuxTruth* auxTruth, Cut* lhsCut, Cut* rhsCut,
                             bool lhsPolarity, bool rhsPolarity,
                             int& currentNumCuts);

  Cut* mergeCuts(CutPool* cutPool, Cut* lhsCut, Cut* rhsCut);

  inline bool cutFilter(CutPool* cutPool, CutList* cutList, Cut* resCut,
                        int& currentNumCuts);

  inline bool checkCutDominance(Cut* smallerCut, Cut* largerCut);

  inline void commitCuts(int nodeId, CutList* cutList);

  void computeTruth(AuxTruth* auxTruth, Cut* resCut, Cut* lhsCut, Cut* rhsCut,
                    bool lhsPolarity, bool rhsPolarity);

  inline unsigned truthPhase(Cut* resCut, Cut* inCut);

public:
  CutManager(aig::Aig& aig, int K, int C, int nThreads, bool compTruth);

  ~CutManager();

  void computeCuts(CutPool* cutPool, CutList* cutList, AuxTruth* auxTruth,
                   int nodeId, int lhsId, int rhsId, bool lhsPolarity,
                   bool rhsPolarity);

  void computeCutsRecursively(aig::GNode node);

  unsigned int* readTruth(Cut* cut);
  void recycleNodeCuts(int nodeId);
  void printNodeCuts(int nodeId, long int& counter);
  void printAllCuts();
  void printCutStatistics();
  void printRuntimes();

  aig::Aig& getAig();
  int getK();
  int getC();
  int getNWords();
  int getNThreads();
  bool getCompTruthFlag();
  long double getKcutTime();
  void setKcutTime(long double time);
  PerThreadCutPool& getPerThreadCutPool();
  PerThreadCutList& getPerThreadCutList();
  PerThreadAuxTruth& getPerThreadAuxTruth();
  Cut** getNodeCuts();
};

// Function that runs the KCut operator define in the end of file CutManager.cpp
// //
void runKCutOperator(CutManager& cutMan);

} /* namespace algorithm */

#endif /* CUTMANAGERC_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/CutPool.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "CutPool.h"
#include "../functional/FunctionHandler32.h"

#include <iostream>
#include <cstdlib>
#include <cstring>
#include <cassert>

namespace algorithm {

CutPool::CutPool(long int initialSize, int k, bool compTruth) {
  this->blockSize              = initialSize;
  this->k                      = k;
  std::size_t initialEntrySize = sizeof(Cut) + (k * sizeof(int));
  if (compTruth) {
    initialEntrySize += (Functional32::wordNum(k) * sizeof(unsigned int));
  }
  // We need to pad the allocation to make sure alignment constraints
  // are still followed, however the computation used to do that
  // assumes the conditions in this assertion.
  static_assert(alignof(Cut) >= alignof(int) &&
                alignof(Cut) >= alignof(unsigned));
  constexpr std::size_t alignment = alignof(Cut);
  std::size_t remainder           = initialEntrySize % alignment;
  std::size_t padding             = remainder ? (alignment - remainder) : 0;
  this->entrySize                 = initialEntrySize + padding;
  this->entriesUsed               = 0;
  this->entriesAlloc              = 0;
  this->entriesFree               = nullptr;
}

CutPool::~CutPool() {
  for (char* ptr : this->blocks) {
    free(ptr);
  }
}

inline void CutPool::alloc() {

  this->entriesFree =
      (char*)malloc((long int)(this->entrySize * this->blockSize));

  if (this->entriesFree == nullptr) {
    std::cout << "Error: memory could not be allocated by CutPool!"
              << std::endl;
    exit(1);
  }

  char* pTemp = this->entriesFree;

  for (int i = 1; i < this->blockSize; i++) {
    *((char**)pTemp) = pTemp + this->entrySize;
    pTemp += this->entrySize;
  }

  *((char**)pTemp) = nullptr;

  this->entriesAlloc += this->blockSize;
  this->blocks.push_back(this->entriesFree);
}

Cut* CutPool::getMemory() {

  if (this->entriesUsed == this->entriesAlloc) {
    assert(this->entriesFree == nullptr);
    alloc();
  }

  this->entriesUsed++;
  char* pTemp       = this->entriesFree;
  this->entriesFree = *((char**)pTemp);

  Cut* cut = (Cut*)pTemp;
  memset(cut, 0, this->entrySize);
  cut->nextCut = nullptr;

  return cut;
}

void CutPool::giveBackMemory(Cut* cut) {

  this->entriesUsed--;
  char* pTemp       = (char*)cut;
  *((char**)pTemp)  = this->entriesFree;
  this->entriesFree = pTemp;
}

int CutPool::getNumBlocks() { return this->blocks.size(); }

int CutPool::getBlockSize() { return this->blockSize; }

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/CutPool.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef CUTPOOL_H_
#define CUTPOOL_H_

#include <vector>

namespace algorithm {

// The size of the leaves is defined acording the parameter Kk, during the
// memory allocation in the CutPool.cpp
typedef struct cut_ {
  unsigned int sig;
  short int nLeaves;
  struct cut_* nextCut;
  int leaves[0];
} Cut;

class CutPool {

private:
  long int blockSize;
  int k;
  int entrySize;
  long int entriesUsed;
  long int entriesAlloc;
  char* entriesFree;
  std::vector<char*> blocks;

  void alloc();

public:
  CutPool(long int initialSize, int k, bool compTruth);

  ~CutPool();

  Cut* getMemory();

  void giveBackMemory(Cut* cut);

  int getNumBlocks();

  int getBlockSize();
};

} /* namespace algorithm */

#endif /* CUTPOOL_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/NPNManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "NPNManager.h"

#include <stdlib.h>
#include <string.h>
#include <assert.h>

namespace algorithm {

/*
 * Static attribute with precomputed data
 * the following 135 practical NPN classes of 4-variable functions were computed
 * by considering all 4-input cuts appearing in IWLS, MCNC, and ISCAS benchmarks
 *  */
const unsigned short NPNManager::rewritePracticalClasses[136] = {
    0x0000, 0x0001, 0x0003, 0x0006, 0x0007, 0x000f, 0x0016, 0x0017, 0x0018,
    0x0019, 0x001b, 0x001e, 0x001f, 0x003c, 0x003d, 0x003f, 0x0069, 0x006b,
    0x006f, 0x007e, 0x007f, 0x00ff, 0x0116, 0x0118, 0x0119, 0x011a, 0x011b,
    0x011e, 0x011f, 0x012c, 0x012d, 0x012f, 0x013c, 0x013d, 0x013e, 0x013f,
    0x0168, 0x0169, 0x016f, 0x017f, 0x0180, 0x0181, 0x0182, 0x0183, 0x0186,
    0x0189, 0x018b, 0x018f, 0x0198, 0x0199, 0x019b, 0x01a8, 0x01a9, 0x01aa,
    0x01ab, 0x01ac, 0x01ad, 0x01ae, 0x01af, 0x01bf, 0x01e9, 0x01ea, 0x01eb,
    0x01ee, 0x01ef, 0x01fe, 0x033c, 0x033d, 0x033f, 0x0356, 0x0357, 0x0358,
    0x0359, 0x035a, 0x035b, 0x035f, 0x0368, 0x0369, 0x036c, 0x036e, 0x037d,
    0x03c0, 0x03c1, 0x03c3, 0x03c7, 0x03cf, 0x03d4, 0x03d5, 0x03d7, 0x03d8,
    0x03d9, 0x03dc, 0x03dd, 0x03de, 0x03fc, 0x0660, 0x0661, 0x0666, 0x0669,
    0x066f, 0x0676, 0x067e, 0x0690, 0x0696, 0x0697, 0x069f, 0x06b1, 0x06b6,
    0x06f0, 0x06f2, 0x06f6, 0x06f9, 0x0776, 0x0778, 0x07b0, 0x07b1, 0x07b4,
    0x07bc, 0x07f0, 0x07f2, 0x07f8, 0x0ff0, 0x1683, 0x1696, 0x1698, 0x169e,
    0x16e9, 0x178e, 0x17e8, 0x18e7, 0x19e6, 0x1be4, 0x1ee1, 0x3cc3, 0x6996,
    0x0000};

/* Computes NPN canonical forms for 4-variable functions */
NPNManager::NPNManager() {

  unsigned uTruth, phase, uPerm;
  int nFuncsAux, nClasses;
  int i, k;

  nFuncs    = (1 << 16);
  nFuncsAux = nFuncs;

  canons = (unsigned short*)malloc(sizeof(unsigned short) * nFuncsAux);
  memset(canons, 0, sizeof(unsigned short) * nFuncsAux);

  phases = (char*)malloc(sizeof(char) * nFuncsAux);
  memset(phases, 0, sizeof(char) * nFuncsAux);

  perms = (char*)malloc(sizeof(char) * nFuncsAux);
  memset(perms, 0, sizeof(char) * nFuncsAux);

  map = (unsigned char*)malloc(sizeof(unsigned char) * nFuncsAux);
  memset(map, 0, sizeof(unsigned char) * nFuncsAux);

  // mapInt will filled during the processing of precomputed graphs
  mapInv = (unsigned short*)malloc(sizeof(unsigned short) * 222);
  memset(mapInv, 0, sizeof(unsigned short) * 222);

  perms4 = getPermutations(4);

  practical = (char*)malloc(sizeof(char) * nFuncsAux);
  memset(practical, 0, sizeof(char) * nFuncsAux);
  initializePractical();

  nClasses  = 1;
  nFuncsAux = (1 << 15);

  for (uTruth = 1; uTruth < (unsigned)nFuncsAux; uTruth++) {

    // skip already assigned
    if (canons[uTruth]) {
      assert(uTruth > canons[uTruth]);
      map[~uTruth & 0xFFFF] = map[uTruth] = map[canons[uTruth]];
      continue;
    }

    map[uTruth] = nClasses++;

    for (i = 0; i < 16; i++) {
      phase = truthPolarize(uTruth, i, 4);
      for (k = 0; k < 24; k++) {
        uPerm = truthPermute(phase, perms4[k], 4, 0);
        if (canons[uPerm] == 0) {
          canons[uPerm] = uTruth;
          phases[uPerm] = i;
          perms[uPerm]  = k;

          uPerm         = ~uPerm & 0xFFFF;
          canons[uPerm] = uTruth;
          phases[uPerm] = i | 16;
          perms[uPerm]  = k;
        } else {
          assert(canons[uPerm] == uTruth);
        }
      }
      phase = truthPolarize(~uTruth & 0xFFFF, i, 4);
      for (k = 0; k < 24; k++) {
        uPerm = truthPermute(phase, perms4[k], 4, 0);
        if (canons[uPerm] == 0) {
          canons[uPerm] = uTruth;
          phases[uPerm] = i;
          perms[uPerm]  = k;

          uPerm         = ~uPerm & 0xFFFF;
          canons[uPerm] = uTruth;
          phases[uPerm] = i | 16;
          perms[uPerm]  = k;
        } else {
          assert(canons[uPerm] == uTruth);
        }
      }
    }
  }

  phases[(1 << 16) - 1] = 16;
  assert(nClasses == 222);
}

NPNManager::~NPNManager() {
  free(phases);
  free(perms);
  free(map);
  free(mapInv);
  free(canons);
  free(perms4);
  free(practical);
}

char** NPNManager::getPermutations(int n) {

  char Array[50];
  char** pRes;
  int nFact, i;
  // allocate memory
  nFact = factorial(n);
  pRes  = (char**)arrayAlloc(nFact, n, sizeof(char));
  // fill in the permutations
  for (i = 0; i < n; i++) {
    Array[i] = i;
  }
  getPermutationsRec(pRes, nFact, n, Array);

  return pRes;
}

/* Fills in the array of permutations */
void NPNManager::getPermutationsRec(char** pRes, int nFact, int n,
                                    char Array[]) {

  char** pNext;
  int nFactNext;
  int iTemp, iCur, iLast, k;

  if (n == 1) {
    pRes[0][0] = Array[0];
    return;
  }

  // get the next factorial
  nFactNext = nFact / n;
  // get the last entry
  iLast = n - 1;

  for (iCur = 0; iCur < n; iCur++) {
    // swap Cur and Last
    iTemp        = Array[iCur];
    Array[iCur]  = Array[iLast];
    Array[iLast] = iTemp;

    // get the pointer to the current section
    pNext = pRes + (n - 1 - iCur) * nFactNext;

    // set the last entry
    for (k = 0; k < nFactNext; k++)
      pNext[k][iLast] = Array[iLast];

    // call recursively for this part
    getPermutationsRec(pNext, nFactNext, n - 1, Array);

    // swap them back
    iTemp        = Array[iCur];
    Array[iCur]  = Array[iLast];
    Array[iLast] = iTemp;
  }
}

/* Permutes the given vector of minterms. */
void NPNManager::truthPermuteInt(int* pMints, int nMints, char* pPerm,
                                 int nVars, int* pMintsP) {

  int m, v;
  // clean the storage for minterms
  memset(pMintsP, 0, sizeof(int) * nMints);
  // go through minterms and add the variables
  for (m = 0; m < nMints; m++)
    for (v = 0; v < nVars; v++)
      if (pMints[m] & (1 << v))
        pMintsP[m] |= (1 << pPerm[v]);
}

/* Permutes the function. */
unsigned NPNManager::truthPermute(unsigned Truth, char* pPerms, int nVars,
                                  int fReverse) {

  unsigned Result;
  int* pMints;
  int* pMintsP;
  int nMints;
  int i, m;

  assert(nVars < 6);
  nMints  = (1 << nVars);
  pMints  = (int*)malloc(sizeof(int) * nMints);
  pMintsP = (int*)malloc(sizeof(int) * nMints);
  for (i = 0; i < nMints; i++)
    pMints[i] = i;

  truthPermuteInt(pMints, nMints, pPerms, nVars, pMintsP);

  Result = 0;
  if (fReverse) {
    for (m = 0; m < nMints; m++) {
      if (Truth & (1 << pMintsP[m])) {
        Result |= (1 << m);
      }
    }
  } else {
    for (m = 0; m < nMints; m++) {
      if (Truth & (1 << m)) {
        Result |= (1 << pMintsP[m]);
      }
    }
  }

  free(pMints);
  free(pMintsP);

  return Result;
}

/* Changes the phase of the function. */
unsigned NPNManager::truthPolarize(unsigned uTruth, int Polarity, int nVars) {

  // elementary truth tables
  static unsigned Signs[5] = {
      0xAAAAAAAA, // 1010 1010 1010 1010 1010 1010 1010 1010
      0xCCCCCCCC, // 1010 1010 1010 1010 1010 1010 1010 1010
      0xF0F0F0F0, // 1111 0000 1111 0000 1111 0000 1111 0000
      0xFF00FF00, // 1111 1111 0000 0000 1111 1111 0000 0000
      0xFFFF0000  // 1111 1111 1111 1111 0000 0000 0000 0000
  };

  unsigned uCof0, uCof1;
  int Shift, v;
  assert(nVars < 6);

  for (v = 0; v < nVars; v++) {
    if (Polarity & (1 << v)) {
      uCof0 = uTruth & ~Signs[v];
      uCof1 = uTruth & Signs[v];
      Shift = (1 << v);
      uCof0 <<= Shift;
      uCof1 >>= Shift;
      uTruth = uCof0 | uCof1;
    }
  }
  return uTruth;
}

void NPNManager::initializePractical() {

  int i;
  this->practical[0] = 1;
  for (i = 1;; i++) {
    if (rewritePracticalClasses[i] == 0) {
      break;
    }
    this->practical[rewritePracticalClasses[i]] = 1;
  }
}

/* Allocated one-memory-chunk array. */
void** NPNManager::arrayAlloc(int nCols, int nRows, int Size) {

  void** pRes;
  char* pBuffer;
  int i;
  assert(nCols > 0 && nRows > 0 && Size > 0);
  pBuffer =
      (char*)malloc(sizeof(char) * (nCols * (sizeof(void*) + nRows * Size)));
  pRes    = (void**)pBuffer;
  pRes[0] = pBuffer + nCols * sizeof(void*);
  for (i = 1; i < nCols; i++) {
    pRes[i] = (void*)((char*)pRes[0] + i * nRows * Size);
  }
  return pRes;
}

int NPNManager::factorial(int n) {

  int res = 1;
  for (int i = 1; i <= n; i++) {
    res *= i;
  }
  return res;
}

int NPNManager::getNFuncs() { return this->nFuncs; }

unsigned short* NPNManager::getCanons() { return this->canons; }

char* NPNManager::getPhases() { return this->phases; }

char* NPNManager::getPerms() { return this->perms; }

char* NPNManager::getPractical() { return this->practical; }

unsigned char* NPNManager::getMap() { return this->map; }

unsigned short* NPNManager::getMapInv() { return this->mapInv; }

char** NPNManager::getPerms4() { return this->perms4; }

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/NPNManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef NPNMANAGER_H_
#define NPNMANAGER_H_

#include <stdlib.h>
#include <string.h>
#include <assert.h>

namespace algorithm {

class NPNManager {

private:
  int nFuncs;
  char* phases;           // canonical phases
  char* perms;            // canonical permutations
  unsigned char* map;     // mapping of functions into class numbers
  unsigned short* mapInv; // mapping of classes into functions
  unsigned short* canons; // canonical forms
  char** perms4;          // four-var permutations
  char* practical;        // practical NPN classes
  static const unsigned short rewritePracticalClasses[136];

  char** getPermutations(int n);
  void getPermutationsRec(char** pRes, int nFact, int n, char Array[]);
  void truthPermuteInt(int* pMints, int nMints, char* pPerm, int nVars,
                       int* pMintsP);
  unsigned truthPermute(unsigned Truth, char* pPerms, int nVars, int fReverse);
  unsigned truthPolarize(unsigned uTruth, int Polarity, int nVars);
  void initializePractical();
  void** arrayAlloc(int nCols, int nRows, int Size);
  int factorial(int n);

public:
  NPNManager();

  ~NPNManager();

  int getNFuncs();
  unsigned short* getCanons();
  char* getPhases();
  char* getPerms();
  char* getPractical();
  unsigned char* getMap();
  unsigned short* getMapInv();
  char** getPerms4();
};

} /* namespace algorithm */

#endif /* NPNMANAGER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PreCompGraphManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "PreCompGraphManager.h"
#include <iostream>
#include <assert.h>
#include <stdlib.h>
#include <string.h>

namespace algorithm {

const unsigned short PreCompGraphManager::aigSubgraphs[3562] = {
    0x0008, 0x0002, 0x000a, 0x0002, 0x0008, 0x0003, 0x000a, 0x0003, 0x0009,
    0x0002, 0x000c, 0x0002, 0x000e, 0x0002, 0x000c, 0x0003, 0x000e, 0x0003,
    0x000d, 0x0002, 0x000c, 0x0004, 0x000e, 0x0004, 0x000c, 0x0005, 0x000e,
    0x0005, 0x000d, 0x0004, 0x0010, 0x0002, 0x0012, 0x0002, 0x0010, 0x0003,
    0x0012, 0x0003, 0x0011, 0x0002, 0x0010, 0x0004, 0x0012, 0x0004, 0x0010,
    0x0005, 0x0012, 0x0005, 0x0011, 0x0004, 0x0010, 0x0006, 0x0012, 0x0006,
    0x0010, 0x0007, 0x0012, 0x0007, 0x0011, 0x0006, 0x0016, 0x0005, 0x0014,
    0x0006, 0x0016, 0x0006, 0x0014, 0x0007, 0x0016, 0x0007, 0x0015, 0x0006,
    0x0014, 0x0008, 0x0016, 0x0008, 0x0014, 0x0009, 0x0016, 0x0009, 0x0015,
    0x0008, 0x0018, 0x0006, 0x001a, 0x0006, 0x0018, 0x0007, 0x001a, 0x0007,
    0x0019, 0x0006, 0x0018, 0x0009, 0x001a, 0x0009, 0x0019, 0x0008, 0x001e,
    0x0005, 0x001c, 0x0006, 0x001e, 0x0006, 0x001c, 0x0007, 0x001e, 0x0007,
    0x001d, 0x0006, 0x001c, 0x0008, 0x001e, 0x0008, 0x001c, 0x0009, 0x001e,
    0x0009, 0x001d, 0x0008, 0x0020, 0x0006, 0x0022, 0x0006, 0x0020, 0x0007,
    0x0022, 0x0007, 0x0021, 0x0006, 0x0020, 0x0008, 0x0022, 0x0008, 0x0020,
    0x0009, 0x0022, 0x0009, 0x0021, 0x0008, 0x0024, 0x0006, 0x0026, 0x0006,
    0x0024, 0x0007, 0x0026, 0x0007, 0x0025, 0x0006, 0x0026, 0x0008, 0x0024,
    0x0009, 0x0026, 0x0009, 0x0025, 0x0008, 0x0028, 0x0004, 0x002a, 0x0004,
    0x0028, 0x0005, 0x002a, 0x0007, 0x0028, 0x0008, 0x002a, 0x0009, 0x0029,
    0x0008, 0x002a, 0x000b, 0x0029, 0x000a, 0x002a, 0x000f, 0x0029, 0x000e,
    0x002a, 0x0011, 0x002a, 0x0013, 0x002c, 0x0004, 0x002e, 0x0004, 0x002c,
    0x0005, 0x002c, 0x0009, 0x002e, 0x0009, 0x002d, 0x0008, 0x002d, 0x000c,
    0x002e, 0x000f, 0x002e, 0x0011, 0x002e, 0x0012, 0x0030, 0x0004, 0x0032,
    0x0007, 0x0032, 0x0009, 0x0031, 0x0008, 0x0032, 0x000b, 0x0032, 0x000d,
    0x0032, 0x000f, 0x0031, 0x000e, 0x0032, 0x0013, 0x0034, 0x0004, 0x0036,
    0x0004, 0x0034, 0x0005, 0x0036, 0x0005, 0x0035, 0x0004, 0x0036, 0x0008,
    0x0034, 0x0009, 0x0036, 0x0009, 0x0035, 0x0008, 0x0036, 0x000b, 0x0036,
    0x000d, 0x0036, 0x0011, 0x0035, 0x0010, 0x0036, 0x0013, 0x0038, 0x0004,
    0x0039, 0x0004, 0x0038, 0x0009, 0x003a, 0x0009, 0x0039, 0x0008, 0x0038,
    0x000b, 0x003a, 0x000b, 0x003a, 0x000d, 0x003a, 0x0011, 0x003a, 0x0012,
    0x0038, 0x0013, 0x003a, 0x0013, 0x003c, 0x0002, 0x003e, 0x0002, 0x003c,
    0x0003, 0x003e, 0x0005, 0x003e, 0x0007, 0x003c, 0x0008, 0x003e, 0x0008,
    0x003c, 0x0009, 0x003e, 0x0009, 0x003d, 0x0008, 0x003e, 0x000d, 0x003e,
    0x0011, 0x003e, 0x0013, 0x003e, 0x0017, 0x003e, 0x001b, 0x003e, 0x001d,
    0x0040, 0x0002, 0x0042, 0x0002, 0x0042, 0x0005, 0x0041, 0x0006, 0x0042,
    0x0008, 0x0041, 0x0008, 0x0042, 0x000d, 0x0042, 0x0011, 0x0042, 0x0015,
    0x0042, 0x0019, 0x0042, 0x001b, 0x0042, 0x001c, 0x0041, 0x001c, 0x0044,
    0x0002, 0x0046, 0x0003, 0x0045, 0x0004, 0x0046, 0x0007, 0x0045, 0x0008,
    0x0046, 0x000b, 0x0046, 0x000f, 0x0046, 0x0013, 0x0045, 0x0012, 0x0046,
    0x0017, 0x0046, 0x001b, 0x0046, 0x0021, 0x0048, 0x0002, 0x004a, 0x0002,
    0x0048, 0x0003, 0x004a, 0x0003, 0x0049, 0x0002, 0x0048, 0x0008, 0x004a,
    0x0008, 0x0048, 0x0009, 0x004a, 0x0009, 0x0049, 0x0008, 0x004a, 0x000b,
    0x004a, 0x000f, 0x004a, 0x0011, 0x004a, 0x0012, 0x004a, 0x0013, 0x004a,
    0x0015, 0x004a, 0x0019, 0x004a, 0x001b, 0x004a, 0x001d, 0x004c, 0x0002,
    0x004c, 0x0003, 0x004d, 0x0002, 0x004c, 0x0008, 0x004e, 0x0008, 0x004c,
    0x0009, 0x004e, 0x0009, 0x004d, 0x0008, 0x004c, 0x000b, 0x004e, 0x000b,
    0x004c, 0x000f, 0x004e, 0x000f, 0x004e, 0x0011, 0x004c, 0x0012, 0x004c,
    0x0013, 0x004e, 0x0013, 0x004e, 0x0015, 0x004c, 0x0017, 0x004e, 0x0019,
    0x004c, 0x001b, 0x004e, 0x001b, 0x004c, 0x001c, 0x004c, 0x001d, 0x004e,
    0x001d, 0x0050, 0x0004, 0x0052, 0x0004, 0x0050, 0x0006, 0x0052, 0x0009,
    0x0052, 0x000d, 0x0052, 0x000f, 0x0052, 0x0013, 0x0052, 0x0017, 0x0052,
    0x0019, 0x0052, 0x001d, 0x0052, 0x001f, 0x0052, 0x0021, 0x0052, 0x0023,
    0x0052, 0x0024, 0x0052, 0x0025, 0x0051, 0x0024, 0x0052, 0x0027, 0x0054,
    0x0004, 0x0056, 0x0004, 0x0054, 0x0005, 0x0056, 0x0006, 0x0054, 0x0007,
    0x0056, 0x0011, 0x0056, 0x001b, 0x0056, 0x001e, 0x0054, 0x001f, 0x0056,
    0x001f, 0x0056, 0x0020, 0x0054, 0x0021, 0x0055, 0x0020, 0x0056, 0x0024,
    0x0054, 0x0025, 0x0056, 0x0025, 0x0055, 0x0024, 0x0054, 0x0027, 0x0056,
    0x0027, 0x0055, 0x0026, 0x005a, 0x0007, 0x005a, 0x0009, 0x005a, 0x000b,
    0x005a, 0x0015, 0x005a, 0x001f, 0x0059, 0x0020, 0x0058, 0x0024, 0x005a,
    0x0024, 0x005a, 0x0027, 0x0059, 0x0026, 0x005c, 0x0004, 0x005e, 0x0004,
    0x005c, 0x0005, 0x005e, 0x0006, 0x005c, 0x0007, 0x005d, 0x0006, 0x005e,
    0x000d, 0x005e, 0x0013, 0x005e, 0x0017, 0x005c, 0x001f, 0x005d, 0x001e,
    0x005e, 0x0020, 0x005e, 0x0021, 0x005e, 0x0022, 0x005e, 0x0023, 0x005c,
    0x0024, 0x005e, 0x0024, 0x005c, 0x0025, 0x005e, 0x0025, 0x005d, 0x0024,
    0x005e, 0x0026, 0x005e, 0x0027, 0x0062, 0x0004, 0x0061, 0x0004, 0x0062,
    0x0006, 0x0061, 0x0006, 0x0060, 0x000f, 0x0060, 0x0013, 0x0062, 0x0013,
    0x0060, 0x0019, 0x0062, 0x001c, 0x0060, 0x001d, 0x0062, 0x001d, 0x0062,
    0x001f, 0x0060, 0x0021, 0x0060, 0x0023, 0x0062, 0x0024, 0x0060, 0x0027,
    0x0061, 0x0026, 0x0064, 0x0002, 0x0066, 0x0002, 0x0064, 0x0006, 0x0066,
    0x0007, 0x0066, 0x0009, 0x0066, 0x000d, 0x0066, 0x0013, 0x0066, 0x0015,
    0x0066, 0x0017, 0x0066, 0x0019, 0x0066, 0x001a, 0x0065, 0x001a, 0x0066,
    0x001f, 0x0066, 0x0023, 0x0066, 0x0027, 0x0066, 0x002f, 0x0066, 0x0030,
    0x006a, 0x0002, 0x0068, 0x0003, 0x0068, 0x0006, 0x006a, 0x0006, 0x006a,
    0x0011, 0x0068, 0x0016, 0x0068, 0x0017, 0x006a, 0x0017, 0x006a, 0x001a,
    0x006a, 0x001b, 0x006a, 0x0025, 0x006a, 0x002d, 0x006e, 0x0003, 0x006e,
    0x0007, 0x006e, 0x0009, 0x006e, 0x000b, 0x006e, 0x0015, 0x006e, 0x0016,
    0x006e, 0x0017, 0x006c, 0x001a, 0x006e, 0x001a, 0x006e, 0x001f, 0x006e,
    0x002b, 0x006e, 0x0035, 0x0070, 0x0002, 0x0070, 0x0003, 0x0072, 0x0006,
    0x0070, 0x0007, 0x0071, 0x0006, 0x0072, 0x000b, 0x0072, 0x000f, 0x0072,
    0x0013, 0x0070, 0x0015, 0x0071, 0x0014, 0x0072, 0x0017, 0x0072, 0x0018,
    0x0070, 0x0019, 0x0072, 0x0019, 0x0070, 0x001a, 0x0070, 0x001b, 0x0072,
    0x001b, 0x0071, 0x001a, 0x0072, 0x0021, 0x0072, 0x0029, 0x0076, 0x0002,
    0x0076, 0x0003, 0x0075, 0x0002, 0x0076, 0x0006, 0x0074, 0x0007, 0x0076,
    0x0007, 0x0075, 0x0006, 0x0076, 0x000d, 0x0076, 0x0011, 0x0076, 0x0013,
    0x0075, 0x0014, 0x0076, 0x0019, 0x0076, 0x001a, 0x0076, 0x001b, 0x0075,
    0x001c, 0x0074, 0x0023, 0x0075, 0x0022, 0x0074, 0x0026, 0x0076, 0x0026,
    0x0074, 0x0027, 0x0076, 0x002b, 0x0076, 0x002f, 0x0078, 0x0002, 0x0078,
    0x0004, 0x007a, 0x0004, 0x007a, 0x0005, 0x0079, 0x0004, 0x007a, 0x0009,
    0x007a, 0x000a, 0x007a, 0x000b, 0x007a, 0x000d, 0x007a, 0x000f, 0x007a,
    0x0010, 0x007a, 0x0011, 0x007a, 0x0012, 0x007a, 0x0013, 0x007a, 0x0017,
    0x007a, 0x001b, 0x007a, 0x0021, 0x007a, 0x0027, 0x007a, 0x002b, 0x007a,
    0x002f, 0x007a, 0x0030, 0x0079, 0x0034, 0x007a, 0x0039, 0x007a, 0x003a,
    0x007e, 0x0002, 0x007c, 0x0004, 0x007e, 0x0004, 0x007e, 0x000c, 0x007c,
    0x000d, 0x007e, 0x0011, 0x007e, 0x0013, 0x007e, 0x001b, 0x007e, 0x0025,
    0x007e, 0x002d, 0x007e, 0x0037, 0x0082, 0x0003, 0x0082, 0x0005, 0x0082,
    0x0009, 0x0082, 0x000b, 0x0080, 0x0010, 0x0082, 0x0010, 0x0082, 0x0012,
    0x0082, 0x0015, 0x0082, 0x001f, 0x0082, 0x002b, 0x0082, 0x0035, 0x0082,
    0x0039, 0x0082, 0x003f, 0x0084, 0x0002, 0x0086, 0x0002, 0x0084, 0x0003,
    0x0086, 0x0003, 0x0085, 0x0002, 0x0086, 0x0004, 0x0084, 0x0005, 0x0085,
    0x0004, 0x0086, 0x000a, 0x0084, 0x000b, 0x0085, 0x000a, 0x0086, 0x000d,
    0x0086, 0x000e, 0x0086, 0x000f, 0x0084, 0x0010, 0x0084, 0x0011, 0x0086,
    0x0011, 0x0085, 0x0010, 0x0084, 0x0012, 0x0084, 0x0013, 0x0086, 0x0013,
    0x0085, 0x0012, 0x0086, 0x0019, 0x0086, 0x0023, 0x0086, 0x0029, 0x0086,
    0x0033, 0x0086, 0x0039, 0x008a, 0x0003, 0x0089, 0x0002, 0x0088, 0x0004,
    0x008a, 0x0004, 0x0088, 0x0005, 0x0089, 0x0004, 0x008a, 0x000b, 0x008a,
    0x0010, 0x0088, 0x0011, 0x008a, 0x0011, 0x0089, 0x0010, 0x0088, 0x0012,
    0x008a, 0x0012, 0x0089, 0x0012, 0x008a, 0x0017, 0x008a, 0x001b, 0x0089,
    0x0020, 0x008a, 0x0025, 0x0088, 0x0027, 0x008a, 0x002b, 0x008a, 0x002f,
    0x008a, 0x0039, 0x0088, 0x003a, 0x008d, 0x0044, 0x0092, 0x0009, 0x0092,
    0x0025, 0x0092, 0x0029, 0x0092, 0x002d, 0x0092, 0x0033, 0x0092, 0x0037,
    0x0092, 0x003d, 0x0092, 0x0041, 0x0095, 0x0002, 0x0095, 0x0004, 0x0095,
    0x0010, 0x0095, 0x0012, 0x0096, 0x0021, 0x0096, 0x0029, 0x0095, 0x002e,
    0x0096, 0x0030, 0x0096, 0x0033, 0x0096, 0x003a, 0x0096, 0x0043, 0x009a,
    0x0008, 0x009a, 0x0009, 0x0099, 0x0008, 0x009a, 0x0011, 0x009a, 0x0023,
    0x009a, 0x0033, 0x009a, 0x003d, 0x009a, 0x0044, 0x009a, 0x0045, 0x0099,
    0x0044, 0x009d, 0x0002, 0x009e, 0x0008, 0x009c, 0x0009, 0x009e, 0x0009,
    0x009d, 0x0008, 0x009e, 0x0011, 0x009d, 0x0010, 0x009e, 0x001f, 0x009e,
    0x003f, 0x00a0, 0x0009, 0x00a0, 0x0011, 0x00a2, 0x0030, 0x00a2, 0x0033,
    0x00a6, 0x0006, 0x00a6, 0x0007, 0x00a6, 0x0011, 0x00a6, 0x0044, 0x00a6,
    0x004b, 0x00aa, 0x0007, 0x00aa, 0x0015, 0x00ae, 0x0006, 0x00ae, 0x0011,
    0x00ae, 0x001b, 0x00ae, 0x0025, 0x00ae, 0x003d, 0x00ae, 0x0041, 0x00ae,
    0x0043, 0x00ae, 0x0045, 0x00b2, 0x0006, 0x00b0, 0x0007, 0x00b1, 0x0006,
    0x00b2, 0x0017, 0x00b1, 0x0016, 0x00b0, 0x0019, 0x00b2, 0x0021, 0x00b2,
    0x003d, 0x00b5, 0x004a, 0x00ba, 0x0009, 0x00ba, 0x000f, 0x00bc, 0x0009,
    0x00be, 0x0009, 0x00be, 0x000f, 0x00bd, 0x000e, 0x00be, 0x0017, 0x00c2,
    0x0009, 0x00c2, 0x0019, 0x00c2, 0x001f, 0x00c2, 0x0033, 0x00c6, 0x0009,
    0x00c5, 0x000e, 0x00c6, 0x0015, 0x00c6, 0x0023, 0x00c4, 0x002d, 0x00c6,
    0x002f, 0x00c5, 0x002e, 0x00c6, 0x0045, 0x00ce, 0x0007, 0x00ce, 0x0021,
    0x00ce, 0x0023, 0x00ce, 0x0025, 0x00ce, 0x0027, 0x00ce, 0x0033, 0x00ce,
    0x003d, 0x00d2, 0x0006, 0x00d0, 0x0015, 0x00d0, 0x001b, 0x00d2, 0x001b,
    0x00d1, 0x001a, 0x00d0, 0x001f, 0x00d2, 0x0025, 0x00d1, 0x0024, 0x00d2,
    0x0037, 0x00d2, 0x0041, 0x00d2, 0x0045, 0x00d9, 0x0044, 0x00e1, 0x0004,
    0x00e2, 0x000d, 0x00e2, 0x0021, 0x00e0, 0x003a, 0x00e6, 0x003d, 0x00e6,
    0x0061, 0x00e6, 0x0067, 0x00e9, 0x0004, 0x00ea, 0x0008, 0x00ea, 0x0009,
    0x00ea, 0x0039, 0x00e9, 0x0038, 0x00ea, 0x003f, 0x00ec, 0x000d, 0x00ee,
    0x000d, 0x00ee, 0x0037, 0x00f2, 0x003d, 0x00f2, 0x0062, 0x00f5, 0x0002,
    0x00fa, 0x0017, 0x00fa, 0x003d, 0x00fe, 0x0006, 0x00fd, 0x0006, 0x00fc,
    0x0015, 0x00fe, 0x001b, 0x00fc, 0x0025, 0x00fe, 0x0025, 0x00fd, 0x0024,
    0x00fe, 0x0041, 0x00fe, 0x004d, 0x00fd, 0x004e, 0x0101, 0x0014, 0x0106,
    0x004d, 0x010a, 0x0009, 0x010a, 0x000b, 0x0109, 0x000a, 0x010a, 0x004f,
    0x010a, 0x0058, 0x010e, 0x0008, 0x010c, 0x0009, 0x010e, 0x0009, 0x010d,
    0x0008, 0x010e, 0x000b, 0x010e, 0x002b, 0x010d, 0x002a, 0x010e, 0x0035,
    0x010e, 0x003d, 0x010e, 0x003f, 0x010e, 0x0049, 0x010e, 0x0057, 0x010d,
    0x0056, 0x010d, 0x0058, 0x0111, 0x0004, 0x0111, 0x0006, 0x0110, 0x0009,
    0x0112, 0x0009, 0x0111, 0x0008, 0x0112, 0x002f, 0x0110, 0x0035, 0x0110,
    0x0037, 0x0112, 0x0039, 0x0112, 0x003d, 0x0112, 0x003f, 0x0112, 0x0045,
    0x0111, 0x0044, 0x0112, 0x004b, 0x0112, 0x0059, 0x0112, 0x0069, 0x0112,
    0x007f, 0x0116, 0x0009, 0x0115, 0x0008, 0x0114, 0x000b, 0x0116, 0x000b,
    0x0116, 0x0058, 0x011a, 0x0015, 0x011a, 0x001f, 0x011a, 0x002b, 0x011a,
    0x003f, 0x011a, 0x0049, 0x011a, 0x0085, 0x011e, 0x0007, 0x011e, 0x0019,
    0x011e, 0x001b, 0x011e, 0x0023, 0x011e, 0x0027, 0x011e, 0x002f, 0x011e,
    0x0043, 0x011e, 0x004b, 0x011e, 0x004e, 0x011e, 0x004f, 0x011e, 0x005f,
    0x011e, 0x0061, 0x011e, 0x0065, 0x011e, 0x0083, 0x0122, 0x0006, 0x0120,
    0x0007, 0x0122, 0x0007, 0x0121, 0x0006, 0x0122, 0x0049, 0x0121, 0x004e,
    0x0122, 0x008f, 0x0125, 0x0004, 0x0124, 0x0007, 0x0125, 0x0006, 0x0124,
    0x001b, 0x0126, 0x001b, 0x0126, 0x0045, 0x0126, 0x0087, 0x0128, 0x0007,
    0x0129, 0x0006, 0x012a, 0x0019, 0x012a, 0x003d, 0x012a, 0x0051, 0x012a,
    0x0065, 0x012a, 0x0083, 0x012d, 0x005a, 0x0132, 0x0009, 0x0132, 0x008f,
    0x0134, 0x0009, 0x0135, 0x003e, 0x013a, 0x003d, 0x013a, 0x0044, 0x0139,
    0x0044, 0x013e, 0x0009, 0x013d, 0x0008, 0x013c, 0x003d, 0x013c, 0x0044,
    0x013c, 0x0053, 0x013e, 0x008f, 0x013e, 0x0095, 0x0142, 0x0044, 0x0142,
    0x0097, 0x0142, 0x009e, 0x0144, 0x0007, 0x0148, 0x0015, 0x0148, 0x001c,
    0x0148, 0x001f, 0x0148, 0x0026, 0x0149, 0x0086, 0x014d, 0x0006, 0x014e,
    0x0044, 0x014d, 0x0048, 0x014e, 0x009e, 0x0152, 0x0009, 0x0151, 0x00a6,
    0x0155, 0x0030, 0x015d, 0x003a, 0x0162, 0x009e, 0x0164, 0x000f, 0x0164,
    0x0013, 0x0169, 0x000e, 0x0174, 0x0009, 0x0179, 0x0008, 0x0180, 0x0009,
    0x0181, 0x0044, 0x0186, 0x0044, 0x0185, 0x0044, 0x018a, 0x0068, 0x0195,
    0x004e, 0x01a6, 0x0009, 0x01a5, 0x0008, 0x01b1, 0x003a, 0x01c4, 0x0029,
    0x01c4, 0x0030, 0x01ca, 0x008f, 0x01ca, 0x0095, 0x01cc, 0x0029, 0x01cc,
    0x0033, 0x01ce, 0x003d, 0x01d6, 0x00b2, 0x01d8, 0x0009, 0x01d9, 0x002a,
    0x01d9, 0x0056, 0x01d9, 0x00a4, 0x01dd, 0x003a, 0x01e2, 0x00b2, 0x01e6,
    0x0013, 0x01e6, 0x009f, 0x01e6, 0x00ba, 0x01e6, 0x00c0, 0x01e6, 0x00d3,
    0x01e6, 0x00d5, 0x01e6, 0x00e5, 0x01e8, 0x0005, 0x01f2, 0x0013, 0x01f2,
    0x0095, 0x01f2, 0x009f, 0x01f2, 0x00ba, 0x01f2, 0x00c0, 0x01f2, 0x00d3,
    0x0202, 0x008f, 0x0202, 0x0095, 0x0202, 0x00f3, 0x0202, 0x00f9, 0x020a,
    0x0044, 0x0209, 0x00b4, 0x020e, 0x0009, 0x020d, 0x0008, 0x020c, 0x003d,
    0x020c, 0x0044, 0x020c, 0x0053, 0x020e, 0x008f, 0x020e, 0x0095, 0x020c,
    0x00b1, 0x020e, 0x00f3, 0x020e, 0x00f9, 0x0210, 0x0013, 0x0211, 0x0024,
    0x0210, 0x0026, 0x0219, 0x0004, 0x021e, 0x008f, 0x021e, 0x0095, 0x0221,
    0x003a, 0x0230, 0x0009, 0x0236, 0x0009, 0x0234, 0x0029, 0x0234, 0x0030,
    0x0234, 0x0033, 0x0234, 0x003a, 0x0234, 0x003d, 0x0234, 0x0044, 0x0235,
    0x00a6, 0x023a, 0x0009, 0x023d, 0x003a, 0x0245, 0x0044, 0x0249, 0x003a,
    0x024e, 0x009e, 0x024e, 0x0106, 0x0251, 0x0026, 0x0258, 0x0013, 0x0259,
    0x0024, 0x0258, 0x0061, 0x0259, 0x0086, 0x0258, 0x00c7, 0x0258, 0x00df,
    0x0259, 0x00ec, 0x0258, 0x00fc, 0x025d, 0x0024, 0x025d, 0x00de, 0x0260,
    0x00f6, 0x0268, 0x0009, 0x0269, 0x0044, 0x0268, 0x00f3, 0x0268, 0x00f9,
    0x026d, 0x003a, 0x0270, 0x0068, 0x0275, 0x003a, 0x027a, 0x0044, 0x0279,
    0x0044, 0x027e, 0x007e, 0x0281, 0x0044, 0x0285, 0x0008, 0x028d, 0x0006,
    0x028d, 0x00d2, 0x0295, 0x00cc, 0x0296, 0x00f6, 0x0295, 0x00f8, 0x0299,
    0x0030, 0x029e, 0x007e, 0x029d, 0x0080, 0x02a6, 0x008f, 0x02a6, 0x0095,
    0x02aa, 0x0029, 0x02aa, 0x0030, 0x02b5, 0x0008, 0x02b9, 0x003a, 0x02bd,
    0x0004, 0x02bd, 0x00fc, 0x02c2, 0x00b2, 0x02c1, 0x00b4, 0x02c4, 0x0029,
    0x02c8, 0x0029, 0x02c8, 0x0033, 0x02ca, 0x003d, 0x02ce, 0x0029, 0x02ce,
    0x0030, 0x02d2, 0x0068, 0x02d1, 0x006a, 0x02d5, 0x006a, 0x02d9, 0x0008,
    0x02de, 0x012c, 0x02e2, 0x012c, 0x02e4, 0x0009, 0x02e5, 0x002a, 0x02e5,
    0x0056, 0x02e5, 0x012c, 0x02ea, 0x0029, 0x02ea, 0x0030, 0x02e9, 0x0030,
    0x02ec, 0x0029, 0x02ec, 0x0030, 0x02ee, 0x012c, 0x02f1, 0x0068, 0x02f1,
    0x00b2, 0x02f1, 0x0108, 0x02f1, 0x012c, 0x02f6, 0x0013, 0x02f6, 0x0015,
    0x02f6, 0x001f, 0x02f6, 0x0030, 0x02f6, 0x0065, 0x02f6, 0x0067, 0x02f6,
    0x009f, 0x02f6, 0x00b6, 0x02f6, 0x00b9, 0x02f6, 0x00c0, 0x02f6, 0x00cf,
    0x02f6, 0x0107, 0x02f6, 0x010b, 0x02f6, 0x010f, 0x02f6, 0x0115, 0x02f6,
    0x012d, 0x02f6, 0x0134, 0x02f6, 0x0153, 0x02f6, 0x0171, 0x02f6, 0x0176,
    0x02f8, 0x0003, 0x02fa, 0x017b, 0x02fc, 0x00ba, 0x02fc, 0x00d3, 0x0302,
    0x0013, 0x0302, 0x001f, 0x0302, 0x0030, 0x0302, 0x005d, 0x0302, 0x0065,
    0x0302, 0x0067, 0x0302, 0x0099, 0x0302, 0x009f, 0x0302, 0x00ad, 0x0302,
    0x00b9, 0x0302, 0x00c0, 0x0302, 0x00cf, 0x0301, 0x00d2, 0x0301, 0x00fe,
    0x0302, 0x0107, 0x0302, 0x010b, 0x0302, 0x010f, 0x0302, 0x0117, 0x0302,
    0x0134, 0x0302, 0x0153, 0x0302, 0x0157, 0x0302, 0x0176, 0x0306, 0x0029,
    0x0308, 0x00b2, 0x0309, 0x00dc, 0x030d, 0x00f8, 0x0312, 0x00f3, 0x0318,
    0x007e, 0x031d, 0x0080, 0x0321, 0x0008, 0x0321, 0x0094, 0x0326, 0x017b,
    0x0326, 0x0181, 0x0329, 0x012e, 0x032a, 0x017b, 0x032a, 0x0181, 0x032e,
    0x008f, 0x032e, 0x0095, 0x032e, 0x00f3, 0x032e, 0x00f9, 0x0332, 0x0009,
    0x0331, 0x0008, 0x0330, 0x003d, 0x0330, 0x0044, 0x0330, 0x0053, 0x0332,
    0x008f, 0x0332, 0x0095, 0x0330, 0x00b1, 0x0332, 0x00f3, 0x0332, 0x00f9,
    0x0330, 0x0127, 0x0332, 0x017b, 0x0332, 0x0181, 0x033c, 0x0013, 0x033c,
    0x001c, 0x033d, 0x0086, 0x033d, 0x00ec, 0x033d, 0x0172, 0x033e, 0x019d,
    0x0345, 0x0002, 0x0344, 0x008f, 0x0344, 0x00f3, 0x034d, 0x0030, 0x0352,
    0x0033, 0x0354, 0x0029, 0x0354, 0x0030, 0x035a, 0x0009, 0x035a, 0x017b,
    0x035a, 0x019b, 0x035a, 0x01a2, 0x035e, 0x0181, 0x0360, 0x0009, 0x0366,
    0x0009, 0x0364, 0x0029, 0x0364, 0x0030, 0x0364, 0x0033, 0x0364, 0x003a,
    0x0364, 0x003d, 0x0364, 0x0044, 0x0369, 0x0030, 0x0370, 0x0029, 0x0370,
    0x0030, 0x0376, 0x0033, 0x037a, 0x0009, 0x037a, 0x019b, 0x037a, 0x01a2,
    0x037c, 0x0009, 0x0382, 0x0181, 0x0386, 0x0009, 0x0384, 0x0029, 0x0384,
    0x0030, 0x0384, 0x0033, 0x0384, 0x003a, 0x0384, 0x003d, 0x0384, 0x0044,
    0x038a, 0x0044, 0x038a, 0x009e, 0x038a, 0x0106, 0x038a, 0x0198, 0x038d,
    0x010e, 0x038d, 0x0152, 0x038d, 0x0158, 0x0392, 0x009e, 0x0392, 0x0106,
    0x0392, 0x0198, 0x0395, 0x0086, 0x0395, 0x009a, 0x0395, 0x00ec, 0x0395,
    0x0172, 0x0398, 0x014e, 0x0398, 0x0175, 0x0398, 0x018d, 0x039c, 0x0023,
    0x039c, 0x0027, 0x039c, 0x00ef, 0x039c, 0x0139, 0x039c, 0x0168, 0x03a0,
    0x0019, 0x03a0, 0x001d, 0x03a0, 0x0023, 0x03a0, 0x0027, 0x03a1, 0x004e,
    0x03a4, 0x0162, 0x03a4, 0x0183, 0x03a8, 0x0013, 0x03a8, 0x0027, 0x03a8,
    0x0133, 0x03a8, 0x0148, 0x03a8, 0x0181, 0x03ac, 0x0013, 0x03ac, 0x0027,
    0x03b0, 0x017b, 0x03b0, 0x0181, 0x03b4, 0x004b, 0x03b4, 0x00e0, 0x03b4,
    0x00fb, 0x03b8, 0x000f, 0x03b8, 0x0013, 0x03b8, 0x00ab, 0x03b8, 0x00bf,
    0x03b8, 0x00d0, 0x03bd, 0x00da, 0x03bd, 0x012c, 0x03c8, 0x000f, 0x03c8,
    0x0013, 0x03c8, 0x0019, 0x03c8, 0x001d, 0x03cd, 0x0086, 0x03cd, 0x00ec,
    0x03cd, 0x0172, 0x03d2, 0x00e0, 0x03d2, 0x00ef, 0x03d2, 0x0112, 0x03d2,
    0x0139, 0x03d2, 0x0168, 0x03d6, 0x017b, 0x03d6, 0x0181, 0x03da, 0x0133,
    0x03da, 0x0148, 0x03e2, 0x0023, 0x03e2, 0x0027, 0x03e6, 0x0027, 0x03e6,
    0x0181, 0x03ee, 0x017b, 0x03ee, 0x0181, 0x03fe, 0x003d, 0x0401, 0x012a,
    0x0401, 0x019e, 0x0405, 0x01a0, 0x040a, 0x000d, 0x040a, 0x011f, 0x040a,
    0x016f, 0x040d, 0x012a, 0x0412, 0x017b, 0x041a, 0x0033, 0x041a, 0x003d,
    0x041a, 0x0181, 0x0421, 0x0086, 0x0421, 0x009a, 0x0421, 0x00ec, 0x0421,
    0x0172, 0x042e, 0x0205, 0x043a, 0x0205, 0x043e, 0x017b, 0x0442, 0x01f5,
    0x044c, 0x0007, 0x0452, 0x0033, 0x0452, 0x01ce, 0x0452, 0x01d0, 0x0452,
    0x01f1, 0x0452, 0x01fb, 0x0452, 0x0225, 0x0454, 0x0005, 0x045a, 0x0033,
    0x045a, 0x0181, 0x045a, 0x01ce, 0x045a, 0x01d0, 0x045a, 0x01f1, 0x0469,
    0x01de, 0x046e, 0x0181, 0x047a, 0x01ce, 0x047a, 0x01f1, 0x0485, 0x012c,
    0x0489, 0x012c, 0x0490, 0x01d8, 0x0496, 0x0033, 0x0496, 0x003d, 0x0498,
    0x008f, 0x0498, 0x00f3, 0x049e, 0x0044, 0x049e, 0x0221, 0x04a1, 0x0006,
    0x04a2, 0x0044, 0x04a6, 0x0221, 0x04a9, 0x0004, 0x04ac, 0x0027, 0x04b1,
    0x009a, 0x04b6, 0x0097, 0x04b8, 0x0027, 0x04c6, 0x0219, 0x04ca, 0x017b,
    0x04cc, 0x004b, 0x04d0, 0x00ab, 0x04d6, 0x017b, 0x04d8, 0x000f, 0x04d8,
    0x0019, 0x04d8, 0x0033, 0x04d8, 0x003d, 0x04de, 0x003d, 0x04de, 0x0103,
    0x04de, 0x018b, 0x04de, 0x0231, 0x04e2, 0x0044, 0x04e2, 0x009e, 0x04e2,
    0x0106, 0x04e2, 0x0198, 0x04e5, 0x01a4, 0x04e5, 0x01b6, 0x04ea, 0x009e,
    0x04ea, 0x0106, 0x04ea, 0x0198, 0x04ed, 0x002e, 0x04ed, 0x0038, 0x04ed,
    0x00a2, 0x04f1, 0x0086, 0x04f1, 0x009a, 0x04f1, 0x00ec, 0x04f1, 0x0172,
    0x04f9, 0x004e, 0x04f8, 0x0229, 0x04f8, 0x022d, 0x0500, 0x023e, 0x0504,
    0x0217, 0x0510, 0x00f3, 0x0514, 0x0043, 0x0514, 0x004d, 0x0514, 0x00c3,
    0x0514, 0x013d, 0x0514, 0x0215, 0x0514, 0x0232, 0x0515, 0x0260, 0x0519,
    0x002a, 0x0518, 0x0030, 0x0518, 0x0067, 0x0518, 0x00c9, 0x0518, 0x01eb,
    0x0518, 0x01ef, 0x051c, 0x0139, 0x051c, 0x0168, 0x0520, 0x0027, 0x0526,
    0x014e, 0x0526, 0x0175, 0x0526, 0x018d, 0x052d, 0x0200, 0x0532, 0x0021,
    0x0532, 0x00bf, 0x0532, 0x00d0, 0x0532, 0x0239, 0x0532, 0x0266, 0x053d,
    0x0024, 0x053d, 0x00da, 0x054a, 0x000f, 0x054a, 0x00ab, 0x054a, 0x023a,
    0x054e, 0x0043, 0x054e, 0x004d, 0x054e, 0x00c3, 0x054e, 0x013d, 0x054e,
    0x0215, 0x054e, 0x0232, 0x054e, 0x029d, 0x0552, 0x014e, 0x0552, 0x018d,
    0x0556, 0x00f3, 0x0556, 0x01e4, 0x055a, 0x0299, 0x055d, 0x0086, 0x055d,
    0x009a, 0x055d, 0x00ec, 0x055d, 0x0172, 0x0566, 0x01dc, 0x0566, 0x02a5,
    0x056d, 0x020a, 0x057a, 0x003d, 0x057a, 0x01d4, 0x057a, 0x01f3, 0x0579,
    0x025e, 0x057e, 0x0139, 0x057e, 0x0168, 0x0581, 0x0006, 0x0586, 0x017b,
    0x0586, 0x0181, 0x0586, 0x028c, 0x0588, 0x0007, 0x058e, 0x0033, 0x058e,
    0x008f, 0x058e, 0x01d0, 0x058e, 0x027c, 0x0590, 0x0003, 0x0596, 0x0033,
    0x0596, 0x008f, 0x0596, 0x0095, 0x0596, 0x01d0, 0x0596, 0x027c, 0x05a2,
    0x026f, 0x05a5, 0x0284, 0x05aa, 0x017b, 0x05ac, 0x0205, 0x05b2, 0x008f,
    0x05b6, 0x017b, 0x05b8, 0x01da, 0x05c1, 0x0276, 0x05c6, 0x0248, 0x05c8,
    0x0247, 0x05c8, 0x027e, 0x05cc, 0x003d, 0x05cc, 0x01d4, 0x05cc, 0x01f3,
    0x05d0, 0x014e, 0x05d0, 0x018d, 0x05da, 0x00f9, 0x05dd, 0x0006, 0x05de,
    0x0044, 0x05e5, 0x002e, 0x05e6, 0x02f1, 0x05ea, 0x01d4, 0x05ea, 0x01f3,
    0x05ea, 0x022d, 0x05ed, 0x0002, 0x05f6, 0x0027, 0x05fa, 0x0097, 0x05fc,
    0x003d, 0x0602, 0x003d, 0x0606, 0x00f3, 0x060a, 0x0027, 0x060e, 0x003d,
    0x060e, 0x0103, 0x060e, 0x018b, 0x060e, 0x0231, 0x060e, 0x02d1, 0x0611,
    0x01fc, 0x0611, 0x0234, 0x061a, 0x0287, 0x061d, 0x0214, 0x0621, 0x01d4,
    0x062a, 0x0027, 0x062a, 0x022d, 0x062e, 0x009e, 0x062e, 0x0106, 0x062e,
    0x0198, 0x0632, 0x009e, 0x0632, 0x0106, 0x0632, 0x0198, 0x0639, 0x0042,
    0x0639, 0x00b2, 0x0639, 0x0108, 0x063d, 0x01f8, 0x0641, 0x0086, 0x0641,
    0x009a, 0x0641, 0x00ec, 0x0641, 0x0172, 0x0645, 0x0044, 0x0649, 0x0042,
    0x0648, 0x0087, 0x0648, 0x00ed, 0x0648, 0x0173, 0x0649, 0x01a0, 0x0648,
    0x0241, 0x0648, 0x026f, 0x0648, 0x02df, 0x0648, 0x0307, 0x064c, 0x023a,
    0x064c, 0x02b3, 0x0651, 0x0062, 0x0650, 0x0217, 0x0651, 0x02ac, 0x0650,
    0x02d6, 0x0655, 0x0042, 0x065d, 0x0042, 0x0664, 0x02b1, 0x0664, 0x02ce,
    0x0669, 0x0238, 0x066d, 0x002a, 0x066c, 0x0039, 0x066d, 0x01f6, 0x066c,
    0x0213, 0x066c, 0x022e, 0x066d, 0x02a2, 0x066c, 0x02e1, 0x0671, 0x002a,
    0x0670, 0x0030, 0x0670, 0x0067, 0x0670, 0x00c9, 0x0670, 0x01eb, 0x0670,
    0x01ef, 0x0670, 0x02c3, 0x0675, 0x0020, 0x0678, 0x0133, 0x0678, 0x0148,
    0x067c, 0x0027, 0x0681, 0x023a, 0x0684, 0x0021, 0x0684, 0x00bf, 0x0684,
    0x00d0, 0x0689, 0x01fc, 0x068e, 0x0162, 0x068e, 0x0183, 0x0691, 0x0200,
    0x0696, 0x0023, 0x0696, 0x00e0, 0x0696, 0x00fb, 0x0696, 0x0268, 0x069a,
    0x0282, 0x069d, 0x007e, 0x06a2, 0x004b, 0x06a2, 0x023e, 0x06a2, 0x02dc,
    0x06a6, 0x0097, 0x06aa, 0x02b1, 0x06aa, 0x02ce, 0x06ae, 0x0039, 0x06ae,
    0x0213, 0x06ae, 0x022e, 0x06ae, 0x02e1, 0x06b2, 0x0162, 0x06b2, 0x0183,
    0x06b6, 0x0023, 0x06b6, 0x00e0, 0x06b6, 0x00fb, 0x06ba, 0x008f, 0x06ba,
    0x01e4, 0x06be, 0x034b, 0x06c1, 0x0086, 0x06c1, 0x009a, 0x06c1, 0x00ec,
    0x06c1, 0x0172, 0x06c6, 0x01da, 0x06c6, 0x0280, 0x06c6, 0x0351, 0x06ce,
    0x008f, 0x06d2, 0x01e3, 0x06d2, 0x0287, 0x06d2, 0x0353, 0x06d6, 0x027a,
    0x06d6, 0x029b, 0x06da, 0x0033, 0x06da, 0x01ce, 0x06da, 0x01f1, 0x06de,
    0x0133, 0x06de, 0x0148, 0x06e2, 0x0021, 0x06e2, 0x00bf, 0x06e2, 0x00d0,
    0x06e5, 0x023a, 0x06e9, 0x0004, 0x06ee, 0x028c, 0x06ee, 0x0338, 0x06f2,
    0x0328, 0x06f2, 0x0330, 0x06f4, 0x0005, 0x06f9, 0x01e0, 0x06fe, 0x0328,
    0x06fe, 0x0330, 0x0702, 0x003d, 0x0702, 0x00f3, 0x0702, 0x0330, 0x0704,
    0x0003, 0x070a, 0x003d, 0x070a, 0x00f3, 0x070a, 0x01d4, 0x070a, 0x01f3,
    0x070a, 0x0330, 0x0711, 0x032a, 0x0711, 0x032e, 0x0716, 0x003d, 0x0718,
    0x0205, 0x0718, 0x0282, 0x071e, 0x00f3, 0x0720, 0x01dc, 0x0720, 0x02a5,
    0x0726, 0x0324, 0x072a, 0x028a, 0x072a, 0x02a7, 0x0729, 0x031c, 0x0729,
    0x032a, 0x072e, 0x003d, 0x072e, 0x00f9, 0x072e, 0x022d, 0x072e, 0x0248,
    0x072e, 0x02e4, 0x0730, 0x003d, 0x0730, 0x0247, 0x0730, 0x02e3, 0x0730,
    0x0324, 0x0732, 0x0324, 0x0739, 0x032e, 0x073e, 0x003d, 0x0740, 0x003d,
    0x0744, 0x027a, 0x0744, 0x029b, 0x0748, 0x0033, 0x0748, 0x01ce, 0x0748,
    0x01f1, 0x074c, 0x0162, 0x074c, 0x0183, 0x0750, 0x0023, 0x0750, 0x00e0,
    0x0750, 0x00fb, 0x0755, 0x0246, 0x075a, 0x0095, 0x075a, 0x0397, 0x075d,
    0x0004, 0x076a, 0x03b3, 0x076d, 0x0002, 0x0772, 0x02fb, 0x0772, 0x0301,
    0x0772, 0x0315, 0x0772, 0x0397, 0x0776, 0x008f, 0x077e, 0x0027, 0x078a,
    0x00a1, 0x0792, 0x009d, 0x0792, 0x00c3, 0x0792, 0x02fb, 0x0792, 0x0301,
    0x0792, 0x0315, 0x0792, 0x03bd, 0x0796, 0x0027, 0x0796, 0x024f, 0x079e,
    0x009d, 0x07a6, 0x009d, 0x07a6, 0x02fb, 0x07a6, 0x0301, 0x07a6, 0x0315,
    0x07a6, 0x03bd, 0x07aa, 0x0027, 0x07aa, 0x024f, 0x07ae, 0x009d, 0x07b9,
    0x004e, 0x07b8, 0x0087, 0x07b8, 0x00ed, 0x07b8, 0x0173, 0x07b8, 0x0197,
    0x07b9, 0x021a, 0x07b9, 0x02b8, 0x07b9, 0x0364, 0x07be, 0x0029, 0x07be,
    0x0030, 0x07c0, 0x017b, 0x07c6, 0x017b, 0x07c8, 0x00f3, 0x07ce, 0x00f3,
    0x07d0, 0x008f, 0x07d6, 0x008f, 0x07d9, 0x01e8, 0x07dd, 0x0292, 0x07e2,
    0x0053, 0x07e6, 0x008f, 0x07e6, 0x00f3, 0x07e6, 0x017b, 0x07e8, 0x0029,
    0x07e8, 0x0030, 0x07ec, 0x0021, 0x07ec, 0x02ad, 0x07f2, 0x0181, 0x07f2,
    0x0315, 0x07f4, 0x0021, 0x07f8, 0x020f, 0x07fd, 0x002e, 0x0800, 0x008f,
    0x0805, 0x0006, 0x0809, 0x03c2, 0x080d, 0x0084, 0x0812, 0x0009, 0x0811,
    0x0008, 0x0812, 0x00f3, 0x0812, 0x00f9, 0x0812, 0x017b, 0x0812, 0x0181,
    0x0814, 0x0033, 0x0818, 0x0023, 0x081c, 0x0285, 0x0826, 0x03bd, 0x082c,
    0x008f, 0x082c, 0x017b, 0x0832, 0x0043, 0x0832, 0x011b, 0x0832, 0x01b3,
    0x0832, 0x01c3, 0x0835, 0x032a, 0x0838, 0x0085, 0x0839, 0x032a, 0x083e,
    0x0049, 0x083d, 0x0084, 0x083e, 0x02fb, 0x083e, 0x0301, 0x083e, 0x0315,
    0x083e, 0x0397, 0x0842, 0x0009, 0x0841, 0x0008, 0x0844, 0x0009, 0x0846,
    0x008f, 0x084a, 0x0033, 0x084e, 0x0285, 0x0851, 0x009a, 0x0856, 0x00a1,
    0x0859, 0x031c, 0x085d, 0x00b2, 0x0861, 0x0012, 0x0861, 0x02cc, 0x0865,
    0x0058, 0x0865, 0x007e, 0x0869, 0x004a, 0x0871, 0x0010, 0x0876, 0x003d,
    0x0879, 0x032c, 0x087e, 0x0089, 0x0882, 0x0229, 0x0882, 0x022d, 0x0882,
    0x02c7, 0x0882, 0x02cb, 0x0886, 0x0021, 0x0886, 0x02ad, 0x0885, 0x0356,
    0x088a, 0x0017, 0x088a, 0x020f, 0x0889, 0x0354, 0x088d, 0x009c, 0x0892,
    0x0089, 0x0895, 0x0246, 0x089a, 0x03bd, 0x089e, 0x008f, 0x089e, 0x02f9,
    0x089e, 0x0313, 0x08a1, 0x032a, 0x08a6, 0x0053, 0x08a6, 0x0095, 0x08a6,
    0x0397, 0x08a8, 0x017b, 0x08ad, 0x031a, 0x08b2, 0x017b, 0x08b4, 0x00f3,
    0x08b5, 0x02a0, 0x08b8, 0x0089, 0x08c1, 0x0024, 0x08c4, 0x00f3, 0x08c9,
    0x007e, 0x08cd, 0x007c, 0x08cd, 0x0222, 0x08cd, 0x0294, 0x08d1, 0x003a,
    0x08d6, 0x0009, 0x08d9, 0x003a, 0x08dc, 0x001f, 0x08e0, 0x008f, 0x08e0,
    0x017b, 0x08e4, 0x0009, 0x08e8, 0x01ed, 0x08ed, 0x031c, 0x08f2, 0x003d,
    0x08f6, 0x008f, 0x08f6, 0x017b, 0x08fa, 0x0009, 0x08fe, 0x003d, 0x0902,
    0x01e9, 0x0904, 0x01e9, 0x0904, 0x0381, 0x090a, 0x03b1, 0x090d, 0x031a,
    0x0910, 0x0299, 0x0914, 0x034b, 0x0919, 0x0008, 0x091c, 0x0033, 0x091c,
    0x003d, 0x0920, 0x0027, 0x0924, 0x0027, 0x0924, 0x01fb, 0x092a, 0x01ce,
    0x092a, 0x01f1, 0x092d, 0x031c, 0x0930, 0x001f, 0x0936, 0x00c5, 0x0938,
    0x00c5, 0x0938, 0x0381, 0x093c, 0x001b, 0x0942, 0x017d, 0x094a, 0x0027,
    0x094e, 0x0027, 0x094e, 0x01fb, 0x0952, 0x03b1, 0x095a, 0x0029, 0x095a,
    0x0030, 0x095d, 0x0030, 0x0961, 0x0030, 0x0966, 0x02f9, 0x0966, 0x0313,
    0x0968, 0x02eb, 0x096d, 0x0008, 0x0970, 0x017b, 0x0974, 0x0033, 0x0979,
    0x0150, 0x097d, 0x009a, 0x0982, 0x0293, 0x0984, 0x0293, 0x0984, 0x0379,
    0x098a, 0x02eb, 0x098e, 0x0009, 0x0992, 0x003d, 0x0996, 0x003d, 0x0999,
    0x0062, 0x099e, 0x003d, 0x09a0, 0x0027, 0x09a5, 0x0144, 0x09a8, 0x02b5,
    0x09ae, 0x008f, 0x09ae, 0x009d, 0x09b2, 0x004d, 0x09b2, 0x0053, 0x09b2,
    0x00c3, 0x09b2, 0x013d, 0x09b2, 0x01c5, 0x09b2, 0x0271, 0x09b4, 0x0025,
    0x09ba, 0x0033, 0x09ba, 0x0079, 0x09bc, 0x0015, 0x09c2, 0x013f, 0x09c4,
    0x013f, 0x09c4, 0x0379, 0x09ca, 0x02b5, 0x09cd, 0x0006, 0x09da, 0x0009,
    0x09d9, 0x0008, 0x09dc, 0x000b, 0x09dc, 0x004f, 0x09dd, 0x0086, 0x09e0,
    0x0009, 0x09e6, 0x00a1, 0x09e8, 0x0009, 0x09ed, 0x0086, 0x09f2, 0x001f,
    0x09f2, 0x002f, 0x09f2, 0x0049, 0x09f2, 0x006f, 0x09f2, 0x0085, 0x09f2,
    0x0091, 0x09f2, 0x00a9, 0x09f2, 0x00d3, 0x09f2, 0x00d7, 0x09f2, 0x011d,
    0x09f2, 0x0121, 0x09f2, 0x0235, 0x09f2, 0x0393, 0x09f6, 0x0324, 0x09f8,
    0x0049, 0x09f8, 0x00a9, 0x09f8, 0x011d, 0x09fe, 0x001f, 0x09fe, 0x0029,
    0x09fe, 0x0033, 0x09fe, 0x003d, 0x09fe, 0x0085, 0x09fe, 0x008f, 0x09fe,
    0x00d3, 0x0a00, 0x003d, 0x0a06, 0x012d, 0x0a0e, 0x00b3, 0x0a10, 0x000b,
    0x0a10, 0x0387, 0x0a16, 0x0059, 0x0a18, 0x0009, 0x0a1e, 0x0043, 0x0a24,
    0x0085, 0x0a2a, 0x0009, 0x0a2d, 0x0008, 0x0a32, 0x028a, 0x0a32, 0x02a7,
    0x0a31, 0x031c, 0x0a35, 0x032e, 0x0a39, 0x0006, 0x0a3a, 0x0105, 0x0a3a,
    0x024f, 0x0a3c, 0x0299, 0x0a42, 0x01ed, 0x0a46, 0x0299, 0x0a48, 0x01ed,
    0x0a4c, 0x0059, 0x0a52, 0x000b, 0x0a52, 0x0387, 0x0a56, 0x000b, 0x0a5e,
    0x0009, 0x0a60, 0x003d, 0x0a66, 0x0105, 0x0a6a, 0x0195, 0x0a6c, 0x000b,
    0x0a76, 0x0053, 0x0a78, 0x0009, 0x0a7a, 0x008f, 0x0a82, 0x0299, 0x0a86,
    0x01ed, 0x0a8a, 0x0027, 0x0a8e, 0x004b, 0x0a92, 0x003d, 0x0a95, 0x0322,
    0x0a99, 0x0038, 0x0a99, 0x0090, 0x0a9c, 0x0061, 0x0a9c, 0x00c7, 0x0a9c,
    0x012d, 0x0a9c, 0x016f, 0x0a9c, 0x017d, 0x0a9c, 0x02c9, 0x0a9c, 0x0383,
    0x0aa1, 0x0010, 0x0aa4, 0x00b3, 0x0aa8, 0x002f, 0x0aac, 0x0027, 0x0ab0,
    0x004b, 0x0ab4, 0x0043, 0x0ab9, 0x0090, 0x0abd, 0x0010, 0x0ac4, 0x0019,
    0x0acc, 0x00f5, 0x0acc, 0x022b, 0x0acc, 0x037b, 0x0ad2, 0x008f, 0x0ad2,
    0x01f1, 0x0ad6, 0x0324, 0x0ad9, 0x0330, 0x0ade, 0x008f, 0x0ade, 0x01f1,
    0x0ae0, 0x017b, 0x0ae4, 0x008f, 0x0ae9, 0x004e, 0x0aee, 0x0027, 0x0af2,
    0x028a, 0x0af2, 0x02a7, 0x0af1, 0x031c, 0x0af6, 0x0027, 0x0af9, 0x031c,
    0x0afe, 0x00e9, 0x0afe, 0x02bb, 0x0b02, 0x000b, 0x0b06, 0x00f5, 0x0b06,
    0x022b, 0x0b06, 0x037b, 0x0b0a, 0x003d, 0x0000, 0x0000};

PreCompGraphManager::PreCompGraphManager(NPNManager& npnManager)
    : npnManager(npnManager), table(npnManager.getNFuncs()), classes(222) {

  this->nTravIds   = 0;
  this->forestSize = 0;
  this->forest     = (ForestNode*)malloc(
      sizeof(ForestNode) *
      1800); // Value based on the execution of ABC rewrite command.
  memset(this->forest, 0, sizeof(ForestNode));

  addForestVar(0x0000); // constant 0
  addForestVar(0xAAAA); // var A
  addForestVar(0xCCCC); // var B
  addForestVar(0xF0F0); // var C
  addForestVar(0xFF00); // var D
}

PreCompGraphManager::~PreCompGraphManager() {

  if (!this->classes.empty()) {
    ForestNode* node;
    DecGraph* decGraph;
    for (size_t i = 0; i < classes.size(); i++) {
      for (size_t j = 0; j < classes[i].size(); j++) {
        node     = classes[i][j];
        decGraph = (DecGraph*)node->pNext;
        delete decGraph;
      }
    }
  }

  free(this->forest);
}

void PreCompGraphManager::loadPreCompGraphFromArray() {

  ForestNode *p0, *p1;
  unsigned Entry0, Entry1;
  int Level, Volume, fExor;
  int i;

  // reconstruct the forest
  for (i = 0;; i++) {

    Entry0 = aigSubgraphs[2 * i + 0];
    Entry1 = aigSubgraphs[2 * i + 1];
    if (Entry0 == 0 && Entry1 == 0) {
      break;
    }
    // get EXOR flag
    fExor = (Entry0 & 1);
    Entry0 >>= 1;
    // get the nodes
    p0 = &(this->forest[Entry0 >> 1]);
    p1 = &(this->forest[Entry1 >> 1]);
    // compute the level and volume of the new nodes
    Level  = 1 + std::max(p0->Level, p1->Level);
    Volume = 1 + getForestNodeVolume(p0, p1);
    // set the complemented attributes
    p0 = forestNodeComplementCond(p0, (Entry0 & 1));
    p1 = forestNodeComplementCond(p1, (Entry1 & 1));
    // add the node
    addForestNode(p0, p1, fExor, Level, Volume + fExor);
  }
}

ForestNode* PreCompGraphManager::addForestNode(ForestNode* p0, ForestNode* p1,
                                               int fExor, int Level,
                                               int Volume) {

  unsigned uTruth;
  // compute truth table, leve, volume
  if (fExor) {
    uTruth = (p0->uTruth ^ p1->uTruth);
  } else {
    uTruth = (isForestNodeComplement(p0) ? ~forestNodeRegular(p0)->uTruth
                                         : forestNodeRegular(p0)->uTruth) &
             (isForestNodeComplement(p1) ? ~forestNodeRegular(p1)->uTruth
                                         : forestNodeRegular(p1)->uTruth) &
             0xFFFF;
  }

  // create the new node
  ForestNode* pNew = &(this->forest[forestSize]);
  pNew->Id         = this->forestSize++;
  pNew->TravId     = 0;
  pNew->uTruth     = uTruth;
  pNew->Level      = Level;
  pNew->Volume     = Volume;
  pNew->fUsed      = 0;
  pNew->fExor      = fExor;
  pNew->p0         = p0;
  pNew->p1         = p1;
  pNew->pNext      = NULL;

  // do not add if the node is not essential
  if (uTruth != this->npnManager.getCanons()[uTruth]) {
    return pNew;
  }

  // add to the list
  addForestNodeToTable(uTruth, pNew);

  return pNew;
}

ForestNode* PreCompGraphManager::addForestVar(unsigned uTruth) {

  ForestNode* pNew = &(this->forest[forestSize]);
  pNew->Id         = this->forestSize++;
  pNew->TravId     = 0;
  pNew->uTruth     = uTruth;
  pNew->Level      = 0;
  pNew->Volume     = 0;
  pNew->fUsed      = 1;
  pNew->fExor      = 0;
  pNew->p0         = NULL;
  pNew->p1         = NULL;
  pNew->pNext      = NULL;
  return pNew;
}

void PreCompGraphManager::addForestNodeToTable(unsigned uTruth,
                                               ForestNode* node) {

  ForestNode** position = &(this->table[uTruth]);
  ForestNode* temp;
  // find the last one
  for (temp = *position; temp; temp = temp->pNext)
    position = &temp->pNext;
  // attach at the end
  *position = node;
}

int PreCompGraphManager::getForestNodeVolume(ForestNode* p0, ForestNode* p1) {

  int volume = 0;
  incTravId();
  getVolumeRec(p0, &volume);
  getVolumeRec(p1, &volume);
  return volume;
}

void PreCompGraphManager::getVolumeRec(ForestNode* node, int* volume) {

  if (node->fUsed || node->TravId == this->nTravIds) {
    return;
  }
  node->TravId = this->nTravIds;
  (*volume)++;
  if (node->fExor) {
    (*volume)++;
  }
  getVolumeRec(forestNodeRegular(node->p0), volume);
  getVolumeRec(forestNodeRegular(node->p1), volume);
}

void PreCompGraphManager::incTravId() {
  // no overflow
  auto result = this->nTravIds++;
  if (this->nTravIds > result) {
    return;
  }

  // overflow detected; reset the counters
  for (int i = 0; i < this->forestSize; i++) {
    forest[i].TravId = 0;
  }
  this->nTravIds = 1;
}

bool PreCompGraphManager::isForestNodeComplement(ForestNode* node) {
  return (bool)(((unsigned long int)node) & 0x1ul);
}

ForestNode* PreCompGraphManager::forestNodeRegular(ForestNode* node) {
  return (ForestNode*)((unsigned long int)(node) & ~0x1ul);
}

ForestNode* PreCompGraphManager::forestNodeComplement(ForestNode* node) {
  return (ForestNode*)((unsigned long int)(node) ^ 0x1ul);
}

ForestNode* PreCompGraphManager::forestNodeComplementCond(ForestNode* node,
                                                          int c) {
  return (ForestNode*)((unsigned long int)(node) ^ (c));
}

void PreCompGraphManager::processDecompositionGraphs() {

  DecGraph* decGraph;
  ForestNode* node;
  unsigned char* map     = this->npnManager.getMap();
  unsigned short* mapInv = this->npnManager.getMapInv();
  unsigned short* canons = this->npnManager.getCanons();

  // put the nodes into the structure
  for (int i = 0; i < this->npnManager.getNFuncs(); i++) {

    if (this->table[i] == NULL) {
      continue;
    }
    // consider all implementations of this function
    for (node = this->table[i]; node; node = node->pNext) {
      assert(node->uTruth == this->table[i]->uTruth);
      assert(map[node->uTruth] < 222); // Guaranteed to be >=0 b/c unsigned
      this->classes[map[node->uTruth]].push_back(node);
      mapInv[map[node->uTruth]] = canons[node->uTruth];
    }
  }
  // compute decomposition forms for each node and verify them
  for (size_t i = 0; i < classes.size(); i++) {

    // Print the number of precomputed structures for each of the 222 fucntions
    // std::cout << i << " " << classes[i].size() << std::endl;

    for (size_t j = 0; j < classes[i].size(); j++) {
      node        = classes[i][j];
      decGraph    = processNode(node);
      node->pNext = (ForestNode*)decGraph;
      assert(node->uTruth == (decGraph->deriveTruth() & 0xFFFF));
    }
  }
}

DecGraph* PreCompGraphManager::processNode(ForestNode* node) {

  DecGraph* decGraph;
  DecEdge eRoot;
  assert(!isForestNodeComplement(node));
  // consider constant
  if (node->uTruth == 0) {
    decGraph = new DecGraph(); // Constant-Zero Graph Constructor
    return decGraph;
  }
  // consider the case of elementary var
  if (node->uTruth == 0x00FF) {
    decGraph = new DecGraph(3, 4, 1); // Leaf Graph Constructor
    return decGraph;
  }
  // start the subgraphs
  decGraph = new DecGraph(4); // 4-Leaves Graph Contructor

  // collect the nodes
  incTravId();
  eRoot = processNodeRec(node, decGraph);
  decGraph->setRootEdge(eRoot);

  return decGraph;
}

DecEdge PreCompGraphManager::processNodeRec(ForestNode* node,
                                            DecGraph* decGraph) {

  DecEdge eNode0, eNode1, eNode;
  // elementary variable
  if (node->fUsed) {
    return decGraph->createEdge(node->Id - 1, 0);
  }
  // previously visited node
  if (node->TravId == this->nTravIds) {
    return decGraph->intToEdge(node->Volume);
  }
  node->TravId = this->nTravIds;
  // solve for children
  eNode0 = processNodeRec(forestNodeRegular(node->p0), decGraph);
  if (isForestNodeComplement(node->p0)) {
    eNode0.fCompl = !eNode0.fCompl;
  }
  eNode1 = processNodeRec(forestNodeRegular(node->p1), decGraph);
  if (isForestNodeComplement(node->p1)) {
    eNode1.fCompl = !eNode1.fCompl;
  }
  // create the decomposition node(s)
  if (node->fExor) {
    eNode = decGraph->addXorNode(eNode0, eNode1, 0);
  } else {
    eNode = decGraph->addAndNode(eNode0, eNode1);
  }
  // save the result
  node->Volume = decGraph->edgeToInt(eNode);

  return eNode;
}

ForestNode* PreCompGraphManager::getForest() { return this->forest; }

std::vector<ForestNode*>& PreCompGraphManager::getTable() {
  return this->table;
}

std::vector<std::vector<ForestNode*>>& PreCompGraphManager::getClasses() {
  return this->classes;
}

// ########################### DECOMPOSITION GRAPH METHODS
// ########################### //

// Create a Const graph
DecGraph::DecGraph() {

  this->fConst       = true;
  this->nLeaves      = 0;
  this->nSize        = 0;
  this->nCap         = 0;
  this->idCounter    = 0;
  this->pNodes       = nullptr;
  this->eRoot.fCompl = 1;
  this->eRoot.Node   = 0;
}

// Create a graph with nLeaves
DecGraph::DecGraph(int nLeaves) {

  this->fConst    = false;
  this->nLeaves   = nLeaves;
  this->nSize     = nLeaves;
  this->nCap      = 20; // Original ABC = 2 * nLeaves + 50;
  this->idCounter = 0;
  this->pNodes    = (DecNode*)malloc(sizeof(DecNode) * this->nCap);
  memset(this->pNodes, 0, sizeof(DecNode) * this->nSize);
  this->eRoot.fCompl = 0;
  this->eRoot.Node   = 0;
  // Initialize the id for leaves
  for (int i = 0; i < this->nLeaves; i++) {
    this->pNodes[i].id = this->idCounter++;
  }
}

// Create a leaf graph
DecGraph::DecGraph(int iLeaf, int nLeaves, int fCompl) {

  assert(0 <= iLeaf && iLeaf < nLeaves);
  this->fConst    = false;
  this->nLeaves   = nLeaves;
  this->nSize     = nLeaves;
  this->nCap      = 20; // Original ABC = 2 * nLeaves + 50;
  this->idCounter = 0;
  this->pNodes    = (DecNode*)malloc(sizeof(DecNode) * this->nCap);
  memset(this->pNodes, 0, sizeof(DecNode) * this->nSize);
  this->eRoot.fCompl = fCompl;
  this->eRoot.Node   = iLeaf;
  // Initialize the id for leaves
  for (int i = 0; i < this->nLeaves; i++) {
    this->pNodes[i].id = this->idCounter++;
  }
}

DecGraph::~DecGraph() { free(this->pNodes); }

DecEdge DecGraph::addAndNode(DecEdge eEdge0, DecEdge eEdge1) {

  // get the new node
  DecNode* node = this->appendNode();
  // set the inputs and other info
  node->id      = this->idCounter++;
  node->eEdge0  = eEdge0;
  node->eEdge1  = eEdge1;
  node->fCompl0 = eEdge0.fCompl;
  node->fCompl1 = eEdge1.fCompl;
  return this->createEdge(this->nSize - 1, 0);
}

DecEdge DecGraph::addOrNode(DecEdge eEdge0, DecEdge eEdge1) {

  // get the new node
  DecNode* node = this->appendNode();
  // set the inputs and other info
  node->id      = this->idCounter++;
  node->eEdge0  = eEdge0;
  node->eEdge1  = eEdge1;
  node->fCompl0 = eEdge0.fCompl;
  node->fCompl1 = eEdge1.fCompl;
  // make adjustments for the OR gate
  node->fNodeOr       = 1;
  node->eEdge0.fCompl = !node->eEdge0.fCompl;
  node->eEdge1.fCompl = !node->eEdge1.fCompl;
  return this->createEdge(this->nSize - 1, 1);
}

DecEdge DecGraph::addXorNode(DecEdge eEdge0, DecEdge eEdge1, int Type) {

  DecEdge eNode0, eNode1, eNode;
  if (Type == 0) {
    // derive the first AND
    eEdge0.fCompl ^= 1;
    eNode0 = this->addAndNode(eEdge0, eEdge1);
    eEdge0.fCompl ^= 1;
    // derive the second AND
    eEdge1.fCompl ^= 1;
    eNode1 = this->addAndNode(eEdge0, eEdge1);
    // derive the final OR
    eNode = this->addOrNode(eNode0, eNode1);
  } else {
    // derive the first AND
    eNode0 = this->addAndNode(eEdge0, eEdge1);
    // derive the second AND
    eEdge0.fCompl ^= 1;
    eEdge1.fCompl ^= 1;
    eNode1 = this->addAndNode(eEdge0, eEdge1);
    // derive the final OR
    eNode = this->addOrNode(eNode0, eNode1);
    eNode.fCompl ^= 1;
  }
  return eNode;
}

DecEdge DecGraph::createEdge(unsigned Node, unsigned fCompl) {
  DecEdge eEdge = {fCompl, Node};
  return eEdge;
}

DecNode* DecGraph::appendNode() {

  DecNode* node;
  if (this->nSize == this->nCap) {
    this->pNodes = (DecNode*)realloc(this->pNodes, 2 * this->nCap);
    this->nCap   = 2 * this->nCap;
  }
  node = this->pNodes + this->nSize++;
  memset(node, 0, sizeof(DecNode));
  return node;
}

DecNode* DecGraph::getNodes() { return this->pNodes; }

DecNode* DecGraph::getNode(int i) { return this->pNodes + i; }

DecNode* DecGraph::getVar() {
  assert(this->isVar());
  return this->getNode(this->eRoot.Node);
}

void DecGraph::setRootEdge(DecEdge eRoot) { this->eRoot = eRoot; }

DecEdge DecGraph::getRootEdge() { return this->eRoot; }

int DecGraph::getLeaveNum() { return this->nLeaves; }

int DecGraph::getNodeNum() { return this->nSize; }

bool DecGraph::isConst() { return this->fConst; }

bool DecGraph::isVar() { return this->eRoot.Node < (unsigned)this->nLeaves; }

unsigned DecGraph::isComplement() { return this->eRoot.fCompl; }

int DecGraph::nodeToInt(DecNode* node) { return node - this->pNodes; }

int DecGraph::varToInt() {
  assert(this->isVar());
  return this->nodeToInt(this->getVar());
}

DecEdge DecGraph::intToEdge(unsigned Edge) {
  return this->createEdge(Edge >> 1, Edge & 1);
}

unsigned DecGraph::edgeToInt(DecEdge eEdge) {
  return (eEdge.Node << 1) | eEdge.fCompl;
}

unsigned DecGraph::deriveTruth() {

  unsigned uTruths[5] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00,
                         0xFFFF0000};
  unsigned uTruth     = 0; // Suppress "might be used uninitialized"
  unsigned uTruth0, uTruth1;
  DecNode* node;

  // sanity checks
  assert(this->nLeaves >= 0);
  assert(this->nLeaves <= this->nSize);
  assert(this->nLeaves <= 5);

  // check for constant function
  if (this->isConst()) {
    return this->isComplement() ? 0 : ~((unsigned)0);
  }
  // check for a literal
  if (this->isVar()) {
    return this->isComplement() ? ~uTruths[this->varToInt()]
                                : uTruths[this->varToInt()];
  }

  // assign the elementary variables
  for (int i = 0; (i < this->nLeaves) && ((node = this->getNode(i)), 1); i++) {
    node->pFunc = (void*)(unsigned long)uTruths[i];
  }

  // compute the function for each internal node
  for (int i = this->nLeaves;
       (i < this->nSize) && ((node = this->getNode(i)), 1); i++) {
    uTruth0 = (unsigned)(unsigned long)this->getNode(node->eEdge0.Node)->pFunc;
    uTruth1 = (unsigned)(unsigned long)this->getNode(node->eEdge1.Node)->pFunc;
    uTruth0 = node->eEdge0.fCompl ? ~uTruth0 : uTruth0;
    uTruth1 = node->eEdge1.fCompl ? ~uTruth1 : uTruth1;
    uTruth  = uTruth0 & uTruth1;
    node->pFunc = (void*)(unsigned long)uTruth;
  }

  // complement the result if necessary
  return this->isComplement() ? ~uTruth : uTruth;
}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PreCompGraphManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef PRECOMPGRAPHMANAGER_h_
#define PRECOMPGRAPHMANAGER_h_

#include "NPNManager.h"
#include <vector>

namespace algorithm {

typedef struct DecEdge_ {

  unsigned fCompl : 1; // the complemented bit
  unsigned Node : 30;  // the decomposition node pointed by the edge

} DecEdge;

typedef struct DecNode_ {

  DecEdge eEdge0; // the left child of the node
  DecEdge eEdge1; // the right child of the node
  // other info
  union {
    int iFunc;   // the literal of the node (AIG)
    void* pFunc; // the function of the node (BDD or AIG)
  };
  int id;
  unsigned Level : 14; // the level of this node in the global AIG
  // printing info
  unsigned fNodeOr : 1; // marks the original OR node
  unsigned fCompl0 : 1; // marks the original complemented edge
  unsigned fCompl1 : 1; // marks the original complemented edge
  // latch info
  unsigned nLat0 : 5; // the number of latches on the first edge
  unsigned nLat1 : 5; // the number of latches on the second edge
  unsigned nLat2 : 5; // the number of latches on the output edge

} DecNode;

class DecGraph {

private:
  bool fConst; // marks the constant 1 graph
  int nLeaves; // the number of leaves
  int nSize;   // the number of nodes (including the leaves)
  int nCap;    // the number of allocated nodes
  int idCounter;
  DecNode* pNodes; // the array of leaves and internal nodes
  DecEdge eRoot;   // the pointer to the topmost node

  DecNode* appendNode();

public:
  DecGraph();                                   // Create a Const graph
  DecGraph(int nLeaves);                        // Create a graph with nLeaves
  DecGraph(int iLeaf, int nLeaves, int fCompl); // Create a leaf graph
  ~DecGraph();

  DecEdge addAndNode(DecEdge eEdge0, DecEdge eEdge1);
  DecEdge addOrNode(DecEdge eEdge0, DecEdge eEdge1);
  DecEdge addXorNode(DecEdge eEdge0, DecEdge eEdge1, int Type);
  DecEdge createEdge(unsigned Node, unsigned fCompl);

  DecNode* getNodes();
  DecNode* getNode(int i);
  DecNode* getVar();
  DecEdge getRootEdge();
  void setRootEdge(DecEdge eRoot);
  int getLeaveNum();
  int getNodeNum();
  bool isConst();
  bool isVar();
  unsigned isComplement();

  int nodeToInt(DecNode* node);
  int varToInt();
  DecEdge intToEdge(unsigned Edge);
  unsigned edgeToInt(DecEdge eEdge);

  unsigned deriveTruth();
};

typedef struct ForestNode_ ForestNode;
struct ForestNode_ {

  int Id;     // ID
  int TravId; // traversal ID
  short nScore;
  short nGain;
  short nAdded;
  unsigned uTruth : 16; // truth table
  unsigned Volume : 8;  // volume
  unsigned Level : 6;   // level
  unsigned fUsed : 1;   // mark
  unsigned fExor : 1;   // mark
  ForestNode* p0;       // first child
  ForestNode* p1;       // second child
  ForestNode* pNext;    // next in the table
};

class PreCompGraphManager {

private:
  static const unsigned short aigSubgraphs[3562];

  NPNManager& npnManager;
  ForestNode* forest; // all the nodes
  std::vector<ForestNode*>
      table; // the hash table of nodes by their canonical form
  std::vector<std::vector<ForestNode*>>
      classes; // the nodes of the equivalence classes
  int forestSize;
  int nTravIds;

  ForestNode* addForestNode(ForestNode* p0, ForestNode* p1, int fExor,
                            int Level, int Volume);
  ForestNode* addForestVar(unsigned uTruth);
  void addForestNodeToTable(unsigned uTruth, ForestNode* node);

  int getForestNodeVolume(ForestNode* p0, ForestNode* p1);
  void getVolumeRec(ForestNode* node, int* volume);
  void incTravId();

  bool isForestNodeComplement(ForestNode* node);
  ForestNode* forestNodeRegular(ForestNode* node);
  ForestNode* forestNodeComplement(ForestNode* node);
  ForestNode* forestNodeComplementCond(ForestNode* node, int c);

  DecGraph* processNode(ForestNode* node);
  DecEdge processNodeRec(ForestNode* node, DecGraph* decGraph);

public:
  PreCompGraphManager(NPNManager& npnManager);
  ~PreCompGraphManager();

  void loadPreCompGraphFromArray();
  void processDecompositionGraphs();

  ForestNode* getForest();
  std::vector<ForestNode*>& getTable();
  std::vector<std::vector<ForestNode*>>& getClasses();
};

} /* namespace algorithm */

#endif /* PRECOMPGRAPHMANAGER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Parallel LUT-Based Tech Mapping October 16, 2018.
 ABC-based implementation on Galois.

*/

#include "PriorityCutManager.h"
#include "galois/Galois.h"
#include "galois/Bag.h"

#include <iostream>
#include <cstdlib>
#include <chrono>
#include <assert.h>

using namespace std::chrono;

namespace algorithm {

PriCutManager::PriCutManager(aig::Aig& aig, int K, int C, int nThreads,
                             bool compTruth, bool deterministic, bool verbose)
    : aig(aig), aigGraph(aig.getGraph()), K(K), C(C),
      nWords(Functional32::wordNum(K)),
      nNodes(std::distance(aig.getGraph().begin(), aig.getGraph().end()) -
             aig.getNumOutputs()),
      nThreads(nThreads), cutPoolSize(nNodes / nThreads), compTruth(compTruth),
      deterministic(deterministic), verbose(verbose),
      perThreadData(cutPoolSize, K, compTruth, C, nWords) {

  nLUTs   = 0;
  nLevels = 0;
  passId  = 0;

  sortMode = SortMode::DELAY;
  costMode = CostMode::AREA_FLOW;

  if (deterministic) {
    refMode = RefMode::MAP;
  } else {
    refMode = RefMode::STANDARD;
  }

  fPower   = false;
  fEpsilon = (float)0.005;
  kcutTime = 0;

  nodePriCuts = new PriCut*[nNodes + 1];
  for (int i = 0; i < nNodes + 1; i++) {
    nodePriCuts[i] = nullptr;
  }

  // iterating from 0 to N is reverse topological order
  // iterating from N to 0 is topological order
  aig.computeGenericTopologicalSortForAnds(this->sortedNodes);
}

PriCutManager::~PriCutManager() { delete[] nodePriCuts; }

void PriCutManager::computePriCutsRecursively(aig::GNode node, RefMap& refMap) {

  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

  if (this->nodePriCuts[nodeData.id] == nullptr) {

    auto inEdgeIt      = aigGraph.in_edge_begin(node);
    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& lhsData =
        aigGraph.getData(lhsNode, galois::MethodFlag::READ);
    bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);

    inEdgeIt++;
    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& rhsData =
        aigGraph.getData(rhsNode, galois::MethodFlag::READ);
    bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);

    ThreadLocalData* thData = this->perThreadData.getLocal();

    computePriCutsRec(lhsNode, thData, refMap);
    computePriCutsRec(rhsNode, thData, refMap);

    computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,
                   lhsPolarity, rhsPolarity);
  }
}

void PriCutManager::computePriCutsRec(aig::GNode node, ThreadLocalData* thData,
                                      RefMap& refMap) {

  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

  if (this->nodePriCuts[nodeData.id] == nullptr) {

    auto inEdgeIt      = aigGraph.in_edge_begin(node);
    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& lhsData =
        aigGraph.getData(lhsNode, galois::MethodFlag::READ);
    bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);

    inEdgeIt++;
    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& rhsData =
        aigGraph.getData(rhsNode, galois::MethodFlag::READ);
    bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);

    computePriCutsRec(lhsNode, thData, refMap);
    computePriCutsRec(rhsNode, thData, refMap);

    computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,
                   lhsPolarity, rhsPolarity);
  }
}

void PriCutManager::computePriCuts(ThreadLocalData* thData, RefMap& refMap,
                                   aig::NodeData& nodeData, int lhsId,
                                   int rhsId, bool lhsPolarity,
                                   bool rhsPolarity) {

  PriCut *trivialCut, *resCut, *oldBestCut = nullptr;

  cleanupCutList(thData->cutPool,
                 thData->cutList); // Ensure that the cutList is empty

  if ((this->passId > 1) && (this->nodePriCuts[nodeData.id] != nullptr)) {
    // Save a copy of the previous bestCut and recompute the cut's costs
    oldBestCut = thData->cutPool.getMemory();
    copyCut(oldBestCut, this->nodePriCuts[nodeData.id]);

    if (this->costMode == CostMode::AREA_FLOW) {
      cutFlowCosts(oldBestCut);
    } else {
      if (this->refMode == RefMode::MAP) {
        cutDerefedCosts(oldBestCut, refMap);
      } else {
        cutDerefedCosts(oldBestCut);
      }
    }

    if (this->deterministic == false) {
      decreaseCutReferences(oldBestCut);
    }
  }

  for (PriCut* lhsCut = this->nodePriCuts[lhsId]; lhsCut != nullptr;
       lhsCut         = lhsCut->nextCut) {
    for (PriCut* rhsCut = this->nodePriCuts[rhsId]; rhsCut != nullptr;
         rhsCut         = rhsCut->nextCut) {

      if (Functional32::countOnes(lhsCut->sig | rhsCut->sig) > this->K) {
        continue;
      }

      // merge the cuts
      if (lhsCut->nLeaves >= rhsCut->nLeaves) {
        resCut = mergeCuts(thData->cutPool, lhsCut, rhsCut);
      } else {
        resCut = mergeCuts(thData->cutPool, rhsCut, lhsCut);
      }

      if (resCut == nullptr) {
        continue;
      }

      // check containment
      if (cutFilter(thData->cutPool, thData->cutList, resCut)) {
        continue;
      }

      if (this->compTruth) {
        computeTruth(thData->auxTruth, resCut, lhsCut, rhsCut, lhsPolarity,
                     rhsPolarity);
        // std::cout << Functional32::toHex( readTruth( resCut ), getNWords() )
        // << std::endl;
      }

      if (this->costMode == CostMode::AREA_FLOW) {
        cutFlowCosts(resCut);
      } else {
        if (this->refMode == RefMode::MAP) {
          cutDerefedCosts(resCut, refMap);
        } else {
          cutDerefedCosts(resCut);
        }
      }

      // add to the sorted list
      cutSort(thData->cutPool, thData->cutList, resCut);
    }
  }

  if ((nodeData.nFanout > 0) && (nodeData.choiceList != nullptr)) {
    mapChoices(thData, refMap, nodeData);
  }

  // start with the elementary cut
  trivialCut            = thData->cutPool.getMemory();
  trivialCut->leaves[0] = nodeData.id;
  trivialCut->nLeaves++;
  trivialCut->sig = (1U << (nodeData.id % 31));
  if (this->compTruth) {
    unsigned* cutTruth = readTruth(trivialCut);
    for (int i = 0; i < this->nWords; i++) {
      cutTruth[i] = 0xAAAAAAAA;
    }
  }
  thData->cutList.array[thData->cutList.nCuts++] = trivialCut;
  nCuts += 1;
  nTriv += 1;

  // Copy from currentCutList to the nodeCuts
  commitCuts(thData->cutPool, thData->cutList, nodeData.id);

  if (oldBestCut != nullptr) {
    if (this->nodePriCuts[nodeData.id]->delay > nodeData.reqTime) {
      oldBestCut->nextCut =
          this->nodePriCuts[nodeData.id]; // Keep the oldBestCut as the best one
      this->nodePriCuts[nodeData.id] = oldBestCut;
    } else {
      thData->cutPool.giveBackMemory(oldBestCut);
    }
  }

  if (this->deterministic == false) {
    increaseCutReferences(this->nodePriCuts[nodeData.id]);
  }
}

void PriCutManager::mapChoices(ThreadLocalData* thData, RefMap& refMap,
                               aig::NodeData& nodeData) {

  aig::GNode nextChoice = nullptr;

  for (aig::GNode currChoice = nodeData.choiceList; currChoice != nullptr;
       currChoice            = nextChoice) {

    aig::NodeData& currChoiceData =
        aigGraph.getData(currChoice, galois::MethodFlag::READ);
    nextChoice = currChoiceData.choiceList;

    // std::cout << "Node " << currChoiceData.id << " has fanout " <<
    // currChoiceData.nFanout << std::endl;

    for (PriCut* chCut = this->nodePriCuts[currChoiceData.id]; chCut != nullptr;
         chCut         = chCut->nextCut) {

      // Discard trivial cuts
      if (chCut->nLeaves == 1) {
        continue;
      }

      PriCut* chCutCopy = thData->cutPool.getMemory();
      copyCut(chCutCopy, chCut);

      /*
      if (this->compTruth) { // FIXME treat complemented choices
          if(currChoiceData.isCompl) {
              Functional32::NOT(readTruth(chCutCopy), readTruth(chCutCopy),
      this->nWords);
          }
      }
      */

      // check containment
      if (cutFilter(thData->cutPool, thData->cutList, chCutCopy)) {
        continue;
      }

      if (this->costMode == CostMode::AREA_FLOW) {
        cutFlowCosts(chCutCopy);
      } else {
        if (this->refMode == RefMode::MAP) {
          cutDerefedCosts(chCutCopy, refMap);
          // cutDerefedCosts(chCutCopy, thData->refMap);
        } else {
          cutDerefedCosts(chCutCopy);
        }
      }

      // add to the sorted list
      cutSort(thData->cutPool, thData->cutList, chCutCopy);
    }
  }
}

PriCut* PriCutManager::mergeCuts(PriCutPool& cutPool, PriCut* lhsCut,
                                 PriCut* rhsCut) {

  // assert( lhsCut->nLeaves >= rhsCut->nLeaves );
  int i, j, l;
  PriCut* resCut;

  // the case of the largest cut sizes
  if (lhsCut->nLeaves == this->K && rhsCut->nLeaves == this->K) {
    for (i = 0; i < lhsCut->nLeaves; i++) {
      if (lhsCut->leaves[i] != rhsCut->leaves[i]) {
        return nullptr;
      }
    }
    resCut = cutPool.getMemory();
    for (i = 0; i < lhsCut->nLeaves; i++) {
      resCut->leaves[i] = lhsCut->leaves[i];
    }
    resCut->nLeaves = lhsCut->nLeaves;
    resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature
    return resCut;
  }

  // the case when one of the cuts is the largest
  if (lhsCut->nLeaves == this->K) {
    for (i = 0; i < rhsCut->nLeaves; i++) {
      for (j = lhsCut->nLeaves - 1; j >= 0; j--) {
        if (lhsCut->leaves[j] == rhsCut->leaves[i]) {
          break;
        }
      }
      if (j == -1) { // did not find
        return nullptr;
      }
    }
    resCut = cutPool.getMemory();
    for (i = 0; i < lhsCut->nLeaves; i++) {
      resCut->leaves[i] = lhsCut->leaves[i];
    }
    resCut->nLeaves = lhsCut->nLeaves;
    resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature
    return resCut;
  }

  // compare two cuts with different numbers
  resCut = cutPool.getMemory();
  i      = 0;
  j      = 0;
  for (l = 0; l < this->K; l++) {
    if (j == rhsCut->nLeaves) {
      if (i == lhsCut->nLeaves) {
        resCut->nLeaves = l;
        resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature
        return resCut;
      }
      resCut->leaves[l] = lhsCut->leaves[i++];
      continue;
    }

    if (i == lhsCut->nLeaves) {
      if (j == rhsCut->nLeaves) {
        resCut->nLeaves = l;
        resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature
        return resCut;
      }
      resCut->leaves[l] = rhsCut->leaves[j++];
      continue;
    }

    if (lhsCut->leaves[i] < rhsCut->leaves[j]) {
      resCut->leaves[l] = lhsCut->leaves[i++];
      continue;
    }

    if (lhsCut->leaves[i] > rhsCut->leaves[j]) {
      resCut->leaves[l] = rhsCut->leaves[j++];
      continue;
    }

    resCut->leaves[l] = lhsCut->leaves[i++];
    j++;
  }

  if (i < lhsCut->nLeaves || j < rhsCut->nLeaves) {
    cutPool.giveBackMemory(resCut);
    return nullptr;
  }

  resCut->nLeaves = l;
  resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature
  return resCut;
}

inline bool PriCutManager::cutFilter(PriCutPool& cutPool, PriCutList& cutList,
                                     PriCut* resCut) {

  PriCut* cut;

  for (int i = 0; i < cutList.nCuts; i++) {

    cut = cutList.array[i];

    if (cut->nLeaves <= resCut->nLeaves) {
      // skip the non-contained cuts
      if ((cut->sig & resCut->sig) != cut->sig) {
        continue;
      }
      // check containment seriously
      if (checkCutDominance(cut, resCut)) {
        cutPool.giveBackMemory(resCut); // Recycle Cut
        nFilt += 1;
        return true; // resCut is dominated
      }
    } else {
      // sKip the non-contained cuts
      if ((cut->sig & resCut->sig) != resCut->sig) {
        continue;
      }
      // check containment seriously
      if (checkCutDominance(resCut, cut)) {
        nCuts -= 1;
        nFilt += 1;
        cutList.nCuts--;
        cutPool.giveBackMemory(cut); // Recycle Cut
        for (int j = i; j < cutList.nCuts; j++) {
          cutList.array[j] = cutList.array[j + 1];
        }
      }
    }
  }
  return false;
}

inline bool PriCutManager::checkCutDominance(PriCut* smallerCut,
                                             PriCut* largerCut) {

  int i, j;
  for (i = 0; i < smallerCut->nLeaves; i++) {
    for (j = 0; j < largerCut->nLeaves; j++) {
      if (smallerCut->leaves[i] == largerCut->leaves[j]) {
        break;
      }
    }
    if (j ==
        largerCut
            ->nLeaves) { // node i in smallerCut is not contained in largerCut
      return false;
    }
  }
  // every node in smallerCut is contained in largerCut
  return true;
}

inline void PriCutManager::cutSort(PriCutPool& cutPool, PriCutList& cutList,
                                   PriCut* resCut) {

  // cut structure is empty
  if (cutList.nCuts == 0) {
    cutList.array[cutList.nCuts++] = resCut;
    nCuts += 1;
    return;
  }

  // the cut will be added - find its place
  cutList.array[cutList.nCuts++] = resCut;

  for (int i = cutList.nCuts - 2; i >= 0; i--) {
    if (sortCompare(cutList.array[i], resCut) <= 0) {
      break;
    }
    cutList.array[i + 1] = cutList.array[i];
    cutList.array[i]     = resCut;
  }

  if (cutList.nCuts > this->C) {
    cutPool.giveBackMemory(cutList.array[--cutList.nCuts]);
  } else {
    nCuts += 1;
  }
}

int PriCutManager::sortCompare(PriCut* lhsCut, PriCut* rhsCut) {

  if (this->fPower) {
    if (this->sortMode == SortMode::AREA) { // area flow
      if (lhsCut->area < rhsCut->area - this->fEpsilon)
        return -1;
      if (lhsCut->area > rhsCut->area + this->fEpsilon)
        return 1;
      if (lhsCut->power < rhsCut->power - this->fEpsilon)
        return -1;
      if (lhsCut->power > rhsCut->power + this->fEpsilon)
        return 1;
      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
        return -1;
      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
        return 1;
      if (lhsCut->nLeaves < rhsCut->nLeaves)
        return -1;
      if (lhsCut->nLeaves > rhsCut->nLeaves)
        return 1;
      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
        return -1;
      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
        return 1;
      return 0;
    }
    if (this->sortMode == SortMode::DELAY) { // delay
      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
        return -1;
      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
        return 1;
      if (lhsCut->nLeaves < rhsCut->nLeaves)
        return -1;
      if (lhsCut->nLeaves > rhsCut->nLeaves)
        return 1;
      if (lhsCut->area < rhsCut->area - this->fEpsilon)
        return -1;
      if (lhsCut->area > rhsCut->area + this->fEpsilon)
        return 1;
      if (lhsCut->power < rhsCut->power - this->fEpsilon)
        return -1;
      if (lhsCut->power > rhsCut->power + this->fEpsilon)
        return 1;
      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
        return -1;
      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
        return 1;
      return 0;
    }
    assert(this->sortMode == SortMode::DELAY_OLD); // delay old, exact area
    if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
      return -1;
    if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
      return 1;
    if (lhsCut->power < rhsCut->power - this->fEpsilon)
      return -1;
    if (lhsCut->power > rhsCut->power + this->fEpsilon)
      return 1;
    if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
      return -1;
    if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
      return 1;
    if (lhsCut->area < rhsCut->area - this->fEpsilon)
      return -1;
    if (lhsCut->area > rhsCut->area + this->fEpsilon)
      return 1;
    if (lhsCut->nLeaves < rhsCut->nLeaves)
      return -1;
    if (lhsCut->nLeaves > rhsCut->nLeaves)
      return 1;
    return 0;
  } else {                                  // regular
    if (this->sortMode == SortMode::AREA) { // area
      if (lhsCut->area < rhsCut->area - this->fEpsilon)
        return -1;
      if (lhsCut->area > rhsCut->area + this->fEpsilon)
        return 1;
      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
        return -1;
      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
        return 1;
      if (lhsCut->power < rhsCut->power - this->fEpsilon)
        return -1;
      if (lhsCut->power > rhsCut->power + this->fEpsilon)
        return 1;
      if (lhsCut->nLeaves < rhsCut->nLeaves)
        return -1;
      if (lhsCut->nLeaves > rhsCut->nLeaves)
        return 1;
      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
        return -1;
      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
        return 1;
      return 0;
    }
    if (this->sortMode == SortMode::DELAY) { // delay
      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
        return -1;
      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
        return 1;
      if (lhsCut->nLeaves < rhsCut->nLeaves)
        return -1;
      if (lhsCut->nLeaves > rhsCut->nLeaves)
        return 1;
      if (lhsCut->area < rhsCut->area - this->fEpsilon)
        return -1;
      if (lhsCut->area > rhsCut->area + this->fEpsilon)
        return 1;
      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
        return -1;
      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
        return 1;
      if (lhsCut->power < rhsCut->power - this->fEpsilon)
        return -1;
      if (lhsCut->power > rhsCut->power + this->fEpsilon)
        return 1;
      return 0;
    }
    assert(this->sortMode == SortMode::DELAY_OLD);
    if (lhsCut->delay < rhsCut->delay - this->fEpsilon)
      return -1;
    if (lhsCut->delay > rhsCut->delay + this->fEpsilon)
      return 1;
    if (lhsCut->area < rhsCut->area - this->fEpsilon)
      return -1;
    if (lhsCut->area > rhsCut->area + this->fEpsilon)
      return 1;
    if (lhsCut->edge < rhsCut->edge - this->fEpsilon)
      return -1;
    if (lhsCut->edge > rhsCut->edge + this->fEpsilon)
      return 1;
    if (lhsCut->power < rhsCut->power - this->fEpsilon)
      return -1;
    if (lhsCut->power > rhsCut->power + this->fEpsilon)
      return 1;
    if (lhsCut->nLeaves < rhsCut->nLeaves)
      return -1;
    if (lhsCut->nLeaves > rhsCut->nLeaves)
      return 1;
    return 0;
  }
}

inline void PriCutManager::commitCuts(PriCutPool& cutPool, PriCutList& cutList,
                                      int nodeId) {

  assert(cutList.nCuts != 0);

  recycleNodeCuts(cutPool, nodeId);

  // Copy from currenti CutList to the nodePriCuts and clean up the cutList
  this->nodePriCuts[nodeId] = cutList.array[0];

  int i;
  for (i = 0; i < cutList.nCuts - 1; i++) {
    cutList.array[i]->nextCut = cutList.array[i + 1];
    cutList.array[i]          = nullptr;
  }
  cutList.array[i]->nextCut = nullptr;
  cutList.array[i]          = nullptr;
  cutList.nCuts             = 0;

  assert(this->nodePriCuts[nodeId] != nullptr);
}

/*
 *     This method gives the cut's memory back to current thread cutPool.
 *     However, the memory can be allocated by the cutPool of one thread
 *     and reused by a cutPool of another thread. But, the original cutPool
 *     will be responsible for dealocating the memory.
 */
inline void PriCutManager::recycleNodeCuts(PriCutPool& cutPool, int nodeId) {

  PriCut* cut = this->nodePriCuts[nodeId];

  while (cut != nullptr) {
    PriCut* nextCut = cut->nextCut;
    cutPool.giveBackMemory(cut);
    cut = nextCut;
  }

  this->nodePriCuts[nodeId] = nullptr;
}

inline void PriCutManager::cleanupCutList(PriCutPool& cutPool,
                                          PriCutList& cutList) {

  for (int i = 0; i < cutList.nCuts; i++) {
    cutPool.giveBackMemory(cutList.array[i]);
    cutList.array[i] = nullptr;
  }

  cutList.nCuts = 0;
}

inline void PriCutManager::copyCut(PriCut* dest, PriCut* source) {

  dest->area    = source->area;
  dest->edge    = source->edge;
  dest->power   = source->power;
  dest->delay   = source->delay;
  dest->sig     = source->sig;
  dest->nLeaves = source->nLeaves;
  dest->nextCut = nullptr;
  for (int i = 0; i < source->nLeaves; i++) {
    dest->leaves[i] = source->leaves[i];
  }
  if (this->compTruth) {
    unsigned int* destTruth   = readTruth(dest);
    unsigned int* sourceTruth = readTruth(source);
    Functional32::copy(destTruth, sourceTruth, this->nWords);
  }
}

void PriCutManager::computeTruth(AuxTruth& auxTruth, PriCut* resCut,
                                 PriCut* lhsCut, PriCut* rhsCut,
                                 bool lhsPolarity, bool rhsPolarity) {

  // permute the first table
  if (lhsPolarity) {
    Functional32::copy(auxTruth.truth[0], readTruth(lhsCut), this->nWords);
  } else {
    Functional32::NOT(auxTruth.truth[0], readTruth(lhsCut), this->nWords);
  }
  Functional32::truthStretch(auxTruth.truth[2], auxTruth.truth[0],
                             lhsCut->nLeaves, this->K,
                             truthPhase(resCut, lhsCut));

  // permute the second table
  if (rhsPolarity) {
    Functional32::copy(auxTruth.truth[1], readTruth(rhsCut), this->nWords);
  } else {
    Functional32::NOT(auxTruth.truth[1], readTruth(rhsCut), this->nWords);
  }
  Functional32::truthStretch(auxTruth.truth[3], auxTruth.truth[1],
                             rhsCut->nLeaves, this->K,
                             truthPhase(resCut, rhsCut));

  // produce the resulting table. In this first version we are not considering
  // the cut->fCompl flag. It may be considerer in further versions according to
  // the demand.
  // if ( cut->fCompl ) {
  //	Functional32::NAND( readTruth( cut ) , auxTruth[2], auxTruth[3], K );
  //}
  // else {
  Functional32::AND(readTruth(resCut), auxTruth.truth[2], auxTruth.truth[3],
                    this->nWords);
  //}
}

inline unsigned PriCutManager::truthPhase(PriCut* resCut, PriCut* inCut) {

  unsigned phase = 0;
  int i, j;
  for (i = j = 0; i < resCut->nLeaves; i++) {
    if (j == inCut->nLeaves) {
      break;
    }
    if (resCut->leaves[i] < inCut->leaves[j]) {
      continue;
    }
    assert(resCut->leaves[i] == inCut->leaves[j]);
    phase |= (1 << i);
    j++;
  }
  return phase;
}

unsigned int* PriCutManager::readTruth(PriCut* cut) {
  return (unsigned*)(cut->leaves + this->K);
}

void PriCutManager::increaseCutReferences(PriCut* cut) {

  int leafId;

  for (int i = 0; i < cut->nLeaves; i++) {
    leafId                  = cut->leaves[i];
    aig::GNode leaf         = this->aig.getNodes()[leafId];
    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);
    assert(leafData.nRefs >= 0);
    leafData.nRefs++;
  }
}

void PriCutManager::decreaseCutReferences(PriCut* cut) {

  int leafId;

  for (int i = 0; i < cut->nLeaves; i++) {
    leafId                  = cut->leaves[i];
    aig::GNode leaf         = this->aig.getNodes()[leafId];
    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);
    assert(leafData.nRefs > 0);
    --leafData.nRefs;
  }
}

// ################### Start of the New Cut's Cost Functions
// ###################### //
inline float PriCutManager::cutDelay(PriCut* cut) {

  int leafId;
  float currDelay, delay = std::numeric_limits<float>::min();

  for (int i = 0; i < cut->nLeaves; i++) {
    leafId    = cut->leaves[i];
    currDelay = getBestCut(leafId)->delay + 1.0;
    delay     = std::max(delay, currDelay);
  }
  return delay;
}

void PriCutManager::cutFlowCosts(PriCut* cut) {

  int leafId;
  float areaFlow = 1.0;
  float edgeFlow = cut->nLeaves;
  float currDelay, delay = std::numeric_limits<float>::min();
  PriCut* bestCut = nullptr;

  for (int i = 0; i < cut->nLeaves; i++) {

    leafId          = cut->leaves[i];
    aig::GNode leaf = this->aig.getNodes()[leafId];
    aig::NodeData& leafData =
        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);
    bestCut = getBestCut(leafId);

    if ((leafData.nRefs == 0) || (leafData.type == aig::NodeType::CONSTZERO)) {
      areaFlow += bestCut->area;
      edgeFlow += bestCut->edge;
    } else {
      assert(leafData.nRefs > this->fEpsilon);
      areaFlow += bestCut->area / leafData.nRefs;
      edgeFlow += bestCut->edge / leafData.nRefs;
    }

    currDelay = bestCut->delay + 1.0;
    delay     = std::max(delay, currDelay);
  }

  cut->area  = areaFlow;
  cut->edge  = edgeFlow;
  cut->delay = delay;
}

// STANDARD VERSION
void PriCutManager::cutDerefedCosts(PriCut* cut) {

  float area1 = 0, area2 = 0, edge1 = 0, edge2 = 0;

  if (cut->nLeaves < 2) {
    cut->area = 0;
    cut->edge = cut->nLeaves;
    return;
  }

  cutRefCosts(cut, area1, edge1);
  cutDerefCosts(cut, area2, edge2);

  assert(area2 > area1 - this->fEpsilon);
  assert(area2 < area1 + this->fEpsilon);
  assert(edge2 > edge1 - this->fEpsilon);
  assert(edge2 < edge1 + this->fEpsilon);

  cut->area  = area2;
  cut->edge  = edge2;
  cut->delay = cutDelay(cut);
}

void PriCutManager::cutRefCosts(PriCut* cut, float& area, float& edge) {

  int leafId;
  area += 1.0;
  edge += cut->nLeaves;

  for (int i = 0; i < cut->nLeaves; i++) {

    leafId                  = cut->leaves[i];
    aig::GNode leaf         = this->aig.getNodes()[leafId];
    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);

    assert(leafData.nRefs >= 0);
    if ((leafData.nRefs++ > 0) || (leafData.type != aig::NodeType::AND))
      continue;

    cutRefCosts(getBestCut(leafId), area, edge);
  }
}

void PriCutManager::cutDerefCosts(PriCut* cut, float& area, float& edge) {

  int leafId;
  area += 1.0;
  edge += cut->nLeaves;

  for (int i = 0; i < cut->nLeaves; i++) {

    leafId                  = cut->leaves[i];
    aig::GNode leaf         = this->aig.getNodes()[leafId];
    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);

    assert(leafData.nRefs > 0);
    if (--leafData.nRefs > 0 || (leafData.type != aig::NodeType::AND))
      continue;

    cutDerefCosts(getBestCut(leafId), area, edge);
  }
}

// REFMAP VERSION

void PriCutManager::cutDerefedCosts(PriCut* cut, RefMap& refMap) {

  float area1 = 0, area2 = 0, edge1 = 0, edge2 = 0;

  if (cut->nLeaves < 2) {
    cut->area = 0;
    cut->edge = cut->nLeaves;
    return;
  }

  cutRefCosts(cut, area1, edge1, refMap);
  cutDerefCosts(cut, area2, edge2, refMap);

  assert(area2 > area1 - this->fEpsilon);
  assert(area2 < area1 + this->fEpsilon);
  assert(edge2 > edge1 - this->fEpsilon);
  assert(edge2 < edge1 + this->fEpsilon);

  cut->area  = area2;
  cut->edge  = edge2;
  cut->delay = cutDelay(cut);
}

void PriCutManager::cutRefCosts(PriCut* cut, float& area, float& edge,
                                RefMap& refMap) {

  int leafId;
  area += 1.0;
  edge += cut->nLeaves;

  for (int i = 0; i < cut->nLeaves; i++) {

    leafId          = cut->leaves[i];
    aig::GNode leaf = this->aig.getNodes()[leafId];
    aig::NodeData& leafData =
        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);

    // Experimental
    auto it = refMap.find(leafId);
    if (it != refMap.end()) {
      assert(it->second >= 0);
      if ((it->second++ > 0) || (leafData.type != aig::NodeType::AND))
        continue;
    } else {
      assert(leafData.nRefs >= 0);
      refMap.insert({leafId, leafData.nRefs + 1});
      if ((leafData.nRefs > 0) || (leafData.type != aig::NodeType::AND))
        continue;
    }

    cutRefCosts(getBestCut(leafId), area, edge, refMap);
  }
}

void PriCutManager::cutDerefCosts(PriCut* cut, float& area, float& edge,
                                  RefMap& refMap) {

  int leafId;
  area += 1.0;
  edge += cut->nLeaves;

  for (int i = 0; i < cut->nLeaves; i++) {

    leafId          = cut->leaves[i];
    aig::GNode leaf = this->aig.getNodes()[leafId];
    aig::NodeData& leafData =
        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);

    // Experimental
    auto it = refMap.find(leafId);
    if (it != refMap.end()) {
      assert(it->second > 0);
      if (--it->second > 0 || (leafData.type != aig::NodeType::AND))
        continue;
    } else {
      assert(leafData.nRefs > 0);
      refMap.insert({leafId, leafData.nRefs - 1});
      if ((leafData.nRefs - 1) > 0 || (leafData.type != aig::NodeType::AND))
        continue;
    }

    cutDerefCosts(getBestCut(leafId), area, edge, refMap);
  }
}
// ################### End of the NewCuts Cost Functions ######################
// //

void PriCutManager::resetNodeCountersFanout() {

  const float FLOAT_MAX = std::numeric_limits<float>::max();

  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    nodeData.counter = 0;
    nodeData.nRefs   = nodeData.nFanout;
    nodeData.reqTime = FLOAT_MAX;
  }

  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersFanout{
  // aigGraph }, galois::loopname("ResetOperatorFanout"), galois::steal() );
}

void PriCutManager::resetNodeCountersZero() {

  const float FLOAT_MAX = std::numeric_limits<float>::max();

  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    nodeData.counter = 0;
    nodeData.nRefs   = 0;
    nodeData.reqTime = FLOAT_MAX;
  }

  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersZero{
  // aigGraph }, galois::loopname("ResetOperatorZero"), galois::steal() );
}

void PriCutManager::resetNodeCountersOnly() {

  const float FLOAT_MAX = std::numeric_limits<float>::max();

  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    nodeData.counter = 0;
    nodeData.reqTime = FLOAT_MAX;
  }

  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersOnly{
  // aigGraph }, galois::loopname("ResetOperatorOnly"), galois::steal() );
}

void PriCutManager::computeReferenceCounters() {

  PriCut* bestCut;
  int size = this->nNodes + 1;

  for (int i = 0; i < size; i++) {
    bestCut = this->nodePriCuts[i];
    if (bestCut == nullptr) {
      continue;
    }
    if (bestCut->nLeaves == 1) {
      continue; // skip trivial cuts
    }
    for (int j = 0; j < bestCut->nLeaves; j++) {
      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[j]];
      aig::NodeData& leafData =
          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);
      leafData.nRefs++;
    }
  }
}

void PriCutManager::computeCoveringReferenceCounters() {

  computeCovering();

  PriCut* bestCut;

  for (auto entry : this->covering) {
    bestCut = entry.second;
    if (bestCut == nullptr) {
      continue;
    }
    if (bestCut->nLeaves == 1) {
      continue; // skip trivial cuts
    }
    for (int j = 0; j < bestCut->nLeaves; j++) {
      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[j]];
      aig::NodeData& leafData =
          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);
      leafData.nRefs++;
    }
  }
}

void PriCutManager::computeRequiredTimes() {

  float maxDelay = 0;
  PriCut* bestCut;

  for (auto po : this->aig.getOutputNodes()) {
    auto inEdgeIt     = aigGraph.in_edge_begin(po);
    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& inNodeData =
        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);
    if (inNodeData.type == aig::NodeType::CONSTZERO) {
      continue;
    }
    bestCut = getBestCut(inNodeData.id);
    if (maxDelay < bestCut->delay - this->fEpsilon) {
      maxDelay = bestCut->delay;
    }
  }

  for (auto po : this->aig.getOutputNodes()) {
    auto inEdgeIt     = aigGraph.in_edge_begin(po);
    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& inNodeData =
        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);
    inNodeData.reqTime = maxDelay;
  }

  // iterating from 0 to N is reverse topological order
  // iterating from N to 0 is topological order
  for (auto node : this->sortedNodes) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);

    // Reset node data to prepare for the next mapping pass
    nodeData.counter = 0;
    if (this->deterministic) {
      nodeData.nRefs = 0;
    }

    bestCut = getBestCut(nodeData.id);
    for (int i = 0; i < bestCut->nLeaves; i++) {
      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[i]];
      aig::NodeData& leafData =
          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);
      leafData.reqTime = std::min((nodeData.reqTime - 1), leafData.reqTime);
    }
  }
}

void PriCutManager::computeCovering() {

  PriCut* bestCut;
  aig::GNode leaf;
  int leafId, nodeId;
  std::vector<int> S;

  this->covering.clear();

  this->nLevels = -1;
  for (auto po : this->aig.getOutputNodes()) {
    auto inEdgeIt     = aigGraph.in_edge_begin(po);
    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& inNodeData =
        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);

    if ((inNodeData.type != aig::NodeType::PI) &&
        (inNodeData.type != aig::NodeType::LATCH) &&
        (inNodeData.type != aig::NodeType::CONSTZERO)) {
      S.push_back(inNodeData.id);
      bestCut = getBestCut(inNodeData.id);
      if (this->nLevels < bestCut->delay) {
        this->nLevels = bestCut->delay;
      }
    }
  }

  while (!S.empty()) {
    nodeId = S.back();
    S.pop_back();

    auto it = this->covering.find(nodeId);
    if (it != this->covering.end()) {
      continue;
    }

    bestCut = getBestCut(nodeId);
    this->covering.insert({nodeId, bestCut});

    for (int i = 0; i < bestCut->nLeaves; i++) {
      leafId = bestCut->leaves[i];
      leaf   = this->aig.getNodes()[leafId];
      aig::NodeData& leafData =
          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);
      leafData.nRefs++; // Update reference counters
      auto it = this->covering.find(leafId);
      if (it == this->covering.end()) {
        if ((leafData.type != aig::NodeType::PI) &&
            (leafData.type != aig::NodeType::LATCH) &&
            (leafData.type != aig::NodeType::CONSTZERO)) {
          S.push_back(leafId);
        }
      }
    }
  }
}

inline void PriCutManager::switchToFirstDelayMode() {
  this->passId++;
  this->sortMode = SortMode::DELAY;
  this->costMode = CostMode::AREA_FLOW;
}

inline void PriCutManager::switchToSecondDelayMode() {
  this->passId++;
  this->sortMode = SortMode::DELAY_OLD;
  this->costMode = CostMode::AREA_FLOW;
}

inline void PriCutManager::switchToAreaFlowMode() {
  this->passId++;
  this->sortMode = SortMode::AREA;
  this->costMode = CostMode::AREA_FLOW;
}

inline void PriCutManager::switchToLocalAreaMode() {
  this->passId++;
  this->sortMode = SortMode::AREA;
  this->costMode = CostMode::LOCAL_AREA;
}

inline aig::Aig& PriCutManager::getAig() { return this->aig; }

inline PriCut* PriCutManager::getBestCut(int nodeId) {
  return this->nodePriCuts[nodeId]; // the first cut is the best cut
}

int PriCutManager::getNumLUTs() {
  this->nLUTs = this->covering.size();
  return this->nLUTs;
}

int PriCutManager::getNumLevels() { return this->nLevels; }

int PriCutManager::getK() { return this->K; }

int PriCutManager::getC() { return this->C; }

int PriCutManager::getNWords() { return this->nWords; }

int PriCutManager::getNThreads() { return this->nThreads; }

bool PriCutManager::isDeterministic() { return this->deterministic; }

bool PriCutManager::getCompTruthFlag() { return this->compTruth; }

bool PriCutManager::getVerboseFlag() { return this->verbose; }

long double PriCutManager::getKcutTime() { return this->kcutTime; }

void PriCutManager::setKcutTime(long double time) { this->kcutTime = time; }

PerThreadData& PriCutManager::getPerThreadData() { return this->perThreadData; }

PriCut** PriCutManager::getNodePriCuts() { return this->nodePriCuts; }

Covering& PriCutManager::getCovering() { return this->covering; }

void PriCutManager::printCovering() {

  std::cout << std::endl
            << "########## Mapping Covering ###############" << std::endl;
  PriCut* bestCut;
  for (auto entry : this->covering) {
    std::cout << "Node " << entry.first << ": { ";
    bestCut = entry.second;
    for (int i = 0; i < bestCut->nLeaves; i++) {
      std::cout << bestCut->leaves[i] << " ";
    }
    std::cout << "}" << std::endl;
    // std::cout << "}[" << Functional32::toHex( readTruth( bestCut ),
    // this->nWords )  << "] " << std::endl;
  }
  std::cout << std::endl
            << "###########################################" << std::endl;
}

void PriCutManager::printNodeCuts(int nodeId, long int& counter) {

  std::cout << "Node " << nodeId << ": { ";
  for (PriCut* currentCut = this->nodePriCuts[nodeId]; currentCut != nullptr;
       currentCut         = currentCut->nextCut) {
    counter++;
    std::cout << "{ ";
    for (int i = 0; i < currentCut->nLeaves; i++) {
      std::cout << currentCut->leaves[i] << " ";
    }
    // std::cout << "} ";
    std::cout << "}(a" << currentCut->area << ") ";
    // std::cout << "}(a" << currentCut->area << ", e" << currentCut->edge << ")
    // "; std::cout << "}[" << Functional32::toHex( readTruth( currentCut ),
    // this->nWords )  << "] ";
  }
  std::cout << "}" << std::endl;
}

void PriCutManager::printAllCuts() {

  long int counter = 0;

  std::cout << std::endl << "########## All K-Cuts ###########" << std::endl;
  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    if ((nodeData.type == aig::NodeType::AND) ||
        (nodeData.type == aig::NodeType::PI)) {
      printNodeCuts(nodeData.id, counter);
    }
  }
  std::cout << "#################################" << std::endl;
}

void PriCutManager::printNodeBestCut(int nodeId) {

  PriCut* bestCut = getBestCut(nodeId);
  std::cout << "Node " << nodeId << ": { ";
  for (int i = 0; i < bestCut->nLeaves; i++) {
    std::cout << bestCut->leaves[i] << " ";
  }
  std::cout << "}(a" << bestCut->area << ")" << std::endl;
  // std::cout << "}(a" << bestCut->area << ", e" << bestCut->edge << ")" <<
  // std::endl; std::cout << "}[" << Functional32::toHex( readTruth( bestCut ),
  // this->nWords )  << "] " << std::endl;
}

void PriCutManager::printBestCuts() {

  std::cout << std::endl << "########## Best K-Cuts ###########" << std::endl;
  for (aig::GNode node : aigGraph) {
    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);
    if ((nodeData.type == aig::NodeType::AND) ||
        (nodeData.type == aig::NodeType::PI)) {
      printNodeBestCut(nodeData.id);
    }
  }
  std::cout << "#################################" << std::endl;
}

void PriCutManager::printCutStatistics() {

  long int nCutsRed = nCuts.reduce();
  nCutsRed += this->aig.getNumInputs();

  long int nTrivRed = nTriv.reduce();
  nTrivRed += this->aig.getNumInputs();

  long int nFiltRed = nFilt.reduce();

  long int nSatuRed = nSatu.reduce();

  std::cout << std::endl
            << "############## Cut Statistics #############" << std::endl;
  std::cout << "nCuts: " << nCutsRed << std::endl;
  std::cout << "nTriv: " << nTrivRed << std::endl;
  std::cout << "nFilt: " << nFiltRed << std::endl;
  std::cout << "nSatu: " << nSatuRed << std::endl;
  std::cout << "nCutPerNode: " << (((double)nCutsRed) / this->nNodes)
            << std::endl;
  std::cout << "###########################################" << std::endl;
}

void PriCutManager::printRuntimes() {

  std::cout << std::endl << "#### Runtimes in microsecond ####" << std::endl;
  // std::cout << "Merge: " << mergeTime << std::endl;
  // std::cout << "Filter: " << filterTime << std::endl;
  // std::cout << "ProcTwo: " << procTwoTime << std::endl;
  // std::cout << "Compute: " << compTime << std::endl;
  // std::cout << "Schedule: " << scheduleTime << std::endl;
  std::cout << "Total: " << this->kcutTime << std::endl;
  std::cout << "#################################" << std::endl;
}

// ######################## BEGIN OPERATOR ######################## //
struct KPriCutOperator {

  const float FLOAT_MAX = std::numeric_limits<float>::max();
  PriCutManager& cutMan;

  KPriCutOperator(PriCutManager& cutMan) : cutMan(cutMan) {}

  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {

    aig::Aig& aig        = cutMan.getAig();
    aig::Graph& aigGraph = aig.getGraph();

    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

    if (nodeData.type == aig::NodeType::AND) {

      // Touching outgoing neighobors to acquire their locks
      aigGraph.out_edges(node);

      // Combine Cuts
      auto inEdgeIt      = aigGraph.in_edge_begin(node);
      aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);
      aig::NodeData& lhsData =
          aigGraph.getData(lhsNode, galois::MethodFlag::READ);
      bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);

      inEdgeIt++;
      aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);
      aig::NodeData& rhsData =
          aigGraph.getData(rhsNode, galois::MethodFlag::READ);
      bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);

      ThreadLocalData* thData = cutMan.getPerThreadData().getLocal();

      RefMap refMap(ctx.getPerIterAlloc());

      cutMan.computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,
                            lhsPolarity, rhsPolarity);

      // Mark node as processed
      nodeData.counter = nodeData.nFanout;
      nodeData.reqTime = FLOAT_MAX;

      // Schedule next nodes
      for (auto edge : aigGraph.out_edges(node)) {
        aig::GNode nextNode = aigGraph.getEdgeDst(edge);
        aig::NodeData& nextNodeData =
            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);
        nextNodeData.counter += 1;
        if (nextNodeData.counter == 2) {
          ctx.push(nextNode);
        }
      }

      // Delete cuts of previous nodes if possibles
      if (lhsData.type == aig::NodeType::AND) {
        if (--lhsData.counter == 0) {
          PriCut* cut =
              cutMan.getNodePriCuts()[lhsData.id]
                  ->nextCut; // skipe the first cut which is the best cut
          cutMan.getNodePriCuts()[lhsData.id]->nextCut = nullptr;
          while (cut != nullptr) {
            PriCut* nextCut = cut->nextCut;
            thData->cutPool.giveBackMemory(cut);
            cut = nextCut;
          }
        }
      }
      if (rhsData.type == aig::NodeType::AND) {
        if (--rhsData.counter == 0) {
          PriCut* cut =
              cutMan.getNodePriCuts()[rhsData.id]
                  ->nextCut; // skipe the first cut which is the best cut
          cutMan.getNodePriCuts()[rhsData.id]->nextCut = nullptr;
          while (cut != nullptr) {
            PriCut* nextCut = cut->nextCut;
            thData->cutPool.giveBackMemory(cut);
            cut = nextCut;
          }
        }
      }
    } else {
      if (nodeData.type == aig::NodeType::PI) {
        // Touching outgoing neighobors to acquire their locks and their fanin
        // node's locks.
        aigGraph.out_edges(node);

        if (cutMan.getNodePriCuts()[nodeData.id] == nullptr) {
          // Set the trivial cut
          nodeData.counter        = 3;
          ThreadLocalData* thData = cutMan.getPerThreadData().getLocal();
          PriCut* trivialCut      = thData->cutPool.getMemory();
          trivialCut->leaves[0]   = nodeData.id;
          trivialCut->nLeaves++;
          trivialCut->sig = (1U << (nodeData.id % 31));
          if (cutMan.getCompTruthFlag()) {
            unsigned* cutTruth = cutMan.readTruth(trivialCut);
            for (int i = 0; i < cutMan.getNWords(); i++) {
              cutTruth[i] = 0xAAAAAAAA;
            }
          }
          cutMan.getNodePriCuts()[nodeData.id] = trivialCut;
        }

        nodeData.counter = 0;
        nodeData.reqTime = FLOAT_MAX;

        // Schedule next nodes
        for (auto edge : aigGraph.out_edges(node)) {
          aig::GNode nextNode = aigGraph.getEdgeDst(edge);
          aig::NodeData& nextNodeData =
              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);
          nextNodeData.counter += 1;
          if (nextNodeData.counter == 2) {
            ctx.push(nextNode);
          }
        }
      }
    }
  }
};
// ######################## END OPERATOR ######################## //

void runKPriCutOperator(PriCutManager& cutMan) {

  typedef galois::worklists::PerSocketChunkBag<1000> DC_BAG;
  bool verbose      = cutMan.getVerboseFlag();
  int nAreaRecovery = 2, nAreaFlow = 1, nLocalArea = 1;
  aig::Aig& aig = cutMan.getAig();
  //	aig::Graph & aigGraph = aig.getGraph();

  if (verbose) {
    std::cout << std::endl << "########## LUT Mapping ###########" << std::endl;
    std::cout << "Mapping in First Delay Mode" << std::endl;
  }
  cutMan.switchToFirstDelayMode();
  cutMan.resetNodeCountersFanout();
  // Galois Parallel Foreach
  galois::for_each(galois::iterate(aig.getInputNodes()),
                   KPriCutOperator(cutMan), galois::wl<DC_BAG>(),
                   galois::loopname("KPriCutOperator"),
                   galois::per_iter_alloc());

  if (verbose) {
    std::cout << "Mapping in Second Delay Mode" << std::endl;
  }
  cutMan.switchToSecondDelayMode();
  cutMan.computeRequiredTimes();
  if (cutMan.isDeterministic()) {
    cutMan.computeCovering();
  }
  // Galois Parallel Foreach
  galois::for_each(galois::iterate(aig.getInputNodes()),
                   KPriCutOperator(cutMan), galois::wl<DC_BAG>(),
                   galois::loopname("KPriCutOperator"),
                   galois::per_iter_alloc());

  for (int i = 1; i <= nAreaRecovery; i++) {

    for (int j = 1; j <= nLocalArea; j++) {
      if (verbose) {
        std::cout << "Mapping in Local Area Mode" << std::endl;
      }
      cutMan.switchToLocalAreaMode();
      cutMan.computeRequiredTimes();
      if (cutMan.isDeterministic()) {
        cutMan.computeCovering();
      }
      // Galois Parallel Foreach
      galois::for_each(galois::iterate(aig.getInputNodes()),
                       KPriCutOperator(cutMan), galois::wl<DC_BAG>(),
                       galois::loopname("KPriCutOperator"),
                       galois::per_iter_alloc());
    }

    for (int j = 1; j <= nAreaFlow; j++) {
      if (verbose) {
        std::cout << "Mapping in Area Flow Mode" << std::endl;
      }
      cutMan.switchToAreaFlowMode();
      cutMan.computeRequiredTimes();
      if (cutMan.isDeterministic()) {
        cutMan.computeCovering();
      }
      // Galois Parallel Foreach
      galois::for_each(galois::iterate(aig.getInputNodes()),
                       KPriCutOperator(cutMan), galois::wl<DC_BAG>(),
                       galois::loopname("KPriCutOperator"),
                       galois::per_iter_alloc());
    }
  }

  if (verbose) {
    std::cout << "Covering ..." << std::endl;
  }
  cutMan.computeCovering();
}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Parallel LUT-Based Tech Mapping October 16, 2018.
 ABC-based implementation on Galois.

*/

#ifndef PRIORITYCUTMANAGER_H_
#define PRIORITYCUTMANAGER_H_

#include "Aig.h"
#include "PriorityCutPool.h"
#include "CutManager.h"
#include "../functional/FunctionHandler32.h"
#include "galois/Reduction.h"
#include <unordered_map>
#include <unordered_set>

namespace algorithm {

typedef std::unordered_map<int, PriCut*> Covering;

// #### THREAD LOCAL #### //
typedef struct pricutList_ {

  PriCut** array;
  int nCuts;

  pricutList_(int C) {
    array = new PriCut*[C + 2];
    nCuts = 0;
  }

  ~pricutList_() { delete array; }

} PriCutList;

typedef galois::PerIterAllocTy::rebind<std::pair<const int, int>>::other
    MapAlloc;
typedef std::unordered_map<int, int, std::hash<int>, std::equal_to<int>,
                           MapAlloc>
    RefMap;

typedef struct threadLocalData_ {

  PriCutPool cutPool;
  PriCutList cutList;
  AuxTruth auxTruth;

  threadLocalData_(int cutPoolSize, int K, bool compTruth, int C, int nWords)
      : cutPool(cutPoolSize, K, compTruth), cutList(C), auxTruth(nWords) {}

} ThreadLocalData;

typedef galois::substrate::PerThreadStorage<ThreadLocalData> PerThreadData;

// #### CONTROL TYPES #### //
enum SortMode { DELAY, DELAY_OLD, AREA };
enum CostMode { AREA_FLOW, LOCAL_AREA };
enum RefMode { STANDARD, MAP };

class PriCutManager {

private:
  aig::Aig& aig;
  aig::Graph& aigGraph;
  int K;
  int C;
  int nWords;
  int nNodes;
  int nThreads;
  long int cutPoolSize;
  bool compTruth;
  bool deterministic;
  bool verbose;
  int passId;
  int sortMode;
  int costMode;
  int refMode;
  int nLUTs;
  int nLevels;
  bool fPower;
  float fEpsilon;
  long double kcutTime;

  PerThreadData perThreadData;

  PriCut** nodePriCuts;
  Covering covering;
  std::vector<aig::GNode> sortedNodes;

  // Cuts Statistics //
  galois::GAccumulator<long int> nCuts;
  galois::GAccumulator<long int> nTriv;
  galois::GAccumulator<long int> nFilt;
  galois::GAccumulator<long int> nSatu;

  // Runtime Statistics //
  galois::GAccumulator<long int> mergeTime;
  galois::GAccumulator<long int> filterTime;
  galois::GAccumulator<long int> procTwoTime;
  galois::GAccumulator<long int> compTime;
  galois::GAccumulator<long int> scheduleTime;

  void computePriCutsRec(aig::GNode node, ThreadLocalData* thData,
                         RefMap& refMap);

  PriCut* mergeCuts(PriCutPool& cutPool, PriCut* lhsCut, PriCut* rhsCut);
  inline bool cutFilter(PriCutPool& cutPool, PriCutList& cutList,
                        PriCut* resCut);
  inline bool checkCutDominance(PriCut* smallerCut, PriCut* largerCut);

  inline void commitCuts(PriCutPool& cutPool, PriCutList& cutList, int nodeId);

  inline void recycleNodeCuts(PriCutPool& cutPool, int nodeId);
  inline void cleanupCutList(PriCutPool& cutPool, PriCutList& cutList);

  void computeTruth(AuxTruth& auxTruth, PriCut* resCut, PriCut* lhsCut,
                    PriCut* rhsCut, bool lhsPolarity, bool rhsPolarity);
  inline unsigned truthPhase(PriCut* resCut, PriCut* inCut);

  void cutSort(PriCutPool& cutPool, PriCutList& cutList, PriCut* resCut);
  int sortCompare(PriCut* lhsCut, PriCut* rhsCut);

  void increaseCutReferences(PriCut* cut);
  void decreaseCutReferences(PriCut* cut);

  // ################### Start of the NewCuts Cost Functions
  // ###################### //
  inline float cutDelay(PriCut* cut);
  // STANDARD VERSIONS
  void cutFlowCosts(PriCut* cut);
  void cutDerefedCosts(PriCut* cut);
  void cutRefCosts(PriCut* cut, float& area, float& edge);
  void cutDerefCosts(PriCut* cut, float& area, float& edge);
  // REFMAP RERSIONS
  void cutDerefedCosts(PriCut* cut, RefMap& refMap);
  void cutRefCosts(PriCut* cut, float& area, float& edge, RefMap& refMap);
  void cutDerefCosts(PriCut* cut, float& area, float& edge, RefMap& refMap);
  // ################### End of the NewCuts Cost Functions
  // ###################### //

  inline void copyCut(PriCut* dest, PriCut* source);

public:
  PriCutManager(aig::Aig& aig, int K, int C, int nThreads, bool compTruth,
                bool deterministic, bool verbose);

  ~PriCutManager();

  void computePriCuts(ThreadLocalData* thData, RefMap& refMap,
                      aig::NodeData& nodeData, int lhsId, int rhsId,
                      bool lhsPolarity, bool rhsPolarity);

  void mapChoices(ThreadLocalData* thData, RefMap& refMap,
                  aig::NodeData& nodeData);

  void computePriCutsRecursively(aig::GNode node, RefMap& refMap);

  unsigned int* readTruth(PriCut* cut);
  inline void switchToFirstDelayMode();
  inline void switchToSecondDelayMode();
  inline void switchToAreaFlowMode();
  inline void switchToLocalAreaMode();

  void resetNodeCountersFanout();
  void resetNodeCountersZero();
  void resetNodeCountersOnly();
  void computeReferenceCounters();
  void computeCoveringReferenceCounters();
  void computeRequiredTimes();
  void computeCovering();

  void printCovering();
  void printNodeCuts(int nodeId, long int& counter);
  void printAllCuts();
  void printNodeBestCut(int nodeId);
  void printBestCuts();
  void printCutStatistics();
  void printRuntimes();

  int getNumLUTs();
  int getNumLevels();
  int getK();
  int getC();
  int getNWords();
  int getNThreads();
  bool isDeterministic();
  bool getCompTruthFlag();
  bool getVerboseFlag();
  long double getKcutTime();
  void setKcutTime(long double time);

  inline aig::Aig& getAig();
  inline PriCut* getBestCut(int nodeId);
  PriCut** getNodePriCuts();
  Covering& getCovering();

  PerThreadData& getPerThreadData();
};

// Function that runs the KCut operator define in the end of file CutManager.cpp
// //
void runKPriCutOperator(PriCutManager& cutMan);

} /* namespace algorithm */

#endif /* PRIORITYCUTMANAGERC_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutPool.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "PriorityCutPool.h"
#include "../functional/FunctionHandler32.h"

#include <iostream>
#include <cstdlib>
#include <cstring>
#include <cassert>

namespace algorithm {

PriCutPool::PriCutPool(long int initialSize, int k, bool compTruth) {
  this->blockSize = initialSize;
  this->k         = k;
  if (compTruth) {
    this->entrySize = sizeof(PriCut) + (k * sizeof(int)) +
                      (Functional32::wordNum(k) * sizeof(unsigned int));
  } else {
    this->entrySize = sizeof(PriCut) + (k * sizeof(int));
  }
  this->entriesUsed  = 0;
  this->entriesAlloc = 0;
  this->entriesFree  = nullptr;
}

PriCutPool::~PriCutPool() {
  for (char* ptr : this->blocks) {
    free(ptr);
  }
}

inline void PriCutPool::alloc() {

  this->entriesFree =
      (char*)malloc((long int)(this->entrySize * this->blockSize));

  if (this->entriesFree == nullptr) {
    std::cout << "Error: memory could not be allocated by CutPool!"
              << std::endl;
    exit(1);
  }

  char* pTemp = this->entriesFree;

  for (int i = 1; i < this->blockSize; i++) {
    *((char**)pTemp) = pTemp + this->entrySize;
    pTemp += this->entrySize;
  }

  *((char**)pTemp) = nullptr;

  this->entriesAlloc += this->blockSize;
  this->blocks.push_back(this->entriesFree);
}

PriCut* PriCutPool::getMemory() {

  if (this->entriesUsed == this->entriesAlloc) {
    assert(this->entriesFree == nullptr);
    alloc();
  }

  this->entriesUsed++;
  char* pTemp       = this->entriesFree;
  this->entriesFree = *((char**)pTemp);

  PriCut* cut = (PriCut*)pTemp;
  memset(cut, 0, this->entrySize);
  cut->nextCut = nullptr;

  return cut;
}

void PriCutPool::giveBackMemory(PriCut* cut) {

  this->entriesUsed--;
  char* pTemp       = (char*)cut;
  *((char**)pTemp)  = this->entriesFree;
  this->entriesFree = pTemp;
}

int PriCutPool::getNumBlocks() { return this->blocks.size(); }

int PriCutPool::getBlockSize() { return this->blockSize; }

// void PriCutPool::copyCut(PriCut* dest, PriCut* source) {
//	memcpy(dest, source, this->entrySize);
//}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutPool.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef PRIORITYCUTPOOL_H_
#define PRIORITYCUTPOOL_H_

#include <vector>

namespace algorithm {

// The size of the leaves is defined acording the parameter K, during the
// memory allocation in the CutPool.cpp
typedef struct priCut_ {
  float area;  // area (or area-flow) of the cut
  float edge;  // the edge flow
  float power; // the power flow
  float delay; // delay of the cut

  unsigned int sig;
  short int nLeaves;
  struct priCut_* nextCut;
  int leaves[0];
} PriCut;

class PriCutPool {

private:
  long int blockSize;
  int k;
  int entrySize;
  long int entriesUsed;
  long int entriesAlloc;
  char* entriesFree;
  std::vector<char*> blocks;

  void alloc();

public:
  PriCutPool(long int initialSize, int k, bool compTruth);

  ~PriCutPool();

  PriCut* getMemory();

  void giveBackMemory(PriCut* cut);

  int getNumBlocks();

  int getBlockSize();

  // void copyCut(PriCut* dest, PriCut* source);
};

} /* namespace algorithm */

#endif /* PRIORITYCUTPOOL_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/ReconvDrivenCut.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "ReconvDrivenCut.h"

#include <iostream>
#include <unordered_set>

namespace algorithm {

typedef galois::PerIterAllocTy Alloc;
typedef std::unordered_set<aig::GNode, std::hash<aig::GNode>,
                           std::equal_to<aig::GNode>,
                           galois::PerIterAllocTy::rebind<aig::GNode>::other>
    GNodeSet;

ReconvDrivenCut::ReconvDrivenCut(aig::Aig& aig) : aig(aig) {}

ReconvDrivenCut::~ReconvDrivenCut() {}

struct Preprocess {

  aig::Graph& aigGraph;
  galois::InsertBag<aig::GNode>& workList;

  Preprocess(aig::Graph& aigGraph, galois::InsertBag<aig::GNode>& workList)
      : aigGraph(aigGraph), workList(workList) {}

  void operator()(aig::GNode node) const {

    aig::NodeData& nodeData =
        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);

    if ((nodeData.type == aig::NodeType::AND) && (nodeData.counter == 0) &&
        (nodeData.nFanout < 1000)) {
      workList.push(node);
    }
  }
};

struct ReconvergenceDrivenCut {

  // typedef int tt_does_not_need_aborts;
  // typedef int tt_needs_per_iter_alloc;
  // typedef int tt_does_not_need_push;

  aig::Graph& aigGraph;
  PerThreadRDCutData& perThreadRDCutData;
  size_t cutSizeLimit;

  ReconvergenceDrivenCut(aig::Graph& aigGraph,
                         PerThreadRDCutData& perThreadRDCutData,
                         size_t cutSizeLimit)
      : aigGraph(aigGraph), perThreadRDCutData(perThreadRDCutData),
        cutSizeLimit(cutSizeLimit) {}

  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) const {
    // void operator()( aig::GNode node ) const {

    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

    // if ( nodeData.type == aig::NodeType::AND ) {
    if ((nodeData.type == aig::NodeType::AND) && (nodeData.counter == 0) &&
        (nodeData.nFanout < 1000)) {

      // galois::PerIterAllocTy & allocator = ctx.getPerIterAlloc();

      // GNodeSet leaves( allocator );
      // GNodeSet visited( allocator );

      // leaves.insert( node );
      // visited.insert( node );

      RDCutData* rdCutData = perThreadRDCutData.getLocal();

      rdCutData->visited.clear();
      rdCutData->leaves.clear();

      rdCutData->visited.insert(node);
      rdCutData->leaves.insert(node);

      // constructCut( leaves, visited );
      constructCut_iter(rdCutData->leaves, rdCutData->visited);

      /*
      std::cout << "Leaves = { ";
      for ( auto leaf : rdCutData->leaves ) {
          aig::NodeData & leafData = aigGraph.getData( leaf,
      galois::MethodFlag::READ ); std::cout << leafData.id << " ";
      }
      std::cout << "} " << std::endl;

      std::cout << "Visited = { ";
      for ( auto vis : rdCutData->visited ) {
          aig::NodeData & visData = aigGraph.getData( vis,
      galois::MethodFlag::READ ); std::cout << visData.id << " ";
      }
      std::cout << "} " << std::endl;
      */
    }

    nodeData.counter = 1;

    for (auto inEdge : aigGraph.in_edges(node)) {

      aig::GNode inNode = aigGraph.getEdgeDst(inEdge);
      aig::NodeData& inNodeData =
          aigGraph.getData(inNode, galois::MethodFlag::WRITE);

      if ((inNodeData.type == aig::NodeType::AND) &&
          (inNodeData.counter == 0)) {
        ctx.push(inNode);
      }
    }
  }

  /*
      void constructCut( GNodeSet & leaves, GNodeSet & visited ) const {

          aig::GNode minCostNode = nullptr;
          int minCost = std::numeric_limits<int>::max();
          bool onlyPIs = true;
          for ( aig::GNode node : leaves ) {
              aig::NodeData & nodeData = aigGraph.getData( node,
     galois::MethodFlag::READ ); if ( nodeData.type != aig::NodeType::PI ) { int
     cost = leafCost( node, visited ); if ( minCost > cost ) { minCost = cost;
                      minCostNode = node;
                      onlyPIs = false;
                  }
              }
          }
          if ( onlyPIs || (leaves.size() + minCost) > cutSizeLimit ) {
              return;
          }

          if( minCostNode == nullptr ) {
              std::cout << "MinCostNode is null" << std::endl;
              exit( 1 );
          }

          leaves.erase( minCostNode );
          for ( auto edge : aigGraph.in_edges( minCostNode ) ) {
              aig::GNode currentNode = aigGraph.getEdgeDst( edge );
              leaves.insert( currentNode );
              visited.insert( currentNode );
          }

          constructCut( leaves, visited );
      }
  */

  // ITER
  // void constructCut_iter( GNodeSet & leaves, GNodeSet & visited ) const {
  void constructCut_iter(std::unordered_set<aig::GNode>& leaves,
                         std::unordered_set<aig::GNode>& visited) const {

    while (true) {
      aig::GNode minCostNode = nullptr;
      int minCost            = std::numeric_limits<int>::max();
      bool onlyPIs           = true;
      for (aig::GNode node : leaves) {
        aig::NodeData& nodeData =
            aigGraph.getData(node, galois::MethodFlag::READ);
        if (nodeData.type != aig::NodeType::PI) {
          int cost = leafCost(node, visited);
          if (minCost > cost) {
            minCost     = cost;
            minCostNode = node;
            onlyPIs     = false;
          }
        }
      }

      if (onlyPIs || (leaves.size() + minCost) > cutSizeLimit) {
        break;
      }

      if (minCostNode == nullptr) {
        std::cout << "MinCostNode is null" << std::endl;
        exit(1);
      }

      leaves.erase(minCostNode);
      for (auto edge : aigGraph.in_edges(minCostNode)) {
        aig::GNode currentNode = aigGraph.getEdgeDst(edge);
        leaves.insert(currentNode);
        visited.insert(currentNode);
      }
    }
  }

  // int leafCost( aig::GNode & node, GNodeSet & visited ) const {
  int leafCost(aig::GNode& node,
               std::unordered_set<aig::GNode>& visited) const {

    int cost = -1;
    for (auto edge : aigGraph.in_edges(node)) {
      aig::GNode currentNode = aigGraph.getEdgeDst(edge);
      auto it                = visited.find(currentNode);
      if (it == visited.end()) {
        cost++;
      }
    }
    return cost;
  }
};

void ReconvDrivenCut::run(size_t cutSizeLimit) {

  aig::Graph& aigGraph = this->aig.getGraph();

  galois::InsertBag<aig::GNode> workList;
  typedef galois::worklists::PerSocketChunkFIFO<5000> DC_FIFO;

  // typedef galois::worklists::PerSocketChunkBag<5000> DC_BAG;
  // galois::do_all_local( aigGraph, Preprocess( aigGraph, workList ) );
  // galois::for_each_local( workList, ReconvergenceDrivenCut( aigGraph,
  // cutSizeLimit ), galois::wl< DC_BAG >() );

  // galois::for_each( aigGraph.begin(), aigGraph.end(), ReconvergenceDrivenCut(
  // aigGraph, cutSizeLimit ) );

  /*
      for ( aig::GNode po : this->aig.getOutputNodes() ) {
          auto inEdge = aigGraph.in_edge_begin( po );
          aig::GNode inNode = aigGraph.getEdgeDst( inEdge );
          workList.push( inNode );
      }

  */

  /*
      typedef struct FanoutComparator_ {

          aig::Graph & aigGraph;

          FanoutComparator_( aig::Graph & aigGraph ) : aigGraph( aigGraph ) { }

          bool operator()( aig::GNode lhs, aig::GNode rhs ) const {
              aig::NodeData & lhsData = aigGraph.getData( lhs,
     galois::MethodFlag::UNPROTECTED ); aig::NodeData & rhsData =
     aigGraph.getData( rhs, galois::MethodFlag::UNPROTECTED ); return
     lhsData.nFanout > rhsData.nFanout;
          }

      } FanoutComparator;

      std::vector< aig::GNode > nodes = aig.getNodes();

      std::sort( nodes.begin(), nodes.end(), FanoutComparator( aigGraph ) );

      for ( aig::GNode node : nodes ) {
          aig::NodeData & nodeData = aigGraph.getData( node,
     galois::MethodFlag::UNPROTECTED );

          if ( (nodeData.type == aig::NodeType::AND) ) {
              workList.push( node );
          }
      }
  */

  galois::for_each(
      galois::iterate(workList.begin(), workList.end()),
      ReconvergenceDrivenCut(aigGraph, perThreadRDCutData, cutSizeLimit),
      galois::wl<DC_FIFO>(), galois::loopname("ReconvergenceDrivenCut"));
}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/ReconvDrivenCut.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef RECONVDRIVENCUT_H_
#define RECONVDRIVENCUT_H_

#include "Aig.h"

#include <unordered_set>

namespace algorithm {

typedef struct RDCutData_ {

  std::unordered_set<aig::GNode> visited;
  std::unordered_set<aig::GNode> leaves;

} RDCutData;

typedef galois::substrate::PerThreadStorage<RDCutData> PerThreadRDCutData;

class ReconvDrivenCut {

private:
  aig::Aig& aig;
  PerThreadRDCutData perThreadRDCutData;

public:
  ReconvDrivenCut(aig::Aig& aig);

  virtual ~ReconvDrivenCut();

  void run(size_t cutSizeLimit);
};

} /* namespace algorithm */

namespace alg = algorithm;

#endif /* RECONVDRIVENCUT_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#include "RewriteManager.h"

#include "galois/worklists/Chunk.h"

//#include "galois/runtime/profile.h"

#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <chrono>

using namespace std::chrono;

namespace algorithm {

RewriteManager::RewriteManager(aig::Aig& aig, CutManager& cutMan,
                               NPNManager& npnMan, PreCompGraphManager& pcgMan,
                               int triesNGraphs, bool useZeros,
                               bool updateLevel)
    : aig(aig), cutMan(cutMan), npnMan(npnMan), pcgMan(pcgMan),
      perThreadContextData(), triesNGraphs(triesNGraphs), useZeros(useZeros),
      updateLevel(updateLevel) {

  nFuncs = (1 << 16);

  for (int i = 0; i < cutMan.getNThreads(); i++) {
    ThreadContextData* threadCtx = perThreadContextData.getRemote(i);
    threadCtx->threadId          = i;
  }

  rewriteTime = 0;
}

RewriteManager::~RewriteManager() {
  // TODO
}

aig::GNode RewriteManager::rewriteNode(ThreadContextData* threadCtx,
                                       aig::GNode node,
                                       GNodeVector& fanoutNodes) {

  aig::Graph& aigGraph    = this->aig.getGraph();
  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);

  // Get the node's cuts
  this->cutMan.computeCutsRecursively(node);
  Cut* cutsBegin = this->cutMan.getNodeCuts()[nodeData.id];
  assert(cutsBegin != nullptr);

  threadCtx->bestCutMFFCIds.clear();
  threadCtx->bestCutMFFCPreservedIds.clear();

  Cut* cut;
  char* perm;
  unsigned phase;
  unsigned truth;
  // unsigned bestTruth = 0;
  bool isOutputCompl = false;
  int requiredLevel  = 0;
  int nodesSaved;
  // int bestNodesSaved;
  int currentGain = -1, bestGain = -1;
  int i;
  DecGraph* currentGraph = nullptr;
  DecGraph* bestGraph    = nullptr;

  // Go through the cuts to lock the fanin conee
  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {
    // Consider only 4-input cuts
    if (cut->nLeaves != 4) {
      continue;
    }
    lockFaninCone(aigGraph, node, cut);
  }

  // Go through the cuts to rewrite
  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {

    // Consider only 4-input cuts
    if (cut->nLeaves != 4) {
      continue;
    }

    // Get the fanin permutation
    truth = 0xFFFF & (*(this->cutMan.readTruth(cut)));
    perm  = this->npnMan.getPerms4()[(int)this->npnMan.getPerms()[truth]];
    phase = this->npnMan.getPhases()[truth];

    // Collect fanins with the corresponding permutation/phase
    for (i = 0; i < cut->nLeaves; i++) {
      aig::GNode faninNode = this->aig.getNodes()[cut->leaves[(int)perm[i]]];
      if (faninNode == nullptr) {
        break;
      }
      threadCtx->currentFanins[i]    = faninNode;
      threadCtx->currentFaninsPol[i] = !((phase & (1 << i)) > 0);
    }

    if (i != cut->nLeaves) {
      continue;
    }

    int counter = 0;
    for (aig::GNode faninNode : threadCtx->currentFanins) {
      aig::NodeData& faninNodeData =
          aigGraph.getData(faninNode, galois::MethodFlag::READ);
      if (faninNodeData.nFanout == 1) {
        counter++;
      }
    }

    if (counter > 2) {
      continue;
    }

    // lockFaninCone( aigGraph, node, cut );

    // mark the fanin boundary
    for (aig::GNode faninNode : threadCtx->currentFanins) {
      aig::NodeData& faninNodeData =
          aigGraph.getData(faninNode, galois::MethodFlag::WRITE);
      faninNodeData.nFanout++;
    }

    // label MFFC with current ThreadId and the ThreadTravId
    threadCtx->travId += 1;
    nodesSaved =
        labelMFFC(threadCtx, node, threadCtx->threadId, threadCtx->travId);

    // unmark the fanin boundary
    for (aig::GNode faninNode : threadCtx->currentFanins) {
      aig::NodeData& faninNodeData =
          aigGraph.getData(faninNode, galois::MethodFlag::WRITE);
      faninNodeData.nFanout--;
    }

    // evaluate the cut
    currentGraph = evaluateCut(threadCtx, node, cut, nodesSaved, requiredLevel,
                               currentGain);

    // cheeck if the cut is better than the current best one
    if ((currentGraph != nullptr) && (bestGain < currentGain)) {
      bestGain      = currentGain;
      bestGraph     = currentGraph;
      isOutputCompl = ((phase & (1 << 4)) > 0);
      // bestTruth = 0xFFFF & *this->cutMan.readTruth( cut );
      // bestNodesSaved = nodesSaved;
      // collect fanins in the
      for (size_t i = 0; i < threadCtx->currentFanins.size(); i++) {
        threadCtx->bestFanins[i]    = threadCtx->currentFanins[i];
        threadCtx->bestFaninsPol[i] = threadCtx->currentFaninsPol[i];
      }
      threadCtx->bestCutMFFCIds = threadCtx->currentCutMFFCIds;
      threadCtx->bestCutMFFCPreservedIds =
          threadCtx->currentCutMFFCPreservedIds;
    }
  }

  if (!(bestGain > 0 || (bestGain == 0 && useZeros))) {
    return nullptr;
  }

  assert(bestGraph != nullptr);

  // Preparing structure/AIG tracking for updating the AIG
  for (int j = 0; j < 20; j++) {
    if (j < 4) {
      threadCtx->decNodeFunc[j] =
          threadCtx->bestFanins[j]; // Link cut leaves with the best
                                    // decomposition graph
    } else {
      threadCtx->decNodeFunc[j] = nullptr; // Clear the link table, after leaves
    }
  }

  // Define the MFFC available IDs to be reused
  for (int id : threadCtx->bestCutMFFCPreservedIds) {
    threadCtx->bestCutMFFCIds.erase(id);
  }

  // std::cout << threadCtx->threadId << " - Updating AIG with gain " <<
  // bestGain << std::endl;
  aig::GNode newRoot =
      updateAig(threadCtx, node, bestGraph, fanoutNodes, isOutputCompl);
  // std::cout << threadCtx->threadId << " - Update done " << std::endl;

  return newRoot;
}

void RewriteManager::lockFaninCone(aig::Graph& aigGraph, aig::GNode node,
                                   Cut* cut) {

  aig::NodeData& nodeData =
      aigGraph.getData(node, galois::MethodFlag::READ); // lock

  // If node is a cut leaf
  if ((nodeData.id == cut->leaves[0]) || (nodeData.id == cut->leaves[1]) ||
      (nodeData.id == cut->leaves[2]) || (nodeData.id == cut->leaves[3])) {
    return;
  }

  // If node is a PI
  if ((nodeData.type == aig::NodeType::PI) ||
      (nodeData.type == aig::NodeType::LATCH)) {
    return;
  }

  auto inEdgeIt      = aigGraph.in_edge_begin(node);
  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);
  //  aig::NodeData& lhsData =
  aigGraph.getData(lhsNode, galois::MethodFlag::READ); // lock
  inEdgeIt++;
  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);
  //  aig::NodeData& rhsData =
  aigGraph.getData(rhsNode, galois::MethodFlag::READ); // lock

  lockFaninCone(aigGraph, lhsNode, cut);
  lockFaninCone(aigGraph, rhsNode, cut);
}

int RewriteManager::labelMFFC(ThreadContextData* threadCtx, aig::GNode node,
                              int threadId, int travId) {

  aig::Graph& aigGraph = this->aig.getGraph();

  threadCtx->currentCutMFFCIds.clear();

  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);
  if ((nodeData.type == aig::NodeType::PI) ||
      (nodeData.type == aig::NodeType::LATCH)) {
    return 0;
  }

  threadCtx->currentCutMFFCIds.insert(nodeData.id);

  int nConeSize1 = refDerefMFFCNodes(threadCtx, node, threadId, travId, false,
                                     true); // dereference
  int GALOIS_USED_ONLY_IN_DEBUG(nConeSize2) =
      refDerefMFFCNodes(threadCtx, node, threadId, travId, true,
                        false); // reference

  assert(nConeSize1 == nConeSize2);
  assert(nConeSize1 > 0);

  return nConeSize1;
}

int RewriteManager::refDerefMFFCNodes(ThreadContextData* threadCtx,
                                      aig::GNode node, int threadId, int travId,
                                      bool reference, bool label) {

  aig::Graph& aigGraph    = this->aig.getGraph();
  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);

  // label visited nodes
  if (label) {
    this->aig.registerTravId(nodeData.id, threadId, travId);
  }
  // skip the CI
  if ((nodeData.type == aig::NodeType::PI) ||
      (nodeData.type == aig::NodeType::LATCH)) {
    return 0;
  }

  // process the internal node
  auto inEdgeIt          = aigGraph.in_edge_begin(node);
  aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);
  aig::NodeData& lhsData = aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);

  inEdgeIt++;
  aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);
  aig::NodeData& rhsData = aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);

  int counter = 1;

  if (reference) {
    if (lhsData.nFanout++ == 0) {
      counter += refDerefMFFCNodes(threadCtx, lhsNode, threadId, travId,
                                   reference, label);
    }
    if (rhsData.nFanout++ == 0) {
      counter += refDerefMFFCNodes(threadCtx, rhsNode, threadId, travId,
                                   reference, label);
    }
  } else {
    assert(lhsData.nFanout > 0);
    assert(rhsData.nFanout > 0);
    if (--lhsData.nFanout == 0) {
      threadCtx->currentCutMFFCIds.insert(lhsData.id);
      counter += refDerefMFFCNodes(threadCtx, lhsNode, threadId, travId,
                                   reference, label);
    }
    if (--rhsData.nFanout == 0) {
      threadCtx->currentCutMFFCIds.insert(rhsData.id);
      counter += refDerefMFFCNodes(threadCtx, rhsNode, threadId, travId,
                                   reference, label);
    }
  }

  return counter;
}

DecGraph* RewriteManager::evaluateCut(ThreadContextData* threadCtx,
                                      aig::GNode root, Cut* cut,
                                      int nNodesSaved, int maxLevel,
                                      int& bestGain) {

  DecGraph* bestGraph = NULL;
  DecGraph* currentGraph;
  ForestNode* node;
  int nNodesAdded;
  unsigned uTruth;
  bestGain = -1;

  threadCtx->currentCutMFFCPreservedIds.clear();

  // find the matching class of subgraphs
  uTruth = 0xFFFF & *this->cutMan.readTruth(cut);
  std::vector<ForestNode*>& subgraphs =
      this->pcgMan.getClasses()[this->npnMan.getMap()[uTruth]];

  aig::Graph& aigGraph = aig.getGraph();

  // copy the leaves
  for (int i = 0; i < 4; i++) { // each deGraph has eactly 4 inputs (vars).
    aig::GNode fanin = threadCtx->currentFanins[i];
    //    aig::NodeData& faninData =
    aigGraph.getData(fanin, galois::MethodFlag::READ);
    threadCtx->decNodeFunc[i] = fanin;
    // threadCtx->decNodeLevel[i] = faninData.level;
  }

  // Pruning
  int nSubgraphs = subgraphs.size();
  if (nSubgraphs > this->triesNGraphs) {
    nSubgraphs = this->triesNGraphs;
  }

  // determine the best subgrap
  for (int i = 0; i < nSubgraphs; i++) {
    node = subgraphs[i];
    // get the current graph
    currentGraph = (DecGraph*)node->pNext;

    // clear link table, after leaves
    for (int j = 4; j < 20; j++) { // each decGraph has at most 20 nodes.
      threadCtx->decNodeFunc[j] = NULL;
    }

    // detect how many unlabeled nodes will be reused
    nNodesAdded = decGraphToAigCount(threadCtx, root, currentGraph, nNodesSaved,
                                     maxLevel);

    if (nNodesAdded == -1) {
      continue;
    }

    assert(nNodesSaved >= nNodesAdded);

    // count the gain at this node
    if (bestGain < nNodesSaved - nNodesAdded) {
      bestGain  = nNodesSaved - nNodesAdded;
      bestGraph = currentGraph;
      threadCtx->currentCutMFFCPreservedIds =
          threadCtx->currentGraphMFFCPreservedIds;
    }
  }

  if (bestGain == -1) {
    return NULL;
  }

  return bestGraph;
}

/*
 *   Before calling this procedure, AIG nodes should be assigned to DecNodes by
 *   using the threadCtx->decNodeFunc[ DecNode.id ] for each leaf of the
 * decGraph. Returns -1 if the number of nodes and levels exceeded the given
 * limit or the number of levels exceeded the maximum allowed level.
 */
int RewriteManager::decGraphToAigCount(ThreadContextData* threadCtx,
                                       aig::GNode root, DecGraph* decGraph,
                                       int maxNode,
                                       int GALOIS_UNUSED(maxLevel)) {

  DecNode* node;
  DecNode* lhsNode;
  DecNode* rhsNode;
  aig::GNode curAnd;
  aig::GNode lhsAnd;
  aig::GNode rhsAnd;
  bool lhsPol, rhsPol;
  int counter = 0;
  // int newLevel, oldLevel;

  aig::Graph& aigGraph = this->aig.getGraph();

  threadCtx->currentGraphMFFCPreservedIds.clear();

  // check for constant function or a literal
  if (decGraph->isConst() || decGraph->isVar()) {
    return counter;
  }

  // compute the AIG size after adding the internal nodes
  for (int i = decGraph->getLeaveNum();
       (i < decGraph->getNodeNum()) && ((node = decGraph->getNode(i)), 1);
       i++) {

    // get the children of this node
    lhsNode = decGraph->getNode(node->eEdge0.Node);
    rhsNode = decGraph->getNode(node->eEdge1.Node);

    // get the AIG nodes corresponding to the children
    lhsAnd = threadCtx->decNodeFunc[lhsNode->id];
    rhsAnd = threadCtx->decNodeFunc[rhsNode->id];

    // if they are both present, find the resulting node
    if (lhsAnd && rhsAnd) {
      if (lhsNode->id < 4) { // If lhs is a cut leaf
        lhsPol = node->eEdge0.fCompl
                     ? !(threadCtx->currentFaninsPol[lhsNode->id])
                     : threadCtx->currentFaninsPol[lhsNode->id];
      } else {
        lhsPol = node->eEdge0.fCompl ? false : true;
      }

      if (rhsNode->id < 4) { // If rhs is a cut leaf
        rhsPol = node->eEdge1.fCompl
                     ? !(threadCtx->currentFaninsPol[rhsNode->id])
                     : threadCtx->currentFaninsPol[rhsNode->id];
      } else {
        rhsPol = node->eEdge1.fCompl ? false : true;
      }

      curAnd = this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsPol, rhsPol);

      // return -1 if the node is the same as the original root
      if (curAnd == root) {
        return -1;
      }
    } else {
      curAnd = nullptr;
    }

    // count the number of new levels
    // newLevel = 1 + std::max( threadCtx->decNodeLevel[ lhsNode->id ],
    // threadCtx->decNodeLevel[ rhsNode->id ] );

    if (curAnd) {
      aig::NodeData& curAndData =
          aigGraph.getData(curAnd, galois::MethodFlag::READ);
      bool isMFFC = this->aig.lookupTravId(curAndData.id, threadCtx->threadId,
                                           threadCtx->travId);

      if (isMFFC) {
        threadCtx->currentGraphMFFCPreservedIds.insert(curAndData.id);
        // count the number of added nodes
        if (++counter > maxNode) {
          return -1;
        }
      }

      // TODO Implement an Heuristic for levels preservation
      /*
      if ( curAnd == aig.getConstZero() ) {
          newLevel = 0;
      }
      else {
          if ( curAnd == lhsAnd ) {
              aig::NodeData & lhsAndData = aigGraph.getData( lhsAnd,
      galois::MethodFlag::READ ); newLevel = lhsAndData.level;
          }
          else {
              if ( curAnd == rhsAnd ) {
                  aig::NodeData & rhsAndData = aigGraph.getData( rhsAnd,
      galois::MethodFlag::READ ); newLevel = rhsAndData.level;
              }
          }
      }

      oldLevel = curAndData.level;
      //assert( LevelNew == LevelOld );
      */
    } else {
      // count the number of added nodes
      if (++counter > maxNode) {
        return -1;
      }
    }

    // if ( newLevel > maxLevel ) {
    //    return -1;
    //}

    threadCtx->decNodeFunc[node->id] = curAnd;
    // threadCtx->decNodeLevel[ node->id ] = newLevel;
  }

  return counter;
}

aig::GNode RewriteManager::updateAig(ThreadContextData* threadCtx,
                                     aig::GNode oldRoot, DecGraph* decGraph,
                                     GNodeVector& fanoutNodes,
                                     bool isOutputCompl) {

  aig::Graph& aigGraph = this->aig.getGraph();

  // Prepare to delete nodes in the MFFC
  for (int id : threadCtx->bestCutMFFCIds) {
    aig::GNode mffcNode = this->aig.getNodes()[id];
    auto inEdge         = aigGraph.in_edge_begin(mffcNode);

    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdge);
    //    aig::NodeData& lhsNodeData =
    aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);
    bool lhsNodePol = aigGraph.getEdgeData(inEdge);
    inEdge++;
    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdge);
    //    aig::NodeData& rhsNodeData =
    aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);
    bool rhsNodePol = aigGraph.getEdgeData(inEdge);

    this->aig.removeNodeInFanoutMap(mffcNode, lhsNode, rhsNode, lhsNodePol,
                                    rhsNodePol);
    this->aig.getNodes()[id] = nullptr;
    this->aig.getFanoutMap(id).clear();
    this->cutMan.recycleNodeCuts(id);
  }

  bool isDecGraphComplement = isOutputCompl
                                  ? (bool)decGraph->getRootEdge().fCompl ^ 1
                                  : (bool)decGraph->getRootEdge().fCompl;
  aig::GNode newRoot;

  // check for constant function
  if (decGraph->isConst()) {
    newRoot = this->aig.getConstZero();
  } else {
    // check for a literal
    if (decGraph->isVar()) {
      DecNode* decNode = decGraph->getVar();
      isDecGraphComplement =
          isDecGraphComplement ? (!threadCtx->bestFaninsPol[decNode->id]) ^ true
                               : !threadCtx->bestFaninsPol[decNode->id];
      newRoot = threadCtx->decNodeFunc[decNode->id];
    } else {
      newRoot = decGraphToAig(threadCtx, decGraph);
    }
  }

  addNewSubgraph(oldRoot, newRoot, fanoutNodes, isDecGraphComplement);

  deleteOldMFFC(aigGraph, oldRoot);

  return newRoot;
}

/*
 *   Transforms the decomposition graph into the AIG.
 *   Before calling this procedure, AIG nodes for the fanins
 *   should be assigned to threadCtx.decNodeFun[ decNode.id ].
 */
aig::GNode RewriteManager::decGraphToAig(ThreadContextData* threadCtx,
                                         DecGraph* decGraph) {

  DecNode* decNode = nullptr;
  DecNode* lhsNode;
  DecNode* rhsNode;
  aig::GNode curAnd;
  aig::GNode lhsAnd;
  aig::GNode rhsAnd;
  bool lhsAndPol;
  bool rhsAndPol;

  // build the AIG nodes corresponding to the AND gates of the graph
  for (int i = decGraph->getLeaveNum();
       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);
       i++) {

    // get the children of this node
    lhsNode = decGraph->getNode(decNode->eEdge0.Node);
    rhsNode = decGraph->getNode(decNode->eEdge1.Node);

    // get the AIG nodes corresponding to the children
    lhsAnd = threadCtx->decNodeFunc[lhsNode->id];
    rhsAnd = threadCtx->decNodeFunc[rhsNode->id];

    if (lhsNode->id < 4) { // If lhs is a cut leaf
      lhsAndPol = decNode->eEdge0.fCompl
                      ? !(threadCtx->bestFaninsPol[lhsNode->id])
                      : threadCtx->bestFaninsPol[lhsNode->id];
    } else {
      lhsAndPol = decNode->eEdge0.fCompl ? false : true;
    }

    if (rhsNode->id < 4) { // If rhs is a cut leaf
      rhsAndPol = decNode->eEdge1.fCompl
                      ? !(threadCtx->bestFaninsPol[rhsNode->id])
                      : threadCtx->bestFaninsPol[rhsNode->id];
    } else {
      rhsAndPol = decNode->eEdge1.fCompl ? false : true;
    }

    curAnd =
        this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);

    if (curAnd) {
      threadCtx->decNodeFunc[decNode->id] = curAnd;
    } else {
      threadCtx->decNodeFunc[decNode->id] =
          createAndNode(threadCtx, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);
    }
  }

  return threadCtx->decNodeFunc[decNode->id];
}

aig::GNode RewriteManager::createAndNode(ThreadContextData* threadCtx,
                                         aig::GNode lhsAnd, aig::GNode rhsAnd,
                                         bool lhsAndPol, bool rhsAndPol) {

  aig::Graph& aigGraph = this->aig.getGraph();
  aig::NodeData& lhsAndData =
      aigGraph.getData(lhsAnd, galois::MethodFlag::READ);
  aig::NodeData& rhsAndData =
      aigGraph.getData(rhsAnd, galois::MethodFlag::READ);

  aig::NodeData newAndData;

  auto idIt =
      threadCtx->bestCutMFFCIds.begin(); // reuse an ID from deleted MFFC
  auto id = (*idIt);
  threadCtx->bestCutMFFCIds.erase(
      idIt); // remove the reused ID from the available IDs set
  assert(id < int(this->aig.getNodes().size()));

  newAndData.id      = id;
  newAndData.type    = aig::NodeType::AND;
  newAndData.level   = 1 + std::max(lhsAndData.level, rhsAndData.level);
  newAndData.counter = 0;

  if (lhsAndData.counter == 3) {
    newAndData.counter += 1;
  }

  if (rhsAndData.counter == 3) {
    newAndData.counter += 1;
  }

  if (newAndData.counter == 2) {
    newAndData.counter += 1;
  }

  aig::GNode newAnd = aigGraph.createNode(newAndData);
  aigGraph.addNode(newAnd);

  aigGraph.getEdgeData(aigGraph.addMultiEdge(
      lhsAnd, newAnd, galois::MethodFlag::WRITE)) = lhsAndPol;
  aigGraph.getEdgeData(aigGraph.addMultiEdge(
      rhsAnd, newAnd, galois::MethodFlag::WRITE)) = rhsAndPol;
  lhsAndData.nFanout++;
  rhsAndData.nFanout++;

  // int faninSize = std::distance( aigGraph.in_edge_begin( newAnd ),
  // aigGraph.in_edge_end( newAnd ) ); assert( faninSize == 2 );

  this->aig.getNodes()[id] = newAnd;
  this->aig.insertNodeInFanoutMap(newAnd, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);

  return newAnd;
}

void RewriteManager::addNewSubgraph(aig::GNode oldNode, aig::GNode newNode,
                                    GNodeVector& fanoutNodes,
                                    bool isNewRootComplement) {

  int fanoutNodesSize = fanoutNodes.size();

  aig::GNode fanoutNode;
  aig::GNode otherNode;
  bool otherNodePol;
  bool newNodePol;
  bool oldNodePol;

  aig::Graph& aigGraph = this->aig.getGraph();
  aig::NodeData& newNodeData =
      aigGraph.getData(newNode, galois::MethodFlag::READ);
  aig::NodeData& oldNodeData =
      aigGraph.getData(oldNode, galois::MethodFlag::READ);
  assert(oldNodeData.nFanout == fanoutNodesSize);

  // look at the fanouts of old node
  for (int i = 0; i < fanoutNodesSize; i++) {

    fanoutNode = fanoutNodes[i];
    aig::NodeData& fanoutNodeData =
        aigGraph.getData(fanoutNode, galois::MethodFlag::READ);

    // auto outEdge = aigGraph.findEdge( oldNode, fanoutNode );
    auto fanoutNodeInEdge = aigGraph.findInEdge(oldNode, fanoutNode);

    if (fanoutNodeInEdge == aigGraph.in_edge_end(fanoutNode)) {
      std::cout << "Adding new subgraph, fanoutNode inEdge not found!"
                << std::endl;
    }

    oldNodePol = aigGraph.getEdgeData(fanoutNodeInEdge);
    // newNodePol = isNewRootComplement ? !(false ^ oldNodePol) : !(true ^
    // oldNodePol);
    newNodePol = isNewRootComplement ? !(oldNodePol) : oldNodePol;

    if ((fanoutNodeData.type == aig::NodeType::PO) ||
        (fanoutNodeData.type == aig::NodeType::LATCH)) {
      // remove the oldNode from the fanoutNode's fanin
      // aigGraph.removeEdge( oldNode, fanoutEdge );
      aigGraph.removeInEdge(fanoutNode, fanoutNodeInEdge);
      oldNodeData.nFanout--;
      // add newNode to the fanoutNode's fanin
      aigGraph.getEdgeData(aigGraph.addMultiEdge(
          newNode, fanoutNode, galois::MethodFlag::WRITE)) = newNodePol;
      newNodeData.nFanout++;
      fanoutNodeData.level = newNodeData.level;
      continue;
    }

    // find the otherNode diffetent of oldNode as a fanin of the fanoutNode
    auto inEdge  = aigGraph.in_edge_begin(fanoutNode);
    otherNode    = aigGraph.getEdgeDst(inEdge);
    otherNodePol = aigGraph.getEdgeData(inEdge);

    if (otherNode == oldNode) {
      inEdge++;
      otherNode    = aigGraph.getEdgeDst(inEdge);
      otherNodePol = aigGraph.getEdgeData(inEdge);
    }

    assert(newNode != otherNode);

    // Remove fanoutNode from the fanoutMap from otherNode
    this->aig.removeNodeInFanoutMap(fanoutNode, otherNode, oldNode,
                                    otherNodePol, oldNodePol);

    // remove the oldNode from the fanoutNode fanin
    // aigGraph.removeEdge( oldNode, fanoutEdge );
    aigGraph.removeInEdge(fanoutNode, fanoutNodeInEdge);
    oldNodeData.nFanout--;

    // add newNode to the fanoutNode fanins
    aigGraph.getEdgeData(aigGraph.addMultiEdge(
        newNode, fanoutNode, galois::MethodFlag::WRITE)) = newNodePol;
    newNodeData.nFanout++;

    aig::NodeData& otherNodeData =
        aigGraph.getData(otherNode, galois::MethodFlag::READ);
    fanoutNodeData.level = 1 + std::max(newNodeData.level, otherNodeData.level);

    // Insert fanoutNode in the fanoutMap from other Node with new inEdge
    this->aig.insertNodeInFanoutMap(fanoutNode, otherNode, newNode,
                                    otherNodePol, newNodePol);
  }
}

void RewriteManager::deleteOldMFFC(aig::Graph& aigGraph, aig::GNode oldNode) {

  // assert( oldNode != nullptr );

  aig::NodeData& oldNodeData =
      aigGraph.getData(oldNode, galois::MethodFlag::READ);

  if ((oldNodeData.type == aig::NodeType::AND) && (oldNodeData.nFanout == 0) &&
      aigGraph.containsNode(oldNode, galois::MethodFlag::WRITE)) {
    deleteOldMFFCRec(aigGraph, oldNode);
  }
}

void RewriteManager::deleteOldMFFCRec(aig::Graph& aigGraph,
                                      aig::GNode oldNode) {

  auto inEdge        = aigGraph.in_edge_begin(oldNode);
  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdge);
  aig::NodeData& lhsNodeData =
      aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);
  inEdge++;
  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdge);
  aig::NodeData& rhsNodeData =
      aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);

  // assert( (lhsNode != nullptr) && (rhsNode != nullptr) );

  aigGraph.removeNode(oldNode);
  lhsNodeData.nFanout--;
  rhsNodeData.nFanout--;

  if ((lhsNodeData.type == aig::NodeType::AND) && (lhsNodeData.nFanout == 0) &&
      aigGraph.containsNode(lhsNode, galois::MethodFlag::WRITE)) {
    deleteOldMFFCRec(aigGraph, lhsNode);
  }

  if ((rhsNodeData.type == aig::NodeType::AND) && (rhsNodeData.nFanout == 0) &&
      aigGraph.containsNode(rhsNode, galois::MethodFlag::WRITE)) {
    deleteOldMFFCRec(aigGraph, rhsNode);
  }
}

aig::Aig& RewriteManager::getAig() { return this->aig; }

CutManager& RewriteManager::getCutMan() { return this->cutMan; }

NPNManager& RewriteManager::getNPNMan() { return this->npnMan; }

PreCompGraphManager& RewriteManager::getPcgMan() { return this->pcgMan; }

PerThreadContextData& RewriteManager::getPerThreadContextData() {
  return this->perThreadContextData;
}

bool RewriteManager::getUseZerosFlag() { return this->useZeros; }

bool RewriteManager::getUpdateLevelFlag() { return this->updateLevel; }

long double RewriteManager::getRewriteTime() { return this->rewriteTime; }

void RewriteManager::setRewriteTime(long double time) {
  this->rewriteTime = time;
}

struct RewriteOperator {

  RewriteManager& rwtMan;
  CutManager& cutMan;

  RewriteOperator(RewriteManager& rwtMan)
      : rwtMan(rwtMan), cutMan(rwtMan.getCutMan()) {}

  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {

    aig::Graph& aigGraph = rwtMan.getAig().getGraph();

    if ((node == nullptr) ||
        !aigGraph.containsNode(node, galois::MethodFlag::WRITE)) {
      return;
    }

    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);

    if (nodeData.type == aig::NodeType::AND) {

      if ((nodeData.nFanout < 1000)) {

        Alloc& alloc = ctx.getPerIterAlloc();
        GNodeVector fanoutNodes(alloc);

        // Touching outgoing neighobors to acquire their locks and their fanin
        // node's locks.
        for (auto outEdge : aigGraph.out_edges(node)) {
          aig::GNode fanoutNode = aigGraph.getEdgeDst(outEdge);
          fanoutNodes.push_back(fanoutNode);
          aigGraph.in_edges(fanoutNode);
        }

        ThreadContextData* threadCtx =
            rwtMan.getPerThreadContextData().getLocal();

        // Try to rewrite the node
        aig::GNode newNode = rwtMan.rewriteNode(threadCtx, node, fanoutNodes);

        bool scheduleFanoutNodes = false;

        if (newNode == nullptr) { // it means that node was not rewritten
          if (nodeData.counter == 2) {
            nodeData.counter += 1;
          }

          if (nodeData.counter == 3) {
            scheduleFanoutNodes = true;
          }
        } else {
          aig::NodeData& newNodeData =
              aigGraph.getData(newNode, galois::MethodFlag::READ);
          if (newNodeData.counter == 3) {
            scheduleFanoutNodes = true;
          }
        }

        if (scheduleFanoutNodes) {
          for (aig::GNode nextNode : fanoutNodes) {
            aig::NodeData& nextNodeData =
                aigGraph.getData(nextNode, galois::MethodFlag::WRITE);

            if ((nextNodeData.type == aig::NodeType::PO) ||
                (nextNodeData.type == aig::NodeType::LATCH)) {
              continue;
            }

            nextNodeData.counter += 1;
            if (nextNodeData.counter == 2) {
              if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {
                cutMan.recycleNodeCuts(nextNodeData.id);
              }
              // rwtMan.nPushes += 1;
              ctx.push(nextNode);
            }
          }
        }

      } else {

        // Touching outgoing neighobors to acquire their locks and their fanin
        // node's locks.
        aigGraph.out_edges(node);

        if (nodeData.counter == 2) {
          nodeData.counter += 1;
        }

        if (nodeData.counter == 3) {
          // Insert nextNodes in the worklist
          for (auto outEdge : aigGraph.out_edges(node)) {
            aig::GNode nextNode = aigGraph.getEdgeDst(outEdge);
            aig::NodeData& nextNodeData =
                aigGraph.getData(nextNode, galois::MethodFlag::WRITE);

            if ((nextNodeData.type == aig::NodeType::PO) ||
                (nextNodeData.type == aig::NodeType::LATCH)) {
              continue;
            }

            nextNodeData.counter += 1;
            if (nextNodeData.counter == 2) {
              if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {
                cutMan.recycleNodeCuts(nextNodeData.id);
              }
              // rwtMan.nPushes += 1;
              ctx.push(nextNode);
            }
          }
        }
      }
    } else {
      if ((nodeData.type == aig::NodeType::PI) ||
          (nodeData.type == aig::NodeType::LATCH)) {

        // Touching outgoing neighobors to acquire their locks and their fanin
        // node's locks.
        aigGraph.out_edges(node);

        // Set the trivial cut
        nodeData.counter      = 3;
        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();
        Cut* trivialCut       = cutPool->getMemory();
        trivialCut->leaves[0] = nodeData.id;
        trivialCut->nLeaves++;
        trivialCut->sig = (1U << (nodeData.id % 31));
        if (cutMan.getCompTruthFlag()) {
          unsigned* cutTruth = cutMan.readTruth(trivialCut);
          for (int i = 0; i < cutMan.getNWords(); i++) {
            cutTruth[i] = 0xAAAAAAAA;
          }
        }
        cutMan.getNodeCuts()[nodeData.id] = trivialCut;

        // Schedule next nodes
        for (auto edge : aigGraph.out_edges(node)) {
          aig::GNode nextNode = aigGraph.getEdgeDst(edge);
          aig::NodeData& nextNodeData =
              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);

          if ((nextNodeData.type == aig::NodeType::PO) ||
              (nextNodeData.type == aig::NodeType::LATCH)) {
            continue;
          }

          nextNodeData.counter += 1;
          if (nextNodeData.counter == 2) {
            // rwtMan.nPushes += 1;
            ctx.push(nextNode);
          }
        }
      }
    }
  }
};

void runRewriteOperator(RewriteManager& rwtMan) {

  // galois::runtime::profileVtune(

  galois::InsertBag<aig::GNode> workList;
  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;
  // typedef galois::worklists::PerSocketChunkFIFO< 5000 > DC_FIFO;
  // typedef galois::worklists::PerSocketChunkLIFO< 5000 > DC_LIFO;
  // typedef galois::worklists::PerThreadChunkFIFO< 5000 > AC_FIFO;

  for (auto pi : rwtMan.getAig().getInputNodes()) {
    workList.push(pi);
  }

  for (auto latch : rwtMan.getAig().getLatchNodes()) {
    workList.push(latch);
  }

  // Galois Parallel Foreach
  galois::for_each(galois::iterate(workList.begin(), workList.end()),
                   RewriteOperator(rwtMan), galois::wl<DC_BAG>(),
                   galois::loopname("RewriteOperator"),
                   galois::per_iter_alloc());

  // galois::wl<galois::worklists::Deterministic<>>(),
  // galois::wl<DC_BAG>(),

  //,"REWRITING" );
}

} /* namespace algorithm */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Parallel Rewriting January 5, 2018.
 ABC-based implementation on Galois.

*/

#ifndef REWRITEMANAGER_H_
#define REWRITEMANAGER_H_

#include "Aig.h"
#include "CutManager.h"
#include "NPNManager.h"
#include "PreCompGraphManager.h"

#include "galois/worklists/Chunk.h"

#include <vector>
#include <unordered_set>

namespace algorithm {

typedef struct ThreadContextData_ {
  // Labels
  int threadId;
  int travId;
  // Cut under evaluation data
  std::vector<bool> currentFaninsPol;
  std::vector<bool> bestFaninsPol;
  std::vector<aig::GNode> currentFanins;
  std::vector<aig::GNode> bestFanins;
  // Decomposition graphs data
  std::vector<aig::GNode> decNodeFunc;
  // std::vector< int > decNodeLevel;
  // MFFC IDs to be reused
  std::unordered_set<int> currentCutMFFCIds;
  std::unordered_set<int> bestCutMFFCIds;
  std::unordered_set<int> currentGraphMFFCPreservedIds;
  std::unordered_set<int> currentCutMFFCPreservedIds;
  std::unordered_set<int> bestCutMFFCPreservedIds;

  ThreadContextData_()
      : threadId(0), travId(0), currentFaninsPol(4), bestFaninsPol(4),
        currentFanins(4), bestFanins(4), decNodeFunc(20) {
  } //, decNodeLevel( 20 ) { }

} ThreadContextData;

typedef galois::PerIterAllocTy Alloc;
typedef std::vector<int, galois::PerIterAllocTy::rebind<int>::other> IntVector;
typedef std::vector<aig::GNode,
                    galois::PerIterAllocTy::rebind<aig::GNode>::other>
    GNodeVector;
typedef std::unordered_set<int, std::hash<int>, std::equal_to<int>,
                           galois::PerIterAllocTy::rebind<int>::other>
    IntSet;

typedef galois::substrate::PerThreadStorage<ThreadContextData>
    PerThreadContextData;

class RewriteManager {

private:
  aig::Aig& aig;
  CutManager& cutMan;
  NPNManager& npnMan;
  PreCompGraphManager& pcgMan;

  PerThreadContextData perThreadContextData;

  int nFuncs;
  int triesNGraphs;
  bool useZeros;
  bool updateLevel;

  long double rewriteTime;

  void lockFaninCone(aig::Graph& aigGraph, aig::GNode node, Cut* cut);
  int labelMFFC(ThreadContextData* threadCtx, aig::GNode node, int threadId,
                int travId);
  int refDerefMFFCNodes(ThreadContextData* threadCtx, aig::GNode node,
                        int threadId, int travId, bool reference, bool label);

  DecGraph* evaluateCut(ThreadContextData* threadCtx, aig::GNode root, Cut* cut,
                        int nNodesSaved, int maxLevel, int& bestGain);
  int decGraphToAigCount(ThreadContextData* threadCtx, aig::GNode root,
                         DecGraph* decGraph, int maxNode, int maxLevel);
  aig::GNode updateAig(ThreadContextData* threadCtx, aig::GNode oldRoot,
                       DecGraph* decGraph, GNodeVector& fanoutNodes,
                       bool isOutputCompl);
  aig::GNode decGraphToAig(ThreadContextData* threadCtx, DecGraph* decGraph);
  aig::GNode createAndNode(ThreadContextData* threadCtx, aig::GNode lhsAnd,
                           aig::GNode rhsAnd, bool lhsAndPol, bool rhsAndPol);
  void addNewSubgraph(aig::GNode oldNode, aig::GNode newNode,
                      GNodeVector& fanoutNodes, bool isNewRootComplement);
  void deleteOldMFFC(aig::Graph& aigGraph, aig::GNode oldNode);
  void deleteOldMFFCRec(aig::Graph& aigGraph, aig::GNode oldNode);

  // void recycleIDsAndCuts( ThreadContextData * threadCtx, IntVector &
  // availableIDs ); aig::GNode searchNode( aig::GNode lhsNode, aig::GNode
  // rhsNode, bool lhsPol, bool rhsPol );

  /*
  void buildLocalStrash( ThreadContextData * threadCtx, Cut * cut, IntSet &
  visited ); void addLocalStrash( ThreadContextData * threadCtx, aig::GNode node
  ); aig::GNode lookupLocalStrash( ThreadContextData * threadCtx, aig::GNode
  lhsNode, aig::GNode rhsNode, bool lhsPol, bool rhsPol ); int makeAndHashKey(
  aig::GNode lhsNode, aig::GNode rhsNode, bool lhsPol, bool rhsPol ); void
  showLocalStrash( std::vector< aig::GNode > & strashMap );
  */

public:
  galois::GAccumulator<long int> nPushes;

  RewriteManager(aig::Aig& aig, CutManager& cutMan, NPNManager& npnMan,
                 PreCompGraphManager& pcgMan, int triesNGraphs, bool useZeros,
                 bool updateLevel);

  ~RewriteManager();

  aig::GNode rewriteNode(ThreadContextData* threadCtx, aig::GNode node,
                         GNodeVector& fanoutNodes);

  aig::Aig& getAig();
  CutManager& getCutMan();
  NPNManager& getNPNMan();
  PreCompGraphManager& getPcgMan();
  PerThreadContextData& getPerThreadContextData();
  bool getUseZerosFlag();
  bool getUpdateLevelFlag();
  long double getRewriteTime();
  void setRewriteTime(long double time);
};

void runRewriteOperator(RewriteManager& rwtMan);

} /* namespace algorithm */

#endif /* REWRITEMANAGER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/BitVectorPool.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * FunctionPool.cpp
 *
 *  Created on: 27/03/2017
 *      Author: viniciuspossani
 */

#include <iostream>
#include "BitVectorPool.h"

namespace Functional {

BitVectorPool::BitVectorPool(int nElements, int nWords) {
  this->blockSize    = nElements;
  this->nWords       = nWords;
  this->index        = 0;
  this->currentBlock = nullptr;
  alloc();
}

BitVectorPool::~BitVectorPool() {
  //	std::cout << "Deleting Blocks..." << std::endl;
  for (auto ptr : this->blocks) {
    free(ptr[0]);
    free(ptr);
  }
}

void BitVectorPool::alloc() {

  word* tmp = (word*)malloc(sizeof(word) * (this->blockSize * this->nWords));
  if (tmp == nullptr) {
    std::cout << "Error: memory could not be allocated by BitVectorPool!"
              << std::endl;
    exit(1);
  }

  this->currentBlock = (word**)malloc(sizeof(word*) * this->blockSize);
  if (this->currentBlock == nullptr) {
    std::cout << "Error: memory could not be allocated by BitVectorPool!"
              << std::endl;
    exit(1);
  }

  int i, j = 0;
  for (i = 0; i < this->blockSize; i++) {
    this->currentBlock[i] = &tmp[j];
    j += this->nWords;
  }

  this->blocks.push_back(this->currentBlock);
}

word* BitVectorPool::getMemory() {

  if (index >= blockSize) {
    alloc();
    this->index = 0;
  }

  word* ptr = this->currentBlock[this->index];
  this->index++;
  return ptr;
}

word* BitVectorPool::getCleanMemory() {

  if (index >= blockSize) {
    alloc();
    this->index = 0;
  }

  for (int i = 0; i < this->nWords; i++) {
    this->currentBlock[this->index][i] = 0;
  }

  word* ptr = this->currentBlock[this->index];
  this->index++;
  return ptr;
}

void BitVectorPool::giveBackMemory() { this->index--; }

int BitVectorPool::getNumBlocks() { return this->blocks.size(); }

} /* namespace Functional */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/BitVectorPool.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * FunctionPool.h
 *
 *  Created on: 27/03/2017
 *      Author: viniciuspossani
 */

#ifndef SRC_MAIN_FUNCTIONPOOL_H_
#define SRC_MAIN_FUNCTIONPOOL_H_

#include <vector>

namespace Functional {

typedef unsigned long word;

class BitVectorPool {

  int blockSize;
  int nWords;
  int index;
  word** currentBlock;
  std::vector<word**> blocks;

  void alloc();

public:
  BitVectorPool(int nElements, int nWords);

  virtual ~BitVectorPool();

  word* getMemory();

  word* getCleanMemory();

  void giveBackMemory();

  int getNumBlocks();
};

} /* namespace Functional */

#endif /* SRC_MAIN_FUNCTIONPOOL_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * This class represents the Function basic data structure. A Function is a
 * vector of unsigned long integers that represents the truth table of a
 * boolean function.
 *
 * @author Marcos Henrique Backes - mhbackes@inf.ufrgs.br.
 *
 * @see InputNode, ChoiceNode.
 *
 * Modified by Vinicius Possani
 * Last modification in July 28, 2017.
 */

#ifndef FUNCTIONAL_H_
#define FUNCTIONAL_H_

#include "../xxHash/xxhash.h"
#include <cmath>
#include <cstring>
#include <string>
#include <sstream>
#include <stdexcept>
#include <iomanip>
#include <iostream>
#include <bitset>
#include <cassert>
#include <algorithm>
#include <vector>
#include <unordered_map>
#include <unordered_set>

#include "BitVectorPool.h"

namespace Functional {

typedef unsigned long word;

enum Order { SMALLER, LARGER, NOTCOMPARABLE, EQUAL };

inline void createLiterals(
    std::vector<std::string>& varSet,
    std::unordered_map<std::string, std::pair<word*, unsigned int>>& literals,
    BitVectorPool& functionPool);
inline bool less(word* lhs, word* rhs, int nWords);
inline bool equals(word* lhs, word* rhs, int nWords);
inline bool diff(word* lhs, word* rhs, int nWords);
inline void copy(word* result, word* original, int nWords);
inline void NOT(word* result, word* original, int nWords);
inline void AND(word* result, word* lhs, word* rhs, int nWords);
inline void OR(word* result, word* lhs, word* rhs, int nWords);
inline void XOR(word* result, word* lhs, word* rhs, int nWords);
inline void MUX(word* result, word* zero, word* one, word* sel, int nWords);
inline void cofactor0(word* result, word* original, int nWords, int iVar);
inline void cofactor1(word* result, word* original, int nWords, int iVar);
inline int getSupport(word* function, int nVars);
inline int getPolarizedSupport(word* function, int nVars);
inline bool hasVar(word* functin, int nVars, int iVar);
inline bool hasVarTruth6(word* function, int iVar);
inline bool posVar6(word t, int iVar);
inline bool negVar6(word t, int iVar);
inline bool posVar(word* function, int nVars, int iVar);
inline bool negVar(word* function, int nVars, int iVar);
inline bool isUnate(word* function, int nVars);
inline bool isPosUnate(word* function, int nVars);
inline bool isConstZero(word* function, int nVars);
inline bool isConstOne(word* function, int nVars);
inline Order order(word* sub, word* target, int nWords);
inline int getHammingDist(word* f1, word* f2, int nWords);
inline int oneCounter(unsigned long int word);
inline int wordNum(int nVars);
inline bool isOdd(word* function);
inline std::string toBin(word* function, int nWords);
inline std::string toHex(word* function, int nWords);
inline std::string supportToBin(unsigned int support);

inline constexpr word truths6[6] = {0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC,
                                    0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00,
                                    0xFFFF0000FFFF0000, 0xFFFFFFFF00000000};

inline constexpr word truths6Neg[6] = {0x5555555555555555, 0x3333333333333333,
                                       0x0F0F0F0F0F0F0F0F, 0x00FF00FF00FF00FF,
                                       0x0000FFFF0000FFFF, 0x00000000FFFFFFFF};

class FunctionHasher {
  int nWords;
  size_t TRUTH_WORDS_BYTE_COUNT;

public:
  FunctionHasher(int nWords) : nWords(nWords) {
    TRUTH_WORDS_BYTE_COUNT = sizeof(Functional::word) * nWords;
  }

  size_t operator()(const Functional::word* function) const {

    if (nWords == 1) {
      return function[0];
    } else {
      return XXH64(function, TRUTH_WORDS_BYTE_COUNT, 0);
    }
  }
};

class FunctionComparator {
  int nWords;

public:
  FunctionComparator(int nWords) : nWords(nWords) {}

  bool operator()(Functional::word* f1, Functional::word* f2) const {
    return Functional::equals(f1, f2, nWords);
  }
};

typedef struct functionData {
  unsigned int support;
  unsigned int occurrences;
} FunctionData;

using FunctionSet =
    std::unordered_set<Functional::word*, FunctionHasher, FunctionComparator>;
using FunctionDataMap =
    std::unordered_map<word*, FunctionData, FunctionHasher, FunctionComparator>;

inline void computeAllCubeCofactors(BitVectorPool& functionPool,
                                    FunctionSet& cubeCofactors, word* function,
                                    int nVars);
inline void computeAllCubeCofactorsRec(BitVectorPool& functionPool,
                                       FunctionSet& cubeCofactors,
                                       word* function, int nVars, int nWords,
                                       int iVar, bool isPrevVarCofactored);
inline void
computeAllCubeCofactorsWithSupport(BitVectorPool& functionPool,
                                   FunctionDataMap& cubeCofactorData,
                                   word* function, int nVars);
inline void computeAllCubeCofactorsWithSupportRec(
    BitVectorPool& functionPool, FunctionDataMap& cubeCofactorData,
    word* function, int nVars, int nWords, int iVar);
inline void registerFunction(FunctionDataMap& cubeCofactorData, word* function,
                             int nVars, unsigned int occurencesInc);

inline void createLiterals(
    std::vector<std::string>& varSet,
    std::unordered_map<std::string, std::pair<word*, unsigned int>>& literals,
    BitVectorPool& functionPool) {

  int nVars  = varSet.size();
  int nWords = wordNum(nVars);

  for (int iVar = 0; iVar < nVars; iVar++) {

    word* currentFunction = functionPool.getMemory();

    if (iVar < 6) {
      for (int k = 0; k < nWords; k++) {
        currentFunction[k] = truths6[iVar];
      }
    } else {
      for (int k = 0; k < nWords; k++) {
        currentFunction[k] = (k & (1 << (iVar - 6))) ? ~(word)0 : 0;
      }
    }

    unsigned int support = getPolarizedSupport(currentFunction, nVars);
    literals.insert(
        std::make_pair(varSet[iVar], std::make_pair(currentFunction, support)));

    // Create negative literals
    std::string negLit = "!" + varSet[iVar];
    word* negFunction  = functionPool.getMemory();
    NOT(negFunction, currentFunction, nWords);
    unsigned int negSupport = (support >> 1);
    literals.insert(
        std::make_pair(negLit, std::make_pair(negFunction, negSupport)));
  }

  // Create constant zero and constant one
  word* constZero = functionPool.getMemory();
  word* constOne  = functionPool.getMemory();
  for (int k = 0; k < nWords; k++) {
    constZero[k] = 0UL;
    constOne[k]  = ~0UL;
  }
  literals.insert(std::make_pair("0", std::make_pair(constZero, 0)));
  literals.insert(std::make_pair("1", std::make_pair(constOne, 0)));

  //	std::cout << std::endl << "############################## Literals
  //##############################" << std::endl; 	for ( auto lit : literals )
  //{ 		std::cout << lit.first << " = " << toHex( lit.second.first, nWords )
  //<< " | " << supportToBin( lit.second.second ) << std::endl;
  //	}
  //	std::cout << std::endl;
}

inline bool less(word* lhs, word* rhs, int nWords) {

  if ((lhs == nullptr) || (rhs == nullptr)) {
    return false;
  }

  for (int i = nWords - 1; i >= 0; --i) {
    if (lhs[i] < rhs[i]) {
      return true;
    }
    if (lhs[i] > rhs[i]) {
      return false;
    }
  }

  return false;
}

inline bool equals(word* lhs, word* rhs, int nWords) {

  if ((lhs == nullptr) || (rhs == nullptr)) {
    return false;
  }

  for (int i = 0; i < nWords; i++) {
    if (lhs[i] != rhs[i]) {
      return false;
    }
  }

  return true;
}

inline bool diff(word* lhs, word* rhs, int nWords) {

  if ((lhs == nullptr) || (rhs == nullptr)) {
    return false;
  }

  for (int i = 0; i < nWords; i++) {
    if (lhs[i] != rhs[i]) {
      return true;
    }
  }

  return false;
}

inline void copy(word* result, word* original, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = original[i];
  }
}

inline void NOT(word* result, word* original, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = ~(original[i]);
  }
}

inline void AND(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] & rhs[i];
  }
}

inline void OR(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] | rhs[i];
  }
}

inline void XOR(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] ^ rhs[i];
  }
}

inline void MUX(word* result, word* zero, word* one, word* sel, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = (zero[i] & ~sel[i]) | (one[i] & sel[i]);
  }
}

inline void cofactor0(word* result, word* original, int nWords, int iVar) {

  if (nWords == 1) {
    result[0] = ((original[0] & truths6Neg[iVar]) << (1 << iVar)) |
                (original[0] & truths6Neg[iVar]);
  } else {
    if (iVar <= 5) {
      int w, shift = (1 << iVar);
      for (w = 0; w < nWords; w++) {
        result[w] = ((original[w] & truths6Neg[iVar]) << shift) |
                    (original[w] & truths6Neg[iVar]);
      }
    } else { // if ( iVar > 5 )
      word* pOriginal = original;
      word* pResult   = result;

      word* pLimit = pOriginal + nWords;
      int i, iStep = wordNum(iVar);
      for (; pOriginal < pLimit; pOriginal += 2 * iStep, pResult += 2 * iStep) {
        for (i = 0; i < iStep; i++) {
          pResult[i]         = pOriginal[i];
          pResult[i + iStep] = pOriginal[i];
        }
      }
    }
  }
}

inline void cofactor1(word* result, word* original, int nWords, int iVar) {

  if (nWords == 1) {
    result[0] = (original[0] & truths6[iVar]) |
                ((original[0] & truths6[iVar]) >> (1 << iVar));
  } else {
    if (iVar <= 5) {
      int w, shift = (1 << iVar);
      for (w = 0; w < nWords; w++) {
        result[w] = (original[w] & truths6[iVar]) |
                    ((original[w] & truths6[iVar]) >> shift);
      }
    } else { // if ( iVar > 5 )
      word* pOriginal = original;
      word* pResult   = result;

      word* pLimit = pOriginal + nWords;
      int i, iStep = wordNum(iVar);
      for (; pOriginal < pLimit; pOriginal += 2 * iStep, pResult += 2 * iStep) {
        for (i = 0; i < iStep; i++) {
          pResult[i]         = pOriginal[i + iStep];
          pResult[i + iStep] = pOriginal[i + iStep];
        }
      }
    }
  }
}

inline void computeAllCubeCofactors(BitVectorPool& functionPool,
                                    FunctionSet& cubeCofactors, word* function,
                                    int nVars) {
  bool isPrevVarCofactored = true;
  int nWords               = wordNum(nVars);
  int iVar                 = nVars - 1;
  computeAllCubeCofactorsRec(functionPool, cubeCofactors, function, nVars,
                             nWords, iVar, isPrevVarCofactored);
}

inline void computeAllCubeCofactorsRec(BitVectorPool& functionPool,
                                       FunctionSet& cubeCofactors,
                                       word* function, int nVars, int nWords,
                                       int iVar, bool isPrevVarCofactored) {

  // ignoring constants
  if (isConstZero(function, nVars) || isConstOne(function, nVars)) {
    return;
  }

  if (isPrevVarCofactored) {
    // inserting the current function to the unique set
    auto status = cubeCofactors.insert(function);
    // When the function was already visited
    if (status.second == false) {
      functionPool.giveBackMemory();
      return;
    }
  }

  // When the terminal case is found
  if (iVar < 0) {
    return;
  }

  // Calling recursing with iVar as dont care
  computeAllCubeCofactorsRec(functionPool, cubeCofactors, function, nVars,
                             nWords, iVar - 1, false);

  // When iVar is dont care
  if (hasVar(function, nVars, iVar) == false) {
    return;
  }

  // Calling recursing with iVar = 0
  word* negCof = functionPool.getMemory();
  cofactor0(negCof, function, nWords, iVar);
  computeAllCubeCofactorsRec(functionPool, cubeCofactors, negCof, nVars, nWords,
                             iVar - 1, true);

  // Calling recursing with iVar = 1
  word* posCof = functionPool.getMemory();
  cofactor1(posCof, function, nWords, iVar);
  computeAllCubeCofactorsRec(functionPool, cubeCofactors, posCof, nVars, nWords,
                             iVar - 1, true);
}

inline void
computeAllCubeCofactorsWithSupport(BitVectorPool& functionPool,
                                   FunctionDataMap& cubeCofactorData,
                                   word* function, int nVars) {
  int nWords = wordNum(nVars);
  int iVar   = nVars - 1;
  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData,
                                        function, nVars, nWords, iVar);
}

inline void computeAllCubeCofactorsWithSupportRec(
    BitVectorPool& functionPool, FunctionDataMap& cubeCofactorData,
    word* function, int nVars, int nWords, int iVar) {

  // When the constants are found
  if (isConstZero(function, nVars) || isConstOne(function, nVars)) {
    unsigned int occurrencesInc = (unsigned int)pow(3, iVar + 1);
    registerFunction(cubeCofactorData, function, nVars, occurrencesInc);
    return;
  }

  // When the terminal case is found
  if (iVar < 0) {
    unsigned int occurrencesInc = 1;
    registerFunction(cubeCofactorData, function, nVars, occurrencesInc);
    return;
  }

  // Calling recursing with iVar as dont care
  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData,
                                        function, nVars, nWords, iVar - 1);

  // Calling recursing with iVar = 0
  word* negCof = functionPool.getMemory();
  cofactor0(negCof, function, nWords, iVar);
  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData, negCof,
                                        nVars, nWords, iVar - 1);

  // Calling recursing with iVar = 1
  word* posCof = functionPool.getMemory();
  cofactor1(posCof, function, nWords, iVar);
  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData, posCof,
                                        nVars, nWords, iVar - 1);
}

inline void registerFunction(FunctionDataMap& cubeCofactorData, word* function,
                             int nVars, unsigned int occurencesInc) {

  auto it = cubeCofactorData.find(function);
  if (it == cubeCofactorData.end()) {
    FunctionData functionData;
    // functionData.support = getSupport( function, nVars );
    functionData.support     = getPolarizedSupport(function, nVars);
    functionData.occurrences = occurencesInc;
    cubeCofactorData.insert(std::make_pair(function, functionData));
  } else {
    it->second.occurrences = it->second.occurrences + occurencesInc;
  }
}

inline bool hasVar(word* function, int nVars, int iVar) {

  word* t    = function;
  int nWords = wordNum(nVars);

  if (nWords == 1) {
    return hasVarTruth6(function, iVar);
  }
  if (iVar < 6) {
    int i, Shift = (1 << iVar);
    for (i = 0; i < nWords; i++) {
      if (((t[i] >> Shift) & truths6Neg[iVar]) != (t[i] & truths6Neg[iVar])) {
        return true;
      }
    }
    return false;
  } else {
    int i, Step = (1 << (iVar - 6));
    word* tLimit = t + nWords;
    for (; t < tLimit; t += 2 * Step) {
      for (i = 0; i < Step; i++) {
        if (t[i] != t[Step + i]) {
          return true;
        }
      }
    }
    return false;
  }
}

inline bool hasVarTruth6(word* function, int iVar) {
  word t = function[0];
  return ((t >> (1 << iVar)) & truths6Neg[iVar]) != (t & truths6Neg[iVar]);
}

inline bool posVar6(word t, int iVar) {
  return ((t >> (1 << iVar)) & t & truths6Neg[iVar]) == (t & truths6Neg[iVar]);
}

inline bool negVar6(word t, int iVar) {
  return ((t << (1 << iVar)) & t & truths6[iVar]) == (t & truths6[iVar]);
}

inline bool posVar(word* function, int nVars, int iVar) {

  assert(iVar < nVars);

  word* t = function;

  if (nVars <= 6) {
    return posVar6(t[0], iVar);
  }
  if (iVar < 6) {
    int i, shift = (1 << iVar);
    int nWords = wordNum(nVars);
    for (i = 0; i < nWords; i++) {
      if (((t[i] >> shift) & t[i] & truths6Neg[iVar]) !=
          (t[i] & truths6Neg[iVar])) {
        return false;
      }
    }
    return true;
  } else {
    int i, step = (1 << (iVar - 6));
    word* tLimit = t + wordNum(nVars);
    for (; t < tLimit; t += 2 * step) {
      for (i = 0; i < step; i++) {
        if (t[i] != (t[i] & t[step + i])) {
          return false;
        }
      }
    }
    return true;
  }
}

inline bool negVar(word* function, int nVars, int iVar) {

  assert(iVar < nVars);

  word* t = function;

  if (nVars <= 6) {
    return negVar6(t[0], iVar);
  }
  if (iVar < 6) {
    int i, shift = (1 << iVar);
    int nWords = wordNum(nVars);
    for (i = 0; i < nWords; i++) {
      if (((t[i] << shift) & t[i] & truths6[iVar]) != (t[i] & truths6[iVar])) {
        return false;
      }
    }
    return true;
  } else {
    int i, step = (1 << (iVar - 6));
    word* tLimit = t + wordNum(nVars);
    for (; t < tLimit; t += 2 * step) {
      for (i = 0; i < step; i++) {
        if ((t[i] & t[step + i]) != t[step + i]) {
          return false;
        }
      }
    }
    return true;
  }
}

inline bool isUnate(word* function, int nVars) {

  for (int i = 0; i < nVars; i++) {
    if (!negVar(function, nVars, i) && !posVar(function, nVars, i)) {
      return false;
    }
  }
  return true;
}

inline bool isPosUnate(word* function, int nVars) {

  for (int i = 0; i < nVars; i++) {
    if (!posVar(function, nVars, i)) {
      return false;
    }
  }
  return true;
}

inline int getSupport(word* function, int nVars) {

  int v, Supp = 0;
  for (v = 0; v < nVars; v++) {
    if (hasVar(function, nVars, v)) {
      Supp |= (1 << v);
    }
  }
  return Supp;
}

inline int getPolarizedSupport(word* function, int nVars) {

  int v, Supp = 0;
  for (v = 0; v < nVars; v++) {
    if (!posVar(function, nVars, v)) {
      Supp |= (1 << (v * 2));
    }
    if (!negVar(function, nVars, v)) {
      Supp |= (1 << ((v * 2) + 1));
    }
  }
  return Supp;
}

inline bool isConstZero(word* function, int nVars) {

  word* pFunction = function;
  word* pLimit    = pFunction + wordNum(nVars);

  while (pFunction != pLimit) {
    if (*pFunction++ != 0ULL) {
      return false;
    }
  }

  return true;
}

inline bool isConstOne(word* function, int nVars) {

  word* pFunction = function;
  word* pLimit    = pFunction + wordNum(nVars);
  const word ONE  = ~0ULL;

  while (pFunction != pLimit) {
    if (*pFunction++ != ONE) {
      return false;
    }
  }

  return true;
}

inline Order order(word* sub, word* target, int nWords) {

  if (equals(sub, target, nWords)) {
    return Order::EQUAL;
  }

  bool smaller = true;
  bool larger  = true;
  unsigned long int partialResult;

  for (int i = 0; i < nWords; i++) {

    partialResult = sub[i] & target[i];

    if (partialResult != sub[i]) {
      smaller = false;
    }
    if (partialResult != target[i]) {
      larger = false;
    }

    if (!smaller && !larger) {
      return Order::NOTCOMPARABLE;
    }
  }

  if (smaller)
    return Order::SMALLER;

  if (larger)
    return Order::LARGER;

  assert(false); // Should never happen
  return Order::NOTCOMPARABLE;
}

inline int getHammingDist(word* f1, word* f2, int nWords) {

  unsigned long int currentWord;
  int count = 0;

  for (int i = 0; i < nWords; i++) {
    currentWord = f1[i] ^ f2[i];
    count += oneCounter(currentWord);
  }

  return count;
}

// This is better when most bits in word are 0. It uses 3 arithmetic operations
// and one comparison/branch per "1" bit in word.
inline int oneCounter(unsigned long int word) {

  int count;
  for (count = 0; word; count++) {
    word &= word - 1;
  }
  return count;
}

inline int wordNum(int nVars) { return nVars <= 6 ? 1 : 1 << (nVars - 6); }

inline bool isOdd(word* function) { return (function[0] & 1) != 0; }

inline std::string toBin(word* function, int nWords) {

  if (function != nullptr) {

    std::stringstream result;

    result << "";

    for (int i = nWords - 1; i >= 0; i--) {
      for (int j = 63; j >= 0; j--) {
        if ((function[i] >> j) & 1) {
          result << ("1");
        } else {
          result << ("0");
        }
      }
    }

    return result.str();
  } else {
    return "nullptr";
  }
}

inline std::string toHex(word* function, int nWords) {

  std::stringstream result;

  result << "0x";

  for (int i = nWords - 1; i >= 0; i--) {
    result << std::setw(16) << std::setfill('0') << std::hex << function[i];
  }

  return result.str();
}

inline std::string supportToBin(unsigned int support) {
  word ptr[1];
  ptr[0] = support;
  return Functional::toBin(ptr, 1);
}

} /* namespace Functional */

#endif /* FUNCTIONAL_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler32.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * This class represents the Function basic data structure. A Function is a
 * vector of unsigned long integers that represents the truth table of a
 * boolean function.
 *
 * @author Marcos Henrique Backes - mhbackes@inf.ufrgs.br.
 *
 * @see InputNode, ChoiceNode.
 *
 * Modified by Vinicius Possani
 * Last modification in July 28, 2017.
 */

#ifndef FUNCTIONAL32_H_
#define FUNCTIONAL32_H_

#include "../xxHash/xxhash.h"
#include <cmath>
#include <cassert>
#include <cstring>
#include <string>
#include <sstream>
#include <iostream>
#include <iomanip>
#include <stdexcept>

namespace Functional32 {

typedef unsigned int word;

inline void copy(word* result, word* original, int nWords);
inline void NOT(word* result, word* original, int nWords);
inline void AND(word* result, word* lhs, word* rhs, int nWords);
inline void NAND(word* result, word* lhs, word* rhs, int nWords);
inline void OR(word* result, word* lhs, word* rhs, int nWords);
inline void XOR(word* result, word* lhs, word* rhs, int nWords);
inline bool isConstZero(word* function, int nVars);
inline bool isConstOne(word* function, int nVars);
inline int countOnes(unsigned uWord);
inline int wordNum(int nVars);
inline void truthStretch(word* result, word* input, int inVars, int nVars,
                         unsigned phase);
inline void swapAdjacentVars(word* result, word* input, int nVars, int iVar);
inline std::string toCubeString(word* function, int nWords, int nVars);
inline std::string toHex(word* function, int nWords);
inline std::string toBin(word* function, int nWords);

inline void copy(word* result, word* original, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = original[i];
  }
}

inline void NOT(word* result, word* original, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = ~(original[i]);
  }
}

inline void AND(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] & rhs[i];
  }
}

inline void NAND(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = ~(lhs[i] & rhs[i]);
  }
}

inline void OR(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] | rhs[i];
  }
}

inline void XOR(word* result, word* lhs, word* rhs, int nWords) {

  for (int i = 0; i < nWords; i++) {
    result[i] = lhs[i] ^ rhs[i];
  }
}

inline bool isConstZero(word* function, int nVars) {

  word* pFunction = function;
  word* pLimit    = pFunction + wordNum(nVars);

  while (pFunction != pLimit) {
    if (*pFunction++ != 0U) {
      return false;
    }
  }

  return true;
}

inline bool isConstOne(word* function, int nVars) {

  word* pFunction = function;
  word* pLimit    = pFunction + wordNum(nVars);
  const word ONE  = ~0U;

  while (pFunction != pLimit) {
    if (*pFunction++ != ONE) {
      return false;
    }
  }

  return true;
}

inline int countOnes(unsigned uWord) {

  uWord = (uWord & 0x55555555) + ((uWord >> 1) & 0x55555555);
  uWord = (uWord & 0x33333333) + ((uWord >> 2) & 0x33333333);
  uWord = (uWord & 0x0F0F0F0F) + ((uWord >> 4) & 0x0F0F0F0F);
  uWord = (uWord & 0x00FF00FF) + ((uWord >> 8) & 0x00FF00FF);
  return (uWord & 0x0000FFFF) + (uWord >> 16);
}

inline int wordNum(int nVars) { return nVars <= 5 ? 1 : 1 << (nVars - 5); }

inline void truthStretch(word* result, word* input, int inVars, int nVars,
                         unsigned phase) {

  unsigned* pTemp;
  int var = inVars - 1, counter = 0;

  for (int i = nVars - 1; i >= 0; i--) {

    if (phase & (1 << i)) {

      for (int j = var; j < i; j++) {

        swapAdjacentVars(result, input, nVars, j);
        pTemp  = input;
        input  = result;
        result = pTemp;
        counter++;
      }
      var--;
    }
  }

  assert(var == -1);

  // swap if it was moved an even number of times
  int nWords = wordNum(nVars);
  if (!(counter & 1)) {
    copy(result, input, nWords);
  }
}

inline void swapAdjacentVars(word* result, word* input, int nVars, int iVar) {

  static unsigned PMasks[4][3] = {{0x99999999, 0x22222222, 0x44444444},
                                  {0xC3C3C3C3, 0x0C0C0C0C, 0x30303030},
                                  {0xF00FF00F, 0x00F000F0, 0x0F000F00},
                                  {0xFF0000FF, 0x0000FF00, 0x00FF0000}};

  int nWords = wordNum(nVars);
  int i, k, step, shift;

  assert(iVar < nVars - 1);

  if (iVar < 4) {
    shift = (1 << iVar);
    for (i = 0; i < nWords; i++) {
      result[i] = (input[i] & PMasks[iVar][0]) |
                  ((input[i] & PMasks[iVar][1]) << shift) |
                  ((input[i] & PMasks[iVar][2]) >> shift);
    }
  } else {
    if (iVar > 4) {
      step = (1 << (iVar - 5));
      for (k = 0; k < nWords; k += 4 * step) {

        for (i = 0; i < step; i++) {
          result[i] = input[i];
        }

        for (i = 0; i < step; i++) {
          result[step + i] = input[2 * step + i];
        }

        for (i = 0; i < step; i++) {
          result[2 * step + i] = input[step + i];
        }

        for (i = 0; i < step; i++) {
          result[3 * step + i] = input[3 * step + i];
        }

        input += 4 * step;
        result += 4 * step;
      }
    } else { // if ( iVar == 4 )
      for (i = 0; i < nWords; i += 2) {
        result[i] =
            (input[i] & 0x0000FFFF) | ((input[i + 1] & 0x0000FFFF) << 16);
        result[i + 1] =
            (input[i + 1] & 0xFFFF0000) | ((input[i] & 0xFFFF0000) >> 16);
      }
    }
  }
}

inline std::string toBin(word* function, int nWords) {

  if (function != nullptr) {

    std::stringstream result;

    result << "";

    for (int i = nWords - 1; i >= 0; i--) {
      for (int j = 31; j >= 0; j--) {
        if ((function[i] >> j) & 1) {
          result << ("1");
        } else {
          result << ("0");
        }
      }
    }

    return result.str();
  } else {
    return "nullptr";
  }
}
inline std::string toHex(word* function, int nWords) {

  std::stringstream result;

  result << "0x";

  for (int i = nWords - 1; i >= 0; i--) {
    result << std::setw(16) << std::setfill('0') << std::hex << function[i];
  }

  return result.str();
}

inline std::string toCubeString(word* function, int nWords, int nVars) {

  std::stringstream cubes;
  word mask, cube;
  int nRows;

  if (nWords == 1) {
    nRows = 2 << (nVars - 1);
    mask  = 1;
    for (int j = 0; j < nRows; j++) {
      if (function[0] & mask) {
        cube = j;
        for (int k = 0; k < nVars; k++) {
          if ((cube >> k) & 1) {
            cubes << ("1");
          } else {
            cubes << ("0");
          }
        }
        cubes << " 1" << std::endl;
      }
      mask = mask << 1;
    }
  } else {
    for (int i = 0; i < nWords; i++) {
      mask = 1;
      for (int j = 0; j < 32; j++) {
        if (function[i] & mask) {
          cube = (i * 32) + j;
          for (int k = 0; k < nVars; k++) {
            if ((cube >> k) & 1) {
              cubes << ("1");
            } else {
              cubes << ("0");
            }
          }
          cubes << " 1" << std::endl;
        }
        mask = mask << 1;
      }
    }
  }
  return cubes.str();
}

} /* namespace Functional32 */

#endif /* FUNCTIONAL32_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/FunctionUtil.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * FunctionUtil.cpp
 *
 *  Created on: 14/02/2017
 *      Author: possani
 */

#include "../functional/FunctionUtil.h"

#include "../functional/FunctionHandler.h"

namespace Functional {

FunctionUtil::FunctionUtil(StringFunctionMap& entries,
                           BitVectorPool& functionPool, int nVars, int nWords)
    : literals(entries), functionPool(functionPool) {
  this->nVars        = nVars;
  this->nWords       = nWords;
  this->currentToken = EMPTY;
}

FunctionUtil::~FunctionUtil() {}

word* FunctionUtil::parseExpression(std::string expression) {

  std::istringstream functionality(expression);
  return expr1(functionality);
}

word* FunctionUtil::expr1(std::istringstream& expression) {

  word* lhs = expr2(expression);

  for (;;) {
    switch (this->currentToken) {
    case ORop: {
      word* result = this->functionPool.getMemory();
      word* rhs    = expr2(expression);
      Functional::OR(result, lhs, rhs, this->nWords);
      lhs = result;
      break;
    }

    default:
      return lhs;
    }
  }
}

word* FunctionUtil::expr2(std::istringstream& expression) {

  word* lhs = term(expression);

  for (;;) {
    switch (this->currentToken) {
    case XORop: {
      word* result = this->functionPool.getMemory();
      word* rhs    = term(expression);
      Functional::XOR(result, lhs, rhs, this->nWords);
      lhs = result;
      break;
    }

    default:
      return lhs;
    }
  }
}

word* FunctionUtil::term(std::istringstream& expression) {

  word* lhs = prim(expression);

  for (;;) {
    switch (this->currentToken) {
    case ANDop: {
      word* result = this->functionPool.getMemory();
      word* rhs    = prim(expression);
      Functional::AND(result, lhs, rhs, this->nWords);
      lhs = result;
      break;
    }

    default:
      return lhs;
    }
  }
}

word* FunctionUtil::prim(std::istringstream& expression) {

  getToken(expression);

  switch (this->currentToken) {

  case LIT: {
    getToken(expression);
    StringFunctionMap::iterator it = literals.find(this->tokenValue);
    if (it != literals.end()) {
      word* var     = it->second.first;
      word* literal = this->functionPool.getMemory();
      Functional::copy(literal, var, this->nWords);
      return literal;
    } else {
      std::cout << "ERROR: Literal ( " << tokenValue << " ) not found!"
                << std::endl;
      exit(1);
    }
  }

  case NOTop: {
    word* function = prim(expression);
    Functional::NOT(function, function, this->nWords);
    return function;
  }

  case LP: {
    word* function = expr1(expression);
    if (currentToken != RP) {
      std::cout << "ERROR: current token = " << currentToken << std::endl;
      exit(1);
    }
    getToken(expression); // eat )
    return function;
  }

  default:
    return nullptr;
    break;
  }
}

Token FunctionUtil::getToken(std::istringstream& expression) {

  char ch = 0;
  expression >> ch;

  switch (ch) {
  case 0: {
    return this->currentToken = END;
  }

  case ';':
  case '*':
  case '+':
  case '^':
  case '!':
  case '(':
  case ')':
  case '=':
    return this->currentToken = Token(ch);

  default: {
    if (isalpha(ch)) {
      this->tokenValue = "";
      for (; isalnum(ch) && !expression.eof(); expression >> ch) {
        tokenValue += ch; //; needed at the end of string
      }
      expression.putback(ch);
      return this->currentToken = LIT;
    }
    return this->currentToken = END;
  }
  }
}

word* FunctionUtil::parseHexa(std::string hexa) {

  if ((hexa.at(0) == '0') && (hexa.at(1) == 'x')) {
    hexa = hexa.substr(2);
  }

  word* function = this->functionPool.getMemory();
  unsigned long int value;
  std::stringstream ss;

  if (this->nVars < 6) {
    ss << std::hex << hexa;
    ss >> value;
    function[0] = static_cast<unsigned long int>(value);
    return function;
  } else {
    int lhs = hexa.size() - 16;
    int i   = 0;
    while (lhs >= 0) {
      std::string currentHexa = hexa.substr(lhs, 16);
      ss.clear();
      ss.str("");
      ss << std::hex << currentHexa;
      ss >> value;
      function[i++] = static_cast<unsigned long int>(value);
      lhs -= 16;
    }
    return function;
  }
}

} // namespace Functional


================================================
FILE: lonestar/eda/cpu/aig-rewriting/functional/FunctionUtil.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * FunctionUtil.h
 *
 *  Created on: 14/02/2017
 *      Author: possani
 */

#ifndef SRC_FUNCTION_FUNCTIONUTIL_H_
#define SRC_FUNCTION_FUNCTIONUTIL_H_

#include "BitVectorPool.h"

#include <cstdlib>
#include <string>
#include <iterator>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <cmath>
#include <sstream>
#include <vector>
#include <unordered_map>

namespace Functional {

enum Token {
  ANDop = '*',
  ORop  = '+',
  XORop = '^',
  LP    = '(',
  RP    = ')',
  NOTop = '!',
  LIT,
  END = ';',
  EMPTY
};

typedef unsigned long int word;
typedef std::unordered_map<std::string, std::pair<word*, unsigned int>>
    StringFunctionMap;

class FunctionUtil {

  Token currentToken;
  std::string tokenValue;
  StringFunctionMap& literals;
  BitVectorPool& functionPool;
  int nVars;
  int nWords;

public:
  FunctionUtil(StringFunctionMap& entries, BitVectorPool& functionPool,
               int nVars, int nWords);

  virtual ~FunctionUtil();

  word* parseExpression(std::string expression);
  word* prim(std::istringstream& expression);
  word* term(std::istringstream& expression);
  word* expr2(std::istringstream& expression);
  word* expr1(std::istringstream& expression);
  Token getToken(std::istringstream& expression);
  word* parseHexa(std::string hexa);
};

} // namespace Functional

#endif /* SRC_FUNCTION_FUNCTIONUTIL_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/main.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "parsers/AigParser.h"
#include "writers/AigWriter.h"
#include "writers/BlifWriter.h"
#include "subjectgraph/aig/Aig.h"
#include "algorithms/CutManager.h"
#include "algorithms/PriorityCutManager.h"
#include "algorithms/NPNManager.h"
#include "algorithms/RewriteManager.h"
#include "algorithms/PreCompGraphManager.h"
#include "algorithms/ChoiceManager.h"
#include "algorithms/ReconvDrivenCut.h"
#include "galois/Galois.h"
#include "Lonestar/BoilerPlate.h"
#include <chrono>
#include <iostream>
#include <sstream>

static const char* name = "AIG Rewriting";

static const char* desc =
    "Optimization in logic synthesis through rewriting AND-Inverter Graphs";

static const char* url = "aigRewriting";

namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

static cll::opt<bool>
    outputVerbose("v", cll::desc("verbose output (default: false)"),
                  cll::init(false));

//! Flag that forces user to be aware that they should be passing in a
//! AIG format.
static cll::opt<bool> AIG(
    "AIG",
    cll::desc("Specify that the input graph is a AND-Inverter Graph format"),
    cll::init(false));

using namespace std::chrono;

void aigRewriting(aig::Aig& aig, std::string& fileName, int nThreads,
                  bool verbose);
void kcut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose);
void prioritycut(aig::Aig& aig, std::string& fileName, int nThreads,
                 bool deterministic, bool verbose);
void addChoices(aig::Aig& aig, std::string& fileName, int nThreads,
                bool verbose);
void rdCut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose);
std::string getFileName(std::string path);

int main(int argc, char* argv[]) {
  // shared-memory system object initializes global variables for galois
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  if (!AIG) {
    GALOIS_DIE("This application requires an AND-Inverter Graph (AIG) format;"
               " please use the -AIG flag "
               " to indicate the input is a AIG format.");
  }

  int nThreads         = numThreads;
  std::string path     = inputFile;
  std::string fileName = getFileName(path);

  aig::Aig aig;
  AigParser aigParser(path, aig);
  aigParser.parseAig();
  // aigParser.parseAag();

  if (outputVerbose) {
    std::cout << "############## AIG REWRITING ##############" << std::endl;
    std::cout << "Design Name: " << fileName << std::endl;
    std::cout << "|Nodes|: " << aig.getGraph().size() << std::endl;
    std::cout << "|I|: " << aigParser.getI() << std::endl;
    std::cout << "|L|: " << aigParser.getL() << std::endl;
    std::cout << "|O|: " << aigParser.getO() << std::endl;
    std::cout << "|A|: " << aigParser.getA() << std::endl;
    std::cout << "|E|: " << aigParser.getE() << " (outgoing edges)"
              << std::endl;
  }

  aigRewriting(aig, fileName, nThreads, outputVerbose);

  return 0;
}

void aigRewriting(aig::Aig& aig, std::string& fileName, int nThreads,
                  bool verbose) {

  int numThreads = galois::setActiveThreads(nThreads);

  int K = 4, C = 500;
  int triesNGraphs = 500;
  bool compTruth   = true;
  bool useZeros    = false;
  bool updateLevel = false;

  if (verbose) {
    std::cout << "############# Configurations ############## " << std::endl;
    std::cout << "K: " << K << std::endl;
    std::cout << "C: " << C << std::endl;
    std::cout << "TriesNGraphs: " << triesNGraphs << std::endl;
    std::cout << "CompTruth: " << (compTruth ? "yes" : "no") << std::endl;
    std::cout << "UseZeroCost: " << (useZeros ? "yes" : "no") << std::endl;
    std::cout << "UpdateLevel: " << (updateLevel ? "yes" : "no") << std::endl;
    std::cout << "nThreads: " << numThreads << std::endl;
  }

  high_resolution_clock::time_point t1 = high_resolution_clock::now();

  // CutMan
  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);

  // NPNMan
  algorithm::NPNManager npnMan;

  // StrMan
  algorithm::PreCompGraphManager pcgMan(npnMan);
  pcgMan.loadPreCompGraphFromArray();
  pcgMan.processDecompositionGraphs();

  // RWMan
  algorithm::RewriteManager rwtMan(aig, cutMan, npnMan, pcgMan, triesNGraphs,
                                   useZeros, updateLevel);

  algorithm::runRewriteOperator(rwtMan);

  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  long double rewriteTime = duration_cast<microseconds>(t2 - t1).count();

  if (verbose) {
    std::cout << "################ Results ################## " << std::endl;
    std::cout << "Size: " << aig.getNumAnds() << std::endl;
    std::cout << "Depth: " << aig.getDepth() << std::endl;
    std::cout << "Runtime (us): " << rewriteTime << std::endl;
  } else {
    std::cout << fileName << ";" << C << ";" << triesNGraphs << ";" << useZeros
              << ";" << aig.getNumAnds() << ";" << aig.getDepth() << ";"
              << numThreads << ";" << rewriteTime << std::endl;
  }

  // WRITE AIG //
  AigWriter aigWriter(fileName + "_rewritten.aig");
  aigWriter.writeAig(aig);

  // WRITE DOT //
  // aig.writeDot( fileName + "_rewritten.dot", aig.toDot() );
}

void prioritycut(aig::Aig& aig, std::string& fileName, int nThreads,
                 bool deterministic, bool verbose) {

  int numThreads = galois::setActiveThreads(nThreads);

  int K = 6, C = 8;
  bool compTruth = true;

  if (verbose) {
    std::cout << "############# Configurations ############## " << std::endl;
    std::cout << "K: " << K << std::endl;
    std::cout << "C: " << C << std::endl;
    std::cout << "CompTruth: " << (compTruth ? "yes" : "no") << std::endl;
    std::cout << "Deterministic: " << (deterministic ? "yes" : "no")
              << std::endl;
    std::cout << "nThreads: " << numThreads << std::endl;
  }

  long double kcutTime                 = 0;
  high_resolution_clock::time_point t1 = high_resolution_clock::now();

  algorithm::PriCutManager cutMan(aig, K, C, numThreads, compTruth,
                                  deterministic, verbose);
  algorithm::runKPriCutOperator(cutMan);

  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  kcutTime = duration_cast<microseconds>(t2 - t1).count();

  BlifWriter blifWriter(fileName + "_mapped.blif");
  blifWriter.writeNetlist(aig, cutMan);

  // WRITE DOT //
  // aig.writeDot( fileName + ".dot", aig.toDot() );

  if (verbose) {
    std::cout << "################ Results ################## " << std::endl;
    // cutMan.printCovering();
    // cutMan.printAllCuts();
    // cutMan.printBestCuts();
    // cutMan.printRuntimes();
    cutMan.printCutStatistics();
    std::cout << "AIG Size: " << aig.getNumAnds() << std::endl;
    std::cout << "AIG Depth: " << aig.getDepth() << std::endl;
    std::cout << "LUT Size: " << cutMan.getNumLUTs() << std::endl;
    std::cout << "LUT Depth: " << cutMan.getNumLevels() << std::endl;
    std::cout << "Runtime (us): " << kcutTime << std::endl;
  } else {
    std::cout << fileName << ";" << K << ";" << C << ";" << compTruth << ";"
              << aig.getNumAnds() << ";" << aig.getDepth() << ";"
              << cutMan.getNumLUTs() << ";" << cutMan.getNumLevels() << ";"
              << numThreads << ";" << kcutTime << std::endl;
  }
}

void addChoices(aig::Aig& aig, std::string& GALOIS_UNUSED(fileName),
                int GALOIS_UNUSED(nThreads), bool verbose) {

  // FIXME
  // In the current implementation the creation of choices must be serial.
  // It is due to the management of AIGi node's IDs when creating new nodes.
  int numThreads = galois::setActiveThreads(1);
  // int numThreads = galois::setActiveThreads(nThreads);

  int K = 4, C = 500;
  int nGraphs      = 4;
  int nChoices     = 4;
  bool compTruth   = true;
  bool useZeros    = false;
  bool updateLevel = false;

  if (verbose) {
    std::cout << "############# Configurations ############## " << std::endl;
    std::cout << "K: " << K << std::endl;
    std::cout << "C: " << C << std::endl;
    std::cout << "nGraphs: " << nGraphs << std::endl;
    std::cout << "nChoices: " << nChoices << std::endl;
    std::cout << "CompTruth: " << (compTruth ? "yes" : "no") << std::endl;
    std::cout << "UseZeroCost: " << (useZeros ? "yes" : "no") << std::endl;
    std::cout << "UpdateLevel: " << (updateLevel ? "yes" : "no") << std::endl;
    std::cout << "nThreads: " << numThreads << std::endl;
  }

  high_resolution_clock::time_point t1 = high_resolution_clock::now();

  // CutMan
  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);

  // NPNMan
  algorithm::NPNManager npnMan;

  // StrMan
  algorithm::PreCompGraphManager pcgMan(npnMan);
  pcgMan.loadPreCompGraphFromArray();
  pcgMan.processDecompositionGraphs();

  // RWMan
  algorithm::ChoiceManager chMan(aig, cutMan, npnMan, pcgMan, nGraphs,
                                 nChoices);

  algorithm::runChoiceOperator(chMan);
  aig.resetAllIds();

  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  long double runtime = duration_cast<microseconds>(t2 - t1).count();

  // WRITE DOT //
  // aig.writeDot( fileName + "_choices.dot", aig.toDot() );

  if (verbose) {
    std::cout << "################ Results ################## " << std::endl;
    std::cout << "Size: " << aig.getNumAnds() << std::endl;
    std::cout << "Depth: " << aig.getDepth() << std::endl;
    std::cout << "Runtime (us): " << runtime << std::endl;
  } else {
    /*
     * std::cout << fileName << ";" << C << ";" << nGraphs << ";" << useZeros
     * << ";" << aig.getNumAnds() << ";" << aig.getDepth() << ";"
     * << numThreads << ";" << runtime << std::endl;
     */
  }
}

void kcut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose) {

  int numThreads = galois::setActiveThreads(nThreads);

  int K = 4, C = 500;
  bool compTruth = false;

  if (verbose) {
    std::cout << "############# Configurations ############## " << std::endl;
    std::cout << "K: " << K << std::endl;
    std::cout << "C: " << C << std::endl;
    std::cout << "CompTruth: " << (compTruth ? "yes" : "no") << std::endl;
    std::cout << "nThreads: " << numThreads << std::endl;
  }

  long double kcutTime                 = 0;
  high_resolution_clock::time_point t1 = high_resolution_clock::now();

  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);
  algorithm::runKCutOperator(cutMan);

  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  kcutTime = duration_cast<microseconds>(t2 - t1).count();

  if (verbose) {
    std::cout << "################ Results ################## " << std::endl;
    // cutMan.printAllCuts();
    // cutMan.printRuntimes();
    cutMan.printCutStatistics();
    std::cout << "Size: " << aig.getNumAnds() << std::endl;
    std::cout << "Depth: " << aig.getDepth() << std::endl;
    std::cout << "Runtime (us): " << kcutTime << std::endl;
  } else {
    std::cout << fileName << ";" << K << ";" << C << ";" << compTruth << ";"
              << aig.getNumAnds() << ";" << aig.getDepth() << ";" << numThreads
              << ";" << kcutTime << std::endl;
  }
}

void rdCut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose) {

  int numThreads = galois::setActiveThreads(nThreads);

  size_t K = 4;

  if (verbose) {
    std::cout << "############# Configurations ############## " << std::endl;
    std::cout << "K: " << K << std::endl;
    std::cout << "nThreads: " << numThreads << std::endl;
  }

  high_resolution_clock::time_point t1 = high_resolution_clock::now();

  algorithm::ReconvDrivenCut rdcMan(aig);
  rdcMan.run(K);

  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  long double rdCutTime = duration_cast<microseconds>(t2 - t1).count();

  if (verbose) {
    std::cout << "################ Results ################## " << std::endl;
    std::cout << "Size: " << aig.getNumAnds() << std::endl;
    std::cout << "Depth: " << aig.getDepth() << std::endl;
    std::cout << "Runtime (us): " << rdCutTime << std::endl;
  } else {
    std::cout << fileName << ";" << K << ";" << aig.getNumAnds() << ";"
              << aig.getDepth() << ";" << numThreads << ";" << rdCutTime
              << std::endl;
  }
}

std::string getFileName(std::string path) {
  std::size_t slash    = path.find_last_of("/") + 1;
  std::size_t dot      = path.find_last_of(".");
  std::string fileName = path.substr(slash, (dot - slash));
  return fileName;
}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/misc/util/utilString.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * utilString.cpp
 *
 *  Created on: 22/07/2014
 *      Author: jodymaick
 */

#include <cstdio>
#include <stdarg.h>
#include <regex>
#include "utilString.h"

void split(const std::string& str, const std::string& delim,
           std::vector<std::string>& parts) {
  size_t start, end = 0;
  while (end < str.size()) {
    start = end;
    while (start < str.size() &&
           (delim.find(str[start]) != std::string::npos)) {
      start++; // skip initial whitespace
    }
    end = start;
    while (end < str.size() && (delim.find(str[end]) == std::string::npos)) {
      end++; // skip to end of word
    }
    if (end - start != 0) { // just ignore zero-length strings.
      parts.push_back(std::string(str, start, end - start));
    }
  }
}

/*
std::vector<std::string> regex_split(const std::string & s, std::string rgx_str)
{ std::vector<std::string> elems;

    std::regex rgx (rgx_str);

    std::sregex_token_iterator iter(s.begin(), s.end(), rgx, -1);
    std::sregex_token_iterator end;

    while (iter != end)  {
        //std::cout << "S43:" << *iter << std::endl;
        elems.push_back(*iter);
        ++iter;
    }

    return elems;
}
*/

bool startsWith(std::string str, std::string part) {
  for (unsigned int i = 0; i < part.size(); i++)
    if (str.at(i) != part.at(i))
      return false;
  return true;
}

bool endsWith(std::string str, std::string part) {
  if (str.size() < part.size())
    return false;
  for (unsigned int i = str.size() - part.size(), j = 0; j < part.size();
       i++, j++)
    if (str.at(i) != part.at(j))
      return false;
  return true;
}

std::string format(const std::string fmt, ...) {
  int size = 100;
  std::string str;
  va_list ap;
  while (1) {
    str.resize(size);
    va_start(ap, fmt);
    int n = vsnprintf((char*)str.c_str(), size, fmt.c_str(), ap);
    va_end(ap);
    if (n > -1 && n < size) {
      str.resize(n);
      return str;
    }
    if (n > -1)
      size = n + 1;
    else
      size *= 2;
  }
  return str;
}

void find_and_replace(std::string& source, std::string const& find,
                      std::string const& replace) {
  for (std::string::size_type i = 0;
       (i = source.find(find, i)) != std::string::npos;) {
    source.replace(i, find.length(), replace);
    i += replace.length();
  }
}

std::string get_clean_string(std::string string) {
  find_and_replace(string, "/", "_");
  find_and_replace(string, "\\", "_");
  find_and_replace(string, ".", "_");
  find_and_replace(string, "(", "_");
  find_and_replace(string, ")", "_");
  find_and_replace(string, "[", "_");
  find_and_replace(string, "]", "_");
  return string;
}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/misc/util/utilString.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * utilstd::string.h
 *
 *  Created on: 22/07/2014
 *      Author: jodymaick
 */

#ifndef UTILSTRING_H_
#define UTILSTRING_H_

#include <string>
#include <vector>

/**
 * Splits the given std::string in a vector of std::strings at each occurrence
 * of a given delimiter.
 *
 * @param str The std::string to be splitted.
 * @param delim The delimiter.
 * @param parts The vector comprising the parts of the split std::string.
 */
void split(const std::string& str, const std::string& delim,
           std::vector<std::string>& parts);

/**
 * Splits the given std::string at each occurrence of a given regex-based
 * delimiter and returns a vector of std::strings.
 *
 * @param s The std::string to be splitted.
 * @param rgx_str The regex-based delimiter.
 * @return The vector comprising the parts of the split std::string.
 */
// std::vector<std::string> regex_split(const std::string & s, std::string
// rgx_str = "\\s+");

/**
 * Checks if a given std::string starts with another given std::string.
 *
 * @param str The std::string to be checked in.
 * @param part The part to be searched into the std::string.
 * @return True if str starts with part. False otherwise.
 */
bool startsWith(std::string str, std::string part);

/**
 * Checks if a given std::string ends with another given std::string.
 *
 * @param str The std::string to be checked in.
 * @param part The part to be searched into the std::string.
 * @return True if str ends with part. False otherwise.
 */
bool endsWith(std::string str, std::string part);

/**
 * Returns a formatted std::string (just like it would be printed with printf).
 *
 * @param fmt The desired format of the std::string.
 * @return The formatted std::string.
 */
std::string format(const std::string fmt, ...);

void find_and_replace(std::string& source, std::string const& find,
                      std::string const& replace);

std::string get_clean_string(std::string string);

#endif /* UTILstd::string_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/AigParser.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include <AigParser.h>
#include <iostream>
#include <algorithm>

AigParser::AigParser(aig::Aig& aig) : aig(aig) {
  currLine = 0;
  currChar = 0;
}

AigParser::AigParser(std::string fileName, aig::Aig& aig) : aig(aig) {
  currLine = 1;
  currChar = 0;
  open(fileName);
}

AigParser::~AigParser() { close(); }

void AigParser::open(std::string fileName) {
  close();
  currLine = 1;
  currChar = 0;
  file.open(fileName.c_str());
}

bool AigParser::isOpen() const { return file.is_open(); }

void AigParser::close() {
  if (isOpen()) {
    file.close();
    currLine = 0;
    currChar = 0;
  }
}

unsigned AigParser::decode() {
  unsigned x = 0, in = 0;
  unsigned char c;
  while ((c = parseByte()) & 0x80)
    x |= (c & 0x7f) << (7 * in++);
  return x | (c << (7 * in));
}

char AigParser::parseByte() {
  char byte;
  file.read(&byte, 1);
  if (file.eof()) {
    throw unexpected_eof(currLine, currChar);
  }
  if (byte == '\n') {
    currLine++;
    currChar = 0;
  } else {
    currChar++;
  }
  return byte;
}

bool AigParser::parseBool(std::string delimChar) {
  bool result;
  int c = parseChar();
  switch (c) {
  case '0':
    result = false;
    break;
  case '1':
    result = true;
    break;
  default:
    throw syntax_error(
        currLine, currChar,
        format("Expected Boolean (0 or 1) but found: ASCII %d", c));
  }
  c = parseChar();
  if (delimChar.find(c) == std::string::npos) {
    throw syntax_error(
        currLine, currChar,
        format("Expected Boolean (0 or 1) but found: ASCII %d", c));
  }
  return result;
}

unsigned char AigParser::parseChar() {
  int result = file.get();
  if (file.eof()) {
    throw unexpected_eof(currLine, currChar);
  }
  if (result == '\r') {
    result = file.get();
    if (file.eof()) {
      throw unexpected_eof(currLine, currChar);
    }
  }
  if (result == '\n') {
    currLine++;
    currChar = 0;
  } else {
    currChar++;
  }
  return result;
}

int AigParser::parseInt(std::string delimChar) {
  unsigned char c;
  int result;
  bool done = false;
  std::stringstream buffer;
  c = parseChar();
  if (!isdigit(c))
    throw syntax_error(currLine, currChar,
                       format("Expected integer but found: ASCII %d", c));
  buffer << c;
  while (!done) {
    c = parseChar();
    if (isdigit(c)) {
      buffer << c;
    } else if (delimChar.find(c) != std::string::npos) {
      buffer >> result;
      done = true;
    } else {
      throw syntax_error(currLine, currChar,
                         format("Expected integer but found: ASCII %d", c));
    }
  }
  return result;
}

std::string AigParser::parseString(std::string delimChar) {
  bool done = false;
  unsigned char c;
  std::stringstream buffer;
  c = parseChar();
  if (delimChar.find(c) != std::string::npos || c == '\0') {
    throw syntax_error(currLine, currChar,
                       format("Expected integer but found: ASCII %d", c));
  }
  buffer << c;
  while (!done) {
    c = parseChar();
    if (delimChar.find(c) != std::string::npos || c == '\0') {
      done = true;
    } else {
      buffer << c;
    }
  }
  return buffer.str();
}

void AigParser::resize() {
  inputs.resize(i);
  latches.resize(l);
  outputs.resize(o);
  ands.resize(a);

  // The symbols may be empty
  // inputNames.resize(i);
  // latchNames.resize(l);
  // outputNames.resize(o);

  int nNodes = m + o + 1;
  aig.resizeNodeVectors(nNodes);
  // aig.getNodes().resize(nNodes);
  // aig.getNodesTravId().resize(nNodes);
  // aig.getNodesFanoutMap().resize(nNodes);
}

void AigParser::parseAagHeader() {
  std::string aag = parseString(" ");
  if (aag.compare("aag") != 0) {
    throw syntax_error(1, 0, "Expected aag header");
  }
  m = parseInt(" ");
  i = parseInt(" ");
  l = parseInt(" ");
  o = parseInt(" ");
  a = parseInt("\n");

  if (m != (i + l + a)) {
    throw semantic_error(1, 4, "Incorrect value for M");
  }
  resize();
}

void AigParser::parseAigHeader() {
  std::string aig = parseString(" ");
  if (aig.compare("aig") != 0) {
    throw syntax_error(1, 0, "Expected aig header");
  }
  m = parseInt(" ");
  i = parseInt(" ");
  l = parseInt(" ");
  o = parseInt(" ");
  a = parseInt("\n");

  if (m != (i + l + a)) {
    throw semantic_error(1, 4, "Incorrect value for M");
  }
  resize();
}

void AigParser::parseAagInputs() {
  for (int in = 0; in < i; in++) {
    inputs[in] = parseInt("\n");
  }
}

void AigParser::parseAigInputs() {
  int x = 2;
  for (int in = 0; in < i; in++) {
    inputs[in] = x;
    x          = x + 2;
  }
}

void AigParser::parseAagLatches() {
  unsigned line;
  for (int in = 0; in < l; in++) {
    int lhs = parseInt(" ");
    line    = currLine;
    int rhs = parseInt("\n");
    bool init;
    if (line == currLine) {
      init = parseBool("\n");
    } else {
      init = false;
    }
    latches[in] = std::make_tuple(lhs, rhs, init);
  }
}

void AigParser::parseAigLatches() {
  unsigned line;
  int lhs = i * 2 + 2;
  for (int in = 0; in < l; in++) {
    line    = currLine;
    int rhs = parseInt("\n");
    bool init;
    if (line == currLine) {
      init = parseBool("\n");
    } else {
      init = false;
    }
    latches[in] = std::make_tuple(lhs, rhs, init);
    lhs         = lhs + 2;
  }
}

void AigParser::parseOutputs() {
  for (int in = 0; in < o; in++) {
    outputs[in] = parseInt("\n");
  }
}

void AigParser::parseAagAnds() {
  for (int in = 0; in < a; in++) {
    int lhs  = parseInt(" ");
    int rhs0 = parseInt(" ");
    int rhs1 = parseInt("\n");
    ands[in] = std::make_tuple(lhs, rhs0, rhs1);
  }
}

void AigParser::parseAigAnds() {
  int delta0, delta1;
  int lhs = (i + l) * 2 + 2;
  for (int in = 0; in < a; in++) {
    delta0   = decode();
    delta1   = decode();
    int rhs0 = lhs - delta0;
    if (rhs0 < 0) {
      throw semantic_error(currLine, currChar,
                           format("Negative rhs0: %d", rhs0));
    }
    int rhs1 = rhs0 - delta1;
    if (rhs1 < 0) {
      throw semantic_error(currLine, currChar,
                           format("Negative rhs0: %d", rhs1));
    }
    ands[in] = std::make_tuple(lhs, rhs0, rhs1);
    lhs      = lhs + 2;
  }
}

void AigParser::parseSymbolTable() {
  int c, n;
  while (true) {
    try {
      c = parseChar();
    } catch (unexpected_eof& e) {
      return;
    }
    switch (c) {
    case 'i':
      n = parseInt(" ");
      if (n >= i)
        throw semantic_error(currLine, currChar,
                             "Input number greater than number of inputs");
      if (inputNames.empty()) {
        inputNames.resize(i);
      }
      inputNames[n] = parseString("\n");
      break;
    case 'l':
      n = parseInt(" ");
      if (n >= l)
        throw semantic_error(currLine, currChar,
                             "Latch number greater than number of latches");
      if (latchNames.empty()) {
        latchNames.resize(l);
      }
      latchNames[n] = parseString("\n");
      break;
    case 'o':
      n = parseInt(" ");
      if (n >= o)
        throw semantic_error(currLine, currChar,
                             "Output number greater than number of outputs");
      if (outputNames.empty()) {
        outputNames.resize(o);
      }
      outputNames[n] = parseString("\n");
      break;
    case 'c':
      c = parseChar();
      if (c != '\n' && c != '\r')
        throw syntax_error(currLine, currChar);
      if (file.peek() != 0)
        designName = parseString("\n\0");
      else
        designName = "Unnamed";
      return;
    }
  }
}

void AigParser::parseAag() {
  parseAagHeader();
  parseAagInputs();
  parseAagLatches();
  parseOutputs();
  parseAagAnds();
  parseSymbolTable();
  createAig();
}

void AigParser::parseAig() {
  parseAigHeader();
  parseAigInputs();
  parseAigLatches();
  parseOutputs();
  parseAigAnds();
  parseSymbolTable();
  createAig();
}

void AigParser::createAig() {
  aig.setDesignName(this->designName);
  createConstant();
  createInputs();
  createLatches();
  createOutputs();
  createAnds();
  // connectAnds();
  connectAndsWithFanoutMap();
  connectLatches();
  connectOutputs();
}

void AigParser::createConstant() {
  // Node Data
  aig::NodeData nodeData;
  nodeData.id      = 0;
  nodeData.counter = 0;
  nodeData.type    = aig::NodeType::CONSTZERO;
  nodeData.level   = 0;
  // AIG Node
  aig::Graph& aigGraph = aig.getGraph();
  aig::GNode constNode;
  constNode = aigGraph.createNode(nodeData);
  aigGraph.addNode(constNode);
  aig.getNodes()[nodeData.id] = constNode;
}

void AigParser::createInputs() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < i; in++) {
    // Node Data
    aig::NodeData nodeData;
    nodeData.id      = inputs[in] / 2;
    nodeData.counter = 0;
    nodeData.type    = aig::NodeType::PI;
    nodeData.level   = 0;
    // AIG Node
    aig::GNode inputNode;
    inputNode = aigGraph.createNode(nodeData);
    aigGraph.addNode(inputNode);
    aig.getInputNodes().push_back(inputNode);
    aig.getNodes()[nodeData.id] = inputNode;
  }

  aig.setInputNames(this->inputNames);
}

void AigParser::createLatches() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < l; in++) {
    // Node Data
    aig::NodeData nodeData;
    nodeData.id      = (std::get<0>(latches[in]) / 2);
    nodeData.counter = 0;
    nodeData.type    = aig::NodeType::LATCH;
    // NodeData.initialValue = std::get<3>( latches[in] ); // FIXME
    // AIG Node
    aig::GNode latchNode;
    latchNode = aigGraph.createNode(nodeData);
    aigGraph.addNode(latchNode);
    aig.getLatchNodes().push_back(latchNode);
    aig.getNodes()[nodeData.id] = latchNode;
  }

  aig.setLatchNames(this->latchNames);
}

void AigParser::createOutputs() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < o; in++) {
    // Node Data
    aig::NodeData nodeData;
    nodeData.id      = m + in + 1;
    nodeData.counter = 0;
    nodeData.type    = aig::NodeType::PO;
    // AIG Node
    aig::GNode outputNode;
    outputNode = aigGraph.createNode(nodeData);
    aigGraph.addNode(outputNode);
    aig.getOutputNodes().push_back(outputNode);
    aig.getNodes()[nodeData.id] = outputNode;
  }

  aig.setOutputNames(this->outputNames);
}

void AigParser::createAnds() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < a; in++) {
    // Node Data
    aig::NodeData nodeData;
    nodeData.id = (std::get<0>(ands[in]) / 2);
    std::stringstream sName;
    nodeData.counter = 0;
    nodeData.type    = aig::NodeType::AND;
    // AIG Node
    aig::GNode andNode;
    andNode = aigGraph.createNode(nodeData);
    aigGraph.addNode(andNode);
    aig.getNodes()[nodeData.id] = andNode;
  }
}

void AigParser::connectLatches() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < l; in++) {
    int lhs                      = std::get<0>(latches[in]);
    aig::GNode latchNode         = aig.getNodes()[lhs / 2];
    aig::NodeData& latchNodeData = aigGraph.getData(latchNode);

    int rhs                      = std::get<1>(latches[in]);
    aig::GNode inputNode         = aig.getNodes()[rhs / 2];
    aig::NodeData& inputNodeData = aigGraph.getData(inputNode);
    inputNodeData.nFanout += 1;

    aigGraph.getEdgeData(aigGraph.addEdge(inputNode, latchNode)) = !(rhs % 2);
    latchNodeData.level = 1 + inputNodeData.level;
  }
}

void AigParser::connectOutputs() {
  aig::Graph& aigGraph = aig.getGraph();
  for (int in = 0; in < o; in++) {
    aig::GNode outputNode = aig.getNodes()[m + in + 1];
    aig::NodeData& outputNodeData =
        aigGraph.getData(outputNode, galois::MethodFlag::WRITE);
    // outputNodeData.nFanin = 1;

    int rhs              = outputs[in];
    aig::GNode inputNode = aig.getNodes()[rhs / 2];
    aig::NodeData& inputNodeData =
        aigGraph.getData(inputNode, galois::MethodFlag::WRITE);
    inputNodeData.nFanout += 1;

    aigGraph.getEdgeData(aigGraph.addEdge(inputNode, outputNode)) = !(rhs % 2);
    outputNodeData.level = 1 + inputNodeData.level;
  }
}

void AigParser::connectAnds() {

  aig::Graph& aigGraph = aig.getGraph();

  // Each andDef is composed by three nodes A B C, A is the AND itself, B and C
  // are the two input nodes.
  for (auto andDef : this->ands) {

    int A              = std::get<0>(andDef);
    aig::GNode andNode = aig.getNodes()[A / 2];
    aig::NodeData& andData =
        aigGraph.getData(andNode, galois::MethodFlag::WRITE);
    // andData.nFanin = 2;

    int B              = std::get<1>(andDef);
    aig::GNode lhsNode = aig.getNodes()[B / 2];
    aig::NodeData& lhsData =
        aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);
    lhsData.nFanout += 1;

    int C              = std::get<2>(andDef);
    aig::GNode rhsNode = aig.getNodes()[C / 2];
    aig::NodeData& rhsData =
        aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);
    rhsData.nFanout += 1;

    aigGraph.getEdgeData(aigGraph.addMultiEdge(
        lhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = !(B % 2);

    aigGraph.getEdgeData(aigGraph.addMultiEdge(
        rhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = !(C % 2);

    andData.level = 1 + std::max(lhsData.level, rhsData.level);
  }
}

void AigParser::connectAndsWithFanoutMap() {

  aig::Graph& aigGraph = aig.getGraph();

  this->levelHistogram.resize(50000, 0); // FIXME
  this->levelHistogram[0] = this->i;

  // Each andDef is composed by three nodes A B C, A is the AND itself, B and C
  // are the two input nodes.
  for (auto andDef : this->ands) {

    int A              = std::get<0>(andDef);
    aig::GNode andNode = aig.getNodes()[A / 2];
    aig::NodeData& andData =
        aigGraph.getData(andNode, galois::MethodFlag::WRITE);
    // andData.nFanin = 2;

    int B              = std::get<1>(andDef);
    aig::GNode lhsNode = aig.getNodes()[B / 2];
    aig::NodeData& lhsData =
        aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);
    bool lhsPol = !(B % 2);
    lhsData.nFanout += 1;

    int C              = std::get<2>(andDef);
    aig::GNode rhsNode = aig.getNodes()[C / 2];
    aig::NodeData& rhsData =
        aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);
    bool rhsPol = !(C % 2);
    rhsData.nFanout += 1;

    aigGraph.getEdgeData(aigGraph.addMultiEdge(
        lhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = lhsPol;

    aigGraph.getEdgeData(aigGraph.addMultiEdge(
        rhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = rhsPol;

    aig.insertNodeInFanoutMap(andNode, lhsNode, rhsNode, lhsPol, rhsPol);

    andData.level = 1 + std::max(lhsData.level, rhsData.level);
    this->levelHistogram[andData.level] += 1;
  }

  int i = 0;
  while (i < 50000) {
    if (this->levelHistogram[i] == 0) {
      break;
    }
    i++;
  }
  this->levelHistogram.resize(i);
}

int AigParser::getI() { return i; }

int AigParser::getL() { return l; }

int AigParser::getO() { return o; }

int AigParser::getA() { return a; }

int AigParser::getE() {

  aig::Graph& aigGraph = aig.getGraph();
  int nEdges           = 0;

  for (auto node : aigGraph) {
    nEdges += std::distance(aigGraph.edge_begin(node), aigGraph.edge_end(node));
  }

  return nEdges;
}

std::vector<int>& AigParser::getLevelHistogram() {
  return this->levelHistogram;
}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/AigParser.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef AIGPARSER_H_
#define AIGPARSER_H_
#include <string>
#include <fstream>
#include <sstream>

#include "semantic_error.h"
#include "syntax_error.h"
#include "unexpected_eof.h"
#include "../misc/util/utilString.h"
#include "../subjectgraph/aig/Aig.h"

class AigParser {

private:
  unsigned currLine;
  unsigned currChar;
  std::ifstream file;
  int m, i, l, o, a;
  std::vector<int> inputs, outputs;
  std::vector<std::tuple<int, int, bool>> latches;
  std::vector<std::tuple<int, int, int>> ands;
  std::vector<std::string> inputNames, latchNames, outputNames;
  std::vector<aig::GNode> nodes;
  std::vector<int> levelHistogram;

  aig::Aig& aig;
  std::string designName;

  unsigned decode();
  bool parseBool(std::string delimChar);
  char parseByte();
  unsigned char parseChar();
  int parseInt(std::string delimChar);
  std::string parseString(std::string delimChar);

  void resize();

  void parseAagHeader();
  void parseAigHeader();
  void parseAagInputs();
  void parseAigInputs();
  void parseAagLatches();
  void parseAigLatches();
  void parseOutputs();
  void parseAagAnds();
  void parseAigAnds();
  void parseSymbolTable();

  void createAig();
  void createConstant();
  void createInputs();
  void createLatches();
  void createOutputs();
  void createAnds();
  void connectAndsWithFanoutMap();

  void connectLatches();
  void connectOutputs();
  void connectAnds();

public:
  AigParser(aig::Aig& aig);
  AigParser(std::string fileName, aig::Aig& aig);
  virtual ~AigParser();

  void open(std::string fileName);
  bool isOpen() const;
  void close();
  void parseAag();
  void parseAig();

  int getI();
  int getL();
  int getO();
  int getA();
  int getE();

  std::vector<int>& getLevelHistogram();
};

#endif /* AIGPARSER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/LookupTableParser.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "LookupTableParser.h"

#include <fstream>
#include <sstream>

namespace lookuptables {

LookupTableParser::LookupTableParser() {}

LookupTableParser::~LookupTableParser() {}

void LookupTableParser::parseFile(std::string fileName,
                                  LookupTableElement** lookupTable) {

  std::ifstream file(fileName);
  std::string line;
  std::string token, exp;
  char lit, lev;

  int i = 0;
  int j = 0;

  while (std::getline(file, line)) {

    if (line.at(0) == '#') {
      i++;
      j = 0;
      continue;
    }

    std::stringstream tokenizer;
    tokenizer << line;

    std::getline(tokenizer, exp, ';');

    std::getline(tokenizer, token, ';');
    lit = std::stoi(token);

    std::getline(tokenizer, token, ';');
    lev = std::stoi(token);

    lookupTable[i][j] = LookupTableElement(exp, lit, lev);

    j++;
  }
}

} /* namespace lookuptables */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/LookupTableParser.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef LOOKUPTABLEPARSER_H_
#define LOOKUPTABLEPARSER_H_

#include <iostream>

namespace lookuptables {

typedef struct lookupTableElement {
  std::string expression;
  char literals;
  char levels;

  lookupTableElement() {
    expression = "";
    literals   = 0;
    levels     = 0;
  }

  lookupTableElement(std::string& expression, char literals, char levels)
      : expression(expression), literals(literals), levels(levels) {}

} LookupTableElement;

class LookupTableParser {

private:
public:
  LookupTableParser();

  ~LookupTableParser();

  void parseFile(std::string fileName, LookupTableElement** lookupTable);
};

} /* namespace lookuptables */

#endif /* LOOKUPTABLEPARSER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/semantic_error.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * semantic_error.cpp
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#include "semantic_error.h"

semantic_error::semantic_error(unsigned l, unsigned c, std::string msg)
    : exception() {
  this->l = l;
  this->c = c;
  std::stringstream ret;
  ret << "Semantic error in line-" << l << " char-" << c << ": " << msg
      << std::endl;
  this->full_msg = ret.str();
}

const char* semantic_error::what() const throw() {
  return this->full_msg.c_str();
}

semantic_error::~semantic_error() throw() {}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/semantic_error.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * semantic_error.h
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#ifndef SEMANTIC_ERROR_H_
#define SEMANTIC_ERROR_H_

#include <exception>
#include <sstream>
#include <string>

class semantic_error : public std::exception {
  unsigned l, c;
  std::string full_msg;

public:
  semantic_error(unsigned l, unsigned c, std::string msg = "");
  virtual const char* what() const throw();
  virtual ~semantic_error() throw();
};

#endif /* SEMANTIC_ERROR_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/syntax_error.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * SyntaxExeption.cpp
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#include "syntax_error.h"

syntax_error::syntax_error(unsigned l, unsigned c, std::string msg)
    : std::exception() {
  this->l = l;
  this->c = c;
  std::stringstream ret;
  ret << "Syntax error in line-" << l << " char-" << c << ": " << msg
      << std::endl;
  this->full_msg = ret.str();
}

const char* syntax_error::what() const throw() {
  return this->full_msg.c_str();
}

syntax_error::~syntax_error() throw() {}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/syntax_error.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * syntax_error.h
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#ifndef SYNTAXEXEPTION_H_
#define SYNTAXEXEPTION_H_

#include <exception>
#include <sstream>
#include <string>

class syntax_error : public std::exception {
  unsigned l, c;
  std::string full_msg;

public:
  syntax_error(unsigned l, unsigned c, std::string msg = "");
  virtual const char* what() const throw();
  virtual ~syntax_error() throw();
};

#endif /* SYNTAXEXEPTION_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/unexpected_eof.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * unexpected_eof.cpp
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#include "unexpected_eof.h"

unexpected_eof::unexpected_eof(unsigned l, unsigned c) : std::exception() {
  this->l = l;
  this->c = c;
  std::stringstream ret;
  ret << "Unexpected eof in line-" << l << " char-" << c << std::endl;
  this->full_msg = ret.str();
}

unexpected_eof::unexpected_eof(unsigned l, unsigned c, std::string msg)
    : std::exception() {
  this->l = l;
  this->c = c;
  std::stringstream ret;
  ret << "Unexpected eof in line-" << l << " char-" << c
      << ". Last token: " << msg << std::endl;
  this->full_msg = ret.str();
}

const char* unexpected_eof::what() const throw() {
  return this->full_msg.c_str();
}

unexpected_eof::~unexpected_eof() throw() {}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/parsers/unexpected_eof.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 * unexpected_eof.h
 *
 *  Created on: Aug 25, 2014
 *      Author: marcos
 */

#ifndef UNEXPECTED_EOF_H_
#define UNEXPECTED_EOF_H_

#include <exception>
#include <sstream>
#include <string>

class unexpected_eof : public std::exception {
  unsigned l, c;
  std::string full_msg;

public:
  unexpected_eof(unsigned l, unsigned c);
  unexpected_eof(unsigned l, unsigned c, std::string msg);
  virtual const char* what() const throw();
  virtual ~unexpected_eof() throw();
};

#endif /* UNEXPECTED_EOF_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/subjectgraph/aig/Aig.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Aig.h"

#include <fstream>
#include <unordered_map>

namespace andInverterGraph {

Aig::Aig() {
  this->idCounter     = 0;
  this->expansionRate = 0.2;
}

Aig::Aig(float expansionRate) {
  this->idCounter     = 0;
  this->expansionRate = expansionRate;
}

Aig::~Aig() {}

void Aig::resize(int m, int i, int l, int o, bool hasSymbols) {
  this->inputNodes.resize(i);
  this->latchNodes.resize(l);
  this->outputNodes.resize(o);

  if (hasSymbols) {
    this->inputNames.resize(i);
    this->latchNames.resize(l);
    this->outputNames.resize(o);
  }

  int nNodes = m + o + 1;
  this->nodes.resize(nNodes);
  this->nodesTravId.resize(nNodes);
  this->nodesFanoutMap.resize(nNodes);
  this->idCounter = nNodes;
}

void Aig::resizeNodeVectors(int size) {
  this->idCounter = size;
  this->nodes.resize(size);
  this->nodesTravId.resize(size);
  this->nodesFanoutMap.resize(size);
}

void Aig::expandNodeVectors(size_t extraSize) {
  this->idCounter = this->nodes.size();
  size_t nNodes   = this->idCounter + extraSize;
  this->nodes.resize(nNodes);
  this->nodesTravId.resize(nNodes);
  this->nodesFanoutMap.resize(nNodes);
}

size_t Aig::getNextId() {
  if (this->idCounter == nodes.size()) {
    size_t extraSize =
        (size_t)(float(this->nodes.size()) * this->expansionRate);
    expandNodeVectors(extraSize);
  }
  size_t nextId = this->idCounter++;
  return nextId;
}

GNode Aig::createAND(GNode lhsAnd, GNode rhsAnd, bool lhsAndPol,
                     bool rhsAndPol) {

  NodeData& lhsAndData = this->graph.getData(lhsAnd, galois::MethodFlag::READ);
  NodeData& rhsAndData = this->graph.getData(rhsAnd, galois::MethodFlag::READ);
  NodeData newAndData;

  newAndData.id      = getNextId();
  newAndData.type    = aig::NodeType::AND;
  newAndData.level   = 1 + std::max(lhsAndData.level, rhsAndData.level);
  newAndData.counter = 0;
  newAndData.nFanout = 0;

  GNode newAnd = this->graph.createNode(newAndData);
  this->graph.addNode(newAnd);

  this->graph.getEdgeData(graph.addMultiEdge(
      lhsAnd, newAnd, galois::MethodFlag::WRITE)) = lhsAndPol;
  this->graph.getEdgeData(graph.addMultiEdge(
      rhsAnd, newAnd, galois::MethodFlag::WRITE)) = rhsAndPol;
  lhsAndData.nFanout++;
  rhsAndData.nFanout++;

  // int faninSize = std::distance( aigGraph.in_edge_begin( newAnd ),
  // aigGraph.in_edge_begin( newAnd ) ); assert( faninSize == 2 );

  this->nodes[newAndData.id] = newAnd;
  this->insertNodeInFanoutMap(newAnd, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);

  return newAnd;
}

void Aig::insertNodeInFanoutMap(GNode andNode, GNode lhsNode, GNode rhsNode,
                                bool lhsPol, bool rhsPol) {

  NodeData& lhsNodeData =
      this->graph.getData(lhsNode, galois::MethodFlag::READ);
  NodeData& rhsNodeData =
      this->graph.getData(rhsNode, galois::MethodFlag::READ);

  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);

  if (lhsNodeData.id < rhsNodeData.id) {
    this->nodesFanoutMap[lhsNodeData.id].emplace(key, andNode);
  } else {
    this->nodesFanoutMap[rhsNodeData.id].emplace(key, andNode);
  }
}

void Aig::removeNodeInFanoutMap(GNode removedNode, GNode lhsNode, GNode rhsNode,
                                bool lhsPol, bool rhsPol) {

  GNode lhsInNode;
  GNode rhsInNode;
  bool lhsInNodePol;
  bool rhsInNodePol;
  int smallestId;

  NodeData& lhsNodeData =
      this->graph.getData(lhsNode, galois::MethodFlag::READ);
  NodeData& rhsNodeData =
      this->graph.getData(rhsNode, galois::MethodFlag::READ);

  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);

  if (lhsNodeData.id < rhsNodeData.id) {
    smallestId = lhsNodeData.id;
  } else {
    smallestId = rhsNodeData.id;
  }

  std::unordered_multimap<unsigned, GNode>& fanoutMap =
      this->nodesFanoutMap[smallestId];
  auto range = fanoutMap.equal_range(key);

  for (auto it = range.first; it != range.second;) {

    GNode fanoutNode = it->second;
    NodeData& fanoutNodeData =
        this->graph.getData(fanoutNode, galois::MethodFlag::READ);

    if (fanoutNodeData.type != NodeType::AND) {
      it++;
      continue;
    }

    auto inEdge  = this->graph.in_edge_begin(fanoutNode);
    lhsInNode    = this->graph.getEdgeDst(inEdge);
    lhsInNodePol = this->graph.getEdgeData(inEdge);

    if (lhsInNode == lhsNode) {
      inEdge++;
      rhsInNode    = this->graph.getEdgeDst(inEdge);
      rhsInNodePol = this->graph.getEdgeData(inEdge);
    } else {
      rhsInNode    = lhsInNode;
      rhsInNodePol = lhsInNodePol;
      inEdge++;
      lhsInNode    = this->graph.getEdgeDst(inEdge);
      lhsInNodePol = this->graph.getEdgeData(inEdge);
    }

    if ((lhsInNode == lhsNode) && (lhsInNodePol == lhsPol) &&
        (rhsInNode == rhsNode) && (rhsInNodePol == rhsPol) &&
        (fanoutNode == removedNode)) {
      it = fanoutMap.erase(it);
    } else {
      it++;
    }
  }
}

GNode Aig::lookupNodeInFanoutMap(GNode lhsNode, GNode rhsNode, bool lhsPol,
                                 bool rhsPol) {

  GNode lhsInNode;
  GNode rhsInNode;
  bool lhsInNodePol;
  bool rhsInNodePol;
  int smallestId;

  NodeData& lhsNodeData =
      this->graph.getData(lhsNode, galois::MethodFlag::READ);
  NodeData& rhsNodeData =
      this->graph.getData(rhsNode, galois::MethodFlag::READ);

  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);

  if (lhsNodeData.id < rhsNodeData.id) {
    smallestId = lhsNodeData.id;
  } else {
    smallestId = rhsNodeData.id;
  }

  std::unordered_multimap<unsigned, GNode>& fanoutMap =
      this->nodesFanoutMap[smallestId];
  auto range = fanoutMap.equal_range(key);

  for (auto it = range.first; it != range.second; it++) {

    GNode fanoutNode = it->second;
    NodeData& fanoutNodeData =
        this->graph.getData(fanoutNode, galois::MethodFlag::READ);

    if (fanoutNodeData.type != NodeType::AND) {
      continue;
    }

    auto inEdge  = this->graph.in_edge_begin(fanoutNode);
    lhsInNode    = this->graph.getEdgeDst(inEdge);
    lhsInNodePol = this->graph.getEdgeData(inEdge);

    if (lhsInNode == lhsNode) {
      inEdge++;
      rhsInNode    = this->graph.getEdgeDst(inEdge);
      rhsInNodePol = this->graph.getEdgeData(inEdge);
    } else {
      rhsInNode    = lhsInNode;
      rhsInNodePol = lhsInNodePol;
      inEdge++;
      lhsInNode    = this->graph.getEdgeDst(inEdge);
      lhsInNodePol = this->graph.getEdgeData(inEdge);
      assert(lhsInNode == lhsNode);
    }
    if ((lhsInNode == lhsNode) && (lhsInNodePol == lhsPol) &&
        (rhsInNode == rhsNode) && (rhsInNodePol == rhsPol)) {
      return fanoutNode;
    }
  }

  return nullptr;
}

unsigned Aig::makeAndHashKey(int lhsId, int rhsId, bool lhsPol, bool rhsPol) {

  unsigned key = 0;

  if (lhsId < rhsId) {
    key ^= lhsId * 7937;
    key ^= rhsId * 2971;
    key ^= lhsPol ? 911 : 0;
    key ^= rhsPol ? 353 : 0;
  } else {
    key ^= rhsId * 7937;
    key ^= lhsId * 2971;
    key ^= rhsPol ? 911 : 0;
    key ^= lhsPol ? 353 : 0;
  }

  return key;
}

void Aig::registerTravId(int nodeId, int threadId, int travId) {
  this->nodesTravId[nodeId].first  = threadId;
  this->nodesTravId[nodeId].second = travId;
}

bool Aig::lookupTravId(int nodeId, int threadId, int travId) {
  if ((this->nodesTravId[nodeId].first == threadId) &&
      (this->nodesTravId[nodeId].second == travId)) {
    return true;
  } else {
    return false;
  }
}

std::vector<std::pair<int, int>>& Aig::getNodesTravId() {
  return this->nodesTravId;
}

std::unordered_multimap<unsigned, GNode>& Aig::getFanoutMap(int nodeId) {
  return this->nodesFanoutMap[nodeId];
}

std::vector<std::unordered_multimap<unsigned, GNode>>&
Aig::getNodesFanoutMap() {
  return this->nodesFanoutMap;
}

Graph& Aig::getGraph() { return this->graph; }

std::vector<GNode>& Aig::getNodes() { return this->nodes; }

std::vector<GNode>& Aig::getInputNodes() { return this->inputNodes; }

std::vector<GNode>& Aig::getLatchNodes() { return this->latchNodes; }

std::vector<GNode>& Aig::getOutputNodes() { return this->outputNodes; }

GNode Aig::getConstZero() { return this->nodes[0]; }

int Aig::getNumInputs() { return this->inputNodes.size(); }

int Aig::getNumLatches() { return this->latchNodes.size(); }

int Aig::getNumOutputs() { return this->outputNodes.size(); }

int Aig::getNumAnds() {
  int nNodes = std::distance(this->graph.begin(), this->graph.end());
  return (nNodes - (getNumInputs() + getNumLatches() + getNumOutputs() + 1));
  // +1 is to disconsider the constant node.
}

int Aig::getDepth() {

  resetAllIds();

  int max = -1;

  for (auto po : this->outputNodes) {
    NodeData& poData = this->graph.getData(po, galois::MethodFlag::READ);
    if (max < poData.level) {
      max = poData.level;
    }
  }

  assert(max > -1);

  return max;
}

std::vector<std::string>& Aig::getInputNames() { return this->inputNames; }

void Aig::setInputNames(std::vector<std::string> inputNames) {
  this->inputNames = inputNames;
}

std::vector<std::string>& Aig::getLatchNames() { return this->latchNames; }

void Aig::setLatchNames(std::vector<std::string> latchNames) {
  this->latchNames = latchNames;
}

std::vector<std::string>& Aig::getOutputNames() { return this->outputNames; }

void Aig::setOutputNames(std::vector<std::string> outputNames) {
  this->outputNames = outputNames;
}

std::string Aig::getDesignName() { return this->designName; }

void Aig::setDesignName(std::string designName) {
  this->designName = designName;
}

/*
bool Aig::isGNodeComplemented(GNode node) {
  return (bool)(((unsigned long int)node) & 01u);
}

GNode Aig::makeGNodeRegular(GNode node) {
  return (GNode)((unsigned long int)(node) & ~01u);
}

GNode Aig::makeGNodeComplemented(GNode node) {
  return (GNode)((unsigned long int)(node) ^ 01u);
}
*/

// ########## ALGORITHMES ######## ///

struct ResetNodeCounters {
  aig::Graph& aigGraph;

  ResetNodeCounters(aig::Graph& aigGraph) : aigGraph(aigGraph) {}

  void operator()(aig::GNode node) {
    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);
    nodeData.counter        = 0;
  }
};

void Aig::resetAllNodeCounters() {
  galois::do_all(galois::iterate(graph), ResetNodeCounters{graph},
                 galois::steal());
}

void Aig::resetAndIds() {

  std::stack<GNode> stack;

  computeTopologicalSortForAnds(stack);

  int currentId = this->getNumInputs() + this->getNumLatches() + 1;

  while (!stack.empty()) {

    GNode node = stack.top();
    stack.pop();
    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);
    nodeData.id              = currentId++;
    nodeData.counter         = 0;
    this->nodes[nodeData.id] = node;
  }

  // std::cout << std::endl << "All AND node IDs were reseted!" << std::endl;
}

void Aig::resetAndPIsIds() {

  std::stack<GNode> stack;

  computeTopologicalSortForAnds(stack);

  int currentId = 1;

  for (GNode pi : this->inputNodes) {
    NodeData& piData       = this->graph.getData(pi, galois::MethodFlag::WRITE);
    piData.id              = currentId++;
    this->nodes[piData.id] = pi;
  }

  while (!stack.empty()) {
    GNode node = stack.top();
    stack.pop();
    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);
    nodeData.id              = currentId++;
    this->nodes[nodeData.id] = node;
  }

  // std::cout << std::endl << "All AND node IDs were reseted!" << std::endl;
}

void Aig::resetAndPOsIds() {

  std::stack<GNode> stack;

  computeTopologicalSortForAnds(stack);

  int currentId = this->getNumInputs() + this->getNumLatches() + 1;

  while (!stack.empty()) {
    GNode node = stack.top();
    stack.pop();
    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);
    nodeData.id              = currentId++;
    this->nodes[nodeData.id] = node;
  }

  for (GNode po : this->outputNodes) {
    NodeData& poData       = this->graph.getData(po, galois::MethodFlag::WRITE);
    poData.id              = currentId++;
    this->nodes[poData.id] = po;
  }

  // std::cout << std::endl << "All AND node IDs were reseted!" << std::endl;
}

void Aig::resetAllIds() {

  std::stack<GNode> stack;

  computeTopologicalSortForAnds(stack);

  int currentId = 1;

  for (GNode pi : this->inputNodes) {
    NodeData& piData       = this->graph.getData(pi, galois::MethodFlag::WRITE);
    piData.id              = currentId++;
    piData.level           = 0;
    this->nodes[piData.id] = pi;
  }

  for (GNode latch : this->latchNodes) {
    NodeData& latchData = this->graph.getData(latch, galois::MethodFlag::WRITE);
    latchData.id        = currentId++;
    latchData.level     = 0; // FIXME
    this->nodes[latchData.id] = latch;
  }

  while (!stack.empty()) {
    GNode node = stack.top();
    stack.pop();
    NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);
    nodeData.id        = currentId++;

    auto inEdge      = this->graph.in_edge_begin(node);
    GNode lhsNode    = this->graph.getEdgeDst(inEdge);
    NodeData lhsData = this->graph.getData(lhsNode, galois::MethodFlag::READ);
    inEdge++;
    GNode rhsNode    = this->graph.getEdgeDst(inEdge);
    NodeData rhsData = this->graph.getData(rhsNode, galois::MethodFlag::READ);

    nodeData.level = 1 + std::max(lhsData.level, rhsData.level);

    this->nodes[nodeData.id] = node;
  }

  for (GNode po : this->outputNodes) {
    NodeData& poData = this->graph.getData(po, galois::MethodFlag::WRITE);
    poData.id        = currentId++;

    auto inEdge     = this->graph.in_edge_begin(po);
    GNode inNode    = this->graph.getEdgeDst(inEdge);
    NodeData inData = this->graph.getData(inNode, galois::MethodFlag::READ);

    poData.level           = inData.level;
    this->nodes[poData.id] = po;
  }

  // std::cout << std::endl << "All AND node IDs were reseted!" << std::endl;
}

void Aig::computeTopologicalSortForAll(std::stack<GNode>& stack) {

  int size = this->nodes.size();
  std::vector<bool> visited(size, false);

  for (GNode pi : this->inputNodes) {
    for (auto outEdge : this->graph.out_edges(pi)) {

      GNode node         = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);

      if (!visited[nodeData.id]) {
        topologicalSortAll(node, visited, stack);
      }
    }

    stack.push(pi);
  }

  for (GNode latch : this->latchNodes) {
    for (auto outEdge : this->graph.out_edges(latch)) {

      GNode node         = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);

      if (!visited[nodeData.id]) {
        topologicalSortAll(node, visited, stack);
      }
    }

    stack.push(latch);
  }
}

void Aig::topologicalSortAll(GNode node, std::vector<bool>& visited,
                             std::stack<GNode>& stack) {

  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::READ);
  visited[nodeData.id] = true;

  for (auto outEdge : this->graph.out_edges(node)) {

    GNode nextNode = this->graph.getEdgeDst(outEdge);
    NodeData& nextNodeData =
        this->graph.getData(nextNode, galois::MethodFlag::READ);

    if (!visited[nextNodeData.id]) {
      topologicalSortAll(nextNode, visited, stack);
    }
  }

  stack.push(node);
}

void Aig::computeTopologicalSortForAnds(std::stack<GNode>& stack) {

  int size = this->nodes.size();
  std::vector<bool> visited(size, false);

  for (GNode pi : this->inputNodes) {
    for (auto outEdge : this->graph.out_edges(pi)) {

      GNode node         = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);

      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {
        topologicalSortAnds(node, visited, stack);
      }
    }
  }

  for (GNode latch : this->latchNodes) {
    for (auto outEdge : this->graph.out_edges(latch)) {

      GNode node         = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);

      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {
        topologicalSortAnds(node, visited, stack);
      }
    }
  }
}

void Aig::topologicalSortAnds(GNode node, std::vector<bool>& visited,
                              std::stack<GNode>& stack) {

  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::READ);
  visited[nodeData.id] = true;

  for (auto outEdge : this->graph.out_edges(node)) {

    GNode nextNode = this->graph.getEdgeDst(outEdge);
    NodeData& nextNodeData =
        this->graph.getData(nextNode, galois::MethodFlag::READ);

    if ((!visited[nextNodeData.id]) && (nextNodeData.type == NodeType::AND)) {
      topologicalSortAnds(nextNode, visited, stack);
    }
  }

  stack.push(node);
}

void Aig::computeGenericTopologicalSortForAnds(
    std::vector<GNode>& sortedNodes) {

  int size = this->nodes.size();
  std::vector<bool> visited(size, false);

  for (GNode pi : this->inputNodes) {
    for (auto outEdge : this->graph.out_edges(pi)) {

      GNode node = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData =
          this->graph.getData(node, galois::MethodFlag::UNPROTECTED);

      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {
        genericTopologicalSortAnds(node, visited, sortedNodes);
      }
    }
  }

  for (GNode latch : this->latchNodes) {
    for (auto outEdge : this->graph.out_edges(latch)) {

      GNode node = this->graph.getEdgeDst(outEdge);
      NodeData& nodeData =
          this->graph.getData(node, galois::MethodFlag::UNPROTECTED);

      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {
        genericTopologicalSortAnds(node, visited, sortedNodes);
      }
    }
  }
}

void Aig::genericTopologicalSortAnds(GNode node, std::vector<bool>& visited,
                                     std::vector<GNode>& sortedNodes) {

  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::UNPROTECTED);
  visited[nodeData.id] = true;

  for (auto outEdge : this->graph.out_edges(node)) {

    GNode nextNode = this->graph.getEdgeDst(outEdge);
    NodeData& nextNodeData =
        this->graph.getData(nextNode, galois::MethodFlag::UNPROTECTED);

    if ((!visited[nextNodeData.id]) && (nextNodeData.type == NodeType::AND)) {
      genericTopologicalSortAnds(nextNode, visited, sortedNodes);
    }
  }

  sortedNodes.push_back(node);
}

std::string Aig::toDot() {

  // Preprocess PI, LATCH and PO names
  std::unordered_map<int, std::string> piNames;
  for (size_t i = 0; i < this->inputNodes.size(); i++) {
    aig::NodeData& nodeData =
        graph.getData(this->inputNodes[i], galois::MethodFlag::READ);
    piNames.insert(std::make_pair(nodeData.id, this->inputNames[i]));
  }

  std::unordered_map<int, std::string> latchNames;
  for (size_t i = 0; i < this->latchNodes.size(); i++) {
    aig::NodeData& nodeData =
        graph.getData(this->latchNodes[i], galois::MethodFlag::READ);
    latchNames.insert(std::make_pair(nodeData.id, this->latchNames[i]));
  }

  std::unordered_map<int, std::string> poNames;
  for (size_t i = 0; i < this->outputNodes.size(); i++) {
    aig::NodeData& nodeData =
        graph.getData(this->outputNodes[i], galois::MethodFlag::READ);
    poNames.insert(std::make_pair(nodeData.id, this->outputNames[i]));
  }

  std::stringstream dot, inputs, latches, outputs, ands, edges;

  for (auto node : this->graph) {

    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);

    // Write Edges
    for (auto edge : graph.in_edges(node)) {
      aig::GNode dstNode     = graph.getEdgeDst(edge);
      aig::NodeData& dstData = graph.getData(dstNode, galois::MethodFlag::READ);
      bool polarity = graph.getEdgeData(edge, galois::MethodFlag::READ);

      std::string nodeName, dstName;

      if (nodeData.type == NodeType::PI) {
        nodeName = piNames[nodeData.id];
      } else {
        if (nodeData.type == NodeType::LATCH) {
          nodeName = latchNames[nodeData.id];
        } else {
          if (nodeData.type == NodeType::PO) {
            nodeName = poNames[nodeData.id];
          } else {
            nodeName = std::to_string(nodeData.id);
          }
        }
      }

      if (dstData.type == NodeType::PI) {
        dstName = piNames[dstData.id];
      } else {
        if (dstData.type == NodeType::LATCH) {
          dstName = latchNames[dstData.id];
        } else {
          if (dstData.type == NodeType::PO) {
            dstName = poNames[dstData.id];
          } else {
            dstName = std::to_string(dstData.id);
          }
        }
      }

      edges << "\"" << dstName << "\" -> \"" << nodeName << "\"";

      if (polarity) {
        edges << " [penwidth = 3, color=blue]" << std::endl;
      } else {
        edges << " [penwidth = 3, color=red, style=dashed]" << std::endl;
      }
    }

    if (nodeData.type == NodeType::PI) {
      inputs << "\"" << piNames[nodeData.id] << "\"";
      inputs << " [shape=circle, height=1, width=1, penwidth=5 style=filled, "
                "fillcolor=\"#ff8080\", fontsize=20]"
             << std::endl;
      continue;
    }

    if (nodeData.type == NodeType::LATCH) {
      latches << "\"" << latchNames[nodeData.id] << "\"";
      latches << " [shape=square, height=1, width=1, penwidth=5 style=filled, "
                 "fillcolor=\"#ff8080\", fontsize=20]"
              << std::endl;
      continue;
    }

    if (nodeData.type == NodeType::PO) {
      outputs << "\"" << poNames[nodeData.id] << "\"";
      outputs << " [shape=circle, height=1, width=1, penwidth=5 style=filled, "
                 "fillcolor=\"#008080\", fontsize=20]"
              << std::endl;
      continue;
    }

    if (nodeData.type == NodeType::AND) {
      ands << "\"" << nodeData.id << "\"";
      ands << " [shape=circle, height=1, width=1, penwidth=5 style=filled, "
              "fillcolor=\"#ffffff\", fontsize=20]"
           << std::endl;
    }
  }

  dot << "digraph aig {" << std::endl;
  dot << "ranksep=1.5;" << std::endl;
  dot << "nodesep=1.5;" << std::endl;
  dot << inputs.str();
  dot << latches.str();
  dot << ands.str();
  dot << outputs.str();
  dot << edges.str();
  dot << "{ rank=source;";
  for (GNode node : this->inputNodes) {
    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);
    dot << " \"" << piNames[nodeData.id] << "\"";
  }

  for (GNode node : this->latchNodes) {
    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);
    dot << " \"" << latchNames[nodeData.id] << "\"";
  }
  dot << " }" << std::endl;

  dot << "{ rank=sink;";
  for (GNode node : this->outputNodes) {
    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);
    dot << " \"" << poNames[nodeData.id] << "\"";
  }
  dot << " }" << std::endl;

  dot << "rankdir=\"BT\"" << std::endl;
  dot << "}" << std::endl;

  return dot.str();
}

void Aig::writeDot(std::string path, std::string dotText) {

  std::ofstream dotFile;
  dotFile.open(path);
  dotFile << dotText;
  dotFile.close();
}

} // namespace andInverterGraph


================================================
FILE: lonestar/eda/cpu/aig-rewriting/subjectgraph/aig/Aig.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef AIG_AIG_H_
#define AIG_AIG_H_

#include "galois/Galois.h"
#include "galois/runtime/Statistics.h"
#include "galois/graphs/Morph_SepInOut_Graph.h"

#include <iostream>
#include <vector>
#include <stack>
#include <set>
#include <utility>
#include <unordered_map>

namespace andInverterGraph {

struct NodeData;

// Nodes hold a NodeData structure, edges hold a boolean value and are
// directional with InOut distinction
typedef galois::graphs::Morph_SepInOut_Graph<NodeData, bool, true, true> Graph;

typedef Graph::GraphNode GNode;

enum NodeType { AND, PI, PO, LATCH, CONSTZERO, CHOICE };

struct NodeData {
  NodeType type;    // AIG node type acording to the NodeType enum
  int id;           // AIG node identifier
  int level;        // AIG node level
  int counter;      // Counter used for controlling graph traversal
  int nFanout;      // AIG node fanout counter
  int nRefs;        // AIG node reference counter for tech mapping
  float reqTime;    // AIG node required time for tech mapping
  GNode choiceList; // Pointer to the first choice node, if it exists
  // bool isCompl;			// Mark is the output is complemented. It is used in
  // choice nodes.

  NodeData()
      : level(0), counter(0), nFanout(0), nRefs(0),
        reqTime(std::numeric_limits<float>::max()), choiceList(nullptr) {
  } //, isCompl(false) {}
};

class Aig {

private:
  Graph graph;
  std::string designName;
  std::vector<GNode> inputNodes;
  std::vector<GNode> latchNodes;
  std::vector<GNode> outputNodes;
  std::vector<std::string> inputNames;
  std::vector<std::string> latchNames;
  std::vector<std::string> outputNames;
  std::vector<GNode> nodes;
  std::vector<std::pair<int, int>> nodesTravId;
  std::vector<std::unordered_multimap<unsigned, GNode>> nodesFanoutMap;
  size_t idCounter;
  float expansionRate;

  void topologicalSortAll(GNode node, std::vector<bool>& visited,
                          std::stack<GNode>& stack);
  void topologicalSortAnds(GNode node, std::vector<bool>& visited,
                           std::stack<GNode>& stack);
  void genericTopologicalSortAnds(GNode node, std::vector<bool>& visited,
                                  std::vector<GNode>& sortedNodes);

public:
  Aig();
  Aig(float expansionRate);
  virtual ~Aig();

  void resize(int m, int i, int l, int o, bool hasSymbols);
  void resizeNodeVectors(int size);
  void expandNodeVectors(size_t extraSize);
  size_t getNextId();
  GNode createAND(GNode lhsAnd, GNode rhsAnd, bool lhsAndPol, bool rhsAndPol);

  void insertNodeInFanoutMap(GNode andNode, GNode lhsNode, GNode rhsNode,
                             bool lhsPol, bool rhsPol);
  void removeNodeInFanoutMap(GNode removedNode, GNode lhsNode, GNode rhsNode,
                             bool lhsPol, bool rhsPol);
  GNode lookupNodeInFanoutMap(GNode lhsNode, GNode rhsNode, bool lhsPol,
                              bool rhsPol);
  unsigned makeAndHashKey(int lhsId, int rhsId, bool lhsPol, bool rhsPol);

  void registerTravId(int nodeId, int threadId, int travId);
  bool lookupTravId(int nodeId, int threadId, int travId);

  std::vector<std::pair<int, int>>& getNodesTravId();
  std::unordered_multimap<unsigned, GNode>& getFanoutMap(int nodeId);
  std::vector<std::unordered_multimap<unsigned, GNode>>& getNodesFanoutMap();
  Graph& getGraph();
  std::vector<GNode>& getNodes();
  std::vector<GNode>& getInputNodes();
  std::vector<GNode>& getLatchNodes();
  std::vector<GNode>& getOutputNodes();
  std::vector<std::string>& getInputNames();
  void setInputNames(std::vector<std::string> inputNames);
  std::vector<std::string>& getLatchNames();
  void setLatchNames(std::vector<std::string> latchNames);
  std::vector<std::string>& getOutputNames();
  void setOutputNames(std::vector<std::string> outputNames);
  GNode getConstZero();
  int getNumInputs();
  int getNumLatches();
  int getNumOutputs();
  int getNumAnds();
  int getDepth();
  std::string getDesignName();
  void setDesignName(std::string designName);

  // bool isGNodeComplemented(GNode node);
  // GNode makeGNodeRegular(GNode node);
  // GNode makeGNodeComplemented(GNode node);

  void resetAndIds();
  void resetAndPIsIds();
  void resetAndPOsIds();
  void resetAllIds();

  void resetAllNodeCounters();

  void computeTopologicalSortForAll(std::stack<GNode>& stack);
  void computeTopologicalSortForAnds(std::stack<GNode>& stack);
  void computeGenericTopologicalSortForAnds(std::vector<GNode>& sortedNodes);

  void writeDot(std::string path, std::string dotText);
  std::string toDot();
};

} // namespace andInverterGraph

namespace aig = andInverterGraph;

#endif /* AIG_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/writers/AigWriter.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "AigWriter.h"
#include "../util/utilString.h"

AigWriter::AigWriter() {}

AigWriter::AigWriter(std::string path) { setFile(path); }

AigWriter::~AigWriter() { aigerFile.close(); }

void AigWriter::setFile(std::string path) {
  this->path = path;
  aigerFile.close();
  aigerFile.open(path.c_str(), std::ios::trunc);
}

bool AigWriter::isOpen() { return aigerFile.is_open(); }

void AigWriter::writeAag(Aig& aig) {
  aig.resetAndIds();
  writeAagHeader(aig);
  writeInputs(aig);
  writeLatchesAag(aig);
  writeOutputs(aig);
  writeAndsAag(aig);
  writeSymbolTable(aig);
}

void AigWriter::writeAagHeader(Aig& aig) {
  int i = aig.getNumInputs();
  int l = aig.getNumLatches();
  int o = aig.getNumOutputs();
  int a = aig.getNumAnds();
  int m = i + l + a;
  aigerFile << "aag " << m << " " << i << " " << l << " " << o << " " << a
            << std::endl;
}

void AigWriter::writeInputs(Aig& aig) {

  aig::Graph& graph = aig.getGraph();

  for (auto input : aig.getInputNodes()) {
    aig::NodeData& inputData = graph.getData(input, galois::MethodFlag::READ);
    aigerFile << inputData.id * 2 << std::endl;
  }
}

void AigWriter::writeLatchesAag(Aig& aig) {

  aig::Graph& aigGraph = aig.getGraph();

  for (aig::GNode latchNode : aig.getLatchNodes()) {
    aig::NodeData& latchNodeData =
        aigGraph.getData(latchNode, galois::MethodFlag::READ);
    aigerFile << latchNodeData.id * 2 << " ";
    auto inEdge         = aigGraph.in_edge_begin(latchNode);
    bool inEdgePolarity = aigGraph.getEdgeData(inEdge);
    aig::GNode inNode   = aigGraph.getEdgeDst(inEdge);
    aig::NodeData& inNodeData =
        aigGraph.getData(inNode, galois::MethodFlag::READ);
    // bool initState = latchNode->getInitialValue(); // FIXME;
    if (inEdgePolarity) {
      aigerFile << inNodeData.id * 2 << std::endl;
      // aigerFile << inNodeData.id << " " << initState << std::endl;
    } else {
      aigerFile << (inNodeData.id * 2) + 1 << std::endl;
      // aigerFile << inNodeData.id + 1 << " " << initState << std::endl;
    }
  }
}

void AigWriter::writeOutputs(Aig& aig) {

  aig::Graph& graph = aig.getGraph();

  for (auto output : aig.getOutputNodes()) {

    auto inEdge         = graph.in_edge_begin(output);
    bool inEdgePolarity = graph.getEdgeData(inEdge, galois::MethodFlag::READ);
    aig::GNode inNode   = graph.getEdgeDst(inEdge);
    aig::NodeData& inNodeData = graph.getData(inNode, galois::MethodFlag::READ);
    if (inEdgePolarity) {
      aigerFile << inNodeData.id * 2 << std::endl;
    } else {
      aigerFile << (inNodeData.id * 2) + 1 << std::endl;
    }
  }
}

void AigWriter::writeAndsAag(Aig& aig) {

  std::stack<aig::GNode> stack;

  aig.computeTopologicalSortForAnds(stack);

  // std::cout << "size: " << aig.getNumAnds() << std::endl;
  // std::cout << "stack size: " << stack.size() << std::endl;

  aig::Graph& graph = aig.getGraph();

  // unsigned int currentID = aig.getNumInputs() + aig.getNumLatches() + 1;

  while (!stack.empty()) {

    aig::GNode node = stack.top();
    stack.pop();

    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);
    // std::cout << nodeData.id << " -> ";
    // nodeData.id = currentID++; // Redefines the AND IDs according to the
    // topological sorting. std::cout << nodeData.id << std::endl;

    unsigned int andIndex = nodeData.id * 2;

    auto inEdge        = graph.in_edge_begin(node);
    bool lhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);
    aig::GNode lhsNode = graph.getEdgeDst(inEdge);
    aig::NodeData& lhsNodeData =
        graph.getData(lhsNode, galois::MethodFlag::READ);
    unsigned int lhsIndex = lhsNodeData.id * 2;
    lhsIndex              = lhsPolarity ? lhsIndex : (lhsIndex + 1);

    inEdge++;
    bool rhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);
    aig::GNode rhsNode = graph.getEdgeDst(inEdge);
    aig::NodeData& rhsNodeData =
        graph.getData(rhsNode, galois::MethodFlag::READ);
    unsigned int rhsIndex = rhsNodeData.id * 2;
    rhsIndex              = rhsPolarity ? rhsIndex : (rhsIndex + 1);

    if (lhsIndex < rhsIndex) {
      std::swap(lhsIndex, rhsIndex);
    }

    aigerFile << andIndex << " " << lhsIndex << " " << rhsIndex << std::endl;
  }
}

void AigWriter::writeAig(Aig& aig) {
  aig.resetAndIds();
  writeAigHeader(aig);
  writeLatchesAig(aig);
  writeOutputs(aig);
  writeAndsAig(aig);
  writeSymbolTable(aig);
}

void AigWriter::writeAigHeader(Aig& aig) {
  int i = aig.getNumInputs();
  int l = aig.getNumLatches();
  int o = aig.getNumOutputs();
  int a = aig.getNumAnds();
  int m = i + l + a;
  aigerFile << "aig " << m << " " << i << " " << l << " " << o << " " << a
            << std::endl;
}

void AigWriter::writeLatchesAig(Aig& aig) {

  aig::Graph& aigGraph = aig.getGraph();

  for (aig::GNode latchNode : aig.getLatchNodes()) {
    auto inEdge         = aigGraph.in_edge_begin(latchNode);
    bool inEdgePolarity = aigGraph.getEdgeData(inEdge);
    aig::GNode inNode   = aigGraph.getEdgeDst(inEdge);
    aig::NodeData& inNodeData =
        aigGraph.getData(inNode, galois::MethodFlag::READ);
    // bool initState = latchNode->getInitialValue(); // FIXME;
    if (inEdgePolarity) {
      aigerFile << inNodeData.id * 2 << std::endl;
      // aigerFile << inNodeData.id + 1 << " " << initState << std::endl;
    } else {
      aigerFile << (inNodeData.id * 2) + 1 << std::endl;
      // aigerFile << inNodeData.id + 1 << " " << initState << std::endl;
    }
  }
}

void AigWriter::writeAndsAig(Aig& aig) {

  std::stack<aig::GNode> stack;

  aig.computeTopologicalSortForAnds(stack);

  aig::Graph& graph = aig.getGraph();

  // unsigned int currentID = aig.getNumInputs() + aig.getNumLatches() + 1;

  while (!stack.empty()) {

    aig::GNode node = stack.top();
    stack.pop();

    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);
    // std::cout << nodeData.id << " -> ";
    // nodeData.id = currentID++; // Redefines the AND IDs according to the
    // topological sorting. std::cout << nodeData.id << std::endl;

    unsigned int andIndex = nodeData.id * 2;

    auto inEdge        = graph.in_edge_begin(node);
    bool lhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);
    aig::GNode lhsNode = graph.getEdgeDst(inEdge);
    aig::NodeData& lhsNodeData =
        graph.getData(lhsNode, galois::MethodFlag::READ);
    unsigned int lhsIndex = lhsNodeData.id * 2;
    lhsIndex              = lhsPolarity ? lhsIndex : (lhsIndex + 1);

    inEdge++;
    bool rhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);
    aig::GNode rhsNode = graph.getEdgeDst(inEdge);
    aig::NodeData& rhsNodeData =
        graph.getData(rhsNode, galois::MethodFlag::READ);
    unsigned int rhsIndex = rhsNodeData.id * 2;
    rhsIndex              = rhsPolarity ? rhsIndex : (rhsIndex + 1);

    if (lhsIndex < rhsIndex) {
      std::swap(lhsIndex, rhsIndex);
    }

    encode(andIndex - lhsIndex);
    encode(lhsIndex - rhsIndex);
  }
}

void AigWriter::writeSymbolTable(Aig& aig) {

  int i = 0;
  for (auto inputName : aig.getInputNames()) {
    aigerFile << "i" << i++ << " " << inputName << std::endl;
  }

  i = 0;
  for (auto latchName : aig.getLatchNames()) {
    aigerFile << "l" << i++ << " " << latchName << std::endl;
  }

  i = 0;
  for (auto outputName : aig.getOutputNames()) {
    aigerFile << "o" << i++ << " " << outputName << std::endl;
  }

  aigerFile << "c" << std::endl << aig.getDesignName() << std::endl;
}

void AigWriter::encode(unsigned x) {
  unsigned char ch;
  while (x & ~0x7f) {
    ch = (x & 0x7f) | 0x80;
    aigerFile.put(ch);
    x >>= 7;
  }
  ch = x;
  aigerFile.put(ch);
}

void AigWriter::close() { aigerFile.close(); }


================================================
FILE: lonestar/eda/cpu/aig-rewriting/writers/AigWriter.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef AIGWRITER_H_
#define AIGWRITER_H_

#include <fstream>
#include <iostream>
#include <string>

#include "../subjectgraph/aig/Aig.h"
#include "galois/Galois.h"

typedef aig::Aig Aig;

class AigWriter {

private:
  std::ofstream aigerFile;
  std::string path;

  void writeAagHeader(Aig& aig);
  void writeLatchesAag(Aig& aig);
  void writeAndsAag(Aig& aig);

  void writeAigHeader(Aig& aig);
  void writeLatchesAig(Aig& aig);
  void writeAndsAig(Aig& aig);

  void writeInputs(Aig& aig);
  void writeOutputs(Aig& aig);
  void writeSymbolTable(Aig& aig);

  void encode(unsigned x);

public:
  AigWriter();
  AigWriter(std::string path);
  virtual ~AigWriter();

  void setFile(std::string path);
  bool isOpen();
  void close();

  void writeAag(Aig& aig);
  void writeAig(Aig& aig);
};

#endif /* AIGWRITER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/writers/BlifWriter.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Blif format writer, October 17, 2018.

*/

#include "BlifWriter.h"
#include "../util/utilString.h"
#include <unordered_set>

BlifWriter::BlifWriter() {}

BlifWriter::BlifWriter(std::string path) { setFile(path); }

BlifWriter::~BlifWriter() { blifFile.close(); }

void BlifWriter::setFile(std::string path) {
  this->path = path;
  blifFile.close();
  blifFile.open(path.c_str(), std::ios::trunc);
}

bool BlifWriter::isOpen() { return blifFile.is_open(); }

void BlifWriter::close() { blifFile.close(); }

void BlifWriter::writeNetlist(aig::Aig& aig, algorithm::PriCutManager& cutMan) {

  aig::GNode leaf;
  aig::Graph& aigGraph   = aig.getGraph();
  std::string designName = aig.getDesignName();
  find_and_replace(designName, " ", "_");
  int nDigitsPIs = countDigits(aig.getInputNodes().size() - 1);
  int nDigitsPOs = countDigits(aig.getOutputNodes().size() - 1);

  this->blifFile << ".model " << designName << std::endl;

  this->blifFile << ".inputs ";
  for (size_t i = 0; i < aig.getInputNodes().size(); i++) {
    this->blifFile << "pi" << std::setfill('0') << std::setw(nDigitsPIs) << i
                   << " ";
  }
  this->blifFile << std::endl;

  this->blifFile << ".outputs ";
  for (size_t i = 0; i < aig.getOutputNodes().size(); i++) {
    this->blifFile << "po" << std::setfill('0') << std::setw(nDigitsPOs) << i
                   << " ";
  }
  this->blifFile << std::endl;

  for (auto entry : cutMan.getCovering()) {
    this->blifFile << ".names";

    if (Functional32::isConstZero(cutMan.readTruth(entry.second),
                                  cutMan.getK())) {
      // Output
      this->blifFile << " n" << entry.first << std::endl << "0" << std::endl;
      continue;
    }

    if (Functional32::isConstOne(cutMan.readTruth(entry.second),
                                 cutMan.getK())) {
      // Output
      this->blifFile << " n" << entry.first << std::endl << "1" << std::endl;
      continue;
    }

    // Inputs
    for (int i = 0; i < entry.second->nLeaves; i++) {

      leaf                    = aig.getNodes()[entry.second->leaves[i]];
      aig::NodeData& leafData = aigGraph.getData(leaf);

      if (leafData.type == aig::NodeType::PI) {
        this->blifFile << " pi" << std::setfill('0') << std::setw(nDigitsPIs)
                       << (leafData.id - 1);
      } else {
        this->blifFile << " n" << leafData.id;
      }
    }
    // Output
    this->blifFile << " n" << entry.first << std::endl;
    // Cubes
    this->blifFile << Functional32::toCubeString(cutMan.readTruth(entry.second),
                                                 cutMan.getNWords(),
                                                 entry.second->nLeaves);
  }

  // Define PO poloarities
  for (size_t i = 0; i < aig.getOutputNodes().size(); i++) {

    auto inEdgeIt = aigGraph.in_edge_begin(aig.getOutputNodes()[i]);
    bool outEdgePolarity =
        aigGraph.getEdgeData(inEdgeIt, galois::MethodFlag::READ);
    aig::GNode inNode         = aigGraph.getEdgeDst(inEdgeIt);
    aig::NodeData& inNodeData = aigGraph.getData(inNode);

    if (inNodeData.type == aig::NodeType::PI) {
      this->blifFile << ".names pi" << std::setfill('0')
                     << std::setw(nDigitsPIs) << (inNodeData.id - 1);
      this->blifFile << " po" << std::setfill('0') << std::setw(nDigitsPOs) << i
                     << std::endl;
      this->blifFile << ((outEdgePolarity == true) ? "1 1" : "0 1")
                     << std::endl;
    } else {
      if (inNodeData.type == aig::NodeType::CONSTZERO) {
        this->blifFile << ".names "
                       << " po" << std::setfill('0');
        this->blifFile << std::setw(nDigitsPOs) << i << std::endl;
        this->blifFile << ((outEdgePolarity == true) ? "0" : "1") << std::endl;
      } else {
        this->blifFile << ".names n" << inNodeData.id;
        this->blifFile << " po" << std::setfill('0') << std::setw(nDigitsPOs)
                       << i << std::endl;
        this->blifFile << ((outEdgePolarity == true) ? "1 1" : "0 1")
                       << std::endl;
      }
    }
  }

  this->blifFile << ".end" << std::endl;
}

int BlifWriter::countDigits(int n) {

  int nDigits = 0;
  while (n) {
    n = n / 10;
    nDigits++;
  }
  return nDigits;
}


================================================
FILE: lonestar/eda/cpu/aig-rewriting/writers/BlifWriter.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*

 @Vinicius Possani
 Blif format writer, October 17, 2018.

*/

#ifndef BLIFWRITER_H_
#define BLIFWRITER_H_

#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>

#include "../subjectgraph/aig/Aig.h"
#include "../algorithms/PriorityCutManager.h"
#include "galois/Galois.h"

class BlifWriter {

private:
  std::ofstream blifFile;
  std::string path;

public:
  BlifWriter();
  BlifWriter(std::string path);
  ~BlifWriter();

  void setFile(std::string path);
  bool isOpen();
  void close();
  int countDigits(int n);

  void writeNetlist(aig::Aig& aig, algorithm::PriCutManager& cutMan);
};

#endif /* BLIFWRITER_H_ */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/xxHash/xxhash.c
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
 *  xxHash - Fast Hash algorithm
 *  Copyright (C) 2012-2016, Yann Collet
 *
 *  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions are
 *  met:
 *
 *  * Redistributions of source code must retain the above copyright
 *  notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above
 *  copyright notice, this list of conditions and the following disclaimer
 *  in the documentation and/or other materials provided with the
 *  distribution.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *  You can contact the author at :
 *  - xxHash homepage: http://www.xxhash.com
 *  - xxHash source repository : https://github.com/Cyan4973/xxHash
 */

/* *************************************
 *  Tuning parameters
 ***************************************/
/*!XXH_FORCE_MEMORY_ACCESS :
 * By default, access to unaligned memory is controlled by `memcpy()`, which is
 * safe and portable. Unfortunately, on some target/compiler combinations, the
 * generated assembly is sub-optimal. The below switch allow to select different
 * access method for improved performance. Method 0 (default) : use `memcpy()`.
 * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
 * extension (ie, not portable). This method is safe if your compiler supports
 * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
 * access. This method doesn't depend on compiler but violate C standard. It can
 * generate buggy code on targets which do not support unaligned memory
 * accesses. But in some circumstances, it's the only known way to get the most
 * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
 * for details. Prefer these methods in priority order (0 > 1 > 2)
 */
#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line  \
                                   for example */
#if defined(__GNUC__) &&                                                       \
    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||                    \
     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) ||                   \
     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
#define XXH_FORCE_MEMORY_ACCESS 2
#elif defined(__INTEL_COMPILER) ||                                             \
    (defined(__GNUC__) &&                                                      \
     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||                   \
      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) ||                  \
      defined(__ARM_ARCH_7S__)))
#define XXH_FORCE_MEMORY_ACCESS 1
#endif
#endif

/*!XXH_ACCEPT_NULL_INPUT_POINTER :
 * If the input pointer is a null pointer, xxHash default behavior is to trigger
 * a memory access error, since it is a bad pointer. When this option is
 * enabled, xxHash output for null input pointers will be the same as a
 * null-length input. By default, this option is disabled. To enable it,
 * uncomment below define :
 */
/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */

/*!XXH_FORCE_NATIVE_FORMAT :
 * By default, xxHash library provides endian-independent Hash values, based on
 * little-endian convention. Results are therefore identical for little-endian
 * and big-endian CPU. This comes at a performance cost for big-endian CPU,
 * since some swapping is required to emulate little-endian format. Should
 * endian-independence be of no importance for your application, you may set the
 * #define below to 1, to improve speed for Big-endian CPU. This option has no
 * impact on Little_Endian CPU.
 */
#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */
#define XXH_FORCE_NATIVE_FORMAT 0
#endif

/*!XXH_FORCE_ALIGN_CHECK :
 * This is a minor performance trick, only useful with lots of very small keys.
 * It means : check for aligned/unaligned input.
 * The check costs one initial branch per hash;
 * set it to 0 when the input is guaranteed to be aligned,
 * or when alignment doesn't matter for performance.
 */
#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) ||              \
    defined(_M_X64)
#define XXH_FORCE_ALIGN_CHECK 0
#else
#define XXH_FORCE_ALIGN_CHECK 1
#endif
#endif

/* *************************************
 *  Includes & Memory related functions
 ***************************************/
/*! Modify the local functions below should you wish to use some other memory
 * routines for malloc(), free() */
#include <stdlib.h>
static void* XXH_malloc(size_t s) { return malloc(s); }
static void XXH_free(void* p) { free(p); }
/*! and for memcpy() */
#include <string.h>
static void* XXH_memcpy(void* dest, const void* src, size_t size) {
  return memcpy(dest, src, size);
}

#define XXH_STATIC_LINKING_ONLY
#include "xxhash.h"

/* *************************************
 *  Compiler Specific Options
 ***************************************/
#ifdef _MSC_VER /* Visual Studio */
#pragma warning(                                                               \
    disable : 4127) /* disable: C4127: conditional expression is constant */
#define FORCE_INLINE static __forceinline
#else
#if defined(__cplusplus) ||                                                    \
    defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
#ifdef __GNUC__
#define FORCE_INLINE static inline __attribute__((always_inline))
#else
#define FORCE_INLINE static inline
#endif
#else
#define FORCE_INLINE static
#endif /* __STDC_VERSION__ */
#endif

/* *************************************
 *  Basic Types
 ***************************************/
#ifndef MEM_MODULE
#if !defined(__VMS) &&                                                         \
    (defined(__cplusplus) ||                                                   \
     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
#include <stdint.h>
typedef uint8_t BYTE;
typedef uint16_t U16;
typedef uint32_t U32;
#else
typedef unsigned char BYTE;
typedef unsigned short U16;
typedef unsigned int U32;
#endif
#endif

#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))

/* Force direct memory access. Only works on CPU which support unaligned memory
 * access in hardware */
static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }

#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))

/* __pack instructions are safer, but compiler specific, hence potentially
 * problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union {
  U32 u32;
} __attribute__((packed)) unalign;
static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }

#else

/* portable and safe solution. Generally efficient.
 * see : http://stackoverflow.com/a/32095106/646947
 */
static U32 XXH_read32(const void* memPtr) {
  U32 val;
  memcpy(&val, memPtr, sizeof(val));
  return val;
}

#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */

/* ****************************************
 *  Compiler-specific Functions and Macros
 ******************************************/
#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)

/* Note : although _rotl exists for minGW (GCC under windows), performance seems
 * poor */
#if defined(_MSC_VER)
#define XXH_rotl32(x, r) _rotl(x, r)
#define XXH_rotl64(x, r) _rotl64(x, r)
#else
#define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r)))
#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
#endif

#if defined(_MSC_VER) /* Visual Studio */
#define XXH_swap32 _byteswap_ulong
#elif XXH_GCC_VERSION >= 403
#define XXH_swap32 __builtin_bswap32
#else
static U32 XXH_swap32(U32 x) {
  return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
         ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
}
#endif

/* *************************************
 *  Architecture Macros
 ***************************************/
typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;

/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler
 * command line */
#ifndef XXH_CPU_LITTLE_ENDIAN
static const int g_one = 1;
#define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one))
#endif

/* ***************************
 *  Memory reads
 *****************************/
typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;

FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
                                    XXH_alignment align) {
  if (align == XXH_unaligned)
    return endian == XXH_littleEndian ? XXH_read32(ptr)
                                      : XXH_swap32(XXH_read32(ptr));
  else
    return endian == XXH_littleEndian ? *(const U32*)ptr
                                      : XXH_swap32(*(const U32*)ptr);
}

FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) {
  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
}

static U32 XXH_readBE32(const void* ptr) {
  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
}

/* *************************************
 *  Macros
 ***************************************/
#define XXH_STATIC_ASSERT(c)                                                   \
  {                                                                            \
    enum { XXH_static_assert = 1 / (int)(!!(c)) };                             \
  } /* use only *after* variable declarations */
XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; }

/* *******************************************************************
 *  32-bits hash functions
 *********************************************************************/
static const U32 PRIME32_1 = 2654435761U;
static const U32 PRIME32_2 = 2246822519U;
static const U32 PRIME32_3 = 3266489917U;
static const U32 PRIME32_4 = 668265263U;
static const U32 PRIME32_5 = 374761393U;

static U32 XXH32_round(U32 seed, U32 input) {
  seed += input * PRIME32_2;
  seed = XXH_rotl32(seed, 13);
  seed *= PRIME32_1;
  return seed;
}

FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed,
                                    XXH_endianess endian, XXH_alignment align) {
  const BYTE* p    = (const BYTE*)input;
  const BYTE* bEnd = p + len;
  U32 h32;
#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)

#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
  if (p == NULL) {
    len  = 0;
    bEnd = p = (const BYTE*)(size_t)16;
  }
#endif

  if (len >= 16) {
    const BYTE* const limit = bEnd - 16;
    U32 v1                  = seed + PRIME32_1 + PRIME32_2;
    U32 v2                  = seed + PRIME32_2;
    U32 v3                  = seed + 0;
    U32 v4                  = seed - PRIME32_1;

    do {
      v1 = XXH32_round(v1, XXH_get32bits(p));
      p += 4;
      v2 = XXH32_round(v2, XXH_get32bits(p));
      p += 4;
      v3 = XXH32_round(v3, XXH_get32bits(p));
      p += 4;
      v4 = XXH32_round(v4, XXH_get32bits(p));
      p += 4;
    } while (p <= limit);

    h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) +
          XXH_rotl32(v4, 18);
  } else {
    h32 = seed + PRIME32_5;
  }

  h32 += (U32)len;

  while (p + 4 <= bEnd) {
    h32 += XXH_get32bits(p) * PRIME32_3;
    h32 = XXH_rotl32(h32, 17) * PRIME32_4;
    p += 4;
  }

  while (p < bEnd) {
    h32 += (*p) * PRIME32_5;
    h32 = XXH_rotl32(h32, 11) * PRIME32_1;
    p++;
  }

  h32 ^= h32 >> 15;
  h32 *= PRIME32_2;
  h32 ^= h32 >> 13;
  h32 *= PRIME32_3;
  h32 ^= h32 >> 16;

  return h32;
}

XXH_PUBLIC_API unsigned int XXH32(const void* input, size_t len,
                                  unsigned int seed) {
#if 0
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH32_state_t state;
    XXH32_reset(&state, seed);
    XXH32_update(&state, input, len);
    return XXH32_digest(&state);
#else
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if (XXH_FORCE_ALIGN_CHECK) {
    if ((((size_t)input) & 3) ==
        0) { /* Input is 4-bytes aligned, leverage the speed benefit */
      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH32_endian_align(input, len, seed, XXH_littleEndian,
                                  XXH_aligned);
      else
        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
    }
  }

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH32_endian_align(input, len, seed, XXH_littleEndian,
                              XXH_unaligned);
  else
    return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
#endif
}

/*======   Hash streaming   ======*/

XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) {
  return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
}
XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) {
  XXH_free(statePtr);
  return XXH_OK;
}

XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState,
                                    const XXH32_state_t* srcState) {
  memcpy(dstState, srcState, sizeof(*dstState));
}

XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr,
                                         unsigned int seed) {
  XXH32_state_t state; /* using a local state to memcpy() in order to avoid
                          strict-aliasing warnings */
  memset(&state, 0,
         sizeof(state) -
             4); /* do not write into reserved, for future removal */
  state.v1 = seed + PRIME32_1 + PRIME32_2;
  state.v2 = seed + PRIME32_2;
  state.v3 = seed + 0;
  state.v4 = seed - PRIME32_1;
  memcpy(statePtr, &state, sizeof(state));
  return XXH_OK;
}

FORCE_INLINE XXH_errorcode XXH32_update_endian(XXH32_state_t* state,
                                               const void* input, size_t len,
                                               XXH_endianess endian) {
  const BYTE* p          = (const BYTE*)input;
  const BYTE* const bEnd = p + len;

#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
  if (input == NULL)
    return XXH_ERROR;
#endif

  state->total_len_32 += (unsigned)len;
  state->large_len |= (len >= 16) | (state->total_len_32 >= 16);

  if (state->memsize + len < 16) { /* fill in tmp buffer */
    XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
    state->memsize += (unsigned)len;
    return XXH_OK;
  }

  if (state->memsize) { /* some data left from previous update */
    XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input,
               16 - state->memsize);
    {
      const U32* p32 = state->mem32;
      state->v1      = XXH32_round(state->v1, XXH_readLE32(p32, endian));
      p32++;
      state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian));
      p32++;
      state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian));
      p32++;
      state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
    }
    p += 16 - state->memsize;
    state->memsize = 0;
  }

  if (p <= bEnd - 16) {
    const BYTE* const limit = bEnd - 16;
    U32 v1                  = state->v1;
    U32 v2                  = state->v2;
    U32 v3                  = state->v3;
    U32 v4                  = state->v4;

    do {
      v1 = XXH32_round(v1, XXH_readLE32(p, endian));
      p += 4;
      v2 = XXH32_round(v2, XXH_readLE32(p, endian));
      p += 4;
      v3 = XXH32_round(v3, XXH_readLE32(p, endian));
      p += 4;
      v4 = XXH32_round(v4, XXH_readLE32(p, endian));
      p += 4;
    } while (p <= limit);

    state->v1 = v1;
    state->v2 = v2;
    state->v3 = v3;
    state->v4 = v4;
  }

  if (p < bEnd) {
    XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
    state->memsize = (unsigned)(bEnd - p);
  }

  return XXH_OK;
}

XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state_in,
                                          const void* input, size_t len) {
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
  else
    return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
}

FORCE_INLINE U32 XXH32_digest_endian(const XXH32_state_t* state,
                                     XXH_endianess endian) {
  const BYTE* p          = (const BYTE*)state->mem32;
  const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
  U32 h32;

  if (state->large_len) {
    h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) +
          XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
  } else {
    h32 = state->v3 /* == seed */ + PRIME32_5;
  }

  h32 += state->total_len_32;

  while (p + 4 <= bEnd) {
    h32 += XXH_readLE32(p, endian) * PRIME32_3;
    h32 = XXH_rotl32(h32, 17) * PRIME32_4;
    p += 4;
  }

  while (p < bEnd) {
    h32 += (*p) * PRIME32_5;
    h32 = XXH_rotl32(h32, 11) * PRIME32_1;
    p++;
  }

  h32 ^= h32 >> 15;
  h32 *= PRIME32_2;
  h32 ^= h32 >> 13;
  h32 *= PRIME32_3;
  h32 ^= h32 >> 16;

  return h32;
}

XXH_PUBLIC_API unsigned int XXH32_digest(const XXH32_state_t* state_in) {
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH32_digest_endian(state_in, XXH_littleEndian);
  else
    return XXH32_digest_endian(state_in, XXH_bigEndian);
}

/*======   Canonical representation   ======*/

/*! Default XXH result types are basic unsigned 32 and 64 bits.
 *   The canonical representation follows human-readable write convention, aka
 * big-endian (large digits first). These functions allow transformation of hash
 * result into and from its canonical format. This way, hash values can be
 * written into a file or buffer, and remain comparable across different systems
 * and programs.
 */

XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst,
                                            XXH32_hash_t hash) {
  XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
  if (XXH_CPU_LITTLE_ENDIAN)
    hash = XXH_swap32(hash);
  memcpy(dst, &hash, sizeof(*dst));
}

XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t* src) {
  return XXH_readBE32(src);
}

#ifndef XXH_NO_LONG_LONG

/* *******************************************************************
 *  64-bits hash functions
 *********************************************************************/

/*======   Memory access   ======*/

#ifndef MEM_MODULE
#define MEM_MODULE
#if !defined(__VMS) &&                                                         \
    (defined(__cplusplus) ||                                                   \
     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
#include <stdint.h>
typedef uint64_t U64;
#else
typedef unsigned long long
    U64; /* if your compiler doesn't support unsigned long long, replace by
            another 64-bit type here. Note that xxhash.h will also need to be
            updated. */
#endif
#endif

#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))

/* Force direct memory access. Only works on CPU which support unaligned memory
 * access in hardware */
static U64 XXH_read64(const void* memPtr) { return *(const U64*)memPtr; }

#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))

/* __pack instructions are safer, but compiler specific, hence potentially
 * problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union {
  U32 u32;
  U64 u64;
} __attribute__((packed)) unalign64;
static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }

#else

/* portable and safe solution. Generally efficient.
 * see : http://stackoverflow.com/a/32095106/646947
 */

static U64 XXH_read64(const void* memPtr) {
  U64 val;
  memcpy(&val, memPtr, sizeof(val));
  return val;
}

#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */

#if defined(_MSC_VER) /* Visual Studio */
#define XXH_swap64 _byteswap_uint64
#elif XXH_GCC_VERSION >= 403
#define XXH_swap64 __builtin_bswap64
#else
static U64 XXH_swap64(U64 x) {
  return ((x << 56) & 0xff00000000000000ULL) |
         ((x << 40) & 0x00ff000000000000ULL) |
         ((x << 24) & 0x0000ff0000000000ULL) |
         ((x << 8) & 0x000000ff00000000ULL) |
         ((x >> 8) & 0x00000000ff000000ULL) |
         ((x >> 24) & 0x0000000000ff0000ULL) |
         ((x >> 40) & 0x000000000000ff00ULL) |
         ((x >> 56) & 0x00000000000000ffULL);
}
#endif

FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
                                    XXH_alignment align) {
  if (align == XXH_unaligned)
    return endian == XXH_littleEndian ? XXH_read64(ptr)
                                      : XXH_swap64(XXH_read64(ptr));
  else
    return endian == XXH_littleEndian ? *(const U64*)ptr
                                      : XXH_swap64(*(const U64*)ptr);
}

FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
}

static U64 XXH_readBE64(const void* ptr) {
  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
}

/*======   xxh64   ======*/

static const U64 PRIME64_1 = 11400714785074694791ULL;
static const U64 PRIME64_2 = 14029467366897019727ULL;
static const U64 PRIME64_3 = 1609587929392839161ULL;
static const U64 PRIME64_4 = 9650029242287828579ULL;
static const U64 PRIME64_5 = 2870177450012600261ULL;

static U64 XXH64_round(U64 acc, U64 input) {
  acc += input * PRIME64_2;
  acc = XXH_rotl64(acc, 31);
  acc *= PRIME64_1;
  return acc;
}

static U64 XXH64_mergeRound(U64 acc, U64 val) {
  val = XXH64_round(0, val);
  acc ^= val;
  acc = acc * PRIME64_1 + PRIME64_4;
  return acc;
}

FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
                                    XXH_endianess endian, XXH_alignment align) {
  const BYTE* p    = (const BYTE*)input;
  const BYTE* bEnd = p + len;
  U64 h64;
#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)

#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
  if (p == NULL) {
    len  = 0;
    bEnd = p = (const BYTE*)(size_t)32;
  }
#endif

  if (len >= 32) {
    const BYTE* const limit = bEnd - 32;
    U64 v1                  = seed + PRIME64_1 + PRIME64_2;
    U64 v2                  = seed + PRIME64_2;
    U64 v3                  = seed + 0;
    U64 v4                  = seed - PRIME64_1;

    do {
      v1 = XXH64_round(v1, XXH_get64bits(p));
      p += 8;
      v2 = XXH64_round(v2, XXH_get64bits(p));
      p += 8;
      v3 = XXH64_round(v3, XXH_get64bits(p));
      p += 8;
      v4 = XXH64_round(v4, XXH_get64bits(p));
      p += 8;
    } while (p <= limit);

    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
          XXH_rotl64(v4, 18);
    h64 = XXH64_mergeRound(h64, v1);
    h64 = XXH64_mergeRound(h64, v2);
    h64 = XXH64_mergeRound(h64, v3);
    h64 = XXH64_mergeRound(h64, v4);

  } else {
    h64 = seed + PRIME64_5;
  }

  h64 += (U64)len;

  while (p + 8 <= bEnd) {
    U64 const k1 = XXH64_round(0, XXH_get64bits(p));
    h64 ^= k1;
    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
    p += 8;
  }

  if (p + 4 <= bEnd) {
    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
    p += 4;
  }

  while (p < bEnd) {
    h64 ^= (*p) * PRIME64_5;
    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
    p++;
  }

  h64 ^= h64 >> 33;
  h64 *= PRIME64_2;
  h64 ^= h64 >> 29;
  h64 *= PRIME64_3;
  h64 ^= h64 >> 32;

  return h64;
}

XXH_PUBLIC_API unsigned long long XXH64(const void* input, size_t len,
                                        unsigned long long seed) {
#if 0
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH64_state_t state;
    XXH64_reset(&state, seed);
    XXH64_update(&state, input, len);
    return XXH64_digest(&state);
#else
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if (XXH_FORCE_ALIGN_CHECK) {
    if ((((size_t)input) & 7) ==
        0) { /* Input is aligned, let's leverage the speed advantage */
      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
                                  XXH_aligned);
      else
        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
    }
  }

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
                              XXH_unaligned);
  else
    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
#endif
}

/*======   Hash Streaming   ======*/

XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) {
  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
}
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
  XXH_free(statePtr);
  return XXH_OK;
}

XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState,
                                    const XXH64_state_t* srcState) {
  memcpy(dstState, srcState, sizeof(*dstState));
}

XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr,
                                         unsigned long long seed) {
  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
                          strict-aliasing warnings */
  memset(&state, 0,
         sizeof(state) -
             8); /* do not write into reserved, for future removal */
  state.v1 = seed + PRIME64_1 + PRIME64_2;
  state.v2 = seed + PRIME64_2;
  state.v3 = seed + 0;
  state.v4 = seed - PRIME64_1;
  memcpy(statePtr, &state, sizeof(state));
  return XXH_OK;
}

FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
                                               const void* input, size_t len,
                                               XXH_endianess endian) {
  const BYTE* p          = (const BYTE*)input;
  const BYTE* const bEnd = p + len;

#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
  if (input == NULL)
    return XXH_ERROR;
#endif

  state->total_len += len;

  if (state->memsize + len < 32) { /* fill in tmp buffer */
    XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
    state->memsize += (U32)len;
    return XXH_OK;
  }

  if (state->memsize) { /* tmp buffer is full */
    XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
               32 - state->memsize);
    state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
    state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
    state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
    state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
    p += 32 - state->memsize;
    state->memsize = 0;
  }

  if (p + 32 <= bEnd) {
    const BYTE* const limit = bEnd - 32;
    U64 v1                  = state->v1;
    U64 v2                  = state->v2;
    U64 v3                  = state->v3;
    U64 v4                  = state->v4;

    do {
      v1 = XXH64_round(v1, XXH_readLE64(p, endian));
      p += 8;
      v2 = XXH64_round(v2, XXH_readLE64(p, endian));
      p += 8;
      v3 = XXH64_round(v3, XXH_readLE64(p, endian));
      p += 8;
      v4 = XXH64_round(v4, XXH_readLE64(p, endian));
      p += 8;
    } while (p <= limit);

    state->v1 = v1;
    state->v2 = v2;
    state->v3 = v3;
    state->v4 = v4;
  }

  if (p < bEnd) {
    XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
    state->memsize = (unsigned)(bEnd - p);
  }

  return XXH_OK;
}

XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* state_in,
                                          const void* input, size_t len) {
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
  else
    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
}

FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
                                     XXH_endianess endian) {
  const BYTE* p          = (const BYTE*)state->mem64;
  const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
  U64 h64;

  if (state->total_len >= 32) {
    U64 const v1 = state->v1;
    U64 const v2 = state->v2;
    U64 const v3 = state->v3;
    U64 const v4 = state->v4;

    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
          XXH_rotl64(v4, 18);
    h64 = XXH64_mergeRound(h64, v1);
    h64 = XXH64_mergeRound(h64, v2);
    h64 = XXH64_mergeRound(h64, v3);
    h64 = XXH64_mergeRound(h64, v4);
  } else {
    h64 = state->v3 + PRIME64_5;
  }

  h64 += (U64)state->total_len;

  while (p + 8 <= bEnd) {
    U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
    h64 ^= k1;
    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
    p += 8;
  }

  if (p + 4 <= bEnd) {
    h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
    p += 4;
  }

  while (p < bEnd) {
    h64 ^= (*p) * PRIME64_5;
    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
    p++;
  }

  h64 ^= h64 >> 33;
  h64 *= PRIME64_2;
  h64 ^= h64 >> 29;
  h64 *= PRIME64_3;
  h64 ^= h64 >> 32;

  return h64;
}

XXH_PUBLIC_API unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;

  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
    return XXH64_digest_endian(state_in, XXH_littleEndian);
  else
    return XXH64_digest_endian(state_in, XXH_bigEndian);
}

/*====== Canonical representation   ======*/

XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst,
                                            XXH64_hash_t hash) {
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
  if (XXH_CPU_LITTLE_ENDIAN)
    hash = XXH_swap64(hash);
  memcpy(dst, &hash, sizeof(*dst));
}

XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
  return XXH_readBE64(src);
}

#endif /* XXH_NO_LONG_LONG */


================================================
FILE: lonestar/eda/cpu/aig-rewriting/xxHash/xxhash.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/*
   xxHash - Extremely Fast Hash algorithm
   Header File
   Copyright (C) 2012-2016, Yann Collet.

   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:

       * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following disclaimer
   in the documentation and/or other materials provided with the
   distribution.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

   You can contact the author at :
   - xxHash source repository : https://github.com/Cyan4973/xxHash
*/

/* Notice extracted from xxHash homepage :

xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
It also successfully passes all tests from the SMHasher suite.

Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
@3GHz)

Name            Speed       Q.Score   Author
xxHash          5.4 GB/s     10
CrapWow         3.2 GB/s      2       Andrew
MumurHash 3a    2.7 GB/s     10       Austin Appleby
SpookyHash      2.0 GB/s     10       Bob Jenkins
SBox            1.4 GB/s      9       Bret Mulvey
Lookup3         1.2 GB/s      9       Bob Jenkins
SuperFastHash   1.2 GB/s      1       Paul Hsieh
CityHash64      1.05 GB/s    10       Pike & Alakuijala
FNV             0.55 GB/s     5       Fowler, Noll, Vo
CRC32           0.43 GB/s     9
MD5-32          0.33 GB/s    10       Ronald L. Rivest
SHA1-32         0.28 GB/s    10

Q.Score is a measure of quality of the hash function.
It depends on successfully passing SMHasher test set.
10 is a perfect score.

A 64-bits version, named XXH64, is available since r35.
It offers much better speed, but for 64-bits applications only.
Name     Speed on 64 bits    Speed on 32 bits
XXH64       13.8 GB/s            1.9 GB/s
XXH32        6.8 GB/s            6.0 GB/s
*/

#ifndef XXHASH_H_5627135585666179
#define XXHASH_H_5627135585666179 1

#if defined(__cplusplus)
extern "C" {
#endif

/* ****************************
 *  Definitions
 ******************************/
#include <stddef.h> /* size_t */
typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;

/* ****************************
 *  API modifier
 ******************************/
/** XXH_PRIVATE_API
 *   This is useful to include xxhash functions in `static` mode
 *   in order to inline them, and remove their symbol from the public list.
 *   Methodology :
 *     #define XXH_PRIVATE_API
 *     #include "xxhash.h"
 *   `xxhash.c` is automatically included.
 *   It's not useful to compile and link it as a separate module.
 */
#ifdef XXH_PRIVATE_API
#ifndef XXH_STATIC_LINKING_ONLY
#define XXH_STATIC_LINKING_ONLY
#endif
#if defined(__GNUC__)
#define XXH_PUBLIC_API static __inline __attribute__((unused))
#elif defined(__cplusplus) ||                                                  \
    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
#define XXH_PUBLIC_API static inline
#elif defined(_MSC_VER)
#define XXH_PUBLIC_API static __inline
#else
#define XXH_PUBLIC_API                                                         \
  static /* this version may generate warnings for unused static functions;    \
            disable the relevant warning */
#endif
#else
#define XXH_PUBLIC_API /* do nothing */
#endif                 /* XXH_PRIVATE_API */

/*!XXH_NAMESPACE, aka Namespace Emulation :

If you want to include _and expose_ xxHash functions from within your own
library, but also want to avoid symbol collisions with other libraries which may
also include xxHash,

you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash
library with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric
values).

Note that no change is required within the calling program as long as it
includes `xxhash.h` : regular symbol name will be automatically translated by
this header.
*/
#ifdef XXH_NAMESPACE
#define XXH_CAT(A, B) A##B
#define XXH_NAME2(A, B) XXH_CAT(A, B)
#define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
#define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
#define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
#define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
#define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
#define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
#define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
#define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
#define XXH32_canonicalFromHash                                                \
  XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
#define XXH32_hashFromCanonical                                                \
  XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
#define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
#define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
#define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
#define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
#define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
#define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
#define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
#define XXH64_canonicalFromHash                                                \
  XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
#define XXH64_hashFromCanonical                                                \
  XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
#endif

/* *************************************
 *  Version
 ***************************************/
#define XXH_VERSION_MAJOR 0
#define XXH_VERSION_MINOR 6
#define XXH_VERSION_RELEASE 2
#define XXH_VERSION_NUMBER                                                     \
  (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 +                   \
   XXH_VERSION_RELEASE)
XXH_PUBLIC_API unsigned XXH_versionNumber(void);

/*-**********************************************************************
 *  32-bits hash
 ************************************************************************/
typedef unsigned int XXH32_hash_t;

/*! XXH32() :
    Calculate the 32-bits hash of sequence "length" bytes stored at memory
   address "input". The memory between input & input+length must be valid
   (allocated and read-accessible). "seed" can be used to alter the result
   predictably.
    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
 */
XXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t length,
                                  unsigned int seed);

/*======   Streaming   ======*/
typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state,
                                    const XXH32_state_t* src_state);

XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr,
                                         unsigned int seed);
XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* statePtr,
                                          const void* input, size_t length);
XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* statePtr);

/*
These functions generate the xxHash of an input provided in multiple segments.
Note that, for small input, they are slower than single-call functions, due to
state management. For small input, prefer `XXH32()` and `XXH64()` .

XXH state must first be allocated, using XXH*_createState() .

Start a new hash by initializing state with a seed, using XXH*_reset().

Then, feed the hash state by calling XXH*_update() as many times as necessary.
Obviously, input must be allocated and read accessible.
The function returns an error code, with 0 meaning OK, and any other value
meaning there is an error.

Finally, a hash value can be produced anytime, by using XXH*_digest().
This function returns the nn-bits hash as an int or long long.

It's still possible to continue inserting input into the hash state after a
digest, and generate some new hashes later on, by calling again XXH*_digest().

When done, free XXH state space if it was allocated dynamically.
*/

/*======   Canonical representation   ======*/

typedef struct {
  unsigned char digest[4];
} XXH32_canonical_t;
XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst,
                                            XXH32_hash_t hash);
XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t* src);

/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
 *  The canonical representation uses human-readable write convention, aka
 * big-endian (large digits first). These functions allow transformation of hash
 * result into and from its canonical format. This way, hash values can be
 * written into a file / memory, and remain comparable on different systems and
 * programs.
 */

#ifndef XXH_NO_LONG_LONG
/*-**********************************************************************
 *  64-bits hash
 ************************************************************************/
typedef unsigned long long XXH64_hash_t;

/*! XXH64() :
    Calculate the 64-bits hash of sequence of length "len" stored at memory
   address "input". "seed" can be used to alter the result predictably. This
   function runs faster on 64-bits systems, but slower on 32-bits systems (see
   benchmark).
*/
XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length,
                                  unsigned long long seed);

/*======   Streaming   ======*/
typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state,
                                    const XXH64_state_t* src_state);

XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr,
                                         unsigned long long seed);
XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* statePtr,
                                          const void* input, size_t length);
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);

/*======   Canonical representation   ======*/
typedef struct {
  unsigned char digest[8];
} XXH64_canonical_t;
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst,
                                            XXH64_hash_t hash);
XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t* src);
#endif /* XXH_NO_LONG_LONG */

#ifdef XXH_STATIC_LINKING_ONLY

/* ================================================================================================
   This section contains definitions which are not guaranteed to remain stable.
   They may change in future versions, becoming incompatible with a different
version of the library. They shall only be used with static linking. Never use
these definitions in association with dynamic linking !
===================================================================================================
*/

/* These definitions are only meant to make possible
   static allocation of XXH state, on stack or in a struct for example.
   Never use members directly. */

struct XXH32_state_s {
  unsigned total_len_32;
  unsigned large_len;
  unsigned v1;
  unsigned v2;
  unsigned v3;
  unsigned v4;
  unsigned mem32[4]; /* buffer defined as U32 for alignment */
  unsigned memsize;
  unsigned
      reserved; /* never read nor write, will be removed in a future version */
};              /* typedef'd to XXH32_state_t */

#ifndef XXH_NO_LONG_LONG /* remove 64-bits support */
struct XXH64_state_s {
  unsigned long long total_len;
  unsigned long long v1;
  unsigned long long v2;
  unsigned long long v3;
  unsigned long long v4;
  unsigned long long mem64[4]; /* buffer defined as U64 for alignment */
  unsigned memsize;
  unsigned reserved[2]; /* never read nor write, will be removed in a future
                           version */
};                      /* typedef'd to XXH64_state_t */
#endif

#ifdef XXH_PRIVATE_API
#include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */
#endif

#endif /* XXH_STATIC_LINKING_ONLY */

#if defined(__cplusplus)
}
#endif

#endif /* XXHASH_H_5627135585666179 */


================================================
FILE: lonestar/eda/cpu/sproute/BoilerPlate.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <sstream>
#include "galois/Galois.h"
#include "galois/Version.h"
#include "llvm/Support/CommandLine.h"

llvm::cl::opt<int>
    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
               llvm::cl::init(1));

static void LonestarPrintVersion(llvm::raw_ostream& out) {
  out << "LoneStar Benchmark Suite v" << galois::getVersion() << " ("
      << galois::getRevision() << ")\n";
  out.flush();
}

//! initialize lonestar benchmark
void LonestarStart(int argc, char** argv, const char* app, const char* desc,
                   const char* url, llvm::cl::opt<std::string>* input) {
  llvm::cl::SetVersionPrinter(LonestarPrintVersion);
  llvm::cl::ParseCommandLineOptions(argc, argv);
  numThreads = galois::setActiveThreads(numThreads);

  LonestarPrintVersion(llvm::outs());
  llvm::outs() << "Copyright (C) " << galois::getCopyrightYear()
               << " The University of Texas at Austin\n";
  llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n";
  llvm::outs() << "application: " << (app ? app : "unspecified") << "\n";
  if (desc)
    llvm::outs() << desc << "\n";
  if (url)
    llvm::outs() << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
                 << url << "\n";
  llvm::outs() << "\n";
  llvm::outs().flush();

  std::ostringstream cmdout;
  for (int i = 0; i < argc; ++i) {
    cmdout << argv[i];
    if (i != argc - 1)
      cmdout << " ";
  }

  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
  galois::runtime::reportParam("(NULL)", "Hosts", 1);
  if (input) {
    galois::runtime::reportParam("(NULL)", "Input", input->getValue());
  }

  char name[256];
  gethostname(name, 256);
  galois::runtime::reportParam("(NULL)", "Hostname", name);
}


================================================
FILE: lonestar/eda/cpu/sproute/CMakeLists.txt
================================================
add_executable(sproute-cpu main.cpp dist.c dl.c err.c heap.c mst2.c neighbors.c bookshelf_IO.c memAlloc.c)
add_dependencies(apps sproute-cpu)
target_link_libraries(sproute-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS sproute-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_scale(small1 sproute-cpu -ISPD2008Graph "${BASEINPUT}/eda/routing/test.gr" --flute "${BASEINPUT}/eda/routing")


================================================
FILE: lonestar/eda/cpu/sproute/DataProc.h
================================================
#ifndef _DATAPROC_H_
#define _DATAPROC_H_

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "DataType.h"

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/PriorityQueue.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/Profile.h"
#include "galois/LargeArray.h"

#include "DataType.h"
#include "flute.h"

using namespace std;

#define BUFFERSIZE 800
#define STRINGLEN 100
#define MAXNETDEG 1000
#define MAXEDGES 10000000

#define MAXLEN 20000

#define XRANGE 2536
#define YRANGE 2536

// Global variables
float round_avg_dist;
int round_avg_length;
int xGrid, yGrid, numGrids, numNets, vCapacity, hCapacity;
float vCapacity_lb, hCapacity_lb, vCapacity_ub, hCapacity_ub;
int MaxDegree;
int MinWidth[MAXLAYER], MinSpacing[MAXLAYER], ViaSpacing[MAXLAYER];
int xcorner, ycorner, wTile, hTile;
int enlarge, costheight, ripup_threshold, ahTH;
int numValidNets,
    numInvalidNets; // # nets need to be routed (having pins in different grids)
int numLayers;
int totalNumSeg;   // total # segments
int totalOverflow; // total # overflow
int mazeThreshold; // the wirelen threshold to do maze routing
Net** nets;
Net** invalid_nets;
Edge *h_edges, *v_edges;
float d1[YRANGE][XRANGE];
float d2[YRANGE][XRANGE];

/*Bool HV[YRANGE][XRANGE];
Bool hyperV[YRANGE][XRANGE];
Bool hyperH[YRANGE][XRANGE];

int corrEdge[YRANGE][XRANGE];*/ //Michael
int SLOPE;
int vCapacity3D[MAXLAYER], hCapacity3D[MAXLAYER];

float LB;
float UB;
int THRESH_M;
double LOGIS_COF;
int ENLARGE;
int STEP;
int COSHEIGHT;
int STOP;
int VCA;
float L;
int VIA, slope, max_adj;

char benchFile[STRINGLEN];

Segment* seglist;
int* seglistIndex; // the index for the segments for each net
int* seglistCnt;   // the number of segements for each net
int* segOrder;     // the order of segments for routing
Tree* trees;       // the tree topologies
StTree* sttrees;   // the Steiner trees
DTYPE** gxs;       // the copy of xs for nets, used for second FLUTE
DTYPE** gys;       // the copy of xs for nets, used for second FLUTE
DTYPE** gs; // the copy of vertical sequence for nets, used for second FLUTE
int MD = 0, TD = 0;

Edge3D* h_edges3D;
Edge3D* v_edges3D;

OrderNetPin* treeOrderPV;
OrderTree* treeOrderCong;
int numTreeedges;
int viacost;

int layerGrid[MAXLAYER][MAXLEN];
int gridD[MAXLAYER][MAXLEN];
int viaLink[MAXLAYER][MAXLEN];

int d13D[MAXLAYER][YRANGE][XRANGE];
short d23D[MAXLAYER][YRANGE][XRANGE];

dirctionT*** directions3D;
int*** corrEdge3D;
parent3D*** pr3D;

int mazeedge_Threshold;
Bool inRegion[YRANGE][XRANGE];

Bool heapVisited[MAXNETDEG];
// int heapQueue[MAXNETDEG]; //Michael

int gridHV, gridH, gridV, gridHs[MAXLAYER], gridVs[MAXLAYER];

int** heap13D;
short** heap23D;

float *h_costTable, *v_costTable;
Bool stopDEC, errorPRONE;
// OrderNetEdge netEO[2000]; //Michael
int xcor[2000], ycor[2000], dcor[2000];

StTree* sttreesBK;

short **parentX1, **parentY1, **parentX3, **parentY3;

/*float **heap2,**heap1; //Michael

Bool *pop_heap2;*/

// Michael:
int LOCK;

void readFile(const char* benchFile) {
  FILE* fp;
  int i, j, k;
  int pinX, pinY, pinL, netID, numPins, pinInd = 0, grid, newnetID,
                                        invalid_netID, segcount, minwidth;
  float pinX_in, pinY_in;
  int maxDeg = 0;
  int pinXarray[MAXNETDEG], pinYarray[MAXNETDEG], pinLarray[MAXNETDEG];
  char netName[STRINGLEN];
  Bool remove;
  int numAdjust, x1, x2, y1, y2, l1, l2, cap, reduce, reducedCap;
  int net;
  int TC;

  net = 0;

  fp = fopen(benchFile, "r");
  if (fp == NULL) {
    printf("Error in opening %s\n", benchFile);
    exit(1);
  }

  if (fscanf(fp, "grid	%d %d %d\n", &xGrid, &yGrid, &numLayers) != 3)
    abort_with_message("Failed to read required info from benchfile.");

  vCapacity = hCapacity = 0;

  if (fscanf(fp, "vertical capacity	"))
    abort_with_message("Failed to read required info from benchfile.");
  for (i = 0; i < numLayers; i++) {
    if (fscanf(fp, "%d", &vCapacity3D[i]) != 1)
      abort_with_message("Failed to read required info from benchfile.");
    if (fscanf(fp, " "))
      abort_with_message("Failed to read required info from benchfile.");
    vCapacity3D[i] = vCapacity3D[i] / 2;
    vCapacity += vCapacity3D[i];
  }
  if (fscanf(fp, "\n"))
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "horizontal capacity	"))
    abort_with_message("Failed to read required info from benchfile.");
  for (i = 0; i < numLayers; i++) {
    if (fscanf(fp, "%d", &hCapacity3D[i]) != 1)
      abort_with_message("Failed to read required info from benchfile.");
    if (fscanf(fp, " "))
      abort_with_message("Failed to read required info from benchfile.");
    hCapacity3D[i] = hCapacity3D[i] / 2;
    hCapacity += hCapacity3D[i];
  }
  if (fscanf(fp, "\n"))
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "minimum width	"))
    abort_with_message("Failed to read required info from benchfile.");
  for (i = 0; i < numLayers; i++) {
    if (fscanf(fp, "%d", &(MinWidth[i])) != 1)
      abort_with_message("Failed to read required info from benchfile.");
    if (fscanf(fp, " "))
      abort_with_message("Failed to read required info from benchfile.");
  }
  if (fscanf(fp, "\n"))
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "minimum spacing	"))
    abort_with_message("Failed to read required info from benchfile.");
  for (i = 0; i < numLayers; i++) {
    if (fscanf(fp, "%d", &(MinSpacing[i])) != 1)
      abort_with_message("Failed to read required info from benchfile.");
    if (fscanf(fp, " "))
      abort_with_message("Failed to read required info from benchfile.");
  }
  if (fscanf(fp, "\n"))
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "via spacing	"))
    abort_with_message("Failed to read required info from benchfile.");
  for (i = 0; i < numLayers; i++) {
    if (fscanf(fp, "%d", &(ViaSpacing[i])) != 1)
      abort_with_message("Failed to read required info from benchfile.");
    if (fscanf(fp, " "))
      abort_with_message("Failed to read required info from benchfile.");
  }
  if (fscanf(fp, "\n"))
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "%d %d %d %d\n\n", &xcorner, &ycorner, &wTile, &hTile) != 4)
    abort_with_message("Failed to read required info from benchfile.");

  if (fscanf(fp, "num net %d\n", &numNets) != 1)
    abort_with_message("Failed to read required info from benchfile.");

  numGrids = xGrid * yGrid;

  vCapacity_lb = LB * vCapacity;
  hCapacity_lb = LB * hCapacity;
  vCapacity_ub = UB * vCapacity;
  hCapacity_ub = UB * hCapacity;

  printf("\n");
  printf("grid %d %d %d\n", xGrid, yGrid, numLayers);
  for (i = 0; i < numLayers; i++) {
    printf("Layer %d vertical capacity %d, horizontal capacity %d\n", i,
           vCapacity3D[i], hCapacity3D[i]);
  }

  printf("total vertical capacity %d\n", vCapacity);
  printf("total horizontal capacity %d\n", hCapacity);
  printf("num net %d\n", numNets);

  // allocate memory for nets
  nets         = (Net**)malloc(numNets * sizeof(Net*));
  invalid_nets = (Net**)malloc(numNets * sizeof(Net*));
  for (i = 0; i < numNets; i++) {
    nets[i]         = (Net*)malloc(sizeof(Net));
    invalid_nets[i] = (Net*)malloc(sizeof(Net));
  }
  seglistIndex = (int*)malloc((numNets + 1) * sizeof(int));

  // read nets information from the input file
  segcount      = 0;
  newnetID      = 0;
  invalid_netID = 0;
  for (i = 0; i < numNets; i++) {
    net++;
    if (fscanf(fp, "%s %d %d %d\n", netName, &netID, &numPins, &minwidth) != 4)
      abort_with_message("Failed to read required info from benchfile.");
    if (numPins < 1000) {
      pinInd = 0;
      for (j = 0; j < numPins; j++) {
        if (fscanf(fp, "%f	%f	%d\n", &pinX_in, &pinY_in, &pinL) != 3)
          abort_with_message("Failed to read required info from benchfile.");
        pinX = (int)((pinX_in - xcorner) / wTile);
        pinY = (int)((pinY_in - ycorner) / hTile);
        if (!(pinX < 0 || pinX >= xGrid || pinY < -1 || pinY >= yGrid ||
              pinL >= numLayers || pinL < 0)) {
          remove = FALSE;
          for (k = 0; k < pinInd; k++) {
            if (pinX == pinXarray[k] && pinY == pinYarray[k] &&
                pinL == pinLarray[k]) {
              remove = TRUE;
              break;
            }
          }
          if (!remove) // the pin is in different grid from other pins
          {
            pinXarray[pinInd] = pinX;
            pinYarray[pinInd] = pinY;
            pinLarray[pinInd] = pinL;
            pinInd++;
          }
        }
      }

      if (pinInd > 1) // valid net
      {
        MD = max(MD, pinInd);
        TD += pinInd;
        strcpy(nets[newnetID]->name, netName);
        nets[newnetID]->netIDorg = netID;
        nets[newnetID]->numPins  = numPins;
        nets[newnetID]->deg      = pinInd;
        nets[newnetID]->pinX     = (short*)malloc(pinInd * sizeof(short));
        nets[newnetID]->pinY     = (short*)malloc(pinInd * sizeof(short));
        nets[newnetID]->pinL     = (short*)malloc(pinInd * sizeof(short));

        for (j = 0; j < pinInd; j++) {
          nets[newnetID]->pinX[j] = pinXarray[j];
          nets[newnetID]->pinY[j] = pinYarray[j];
          nets[newnetID]->pinL[j] = pinLarray[j];
        }
        maxDeg                 = pinInd > maxDeg ? pinInd : maxDeg;
        seglistIndex[newnetID] = segcount;
        newnetID++;
        segcount +=
            2 * pinInd -
            3; // at most (2*numPins-2) nodes, (2*numPins-3) nets for a net
      }        // if valid net
      else {
        strcpy(invalid_nets[invalid_netID]->name, netName);
        invalid_nets[invalid_netID]->netIDorg = netID;
        invalid_nets[invalid_netID]->numPins  = numPins;
        invalid_nets[invalid_netID]->deg      = pinInd;
        invalid_nets[invalid_netID]->pinX =
            (short*)malloc(pinInd * sizeof(short));
        invalid_nets[invalid_netID]->pinY =
            (short*)malloc(pinInd * sizeof(short));
        invalid_nets[invalid_netID]->pinL =
            (short*)malloc(pinInd * sizeof(short));

        for (j = 0; j < pinInd; j++) {
          invalid_nets[invalid_netID]->pinX[j] = pinXarray[j];
          invalid_nets[invalid_netID]->pinY[j] = pinYarray[j];
          invalid_nets[invalid_netID]->pinL[j] = pinLarray[j];
        }
        invalid_netID++;
      }

    } // if

    else {
      for (j = 0; j < numPins; j++)
        if (fscanf(fp, "%f	%f	%d\n", &pinX_in, &pinY_in, &pinL) != 3)
          abort_with_message("Failed to read required info from benchfile.");
    }

  } // loop i
  printf("the total net number is %d\n\n", net);

  if ((pinInd > 1) && (pinInd < 1000)) {
    seglistIndex[newnetID] = segcount; // the end pointer of the seglist
  }
  numValidNets   = newnetID;
  numInvalidNets = invalid_netID;

  // allocate memory and initialize for edges

  h_edges = (Edge*)calloc(((xGrid - 1) * yGrid), sizeof(Edge));
  v_edges = (Edge*)calloc((xGrid * (yGrid - 1)), sizeof(Edge));

  v_edges3D = (Edge3D*)calloc((numLayers * xGrid * yGrid), sizeof(Edge3D));
  h_edges3D = (Edge3D*)calloc((numLayers * xGrid * yGrid), sizeof(Edge3D));

  // 2D edge innitialization
  TC = 0;
  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid              = i * (xGrid - 1) + j;
      h_edges[grid].cap = hCapacity;
      TC += hCapacity;
      h_edges[grid].usage      = 0;
      h_edges[grid].est_usage  = 0;
      h_edges[grid].red        = 0;
      h_edges[grid].last_usage = 0;

      h_edges[grid].max_ripups         = 0;
      h_edges[grid].max_have_rippedups = 0;
      h_edges[grid].ripups_cur_round   = false;
    }
  }
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid              = i * xGrid + j;
      v_edges[grid].cap = vCapacity;
      TC += vCapacity;
      v_edges[grid].usage      = 0;
      v_edges[grid].est_usage  = 0;
      v_edges[grid].red        = 0;
      v_edges[grid].last_usage = 0;

      v_edges[grid].max_ripups         = 0;
      v_edges[grid].max_have_rippedups = 0;
      v_edges[grid].ripups_cur_round   = false;
    }
  }

  for (k = 0; k < numLayers; k++) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid                  = i * (xGrid - 1) + j + k * (xGrid - 1) * yGrid;
        h_edges3D[grid].cap   = hCapacity3D[k];
        h_edges3D[grid].usage = 0;
        h_edges3D[grid].red   = 0;
      }
    }
    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid                  = i * xGrid + j + k * xGrid * (yGrid - 1);
        v_edges3D[grid].cap   = vCapacity3D[k];
        v_edges3D[grid].usage = 0;
        v_edges3D[grid].red   = 0;
      }
    }
  }

  // modify the capacity of edges according to the input file

  if (fscanf(fp, "%d\n", &numAdjust) != 1)
    abort_with_message("Failed to read required info from benchfile.");
  printf("num of Adjust is %d\n", numAdjust);
  while (numAdjust > 0) {
    if (fscanf(fp, "%d %d %d %d %d %d %d\n", &x1, &y1, &l1, &x2, &y2, &l2,
               &reducedCap) != 7)
      abort_with_message("Failed to read required info from benchfile.");
    reducedCap = reducedCap / 2;

    k = l1 - 1;

    if (y1 == y2) // horizontal edge
    {
      grid                = y1 * (xGrid - 1) + x1 + k * (xGrid - 1) * yGrid;
      cap                 = h_edges3D[grid].cap;
      reduce              = cap - reducedCap;
      h_edges3D[grid].cap = reducedCap;
      h_edges3D[grid].red = reduce;
      grid                = y1 * (xGrid - 1) + x1;
      h_edges[grid].cap -= reduce;
      h_edges[grid].red += reduce;

    } else if (x1 == x2) // vertical edge
    {
      grid                = y1 * xGrid + x1 + k * xGrid * (yGrid - 1);
      cap                 = v_edges3D[grid].cap;
      reduce              = cap - reducedCap;
      v_edges3D[grid].cap = reducedCap;
      v_edges3D[grid].red = reduce;
      grid                = y1 * xGrid + x1;
      v_edges[grid].cap -= reduce;
      v_edges[grid].red += reduce;
    }
    numAdjust--;
  }

  treeOrderCong = NULL;
  stopDEC       = FALSE;

  seglistCnt = (int*)malloc(numValidNets * sizeof(int));
  seglist    = (Segment*)malloc(segcount * sizeof(Segment));
  trees      = (Tree*)malloc(numValidNets * sizeof(Tree));
  sttrees    = (StTree*)malloc(numValidNets * sizeof(StTree));
  gxs        = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));
  gys        = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));
  gs         = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));

  gridHV = XRANGE * YRANGE;
  gridH  = (xGrid - 1) * yGrid;
  gridV  = xGrid * (yGrid - 1);
  for (k = 0; k < numLayers; k++) {
    gridHs[k] = k * gridH;
    gridVs[k] = k * gridV;
  }

  MaxDegree = MD;

  printf("# valid nets: %d\n", numValidNets);
  printf("# segments: %d\n", segcount);
  printf("maxDeg:     %d\n", maxDeg);
  printf("\nDone getting input\n");
  printf("MD: %d, AD: %.2f, #nets: %d, #routed nets: %d\n", MD,
         (float)TD / newnetID, numNets, newnetID);
  printf("TC is %d\n", TC);
  fclose(fp);

  /*parentX1 = (short**)calloc(yGrid, sizeof(short*));
  parentY1 = (short**)calloc(yGrid, sizeof(short*));
  parentX3 = (short**)calloc(yGrid, sizeof(short*));
  parentY3 = (short**)calloc(yGrid, sizeof(short*));


  for(i=0; i<yGrid; i++)
  {
      parentX1[i] = (short*)calloc(xGrid, sizeof(short));
      parentY1[i] = (short*)calloc(xGrid, sizeof(short));
      parentX3[i] = (short*)calloc(xGrid, sizeof(short));
      parentY3[i] = (short*)calloc(xGrid, sizeof(short));
  }*/

  /*pop_heap2 = (Bool*)calloc(yGrid*XRANGE, sizeof(Bool));

  // allocate memory for priority queue
  heap1 = (float**)calloc((yGrid*xGrid), sizeof(float*));
  heap2 = (float**)calloc((yGrid*xGrid), sizeof(float*));*/

  sttreesBK = NULL;
}

void init_usage() {
  int i;

  for (i = 0; i < yGrid * (xGrid - 1); i++)
    h_edges[i].usage = 0;
  for (i = 0; i < (yGrid - 1) * xGrid; i++)
    v_edges[i].usage = 0;
}

void freeAllMemory() {
  int deg, numEdges, edgeID;
  TreeEdge* treeedge;

  for (int i = 0; i < numValidNets; i++) {
    free(nets[i]->pinX);
    free(nets[i]->pinY);
    free(nets[i]->pinL);
    free(nets[i]);
  }
  free(seglistIndex);
  free(seglistCnt);
  free(seglist);
  free(h_edges);
  free(v_edges);
  free(h_edges3D);
  free(v_edges3D);
  free(segOrder);

  for (int i = 0; i < numValidNets; i++)
    free(trees[i].branch);
  free(trees);

  for (int i = 0; i < numValidNets; i++) {
    deg      = sttrees[i].deg;
    numEdges = 2 * deg - 3;
    for (edgeID = 0; edgeID < numEdges; edgeID++) {
      treeedge = &(sttrees[i].edges[edgeID]);
      if (treeedge->len > 0) {
        free(treeedge->route.gridsX);
        free(treeedge->route.gridsY);
        free(treeedge->route.gridsL);
      }
    }
    free(sttrees[i].nodes);
    free(sttrees[i].edges);
  }
  free(sttrees);

  /*for(i=0; i<yGrid; i++)
  {
      free(parentX1[i]);
      free(parentY1[i]);
      free(parentX3[i]);
      free(parentY3[i]);
  }
  free(parentX1);
  free(parentY1);
  free(parentX3);
  free(parentY3);
  free(pop_heap2);
  free(heap1);
  free(heap2); //Michael*/
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/DataType.h
================================================
#ifndef _DATATYPE_H_
#define _DATATYPE_H_

#define MAXDEMAND 500  // MAX # Segments over an edge
#define MAXLAYER 20    // MAX # Layer of a routing
#define FILESTRLEN 100 // MAX length of file name
#define BIG_INT 1e7    // big integer used as infinity

#define TRUE 1
#define FALSE 0
typedef char Bool;

typedef struct {
  Bool xFirst; // route x-direction first (only for L route)
  Bool HVH;    // TRUE = HVH or FALSE = VHV (only for Z route)
  Bool maze;   // Whether this segment is routed by maze

  short x1, y1, x2, y2; // coordinates of two endpoints
  int netID;            // the netID of the net this segment belonging to
  short Zpoint;         // The coordinates of Z point (x for HVH and y for VHV)
  short* route;         // array of H and V Edges to implement this Segment
  int numEdges;         // number of H and V Edges to implement this Segment
} Segment;              // A Segment is a 2-pin connection

typedef struct {
  char name[18]; // net name
  int netIDorg;  // orginal net ID in the input file
  short numPins; // number of pins in the net
  short deg; // net degree (number of MazePoints connecting by the net, pins in
             // same MazePoints count only 1)
  short* pinX; // array of X coordinates of pins
  short* pinY; // array of Y coordinates of pins
  short* pinL; // array of L coordinates of pins
  short minwidth;
} Net; // A Net is a set of connected MazePoints

typedef struct edge_t : public galois::runtime::Lockable {
  // galois::substrate::SimpleLock lock;
  int congCNT;
  int cap;                // the capacity of the edge
  std::atomic<int> usage; // the usage of the edge
  int red;
  int last_usage;
  float est_usage; // the estimated usage of the edge

  std::atomic<int> max_ripups;
  int max_have_rippedups;
  bool ripups_cur_round;
  /*inline void acquireLock()
  {
      lock.lock();
  }
  inline void releaseLock()
  {
      lock.unlock();
  } */
} Edge; // An Edge is the routing track holder between two adjacent MazePoints

typedef struct {
  unsigned short cap;   // the capacity of the edge
  unsigned short usage; // the usage of the edge
  unsigned short red;

} Edge3D;

typedef struct {
  Bool assigned;

  short status;
  short conCNT;
  short botL, topL;
  short heights[6];

  short x, y;    // position in the grid graph
  short nbr[3];  // three neighbors
  short edge[3]; // three adjacent edges
  int hID;
  int lID;
  int eID[6];
  int stackAlias;

} TreeNode;

#define NOROUTE 0
#define LROUTE 1
#define ZROUTE 2
#define MAZEROUTE 3

typedef char RouteType;

typedef struct {
  RouteType type; // type of route: LROUTE, ZROUTE, MAZEROUTE
  Bool
      xFirst; // valid for LROUTE, TRUE - the route is horizontal first (x1, y1)
              // - (x2, y1) - (x2, y2), FALSE (x1, y1) - (x1, y2) - (x2, y2)
  Bool
      HVH; // valid for ZROUTE, TRUE - the route is HVH shape, FALSE - VHV shape
  short Zpoint; // valid for ZROUTE, the position of turn point for Z-shape
  short*
      gridsX; // valid for MAZEROUTE, a list of grids (n=routelen+1) the route
              // passes, (x1, y1) is the first one, but (x2, y2) is the lastone
  short*
      gridsY; // valid for MAZEROUTE, a list of grids (n=routelen+1) the route
              // passes, (x1, y1) is the first one, but (x2, y2) is the lastone
  short* gridsL; // n
  int routelen;  // valid for MAZEROUTE, the number of edges in the route
  // Edge3D *edge;       // list of 3D edges the route go through;

} Route;

typedef struct {
  Bool assigned;

  int len; // the Manhanttan Distance for two end nodes
  int n1, n1a;
  int n2, n2a;
  Route route;

  int n_ripups;
  bool ripup;
} TreeEdge;

typedef struct {
  int deg;
  TreeNode* nodes; // the nodes (pin and Steiner nodes) in the tree
  TreeEdge* edges; // the tree edges
} StTree;

typedef struct {

  int treeIndex;
  int minX;
  float npv; // net length over pin
} OrderNetPin;

typedef struct {
  int length;
  int treeIndex;
  int xmin;
} OrderTree;

typedef struct {
  short l;
  int x, y;
} parent3D;

typedef struct {
  int length;
  int edgeID;
} OrderNetEdge;

typedef enum { NORTH, EAST, SOUTH, WEST, ORIGIN, UP, DOWN } dirctionT;
typedef enum { NONE, HORI, VERT, BID } viaST;

#endif /* _DATATYPE_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/EdgeShift.h
================================================
#ifndef _EDGESHIFT_H_
#define _EDGESHIFT_H_

#include <stdio.h>
#include <stdlib.h>
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "route.h"
#include "RipUp.h"

#define HORIZONTAL 1
#define VERTICAL 0

int edgeShift(Tree* t) {
  int i, j, k, l, m, deg, root = 0, x, y, n, n1, n2, n3;
  int maxX, minX, maxY, minY, maxX1, minX1, maxY1, minY1, maxX2, minX2, maxY2,
      minY2, bigX, smallX, bigY, smallY, grid, grid1, grid2;
  int nbr[MAXNETDEG * 2][3], nbrCnt[MAXNETDEG * 2];
  int pairCnt, pairN1[MAXNETDEG], pairN2[MAXNETDEG];
  int benefit, bestBenefit, bestCost;
  int cost1, cost2, *costH, *costV, bestPair, Pos, bestPos, numShift = 0;

  costH = (int*)malloc(yGrid * sizeof(int));
  costV = (int*)malloc(xGrid * sizeof(int));

  deg = t->deg;
  // find root of the tree
  for (i = deg; i < 2 * deg - 2; i++) {
    if (t->branch[i].n == i) {
      root = i;
      break;
    }
  }
  // printf("root=%d\n", root);

  // find all neighbors for steiner nodes
  for (i = deg; i < 2 * deg - 2; i++)
    nbrCnt[i] = 0;
  // edges from pin to steiner
  for (i = 0; i < deg; i++) {
    n                 = t->branch[i].n;
    nbr[n][nbrCnt[n]] = i;
    nbrCnt[n]++;
  }
  // edges from steiner to steiner
  for (i = deg; i < 2 * deg - 2; i++) {
    if (i != root) // not the removed steiner nodes and root
    {
      n                 = t->branch[i].n;
      nbr[i][nbrCnt[i]] = n;
      nbrCnt[i]++;
      nbr[n][nbrCnt[n]] = i;
      nbrCnt[n]++;
    }
  }

  // for(i=deg; i<2*deg-2; i++)
  //   if(nbrCnt[i]!=3) printf("nbrCnt[%d]=%d(!=3)\n", i, nbrCnt[i]);

  bestBenefit = BIG_INT;  // used to enter while loop
  while (bestBenefit > 0) // && numShift<60)
  {
    // find all H or V edges (steiner pairs)
    pairCnt = 0;
    for (i = deg; i < 2 * deg - 2; i++) {
      n = t->branch[i].n;
      if (t->branch[i].x == t->branch[n].x) {
        if (t->branch[i].y < t->branch[n].y) {
          pairN1[pairCnt] = i;
          pairN2[pairCnt] = n;
          pairCnt++;
        } else if (t->branch[i].y > t->branch[n].y) {
          pairN1[pairCnt] = n;
          pairN2[pairCnt] = i;
          pairCnt++;
        }
      } else if (t->branch[i].y == t->branch[n].y) {
        if (t->branch[i].x < t->branch[n].x) {
          pairN1[pairCnt] = i;
          pairN2[pairCnt] = n;
          pairCnt++;
        } else if (t->branch[i].x > t->branch[n].x) {
          pairN1[pairCnt] = n;
          pairN2[pairCnt] = i;
          pairCnt++;
        }
      }
    }

    bestPair    = -1;
    bestBenefit = -1;
    // for each H or V edge, find the best benefit by shifting it
    for (i = 0; i < pairCnt; i++) {
      // find the range of shifting for this pair
      n1 = pairN1[i];
      n2 = pairN2[i];
      if (t->branch[n1].y == t->branch[n2].y) // a horizontal edge
      {
        // find the shifting range for the edge (minY~maxY)
        maxY1 = minY1 = t->branch[n1].y;
        for (j = 0; j < 3; j++) {
          y = t->branch[nbr[n1][j]].y;
          if (y > maxY1)
            maxY1 = y;
          else if (y < minY1)
            minY1 = y;
        }
        maxY2 = minY2 = t->branch[n2].y;
        for (j = 0; j < 3; j++) {
          y = t->branch[nbr[n2][j]].y;
          if (y > maxY2)
            maxY2 = y;
          else if (y < minY2)
            minY2 = y;
        }
        minY = max(minY1, minY2);
        maxY = min(maxY1, maxY2);
        // printf("(%d-%d) minY1=%d, maxY1=%d, minY2=%d, maxY2=%d, minY = %d,
        // maxY = %d\n", n1, n2, minY1, maxY1, minY2, maxY2, minY, maxY);

        // find the best position (least total usage) to shift
        if (minY < maxY) // more than 1 possible positions
        {
          for (j = minY; j <= maxY; j++) {
            costH[j] = 0;
            grid     = j * (xGrid - 1);
            for (k = t->branch[n1].x; k < t->branch[n2].x; k++) {
              costH[j] += h_edges[grid + k].est_usage;
            }
            // add the cost of all edges adjacent to the two steiner nodes
            for (l = 0; l < nbrCnt[n1]; l++) {
              n3 = nbr[n1][l];
              if (n3 != n2) // exclude current edge n1-n2
              {
                cost1 = cost2 = 0;
                if (t->branch[n1].x < t->branch[n3].x) {
                  smallX = t->branch[n1].x;
                  bigX   = t->branch[n3].x;
                } else {
                  smallX = t->branch[n3].x;
                  bigX   = t->branch[n1].x;
                }
                if (j < t->branch[n3].y) {
                  smallY = j;
                  bigY   = t->branch[n3].y;
                } else {
                  smallY = t->branch[n3].y;
                  bigY   = j;
                }
                grid1 = smallY * (xGrid - 1);
                grid2 = bigY * (xGrid - 1);
                for (m = smallX; m < bigX; m++) {
                  cost1 += h_edges[grid1 + m].est_usage;
                  cost2 += h_edges[grid2 + m].est_usage;
                }
                grid1 = smallY * xGrid;
                for (m = smallY; m < bigY; m++) {
                  cost1 += v_edges[grid1 + bigX].est_usage;
                  cost2 += v_edges[grid1 + smallX].est_usage;
                  grid1 += xGrid;
                }
                costH[j] += min(cost1, cost2);
              } // if(n3!=n2)
            }   // loop l
            for (l = 0; l < nbrCnt[n2]; l++) {
              n3 = nbr[n2][l];
              if (n3 != n1) // exclude current edge n1-n2
              {
                cost1 = cost2 = 0;
                if (t->branch[n2].x < t->branch[n3].x) {
                  smallX = t->branch[n2].x;
                  bigX   = t->branch[n3].x;
                } else {
                  smallX = t->branch[n3].x;
                  bigX   = t->branch[n2].x;
                }
                if (j < t->branch[n3].y) {
                  smallY = j;
                  bigY   = t->branch[n3].y;
                } else {
                  smallY = t->branch[n3].y;
                  bigY   = j;
                }
                grid1 = smallY * (xGrid - 1);
                grid2 = bigY * (xGrid - 1);
                for (m = smallX; m < bigX; m++) {
                  cost1 += h_edges[grid1 + m].est_usage;
                  cost2 += h_edges[grid2 + m].est_usage;
                }
                grid1 = smallY * xGrid;
                for (m = smallY; m < bigY; m++) {
                  cost1 += v_edges[grid1 + bigX].est_usage;
                  cost2 += v_edges[grid1 + smallX].est_usage;
                  grid1 += xGrid;
                }
                costH[j] += min(cost1, cost2);
              } // if(n3!=n1)
            }   // loop l
          }     // loop j
          bestCost = BIG_INT;
          Pos      = t->branch[n1].y;
          for (j = minY; j <= maxY; j++) {
            if (costH[j] < bestCost) {
              bestCost = costH[j];
              Pos      = j;
            }
          }
          if (Pos != t->branch[n1].y) // find a better position than current
          {
            benefit = costH[t->branch[n1].y] - bestCost;
            if (benefit > bestBenefit) {
              bestBenefit = benefit;
              bestPair    = i;
              bestPos     = Pos;
            }
          }
        }

      } else // a vertical edge
      {
        // find the shifting range for the edge (minX~maxX)
        maxX1 = minX1 = t->branch[n1].x;
        for (j = 0; j < 3; j++) {
          x = t->branch[nbr[n1][j]].x;
          if (x > maxX1)
            maxX1 = x;
          else if (x < minX1)
            minX1 = x;
        }
        maxX2 = minX2 = t->branch[n2].x;
        for (j = 0; j < 3; j++) {
          x = t->branch[nbr[n2][j]].x;
          if (x > maxX2)
            maxX2 = x;
          else if (x < minX2)
            minX2 = x;
        }
        minX = max(minX1, minX2);
        maxX = min(maxX1, maxX2);

        // find the best position (least total usage) to shift
        if (minX < maxX) // more than 1 possible positions
        {
          for (j = minX; j <= maxX; j++) {
            costV[j] = 0;
            for (k = t->branch[n1].y; k < t->branch[n2].y; k++) {
              costV[j] += v_edges[k * xGrid + j].est_usage;
            }
            // add the cost of all edges adjacent to the two steiner nodes
            for (l = 0; l < nbrCnt[n1]; l++) {
              n3 = nbr[n1][l];
              if (n3 != n2) // exclude current edge n1-n2
              {
                cost1 = cost2 = 0;
                if (j < t->branch[n3].x) {
                  smallX = j;
                  bigX   = t->branch[n3].x;
                } else {
                  smallX = t->branch[n3].x;
                  bigX   = j;
                }
                if (t->branch[n1].y < t->branch[n3].y) {
                  smallY = t->branch[n1].y;
                  bigY   = t->branch[n3].y;
                } else {
                  smallY = t->branch[n3].y;
                  bigY   = t->branch[n1].y;
                }
                grid1 = smallY * (xGrid - 1);
                grid2 = bigY * (xGrid - 1);
                for (m = smallX; m < bigX; m++) {
                  cost1 += h_edges[grid1 + m].est_usage;
                  cost2 += h_edges[grid2 + m].est_usage;
                }
                grid1 = smallY * xGrid;
                for (m = smallY; m < bigY; m++) {
                  cost1 += v_edges[grid1 + bigX].est_usage;
                  cost2 += v_edges[grid1 + smallX].est_usage;
                  grid1 += xGrid;
                }
                costV[j] += min(cost1, cost2);
              } // if(n3!=n2)
            }   // loop l
            for (l = 0; l < nbrCnt[n2]; l++) {
              n3 = nbr[n2][l];
              if (n3 != n1) // exclude current edge n1-n2
              {
                cost1 = cost2 = 0;
                if (j < t->branch[n3].x) {
                  smallX = j;
                  bigX   = t->branch[n3].x;
                } else {
                  smallX = t->branch[n3].x;
                  bigX   = j;
                }
                if (t->branch[n2].y < t->branch[n3].y) {
                  smallY = t->branch[n2].y;
                  bigY   = t->branch[n3].y;
                } else {
                  smallY = t->branch[n3].y;
                  bigY   = t->branch[n2].y;
                }
                grid1 = smallY * (xGrid - 1);
                grid2 = bigY * (xGrid - 1);
                for (m = smallX; m < bigX; m++) {
                  cost1 += h_edges[grid1 + m].est_usage;
                  cost2 += h_edges[grid2 + m].est_usage;
                }
                grid1 = smallY * xGrid;
                for (m = smallY; m < bigY; m++) {
                  cost1 += v_edges[grid1 + bigX].est_usage;
                  cost2 += v_edges[grid1 + smallX].est_usage;
                  grid1 += xGrid;
                }
                costV[j] += min(cost1, cost2);
              } // if(n3!=n1)
            }   // loop l
          }     // loop j
          bestCost = BIG_INT;
          Pos      = t->branch[n1].x;
          for (j = minX; j <= maxX; j++) {
            if (costV[j] < bestCost) {
              bestCost = costV[j];
              Pos      = j;
            }
          }
          if (Pos != t->branch[n1].x) // find a better position than current
          {
            benefit = costV[t->branch[n1].x] - bestCost;
            if (benefit > bestBenefit) {
              bestBenefit = benefit;
              bestPair    = i;
              bestPos     = Pos;
            }
          }
        }

      } // else (a vertical edge)

    } // loop i

    if (bestBenefit > 0) {
      n1 = pairN1[bestPair];
      n2 = pairN2[bestPair];

      if (t->branch[n1].y == t->branch[n2].y) // horizontal edge
      {
        t->branch[n1].y = bestPos;
        t->branch[n2].y = bestPos;
      } // vertical edge
      else {
        t->branch[n1].x = bestPos;
        t->branch[n2].x = bestPos;
      }
      numShift++;
    }
  } // while(bestBenefit>0)

  free(costH);
  free(costV);

  return (numShift);
}

// exchange Steiner nodes at the same position, then call edgeShift()
int edgeShiftNew(Tree* t) {
  int i, j, n;
  int deg, pairCnt, pairN1[MAXNETDEG], pairN2[MAXNETDEG], cur_pairN1,
      cur_pairN2;
  int N1nbrH, N1nbrV, N2nbrH, N2nbrV, iter;
  int numShift;
  Bool isPair;
  // printf("net[%d]\n", net); getchar();
  numShift = edgeShift(t);
  deg      = t->deg;

  // if(net==3){printtree(*t);getchar();}
  iter       = 0;
  cur_pairN1 = cur_pairN2 = -1;
  while (iter < 3) {
    iter++;

    // find all pairs of steiner node at the same position (steiner pairs)
    pairCnt = 0;
    for (i = deg; i < 2 * deg - 2; i++) {
      n = t->branch[i].n;
      if (n != i && n != t->branch[n].n && t->branch[i].x == t->branch[n].x &&
          t->branch[i].y == t->branch[n].y) {
        pairN1[pairCnt] = i;
        pairN2[pairCnt] = n;
        pairCnt++;
      }
    }
    // if(net==3){printf("# pairs: %d, N1=%d, N2=%d\n", pairCnt, pairN1[0],
    // pairN2[0]);getchar();}
    if (pairCnt > 0) {
      if (pairN1[0] != cur_pairN1 ||
          pairN2[0] != cur_pairN2) // don't try the same as last one
      {
        cur_pairN1 = pairN1[0];
        cur_pairN2 = pairN2[0];
        isPair     = TRUE;
      } else if (pairN1[0] == cur_pairN1 && pairN2[0] == cur_pairN2 &&
                 pairCnt > 1) {
        cur_pairN1 = pairN1[1];
        cur_pairN2 = pairN2[1];
        isPair     = TRUE;
      } else
        isPair = FALSE;

      // if(net==3){printf("isPair: %d, N1=%d, N2=%d\n", isPair, cur_pairN1,
      // cur_pairN2);getchar();}
      if (isPair) // find a new pair to swap
      {
        N1nbrH = N1nbrV = N2nbrH = N2nbrV = -1;
        // find the nodes directed to cur_pairN1(2 nodes) and cur_pairN2(1
        // nodes)
        for (j = 0; j < 2 * deg - 2; j++) {
          n = t->branch[j].n;
          if (n == cur_pairN1) {
            if (t->branch[j].x == t->branch[cur_pairN1].x &&
                t->branch[j].y != t->branch[cur_pairN1].y)
              N1nbrV = j;
            else if (t->branch[j].y == t->branch[cur_pairN1].y &&
                     t->branch[j].x != t->branch[cur_pairN1].x)
              N1nbrH = j;
          } else if (n == cur_pairN2) {
            if (t->branch[j].x == t->branch[cur_pairN2].x &&
                t->branch[j].y != t->branch[cur_pairN2].y)
              N2nbrV = j;
            else if (t->branch[j].y == t->branch[cur_pairN2].y &&
                     t->branch[j].x != t->branch[cur_pairN2].x)
              N2nbrH = j;
          }
        }
        // find the node cur_pairN2 directed to
        n = t->branch[cur_pairN2].n;
        if (t->branch[n].x == t->branch[cur_pairN2].x &&
            t->branch[n].y != t->branch[cur_pairN2].y)
          N2nbrV = n;
        else if (t->branch[n].y == t->branch[cur_pairN2].y &&
                 t->branch[n].x != t->branch[cur_pairN2].x)
          N2nbrH = n;

        // if(net==3){printf("N1=%d, N2=%d, N1nbrH=%d, N1nbrV=%d, N2nbrH=%d,
        // N2nbrV=%d\n", cur_pairN1, cur_pairN2, N1nbrH, N1nbrV, N2nbrH,
        // N2nbrV);getchar();}
        if (N1nbrH >= 0 && N2nbrH >= 0) {
          if (N2nbrH == t->branch[cur_pairN2].n) {
            t->branch[N1nbrH].n     = cur_pairN2;
            t->branch[cur_pairN1].n = N2nbrH;
            t->branch[cur_pairN2].n = cur_pairN1;
          } else {
            t->branch[N1nbrH].n = cur_pairN2;
            t->branch[N2nbrH].n = cur_pairN1;
          }
          numShift += edgeShift(t);
        } else if (N1nbrV >= 0 && N2nbrV >= 0) {
          if (N2nbrV == t->branch[cur_pairN2].n) {
            t->branch[N1nbrV].n     = cur_pairN2;
            t->branch[cur_pairN1].n = N2nbrV;
            t->branch[cur_pairN2].n = cur_pairN1;
          } else {
            t->branch[N1nbrV].n = cur_pairN2;
            t->branch[N2nbrV].n = cur_pairN1;
          }
          numShift += edgeShift(t);
        }
        // if(net==3){printtree(*t);getchar();}
      } // if(isPair)

    } // if(pairCnt>0)
    else
      iter = 3;

  } // while

  return (numShift);
}
#endif


================================================
FILE: lonestar/eda/cpu/sproute/LICENSE
================================================
-------------- FLUTE - Version 3.0 -----------------
                       by
                Chris C.-N. Chu
       Dept. of ECpE, Iowa State University
             Copyright (c) - 2005
  Iowa State University Research Foundation, Inc.
----------------------------------------------------


BSD 3-Clause License

Copyright (c) 2018, Iowa State University
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: lonestar/eda/cpu/sproute/README.md
================================================
SPRoute
================================================================================


DESCRIPTION 
--------------------------------------------------------------------------------

This program performs global routing on a circuit. Please find our ICCAD 2019 paper "He, Jiayuan, et al. "SPRoute: A Scalable Parallel Negotiation-based Global Router." 2019 IEEE/ACM International Conference on Computer-Aided Design (ICCAD). IEEE, 2019." for details.

SPRoute is based on FastRoute 4.1 and consists of four stages: tree decomposition, pattern routing, maze routing and layer assignment. SRoute parallelizes the most time-consuming maze routing stage in a novel hybrid parallel scheme which combines net-level parallelism and fine-grain parallelism. 

INPUT
--------------------------------------------------------------------------------

Input circuit is ISPD2008 contest format. For more information please visit http://www.ispd.cc/contests/08/ispd08rc.html

Input also requires FLUTE files. Please download flute-3.1.tgz from http://home.eng.iastate.edu/~cnchu/flute.html.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/eda/cpu/sproute; make -j sproute-cpu`


RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./sproute-cpu -ISPD2008Graph <path-to-circuit> --flute <path-to-flute-directory> -t 40`


PERFORMANCE  
--------------------------------------------------------------------------------
Please find more details in the SPRoute paper.

On a 28-core machine, SPRoute achieves an average speedup of 11X on overflow-free cases and 3.1X on hard-to-route cases in ISPD2008 benchmarks. 


================================================
FILE: lonestar/eda/cpu/sproute/RSMT.h
================================================
#ifndef _RSMT_H
#define _RSMT_H

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "EdgeShift.h"
#include "utility.h"

#define FLUTEACCURACY 2

struct pnt {
  DTYPE x, y;
  int o;
};

// global variable
// ** V_table;
// int ** H_table;

/*static int ordery1(const void *a,  const void *b)
{
    struct wire *pa, *pb;

    pa = *(struct wire**)a;
    pb = *(struct wire**)b;

    if (pa->y1 < pb->y1) return -1;
    if (pa->y1 > pb->y1) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);

}

static int ordery2(const void *a,  const void *b)
{
    struct wire *pa, *pb;

    pa = *(struct wire**)a;
    pb = *(struct wire**)b;

    if (pa->y2 < pb->y2) return -1;
    if (pa->y2 > pb->y2) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);

}

static int orderx1(const void *a,  const void *b)
{
    struct wire *pa, *pb;

    pa = *(struct wire**)a;
    pb = *(struct wire**)b;

    if (pa->x1 < pb->x1) return -1;
    if (pa->x1 > pb->x1) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);

}

static int orderx2(const void *a,  const void *b)
{
    struct wire *pa, *pb;

    pa = *(struct wire**)a;
    pb = *(struct wire**)b;

    if (pa->x2 < pb->x2) return -1;
    if (pa->x2 > pb->x2) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);

}*/

/*int orderx(const void *a, const void *b)
{
    struct pnt *pa, *pb;

    pa = *(struct pnt**)a;
    pb = *(struct pnt**)b;

    if (pa->x < pb->x) return -1;
    if (pa->x > pb->x) return 1;
    return 0;
    //return Y(*(struct Segment*)a.x1-*(struct Segment*)b.x1);
}


static int ordery(const void *a, const void *b)
{
    struct pnt *pa, *pb;

    pa = *(struct pnt**)a;
    pb = *(struct pnt**)b;

    if (pa->y < pb->y) return -1;
    if (pa->y > pb->y) return 1;
    return 0;
}*/

// binary search to map the new coordinates to original coordinates
int mapxy(int nx, int xs[], int nxs[], int d) {
  int max, min, mid;

  min = 0;
  max = d - 1;

  while (min <= max) {
    mid = (min + max) / 2;
    if (nx == nxs[mid])
      return (xs[mid]);
    if (nx < nxs[mid])
      max = mid - 1;
    else
      min = mid + 1;
  }

  if (min > max)
    printf("mapping error\n");
  return 0;
}

void copyStTree(int ind, Tree rsmt) {
  int i, d, numnodes, numedges;
  int n, x1, y1, x2, y2, edgecnt, nbrcnt[2 * MAXNETDEG];
  TreeEdge* treeedges;
  TreeNode* treenodes;

  d                  = rsmt.deg;
  sttrees[ind].deg   = d;
  numnodes           = 2 * d - 2;
  numedges           = 2 * d - 3;
  sttrees[ind].nodes = (TreeNode*)malloc(numnodes * sizeof(TreeNode));
  sttrees[ind].edges = (TreeEdge*)malloc(numedges * sizeof(TreeEdge));

  treenodes = sttrees[ind].nodes;
  treeedges = sttrees[ind].edges;

  // initialize the nbrcnt for treenodes
  for (i = 0; i < numnodes; i++)
    nbrcnt[i] = 0;

  // printf("tree ind %d\n",ind);

  edgecnt = 0;
  // original rsmt has 2*d-2 branch (one is a loop for root), in StTree 2*d-3
  // edges (no original loop)
  for (i = 0; i < numnodes; i++) {
    x1             = rsmt.branch[i].x;
    y1             = rsmt.branch[i].y;
    n              = rsmt.branch[i].n;
    x2             = rsmt.branch[n].x;
    y2             = rsmt.branch[n].y;
    treenodes[i].x = x1;
    treenodes[i].y = y1;
    if (i < d) {
      treenodes[i].status = 2;
    } else {
      treenodes[i].status = 0;
    }
    if (n != i) // not root
    {
      treeedges[edgecnt].len      = ADIFF(x1, x2) + ADIFF(y1, y2);
      treeedges[edgecnt].n_ripups = 0;
      // make x1 always less than x2
      if (x1 < x2) {
        treeedges[edgecnt].n1 = i;
        treeedges[edgecnt].n2 = n;
      } else {
        treeedges[edgecnt].n1 = n;
        treeedges[edgecnt].n2 = i;
      }
      treenodes[i].nbr[nbrcnt[i]]  = n;
      treenodes[i].edge[nbrcnt[i]] = edgecnt;
      treenodes[n].nbr[nbrcnt[n]]  = i;
      treenodes[n].edge[nbrcnt[n]] = edgecnt;

      nbrcnt[i]++;
      nbrcnt[n]++;
      edgecnt++;
    }
    if (nbrcnt[i] > 3 || nbrcnt[n] > 3)
      printf("wrong\n");
  }
  if (edgecnt != numnodes - 1) {
    printf("copy tree wrong\n");
    printf("num edges %d, num nodes %d\n", edgecnt, numnodes);
    exit(0);
  }
}

void fluteNormal(int netID, int d, DTYPE x[], DTYPE y[], int acc, float coeffV,
                 Tree* t) {
  DTYPE *xs, *ys, minval, x_max, x_min, x_mid, y_max, y_min, y_mid, *tmp_xs,
      *tmp_ys;
  int* s;
  int i, j, k, minidx;
  struct pnt *pt, **ptp, *tmpp;

  if (d == 2) {
    t->deg         = 2;
    t->length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);
    t->branch      = (Branch*)malloc(2 * sizeof(Branch));
    t->branch[0].x = x[0];
    t->branch[0].y = y[0];
    t->branch[0].n = 1;
    t->branch[1].x = x[1];
    t->branch[1].y = y[1];
    t->branch[1].n = 1;
  } else if (d == 3) {
    t->deg = 3;
    if (x[0] < x[1]) {
      if (x[0] < x[2]) {
        x_min = x[0];
        x_mid = min(x[1], x[2]);
        x_max = max(x[1], x[2]);
      } else {
        x_min = x[2];
        x_mid = x[0];
        x_max = x[1];
      }
    } else {
      if (x[0] < x[2]) {
        x_min = x[1];
        x_mid = x[0];
        x_max = x[2];
      } else {
        x_min = min(x[1], x[2]);
        x_mid = max(x[1], x[2]);
        x_max = x[0];
      }
    }
    if (y[0] < y[1]) {
      if (y[0] < y[2]) {
        y_min = y[0];
        y_mid = min(y[1], y[2]);
        y_max = max(y[1], y[2]);
      } else {
        y_min = y[2];
        y_mid = y[0];
        y_max = y[1];
      }
    } else {
      if (y[0] < y[2]) {
        y_min = y[1];
        y_mid = y[0];
        y_max = y[2];
      } else {
        y_min = min(y[1], y[2]);
        y_mid = max(y[1], y[2]);
        y_max = y[0];
      }
    }

    t->length      = ADIFF(x_max, x_min) + ADIFF(y_max, y_min);
    t->branch      = (Branch*)malloc(4 * sizeof(Branch));
    t->branch[0].x = x[0];
    t->branch[0].y = y[0];
    t->branch[0].n = 3;
    t->branch[1].x = x[1];
    t->branch[1].y = y[1];
    t->branch[1].n = 3;
    t->branch[2].x = x[2];
    t->branch[2].y = y[2];
    t->branch[2].n = 3;
    t->branch[3].x = x_mid;
    t->branch[3].y = y_mid;
    t->branch[3].n = 3;
  } else {
    xs = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    ys = (DTYPE*)malloc(sizeof(DTYPE) * (d));

    tmp_xs = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    tmp_ys = (DTYPE*)malloc(sizeof(DTYPE) * (d));

    s   = (int*)malloc(sizeof(int) * (d));
    pt  = (struct pnt*)malloc(sizeof(struct pnt) * (d + 1));
    ptp = (struct pnt**)malloc(sizeof(struct pnt*) * (d + 1));

    for (i = 0; i < d; i++) {
      pt[i].x = x[i];
      pt[i].y = y[i];
      ptp[i]  = &pt[i];
    }
    // printf("OK here\n");
    // sort x

    if (d < 1000) {
      for (i = 0; i < d - 1; i++) {
        minval = ptp[i]->x;
        minidx = i;
        for (j = i + 1; j < d; j++) {
          if (minval > ptp[j]->x) {
            minval = ptp[j]->x;
            minidx = j;
          }
        }
        tmpp        = ptp[i];
        ptp[i]      = ptp[minidx];
        ptp[minidx] = tmpp;
      }
    } else {
      qsort(ptp, d, sizeof(struct point*), orderx);
    }

#if REMOVE_DUPLICATE_PIN == 1
    ptp[d]    = &pt[d];
    ptp[d]->x = ptp[d]->y = -999999;
    j                     = 0;
    for (i = 0; i < d; i++) {
      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)
        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same
          break;
      if (ptp[k]->x != ptp[i]->x)
        ptp[j++] = ptp[i];
    }
    d = j;
#endif

    for (i = 0; i < d; i++) {
      xs[i]     = ptp[i]->x;
      ptp[i]->o = i;
    }

    // sort y to find s[]
    if (d < 1000) {
      for (i = 0; i < d - 1; i++) {
        minval = ptp[i]->y;
        minidx = i;
        for (j = i + 1; j < d; j++) {
          if (minval > ptp[j]->y) {
            minval = ptp[j]->y;
            minidx = j;
          }
        }
        ys[i]       = ptp[minidx]->y;
        s[i]        = ptp[minidx]->o;
        ptp[minidx] = ptp[i];
      }
      ys[d - 1] = ptp[d - 1]->y;
      s[d - 1]  = ptp[d - 1]->o;
    } else {
      qsort(ptp, d, sizeof(struct point*), ordery);
      for (i = 0; i < d; i++) {
        ys[i] = ptp[i]->y;
        s[i]  = ptp[i]->o;
      }
    }

    gxs[netID] = (DTYPE*)malloc(d * sizeof(DTYPE));
    gys[netID] = (DTYPE*)malloc(d * sizeof(DTYPE));
    gs[netID]  = (DTYPE*)malloc(d * sizeof(DTYPE));

    for (i = 0; i < d; i++) {
      gxs[netID][i] = xs[i];
      gys[netID][i] = ys[i];
      gs[netID][i]  = s[i];

      tmp_xs[i] = xs[i] * 100;
      tmp_ys[i] = ys[i] * ((int)(100 * coeffV));
    }

    *t = flutes(d, tmp_xs, tmp_ys, s, acc);

    for (i = 0; i < 2 * d - 2; i++) {
      t->branch[i].x = t->branch[i].x / 100;
      t->branch[i].y = t->branch[i].y / ((int)(100 * coeffV));
    }

    free(xs);
    free(ys);
    free(tmp_xs);
    free(tmp_ys);
    free(s);
    free(pt);
    free(ptp);
  }
}

void fluteCongest(int netID, int d, DTYPE x[], DTYPE y[], int acc, float coeffV,
                  Tree* t) {
  DTYPE *xs, *ys, *nxs, *nys, *x_seg, *y_seg, x_max, x_min, x_mid, y_max, y_min,
      y_mid;
  int* s;
  int i, j, k, grid;
  DTYPE height, width;
  int usageH, usageV;
  float coeffH = 1;
  //	float coeffV = 2;//1.36;//hCapacity/vCapacity;//1;//

  if (d == 2) {
    t->deg         = 2;
    t->length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);
    t->branch      = (Branch*)malloc(2 * sizeof(Branch));
    t->branch[0].x = x[0];
    t->branch[0].y = y[0];
    t->branch[0].n = 1;
    t->branch[1].x = x[1];
    t->branch[1].y = y[1];
    t->branch[1].n = 1;
  } else if (d == 3) {
    t->deg = 3;
    if (x[0] < x[1]) {
      if (x[0] < x[2]) {
        x_min = x[0];
        x_mid = min(x[1], x[2]);
        x_max = max(x[1], x[2]);
      } else {
        x_min = x[2];
        x_mid = x[0];
        x_max = x[1];
      }
    } else {
      if (x[0] < x[2]) {
        x_min = x[1];
        x_mid = x[0];
        x_max = x[2];
      } else {
        x_min = min(x[1], x[2]);
        x_mid = max(x[1], x[2]);
        x_max = x[0];
      }
    }
    if (y[0] < y[1]) {
      if (y[0] < y[2]) {
        y_min = y[0];
        y_mid = min(y[1], y[2]);
        y_max = max(y[1], y[2]);
      } else {
        y_min = y[2];
        y_mid = y[0];
        y_max = y[1];
      }
    } else {
      if (y[0] < y[2]) {
        y_min = y[1];
        y_mid = y[0];
        y_max = y[2];
      } else {
        y_min = min(y[1], y[2]);
        y_mid = max(y[1], y[2]);
        y_max = y[0];
      }
    }

    t->length      = ADIFF(x_max, x_min) + ADIFF(y_max, y_min);
    t->branch      = (Branch*)malloc(4 * sizeof(Branch));
    t->branch[0].x = x[0];
    t->branch[0].y = y[0];
    t->branch[0].n = 3;
    t->branch[1].x = x[1];
    t->branch[1].y = y[1];
    t->branch[1].n = 3;
    t->branch[2].x = x[2];
    t->branch[2].y = y[2];
    t->branch[2].n = 3;
    t->branch[3].x = x_mid;
    t->branch[3].y = y_mid;
    t->branch[3].n = 3;
  } else {
    xs    = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    ys    = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    nxs   = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    nys   = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    x_seg = (DTYPE*)malloc(sizeof(DTYPE) * (d - 1));
    y_seg = (DTYPE*)malloc(sizeof(DTYPE) * (d - 1));
    s     = (int*)malloc(sizeof(int) * (d));

    for (i = 0; i < d; i++) {
      xs[i] = gxs[netID][i];
      ys[i] = gys[netID][i];
      s[i]  = gs[netID][i];
    }

    // get the new coordinates considering congestion
    for (i = 0; i < d - 1; i++) {
      x_seg[i] = (xs[i + 1] - xs[i]) * 100;
      y_seg[i] = (ys[i + 1] - ys[i]) * 100;
    }

    height = ys[d - 1] - ys[0] + 1; // # vertical grids the net span
    width  = xs[d - 1] - xs[0] + 1; // # horizontal grids the net span

    for (i = 0; i < d - 1; i++) {
      usageH = 0;
      for (k = ys[0]; k <= ys[d - 1]; k++) // all grids in the column
      {
        grid = k * (xGrid - 1);
        for (j = xs[i]; j < xs[i + 1]; j++)
          usageH += (h_edges[grid + j].est_usage + h_edges[grid + j].red);
      }
      if (x_seg[i] != 0 && usageH != 0) {
        x_seg[i] *=
            coeffH * usageH / ((xs[i + 1] - xs[i]) * height * hCapacity);
        x_seg[i] = max(1, x_seg[i]); // the segment len is at least 1 if
                                     // original segment len > 0
      }
      usageV = 0;
      for (j = ys[i]; j < ys[i + 1]; j++) {
        grid = j * xGrid;
        for (k = xs[0]; k <= xs[d - 1]; k++) // all grids in the row
          usageV += (v_edges[grid + k].est_usage + v_edges[grid + k].red);
      }
      if (y_seg[i] != 0 && usageV != 0) {
        y_seg[i] *= coeffV * usageV / ((ys[i + 1] - ys[i]) * width * vCapacity);
        y_seg[i] = max(1, y_seg[i]); // the segment len is at least 1 if
                                     // original segment len > 0
      }
    }

    nxs[0] = xs[0];
    nys[0] = ys[0];
    for (i = 0; i < d - 1; i++) {
      nxs[i + 1] = nxs[i] + x_seg[i];
      nys[i + 1] = nys[i] + y_seg[i];
    }

    (*t) = flutes(d, nxs, nys, s, acc);

    // map the new coordinates back to original coordinates
    for (i = 0; i < 2 * d - 2; i++) {
      t->branch[i].x = mapxy(t->branch[i].x, xs, nxs, d);
      t->branch[i].y = mapxy(t->branch[i].y, ys, nys, d);
    }

    free(xs);
    free(ys);
    free(nxs);
    free(nys);
    free(x_seg);
    free(y_seg);
    free(s);
  }

  // return t;
}

Bool netCongestion(int netID) {
  int i, j;
  int grid, ymin, ymax;
  //	Bool Congested;
  Segment* seg;

  for (j = seglistIndex[netID]; j < seglistIndex[netID] + seglistCnt[netID];
       j++) {
    seg = &seglist[j];

    if (seg->y1 < seg->y2) {
      ymin = seg->y1;
      ymax = seg->y2;
    } else {
      ymin = seg->y2;
      ymax = seg->y1;
    }

    // remove L routing
    if (seg->xFirst) {
      grid = seg->y1 * (xGrid - 1);
      for (i = seg->x1; i < seg->x2; i++) {
        if (h_edges[grid + i].est_usage >= h_edges[grid + i].cap) {
          return (TRUE);
        }
      }
      for (i = ymin; i < ymax; i++) {
        if (v_edges[i * xGrid + seg->x2].est_usage >=
            v_edges[i * xGrid + seg->x2].cap) {
          return (TRUE);
        }
      }
    } else {
      for (i = ymin; i < ymax; i++) {
        if (v_edges[i * xGrid + seg->x1].est_usage >=
            v_edges[i * xGrid + seg->x1].cap) {
          return (TRUE);
        }
      }
      grid = seg->y2 * (xGrid - 1);
      for (i = seg->x1; i < seg->x2; i++) {
        if (h_edges[grid + i].est_usage >= h_edges[grid + i].cap) {
          return (TRUE);
        }
      }
    }
  }
  return (FALSE);
}

Bool VTreeSuite(int netID) {
  int xmin, xmax, ymin, ymax;

  int i, deg;

  deg  = nets[netID]->deg;
  xmax = ymax = 0;
  xmin = ymin = BIG_INT;

  for (i = 0; i < deg; i++) {
    if (xmin > nets[netID]->pinX[i]) {
      xmin = nets[netID]->pinX[i];
    }
    if (xmax < nets[netID]->pinX[i]) {
      xmax = nets[netID]->pinX[i];
    }
    if (ymin > nets[netID]->pinY[i]) {
      ymin = nets[netID]->pinY[i];
    }
    if (ymax < nets[netID]->pinY[i]) {
      ymax = nets[netID]->pinY[i];
    }
  }

  if ((ymax - ymin) > 3 * (xmax - xmin)) {
    return (TRUE);
  } else {
    return (FALSE);
  }
}

Bool HTreeSuite(int netID) {
  int xmin, xmax, ymin, ymax;

  int i, deg;

  deg  = nets[netID]->deg;
  xmax = ymax = 0;
  xmin = ymin = BIG_INT;

  //	printf("d %d\n",deg);

  for (i = 0; i < deg; i++) {
    if (xmin > nets[netID]->pinX[i]) {
      xmin = nets[netID]->pinX[i];
    }
    if (xmax < nets[netID]->pinX[i]) {
      xmax = nets[netID]->pinX[i];
    }
    if (ymin > nets[netID]->pinY[i]) {
      ymin = nets[netID]->pinY[i];
    }
    if (ymax < nets[netID]->pinY[i]) {
      ymax = nets[netID]->pinY[i];
    }
  }

  if (5 * (ymax - ymin) < (xmax - xmin)) {
    return (TRUE);
  } else {
    return (FALSE);
  }
}

float coeffADJ(int netID) {
  int xmin, xmax, ymin, ymax, Hcap, Vcap;
  float Husage, Vusage, coef;

  int i, j, deg, grid;

  deg  = nets[netID]->deg;
  xmax = ymax = 0;
  xmin = ymin = BIG_INT;
  Hcap = Vcap = 0;
  Husage = Vusage = 0;

  //	printf("d %d\n",deg);

  for (i = 0; i < deg; i++) {
    if (xmin > nets[netID]->pinX[i]) {
      xmin = nets[netID]->pinX[i];
    }
    if (xmax < nets[netID]->pinX[i]) {
      xmax = nets[netID]->pinX[i];
    }
    if (ymin > nets[netID]->pinY[i]) {
      ymin = nets[netID]->pinY[i];
    }
    if (ymax < nets[netID]->pinY[i]) {
      ymax = nets[netID]->pinY[i];
    }
  }

  if (xmin == xmax) {
    for (j = ymin; j < ymax; j++) {
      grid = j * xGrid + xmin;
      Vcap += v_edges[grid].cap;
      Vusage += v_edges[grid].est_usage;
    }
    coef = 1;
  } else if (ymin == ymax) {
    for (i = xmin; i < xmax; i++) {
      grid = ymin * (xGrid - 1) + i;
      Hcap += h_edges[grid].cap;
      Husage += h_edges[grid].est_usage;
    }
    coef = 1;
  } else {
    for (j = ymin; j <= ymax; j++) {
      for (i = xmin; i < xmax; i++) {
        grid = j * (xGrid - 1) + i;
        Hcap += h_edges[grid].cap;
        Husage += h_edges[grid].est_usage;
      }
    }
    for (j = ymin; j < ymax; j++) {
      for (i = xmin; i <= xmax; i++) {
        grid = j * xGrid + i;
        Vcap += v_edges[grid].cap;
        Vusage += v_edges[grid].est_usage;
      }
    }
    // coef  = (Husage*Vcap)/ (Hcap*Vusage);
    coef = (Hcap * Vusage) / (Husage * Vcap);
  }

  if (coef < 1.2) {
    coef = 1.2;
  }

  return (coef);
}

void gen_brk_RSMT(Bool congestionDriven, Bool reRoute, Bool genTree,
                  Bool newType, Bool noADJ) {
  int i, j, d, n, netID, n1, n2;
  int x1, y1, x2, y2;
  int x[MAXNETDEG], y[MAXNETDEG];
  int segPos, segcnt;
  Tree rsmt;
  int wl, wl1, numShift = 0, cnt1, cnt2, cnt3;
  float coeffV, coefMax, coefMin;

  coefMax = 0;
  coefMin = BIG_INT;

  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  int totalnon = 0;
  Bool cong;

  wl = wl1    = 0;
  totalNumSeg = 0;

  cnt1 = cnt2 = cnt3 = 0;

  /*if (congestionDriven) {
      netpinOrderInc();
  } */

  for (netID = 0; netID < numValidNets; netID++) {
    i      = netID;
    coeffV = 1.36;

    if (congestionDriven) {

      coeffV = coeffADJ(i);
      cong   = netCongestion(i);

    } else {
      if (HTreeSuite(i)) {
        coeffV = 1.2;
      }
    }

    d = nets[i]->deg;
    for (j = 0; j < d; j++) {
      x[j] = nets[i]->pinX[j];
      y[j] = nets[i]->pinY[j];
    }

    if (reRoute) {
      if (newType) {
        treeedges = sttrees[i].edges;
        treenodes = sttrees[i].nodes;
        for (j = 0; j < 2 * d - 3; j++) {
          if (sttrees[i].edges[j].len >
              0) // only route the non-degraded edges (len>0)
          {
            treeedge = &(treeedges[j]);
            n1       = treeedge->n1;
            n2       = treeedge->n2;
            x1       = treenodes[n1].x;
            y1       = treenodes[n1].y;
            x2       = treenodes[n2].x;
            y2       = treenodes[n2].y;
            newRipup(treeedge, x1, y1, x2, y2);
          }
        }
      } else {
        // remove the est_usage due to the segments in this net
        for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {
          ripupSegL(&seglist[j]);
        }
      }
    }

    if (noADJ) {
      coeffV = 1.2;
    }
    if (congestionDriven) {
      // call congestion driven flute to generate RSMT
      if (cong) {
        fluteCongest(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);
      } else {
        fluteNormal(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);
      }
      if (d > 3) {
        numShift += edgeShiftNew(&rsmt);
      }
    } else {
      // call FLUTE to generate RSMT for each net
      fluteNormal(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);
    }

    if (genTree) {
      copyStTree(i, rsmt);
    }

    if (congestionDriven) {
      for (j = 0; j < 2 * d - 3; j++)
        wl1 += sttrees[i].edges[j].len;
    }

    segcnt = 0;
    d      = rsmt.deg;
    for (j = 0; j < 2 * d - 2; j++) {
      x1 = rsmt.branch[j].x;
      y1 = rsmt.branch[j].y;
      n  = rsmt.branch[j].n;
      x2 = rsmt.branch[n].x;
      y2 = rsmt.branch[n].y;

      wl += ADIFF(x1, x2) + ADIFF(y1, y2);

      if (x1 != x2 || y1 != y2) // the branch is not degraded (a point)
      {
        segPos =
            seglistIndex[i] + segcnt; // the position of this segment in seglist
        if (x1 < x2) {
          seglist[segPos].x1 = x1;
          seglist[segPos].x2 = x2;
          seglist[segPos].y1 = y1;
          seglist[segPos].y2 = y2;
        } else {
          seglist[segPos].x1 = x2;
          seglist[segPos].x2 = x1;
          seglist[segPos].y1 = y2;
          seglist[segPos].y2 = y1;
        }

        seglist[segPos].netID = i;
        segcnt++;
      }
    } // loop j

    free(rsmt.branch);

    seglistCnt[i] = segcnt; // the number of segments for net i
    totalNumSeg += segcnt;

    if (reRoute) {
      // update the est_usage due to the segments in this net
      newrouteL(
          i, NOROUTE,
          TRUE); // route the net with no previous route for each tree edge
    }
  } // loop i

  printf("WIRELEN : %d, WIRELEN1 : %d\n", wl, wl1);
  printf("NumSeg  : %d\n", totalNumSeg);
  printf("NumShift: %d\n", numShift);
  printf("totalnon %d\n", totalnon);
  printf("Max %f, Min %f\n", coefMax, coefMin);
  printf("cnt1 %d, cnt2 %d, cnt3 %d\n", cnt1, cnt2, cnt3);
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/RipUp.h
================================================
#ifndef _RIPUP_H_
#define _RIPUP_H_

#include <stdio.h>
#include <stdlib.h>
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "route.h"
#include "utility.h"

// rip-up a L segment
void ripupSegL(Segment* seg) {
  int i, grid;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  // remove L routing
  if (seg->xFirst) {
    grid = seg->y1 * (xGrid - 1);
    for (i = seg->x1; i < seg->x2; i++)
      h_edges[grid + i].est_usage -= 1;
    for (i = ymin; i < ymax; i++)
      v_edges[i * xGrid + seg->x2].est_usage -= 1;
  } else {
    for (i = ymin; i < ymax; i++)
      v_edges[i * xGrid + seg->x1].est_usage -= 1;
    grid = seg->y2 * (xGrid - 1);
    for (i = seg->x1; i < seg->x2; i++)
      h_edges[grid + i].est_usage -= 1;
  }
}

void ripupSegZ(Segment* seg) {
  int i, grid;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  if (seg->x1 == seg->x2) {
    // remove V routing
    for (i = ymin; i < ymax; i++)
      v_edges[i * xGrid + seg->x1].est_usage -= 1;
  } else if (seg->y1 == seg->y2) {
    // remove H routing
    grid = seg->y1 * (xGrid - 1);
    for (i = seg->x1; i < seg->x2; i++)
      h_edges[grid + i].est_usage -= 1;
  } else {
    // remove Z routing
    if (seg->HVH) {
      grid = seg->y1 * (xGrid - 1);
      for (i = seg->x1; i < seg->Zpoint; i++)
        h_edges[grid + i].est_usage -= 1;
      grid = seg->y2 * (xGrid - 1);
      for (i = seg->Zpoint; i < seg->x2; i++)
        h_edges[grid + i].est_usage -= 1;
      for (i = ymin; i < ymax; i++)
        v_edges[i * xGrid + seg->Zpoint].est_usage -= 1;
    } else {
      if (seg->y1 < seg->y2) {
        for (i = seg->y1; i < seg->Zpoint; i++)
          v_edges[i * xGrid + seg->x1].est_usage -= 1;
        for (i = seg->Zpoint; i < seg->y2; i++)
          v_edges[i * xGrid + seg->x2].est_usage -= 1;
        grid = seg->Zpoint * (xGrid - 1);
        for (i = seg->x1; i < seg->x2; i++)
          h_edges[grid + i].est_usage -= 1;
      } else {
        for (i = seg->y2; i < seg->Zpoint; i++)
          v_edges[i * xGrid + seg->x2].est_usage -= 1;
        for (i = seg->Zpoint; i < seg->y1; i++)
          v_edges[i * xGrid + seg->x1].est_usage -= 1;
        grid = seg->Zpoint * (xGrid - 1);
        for (i = seg->x1; i < seg->x2; i++)
          h_edges[grid + i].est_usage -= 1;
      }
    }
  }
}

void newRipup(TreeEdge* treeedge, int x1, int y1, int x2, int y2) {
  short *gridsX, *gridsY;
  int i, j, grid, Zpoint, ymin, ymax, xmin;
  RouteType ripuptype;

  if (treeedge->len == 0) {
    return; // not ripup for degraded edge
  }

  ripuptype = treeedge->route.type;
  if (y1 < y2) {
    ymin = y1;
    ymax = y2;
  } else {
    ymin = y2;
    ymax = y1;
  }

  if (ripuptype == LROUTE) // remove L routing
  {
    if (treeedge->route.xFirst) {
      grid = y1 * (xGrid - 1);
      for (i = x1; i < x2; i++)
        h_edges[grid + i].est_usage -= 1;
      for (i = ymin; i < ymax; i++)
        v_edges[i * xGrid + x2].est_usage -= 1;
    } else {
      for (i = ymin; i < ymax; i++)
        v_edges[i * xGrid + x1].est_usage -= 1;
      grid = y2 * (xGrid - 1);
      for (i = x1; i < x2; i++)
        h_edges[grid + i].est_usage -= 1;
    }
  } else if (ripuptype == ZROUTE) {
    // remove Z routing
    Zpoint = treeedge->route.Zpoint;
    if (treeedge->route.HVH) {
      grid = y1 * (xGrid - 1);
      for (i = x1; i < Zpoint; i++)
        h_edges[grid + i].est_usage -= 1;
      grid = y2 * (xGrid - 1);
      for (i = Zpoint; i < x2; i++)
        h_edges[grid + i].est_usage -= 1;
      for (i = ymin; i < ymax; i++)
        v_edges[i * xGrid + Zpoint].est_usage -= 1;
    } else {
      if (y1 < y2) {
        for (i = y1; i < Zpoint; i++)
          v_edges[i * xGrid + x1].est_usage -= 1;
        for (i = Zpoint; i < y2; i++)
          v_edges[i * xGrid + x2].est_usage -= 1;
        grid = Zpoint * (xGrid - 1);
        for (i = x1; i < x2; i++)
          h_edges[grid + i].est_usage -= 1;
      } else {
        for (i = y2; i < Zpoint; i++)
          v_edges[i * xGrid + x2].est_usage -= 1;
        for (i = Zpoint; i < y1; i++)
          v_edges[i * xGrid + x1].est_usage -= 1;
        grid = Zpoint * (xGrid - 1);
        for (i = x1; i < x2; i++)
          h_edges[grid + i].est_usage -= 1;
      }
    }
  } else if (ripuptype == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        v_edges[ymin * xGrid + gridsX[i]].est_usage -= 1;
      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        h_edges[gridsY[i] * (xGrid - 1) + xmin].est_usage -= 1;
      } else {
        printf("MAZE RIPUP WRONG\n");
        for (j = 0; j < treeedge->route.routelen; j++) {
          printf("x %d y %d\n", gridsX[j], gridsY[j]);
        }
        exit(1);
      }
    }
  }
}

Bool newRipupType2(TreeEdge* treeedge, TreeNode* treenodes, int x1, int y1,
                   int x2, int y2, int deg) {
  int i, grid, ymin, ymax, n1, n2;
  RouteType ripuptype;
  Bool needRipup = FALSE;

  if (treeedge->len == 0) {
    return (FALSE); // not ripup for degraded edge
  }

  ripuptype = treeedge->route.type;
  if (y1 < y2) {
    ymin = y1;
    ymax = y2;
  } else {
    ymin = y2;
    ymax = y1;
  }

  if (ripuptype == LROUTE) // remove L routing
  {
    if (treeedge->route.xFirst) {
      grid = y1 * (xGrid - 1);
      for (i = x1; i < x2; i++) {
        if (h_edges[grid + i].est_usage > h_edges[grid + i].cap) {
          needRipup = TRUE;
          break;
        }
      }

      for (i = ymin; i < ymax; i++) {
        if (v_edges[i * xGrid + x2].est_usage > v_edges[i * xGrid + x2].cap) {
          needRipup = TRUE;
          break;
        }
      }
    } else {
      for (i = ymin; i < ymax; i++) {
        if (v_edges[i * xGrid + x1].est_usage > v_edges[i * xGrid + x1].cap) {
          needRipup = TRUE;
          break;
        }
      }
      grid = y2 * (xGrid - 1);
      for (i = x1; i < x2; i++) {
        if (h_edges[grid + i].est_usage > v_edges[grid + i].cap) {
          needRipup = TRUE;
          break;
        }
      }
    }

    if (needRipup) {
      n1 = treeedge->n1;
      n2 = treeedge->n2;

      if (treeedge->route.xFirst) {
        if (n1 >= deg) {
          treenodes[n1].status -= 2;
        }
        treenodes[n2].status -= 1;

        grid = y1 * (xGrid - 1);
        for (i = x1; i < x2; i++)
          h_edges[grid + i].est_usage -= 1;
        for (i = ymin; i < ymax; i++)
          v_edges[i * xGrid + x2].est_usage -= 1;
      } else {
        if (n2 >= deg) {
          treenodes[n2].status -= 2;
        }
        treenodes[n1].status -= 1;

        for (i = ymin; i < ymax; i++)
          v_edges[i * xGrid + x1].est_usage -= 1;
        grid = y2 * (xGrid - 1);
        for (i = x1; i < x2; i++)
          h_edges[grid + i].est_usage -= 1;
      }
    }
    return (needRipup);

  } else {
    printf("type2 ripup not type L\n");
    exit(0);
  }
}

void printEdgeVEC(TreeEdge* treeedge) {
  int i;

  for (i = 0; i <= treeedge->route.routelen; i++) {
    printf("(%d, %d) ", treeedge->route.gridsX[i], treeedge->route.gridsY[i]);
  }
  printf("\n");
}

Bool newRipupCheckProb(TreeEdge* treeedge, int ripup_threshold, int netID,
                       int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup = FALSE;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  // std::random_device rd;
  // std::mt19937 g(rd());

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin         = min(gridsY[i], gridsY[i + 1]);
        grid         = ymin * xGrid + gridsX[i];
        int cap      = vCapacity - ripup_threshold - v_edges[grid].red;
        int overflow = v_edges[grid].usage + v_edges[grid].red - vCapacity -
                       ripup_threshold;
        int r = rand();
        // if(overflow >= 0) printf("red %d r %d cap %d %d overflow %d ripup:
        // %d\n", v_edges[grid].red, r, cap, r%cap, overflow, (int)(r%cap <=
        // overflow));
        if (overflow >= 0 && (r % cap <= overflow)) {
          needRipup = TRUE;
          break;
        }
      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin         = min(gridsX[i], gridsX[i + 1]);
        grid         = gridsY[i] * (xGrid - 1) + xmin;
        int cap      = hCapacity - ripup_threshold - h_edges[grid].red;
        int overflow = h_edges[grid].usage + h_edges[grid].red - hCapacity -
                       ripup_threshold;
        int r = rand();
        // if(overflow >= 0) printf("red %d r %d cap %d %d overflow %d ripup:
        // %d\n", h_edges[grid].red, r, cap, r%cap, overflow, (int)(r%cap <=
        // overflow));
        if (overflow >= 0 && (r % cap <= overflow)) {
          needRipup = TRUE;
          break;
        }
      }
    }

    if (needRipup) {

      for (i = 0; i < treeedge->route.routelen; i++) {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        }
      }

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipupCheck(TreeEdge* treeedge, int ripup_threshold, int netID,
                   int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup = FALSE;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = ymin * xGrid + gridsX[i];
        if (v_edges[grid].usage + v_edges[grid].red >=
            vCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsY[i] * (xGrid - 1) + xmin;
        if (h_edges[grid].usage + h_edges[grid].red >=
            hCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }
      }
    }

    if (needRipup) {
      for (i = 0; i < treeedge->route.routelen; i++) {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        }
      }

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipupCheck_M1M2(TreeEdge* treeedge, int ripup_threshold, int netID,
                        int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup = FALSE;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = ymin * xGrid + gridsX[i];
        if (v_edges[grid].usage + v_edges[grid].red >=
                vCapacity - ripup_threshold &&
            treeedge->n_ripups <= v_edges[grid].max_have_rippedups) {
          v_edges[grid].ripups_cur_round = true;
          needRipup                      = TRUE;
          break;
        }

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsY[i] * (xGrid - 1) + xmin;
        if (h_edges[grid].usage + h_edges[grid].red >=
                hCapacity - ripup_threshold &&
            treeedge->n_ripups <= h_edges[grid].max_have_rippedups) {
          h_edges[grid].ripups_cur_round = true;
          needRipup                      = TRUE;
          break;
        }
      }
    }

    if (needRipup) {
      for (i = 0; i < treeedge->route.routelen; i++) {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        }
      }

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipupCheck_atomic(TreeEdge* treeedge, int ripup_threshold, int netID,
                          int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup = FALSE;
  int break_edge = 0;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge
  // std::cout << " atomic ripup" << std::endl;
  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin          = min(gridsY[i], gridsY[i + 1]);
        grid          = ymin * xGrid + gridsX[i];
        int old_usage = v_edges[grid].usage;

        while (old_usage + v_edges[grid].red >= vCapacity - ripup_threshold) {
          if (v_edges[grid].usage.compare_exchange_weak(old_usage,
                                                        old_usage - 1)) {
            break_edge = i;
            needRipup  = TRUE;
            break;
          }
          old_usage = v_edges[grid].usage;
        }
        if (needRipup)
          break;

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin          = min(gridsX[i], gridsX[i + 1]);
        grid          = gridsY[i] * (xGrid - 1) + xmin;
        int old_usage = h_edges[grid].usage;

        while (old_usage + h_edges[grid].red >= hCapacity - ripup_threshold) {
          if (h_edges[grid].usage.compare_exchange_weak(old_usage,
                                                        old_usage - 1)) {
            break_edge = i;
            needRipup  = TRUE;
            break;
          }
          old_usage = h_edges[grid].usage;
        }
        if (needRipup)
          break;
      }
    }

    if (needRipup) {
      for (i = 0; i < treeedge->route.routelen; i++) {
        if (i == break_edge)
          continue;
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(
              (short unsigned)1, std::memory_order_relaxed);
        }
      }

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipupCheck_sort(TreeEdge* treeedge, int ripup_threshold, int netID,
                        int edgeID, bool& is_horizontal, int& grid_pos) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup  = FALSE;
  treeedge->ripup = false;
  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = ymin * xGrid + gridsX[i];
        if (v_edges[grid].usage + v_edges[grid].red >=
            vCapacity - ripup_threshold) {
          needRipup     = TRUE;
          is_horizontal = false;
          grid_pos      = grid;
          break;
        }

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsY[i] * (xGrid - 1) + xmin;
        if (h_edges[grid].usage + h_edges[grid].red >=
            hCapacity - ripup_threshold) {
          needRipup     = TRUE;
          is_horizontal = true;
          grid_pos      = grid;
          break;
        }
      }
    }

    if (needRipup) {
      /*for(i=0; i<treeedge->route.routelen; i++)
      {
          if(gridsX[i]==gridsX[i+1]) // a vertical edge
          {
              ymin = min(gridsY[i], gridsY[i+1]);
              if(netID == 2 && edgeID == 21)
                  printf("i %d x %d y %d\n", i, gridsX[i], gridsY[i]);
              //v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
              //v_edges[ymin*xGrid+gridsX[i]].usage.fetch_sub((short unsigned)1,
      std::memory_order_relaxed);
          }
          else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
          {
              xmin = min(gridsX[i], gridsX[i+1]);
              if(netID == 2 && edgeID == 21)
                  printf("i %d x %d y %d\n", i, gridsX[i], gridsY[i]);
              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage.fetch_sub((short
      unsigned)1, std::memory_order_relaxed);
          }
      }*/

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipupCheck_nosub(TreeEdge* treeedge, int ripup_threshold, int netID,
                         int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup = FALSE;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;
    for (i = 0; i < treeedge->route.routelen; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = ymin * xGrid + gridsX[i];
        if (v_edges[grid].usage + v_edges[grid].red >=
            vCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsY[i] * (xGrid - 1) + xmin;
        if (h_edges[grid].usage + h_edges[grid].red >=
            hCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }
      }
    }

    if (needRipup) {
      /*for(i=0; i<treeedge->route.routelen; i++)
      {
          if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
          {
              ymin = min(pre_gridsY[i], pre_gridsY[i+1]);
              printf("nosub x y %d %d i %d\n", pre_gridsX[i], ymin, i);
              //v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
              //v_edges[ymin*xGrid+gridsX[i]].usage.fetch_sub((short unsigned)1,
      std::memory_order_relaxed);
          }
          else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
          {
              xmin = min(pre_gridsX[i], pre_gridsX[i+1]);
              printf("nosub x y %d %d i %d \n", xmin, pre_gridsY[i], i);
              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage.fetch_sub((short
      unsigned)1, std::memory_order_relaxed);
          }
      }*/

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

Bool newRipup3DType3(int netID, int edgeID) {
  short *gridsX, *gridsY, *gridsL;
  int i, k, grid, ymin, xmin, n1a, n2a, hl, bl, hid, bid, deg;

  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  treeedges = sttrees[netID].edges;
  treeedge  = &(treeedges[edgeID]);

  if (treeedge->len == 0) {
    return (FALSE); // not ripup for degraded edge
  }

  treenodes = sttrees[netID].nodes;

  deg = sttrees[netID].deg;

  n1a = treeedge->n1a;
  n2a = treeedge->n2a;

  if (n1a < deg) {
    bl = 0;
  } else {
    bl = BIG_INT;
  }
  hl  = 0;
  hid = bid = BIG_INT;

  for (i = 0; i < treenodes[n1a].conCNT; i++) {
    if (treenodes[n1a].eID[i] == edgeID) {
      for (k = i + 1; k < treenodes[n1a].conCNT; k++) {
        treenodes[n1a].eID[k - 1]     = treenodes[n1a].eID[k];
        treenodes[n1a].heights[k - 1] = treenodes[n1a].heights[k];
        if (bl > treenodes[n1a].heights[k]) {
          bl  = treenodes[n1a].heights[k];
          bid = treenodes[n1a].eID[k];
        }
        if (hl < treenodes[n1a].heights[k]) {
          hl  = treenodes[n1a].heights[k];
          hid = treenodes[n1a].eID[k];
        }
      }
      break;
    } else {
      if (bl > treenodes[n1a].heights[i]) {
        bl  = treenodes[n1a].heights[i];
        bid = treenodes[n1a].eID[i];
      }
      if (hl < treenodes[n1a].heights[i]) {
        hl  = treenodes[n1a].heights[i];
        hid = treenodes[n1a].eID[i];
      }
    }
  }
  treenodes[n1a].conCNT--;

  treenodes[n1a].botL = bl;
  treenodes[n1a].lID  = bid;
  treenodes[n1a].topL = hl;
  treenodes[n1a].hID  = hid;

  if (n2a < deg) {
    bl = 0;
  } else {
    bl = BIG_INT;
  }
  hl  = 0;
  hid = bid = BIG_INT;

  for (i = 0; i < treenodes[n2a].conCNT; i++) {
    if (treenodes[n2a].eID[i] == edgeID) {
      for (k = i + 1; k < treenodes[n2a].conCNT; k++) {
        treenodes[n2a].eID[k - 1]     = treenodes[n2a].eID[k];
        treenodes[n2a].heights[k - 1] = treenodes[n2a].heights[k];
        if (bl > treenodes[n2a].heights[k]) {
          bl  = treenodes[n2a].heights[k];
          bid = treenodes[n2a].eID[k];
        }
        if (hl < treenodes[n2a].heights[k]) {
          hl  = treenodes[n2a].heights[k];
          hid = treenodes[n2a].eID[k];
        }
      }
      break;
    } else {
      if (bl > treenodes[n2a].heights[i]) {
        bl  = treenodes[n2a].heights[i];
        bid = treenodes[n2a].eID[i];
      }
      if (hl < treenodes[n2a].heights[i]) {
        hl  = treenodes[n2a].heights[i];
        hid = treenodes[n2a].eID[i];
      }
    }
  }
  treenodes[n2a].conCNT--;

  treenodes[n2a].botL = bl;
  treenodes[n2a].lID  = bid;
  treenodes[n2a].topL = hl;
  treenodes[n2a].hID  = hid;

  gridsX = treeedge->route.gridsX;
  gridsY = treeedge->route.gridsY;
  gridsL = treeedge->route.gridsL;
  for (i = 0; i < treeedge->route.routelen; i++) {
    if (gridsL[i] == gridsL[i + 1]) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = gridsL[i] * gridV + ymin * xGrid + gridsX[i];
        v_edges3D[grid].usage -= 1;
      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + xmin;
        h_edges3D[grid].usage -= 1;
      } else {
        printf("MAZE RIPUP WRONG\n");
        return (FALSE);
        // exit(1);
      }
    }
  }

  return (TRUE);
}

void newRipupNet(int netID) {
  short *gridsX, *gridsY;
  int i, j, grid, Zpoint, ymin, ymax, xmin, n1, n2, edgeID;

  RouteType ripuptype;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;
  int x1, y1, x2, y2, deg;

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  deg       = sttrees[netID].deg;

  for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
    treeedge = &(treeedges[edgeID]);
    if (treeedge->len > 0) {

      n1 = treeedge->n1;
      n2 = treeedge->n2;
      x1 = treenodes[n1].x;
      y1 = treenodes[n1].y;
      x2 = treenodes[n2].x;
      y2 = treenodes[n2].y;

      ripuptype = treeedge->route.type;
      if (y1 < y2) {
        ymin = y1;
        ymax = y2;
      } else {
        ymin = y2;
        ymax = y1;
      }

      if (ripuptype == LROUTE) // remove L routing
      {
        if (treeedge->route.xFirst) {
          grid = y1 * (xGrid - 1);
          for (i = x1; i < x2; i++)
            h_edges[grid + i].est_usage -= 1;
          for (i = ymin; i < ymax; i++)
            v_edges[i * xGrid + x2].est_usage -= 1;
        } else {
          for (i = ymin; i < ymax; i++)
            v_edges[i * xGrid + x1].est_usage -= 1;
          grid = y2 * (xGrid - 1);
          for (i = x1; i < x2; i++)
            h_edges[grid + i].est_usage -= 1;
        }
      } else if (ripuptype == ZROUTE) {
        // remove Z routing
        Zpoint = treeedge->route.Zpoint;
        if (treeedge->route.HVH) {
          grid = y1 * (xGrid - 1);
          for (i = x1; i < Zpoint; i++)
            h_edges[grid + i].est_usage -= 1;
          grid = y2 * (xGrid - 1);
          for (i = Zpoint; i < x2; i++)
            h_edges[grid + i].est_usage -= 1;
          for (i = ymin; i < ymax; i++)
            v_edges[i * xGrid + Zpoint].est_usage -= 1;
        } else {
          if (y1 < y2) {
            for (i = y1; i < Zpoint; i++)
              v_edges[i * xGrid + x1].est_usage -= 1;
            for (i = Zpoint; i < y2; i++)
              v_edges[i * xGrid + x2].est_usage -= 1;
            grid = Zpoint * (xGrid - 1);
            for (i = x1; i < x2; i++)
              h_edges[grid + i].est_usage -= 1;
          } else {
            for (i = y2; i < Zpoint; i++)
              v_edges[i * xGrid + x2].est_usage -= 1;
            for (i = Zpoint; i < y1; i++)
              v_edges[i * xGrid + x1].est_usage -= 1;
            grid = Zpoint * (xGrid - 1);
            for (i = x1; i < x2; i++)
              h_edges[grid + i].est_usage -= 1;
          }
        }
      } else if (ripuptype == MAZEROUTE) {
        gridsX = treeedge->route.gridsX;
        gridsY = treeedge->route.gridsY;
        for (i = 0; i < treeedge->route.routelen; i++) {
          if (gridsX[i] == gridsX[i + 1]) // a vertical edge
          {
            ymin = min(gridsY[i], gridsY[i + 1]);
            v_edges[ymin * xGrid + gridsX[i]].est_usage -= 1;
          } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
          {
            xmin = min(gridsX[i], gridsX[i + 1]);
            h_edges[gridsY[i] * (xGrid - 1) + xmin].est_usage -= 1;
          } else {
            printf("MAZE RIPUP WRONG in newRipupNet\n");
            for (j = 0; j < treeedge->route.routelen; j++) {
              printf("x %d y %d\n", gridsX[j], gridsY[j]);
              // if(gridsX[i]!=gridsX[i+1] && gridsY[i]==gridsY[i+1])
            }
            // exit(1);
          }
        }
      }
    }
  }
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/bitmap_image.hpp
================================================
/*
 *****************************************************************************
 *                                                                           *
 *                          Platform Independent                             *
 *                    Bitmap Image Reader Writer Library                     *
 *                                                                           *
 * Author: Arash Partow - 2002                                               *
 * URL: http://partow.net/programming/bitmap/index.html                      *
 *                                                                           *
 * Note: This library only supports 24-bits per pixel bitmap format files.   *
 *                                                                           *
 * Copyright notice:                                                         *
 * Free use of the Platform Independent Bitmap Image Reader Writer Library   *
 * is permitted under the guidelines and in accordance with the most current *
 * version of the MIT License.                                               *
 * http://www.opensource.org/licenses/MIT                                    *
 *                                                                           *
 *****************************************************************************
*/


#ifndef INCLUDE_BITMAP_IMAGE_HPP
#define INCLUDE_BITMAP_IMAGE_HPP

#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <iterator>
#include <limits>
#include <string>
#include <vector>


class bitmap_image
{
public:

   enum channel_mode {
                        rgb_mode = 0,
                        bgr_mode = 1
                     };

   enum color_plane {
                       blue_plane  = 0,
                       green_plane = 1,
                       red_plane   = 2
                    };

   struct rgb_t
   {
      unsigned char   red;
      unsigned char green;
      unsigned char  blue;
   };

   bitmap_image()
   : file_name_(""),
     width_          (0),
     height_         (0),
     row_increment_  (0),
     bytes_per_pixel_(3),
     channel_mode_(bgr_mode)
   {}

   bitmap_image(const std::string& filename)
   : file_name_(filename),
     width_          (0),
     height_         (0),
     row_increment_  (0),
     bytes_per_pixel_(0),
     channel_mode_(bgr_mode)
   {
      load_bitmap();
   }

   bitmap_image(const unsigned int width, const unsigned int height)
   : file_name_(""),
     width_ (width ),
     height_(height),
     row_increment_  (0),
     bytes_per_pixel_(3),
     channel_mode_(bgr_mode)
   {
      create_bitmap();
   }

   bitmap_image(const bitmap_image& image)
   : file_name_(image.file_name_),
     width_    (image.width_    ),
     height_   (image.height_   ),
     row_increment_  (0),
     bytes_per_pixel_(3),
     channel_mode_(bgr_mode)
   {
      create_bitmap();
      data_ = image.data_;
   }

   bitmap_image& operator=(const bitmap_image& image)
   {
      if (this != &image)
      {
         file_name_       = image.file_name_;
         bytes_per_pixel_ = image.bytes_per_pixel_;
         width_           = image.width_;
         height_          = image.height_;
         row_increment_   = 0;
         channel_mode_    = image.channel_mode_;
         create_bitmap();
         data_ = image.data_;
      }

      return *this;
   }

   inline bool operator!()
   {
      return (data_.size()   == 0) ||
             (width_         == 0) ||
             (height_        == 0) ||
             (row_increment_ == 0);
   }

   inline void clear(const unsigned char v = 0x00)
   {
      std::fill(data_.begin(), data_.end(), v);
   }

   inline unsigned char red_channel(const unsigned int x, const unsigned int y) const
   {
      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 2)];
   }

   inline unsigned char green_channel(const unsigned int x, const unsigned int y) const
   {
      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 1)];
   }

   inline unsigned char blue_channel (const unsigned int x, const unsigned int y) const
   {
      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 0)];
   }

   inline void red_channel(const unsigned int x, const unsigned int y, const unsigned char value)
   {
      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 2)] = value;
   }

   inline void green_channel(const unsigned int x, const unsigned int y, const unsigned char value)
   {
      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 1)] = value;
   }

   inline void blue_channel (const unsigned int x, const unsigned int y, const unsigned char value)
   {
      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 0)] = value;
   }

   inline unsigned char* row(unsigned int row_index) const
   {
      return const_cast<unsigned char*>(&data_[(row_index * row_increment_)]);
   }

   inline void get_pixel(const unsigned int x, const unsigned int y,
                         unsigned char& red,
                         unsigned char& green,
                         unsigned char& blue) const
   {
      const unsigned int y_offset = y * row_increment_;
      const unsigned int x_offset = x * bytes_per_pixel_;
      const unsigned int offset   = y_offset + x_offset;

      blue  = data_[offset + 0];
      green = data_[offset + 1];
      red   = data_[offset + 2];
   }

   template <typename RGB>
   inline void get_pixel(const unsigned int x, const unsigned int y, RGB& colour) const
   {
      get_pixel(x, y, colour.red, colour.green, colour.blue);
   }

   inline rgb_t get_pixel(const unsigned int x, const unsigned int y) const
   {
      rgb_t colour;
      get_pixel(x, y, colour.red, colour.green, colour.blue);
      return colour;
   }

   inline void set_pixel(const unsigned int x, const unsigned int y,
                         const unsigned char red,
                         const unsigned char green,
                         const unsigned char blue)
   {
      const unsigned int y_offset = y * row_increment_;
      const unsigned int x_offset = x * bytes_per_pixel_;
      const unsigned int offset   = y_offset + x_offset;

      data_[offset + 0] = blue;
      data_[offset + 1] = green;
      data_[offset + 2] = red;
   }

   template <typename RGB>
   inline void set_pixel(const unsigned int x, const unsigned int y, const RGB& colour)
   {
      set_pixel(x, y, colour.red, colour.green, colour.blue);
   }

   inline bool copy_from(const bitmap_image& image)
   {
      if (
           (image.height_ != height_) ||
           (image.width_  != width_ )
         )
      {
         return false;
      }

      data_ = image.data_;

      return true;
   }

   inline bool copy_from(const bitmap_image& source_image,
                         const unsigned int& x_offset,
                         const unsigned int& y_offset)
   {
      if ((x_offset + source_image.width_ ) > width_ ) { return false; }
      if ((y_offset + source_image.height_) > height_) { return false; }

      for (unsigned int y = 0; y < source_image.height_; ++y)
      {
         unsigned char* itr1           = row(y + y_offset) + x_offset * bytes_per_pixel_;
         const unsigned char* itr2     = source_image.row(y);
         const unsigned char* itr2_end = itr2 + source_image.width_ * bytes_per_pixel_;

         std::copy(itr2, itr2_end, itr1);
      }

      return true;
   }

   inline bool region(const unsigned int& x     ,
                      const unsigned int& y     ,
                      const unsigned int& width ,
                      const unsigned int& height,
                      bitmap_image& dest_image  ) const
   {
      if ((x + width ) > width_ ) { return false; }
      if ((y + height) > height_) { return false; }

      if (
           (dest_image.width_  < width_ ) ||
           (dest_image.height_ < height_)
         )
      {
         dest_image.setwidth_height(width,height);
      }

      for (unsigned int r = 0; r < height; ++r)
      {
         unsigned char* itr1     = row(r + y) + x * bytes_per_pixel_;
         unsigned char* itr1_end = itr1 + (width * bytes_per_pixel_);
         unsigned char* itr2     = dest_image.row(r);

         std::copy(itr1, itr1_end, itr2);
      }

      return true;
   }

   inline bool roi_from_center(const unsigned int& cx    ,
                               const unsigned int& cy    ,
                               const unsigned int& width ,
                               const unsigned int& height,
                               bitmap_image& dest_image  ) const
   {
      return region(cx - (width / 2), cy - (height / 2),
                    width, height,
                    dest_image);
   }

   inline bool set_region(const unsigned int&  x     ,
                          const unsigned int&  y     ,
                          const unsigned int&  width ,
                          const unsigned int&  height,
                          const unsigned char& value )
   {
      if ((x + width ) > width_ ) { return false; }
      if ((y + height) > height_) { return false; }

      for (unsigned int r = 0; r < height; ++r)
      {
         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_;
         unsigned char* itr_end = itr + (width * bytes_per_pixel_);

         std::fill(itr, itr_end, value);
      }

      return true;
   }

   inline bool set_region(const unsigned int&  x     ,
                          const unsigned int&  y     ,
                          const unsigned int&  width ,
                          const unsigned int&  height,
                          const color_plane    color ,
                          const unsigned char& value )
   {
      if ((x + width ) > width_ ) { return false; }
      if ((y + height) > height_) { return false; }

      const unsigned int color_plane_offset = offset(color);

      for (unsigned int r = 0; r < height; ++r)
      {
         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_ + color_plane_offset;
         unsigned char* itr_end = itr + (width * bytes_per_pixel_);

         while (itr != itr_end)
         {
            *itr  = value;
             itr += bytes_per_pixel_;
         }
      }

      return true;
   }

   inline bool set_region(const unsigned int&  x     ,
                          const unsigned int&  y     ,
                          const unsigned int&  width ,
                          const unsigned int&  height,
                          const unsigned char& red   ,
                          const unsigned char& green ,
                          const unsigned char& blue  )
   {
      if ((x + width ) > width_ ) { return false; }
      if ((y + height) > height_) { return false; }

      for (unsigned int r = 0; r < height; ++r)
      {
         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_;
         unsigned char* itr_end = itr + (width * bytes_per_pixel_);

         while (itr != itr_end)
         {
            *(itr++) =  blue;
            *(itr++) = green;
            *(itr++) =   red;
         }
      }

      return true;
   }

   void reflective_image(bitmap_image& image, const bool include_diagnols = false)
   {
      image.setwidth_height(3 * width_, 3 * height_, true);

      image.copy_from(*this, width_, height_);

      vertical_flip();

      image.copy_from(*this, width_,           0);
      image.copy_from(*this, width_, 2 * height_);

      vertical_flip();
      horizontal_flip();

      image.copy_from(*this,          0, height_);
      image.copy_from(*this, 2 * width_, height_);

      horizontal_flip();

      if (include_diagnols)
      {
         bitmap_image tile = *this;

         tile.vertical_flip();
         tile.horizontal_flip();

         image.copy_from(tile,          0,           0);
         image.copy_from(tile, 2 * width_,           0);
         image.copy_from(tile, 2 * width_, 2 * height_);
         image.copy_from(tile, 0         , 2 * height_);
      }
   }

   inline unsigned int width() const
   {
      return width_;
   }

   inline unsigned int height() const
   {
      return height_;
   }

   inline unsigned int bytes_per_pixel() const
   {
      return bytes_per_pixel_;
   }

   inline unsigned int pixel_count() const
   {
      return width_ *  height_;
   }

   inline void setwidth_height(const unsigned int width,
                               const unsigned int height,
                               const bool clear = false)
   {
      data_.clear();
      width_  = width;
      height_ = height;

      create_bitmap();

      if (clear)
      {
         std::fill(data_.begin(), data_.end(), static_cast<unsigned char>(0x00));
      }
   }

   void save_image(const std::string& file_name) const
   {
      std::ofstream stream(file_name.c_str(),std::ios::binary);

      if (!stream)
      {
         std::cerr << "bitmap_image::save_image(): Error - Could not open file "  << file_name << " for writing!" << std::endl;
         return;
      }

      bitmap_information_header bih;

      bih.width            = width_;
      bih.height           = height_;
      bih.bit_count        = static_cast<unsigned short>(bytes_per_pixel_ << 3);
      bih.clr_important    = 0;
      bih.clr_used         = 0;
      bih.compression      = 0;
      bih.planes           = 1;
      bih.size             = bih.struct_size();
      bih.x_pels_per_meter = 0;
      bih.y_pels_per_meter = 0;
      bih.size_image       = (((bih.width * bytes_per_pixel_) + 3) & 0x0000FFFC) * bih.height;

      bitmap_file_header bfh;

      bfh.type             = 19778;
      bfh.size             = bfh.struct_size() + bih.struct_size() + bih.size_image;
      bfh.reserved1        = 0;
      bfh.reserved2        = 0;
      bfh.off_bits         = bih.struct_size() + bfh.struct_size();

      write_bfh(stream,bfh);
      write_bih(stream,bih);

      unsigned int padding = (4 - ((3 * width_) % 4)) % 4;
      char padding_data[4] = { 0x00, 0x00, 0x00, 0x00 };

      for (unsigned int i = 0; i < height_; ++i)
      {
         const unsigned char* data_ptr = &data_[(row_increment_ * (height_ - i - 1))];

         stream.write(reinterpret_cast<const char*>(data_ptr), sizeof(unsigned char) * bytes_per_pixel_ * width_);
         stream.write(padding_data,padding);
      }

      stream.close();
   }

   inline void set_all_ith_bits_low(const unsigned int bitr_index)
   {
      unsigned char mask = static_cast<unsigned char>(~(1 << bitr_index));

      for (unsigned char* itr = data(); itr != end(); ++itr)
      {
         *itr &= mask;
      }
   }

   inline void set_all_ith_bits_high(const unsigned int bitr_index)
   {
      unsigned char mask = static_cast<unsigned char>(1 << bitr_index);

      for (unsigned char* itr = data(); itr != end(); ++itr)
      {
         *itr |= mask;
      }
   }

   inline void set_all_ith_channels(const unsigned int& channel, const unsigned char& value)
   {
      for (unsigned char* itr = (data() + channel); itr < end(); itr += bytes_per_pixel_)
      {
         *itr = value;
      }
   }

   inline void set_channel(const color_plane color,const unsigned char& value)
   {
      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)
      {
         *itr = value;
      }
   }

   inline void ror_channel(const color_plane color, const unsigned int& ror)
   {
      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)
      {
         *itr = static_cast<unsigned char>(((*itr) >> ror) | ((*itr) << (8 - ror)));
      }
   }

   inline void set_all_channels(const unsigned char& value)
   {
      for (unsigned char* itr = data(); itr < end(); )
      {
         *(itr++) = value;
      }
   }

   inline void set_all_channels(const unsigned char& r_value,
                                const unsigned char& g_value,
                                const unsigned char& b_value)
   {
      for (unsigned char* itr = (data() + 0); itr < end(); itr += bytes_per_pixel_)
      {
         *(itr + 0) = b_value;
         *(itr + 1) = g_value;
         *(itr + 2) = r_value;
      }
   }

   inline void invert_color_planes()
   {
      for (unsigned char* itr = data(); itr < end(); *itr = ~(*itr), ++itr);
   }

   inline void add_to_color_plane(const color_plane color, const unsigned char& value)
   {
      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)
      {
         (*itr) += value;
      }
   }

   inline void convert_to_grayscale()
   {
      double r_scaler = 0.299;
      double g_scaler = 0.587;
      double b_scaler = 0.114;

      if (rgb_mode == channel_mode_)
      {
         std::swap(r_scaler, b_scaler);
      }

      for (unsigned char* itr = data(); itr < end(); )
      {
         unsigned char gray_value = static_cast<unsigned char>
                       (
                         (r_scaler * (*(itr + 2))) +
                         (g_scaler * (*(itr + 1))) +
                         (b_scaler * (*(itr + 0)))
                       );

         *(itr++) = gray_value;
         *(itr++) = gray_value;
         *(itr++) = gray_value;
      }
   }

   inline const unsigned char* data() const
   {
      return data_.data();
   }

   inline unsigned char* data()
   {
      return const_cast<unsigned char*>(data_.data());
   }

   inline void bgr_to_rgb()
   {
      if ((bgr_mode == channel_mode_) && (3 == bytes_per_pixel_))
      {
         reverse_channels();
         channel_mode_ = rgb_mode;
      }
   }

   inline void rgb_to_bgr()
   {
      if ((rgb_mode == channel_mode_) && (3 == bytes_per_pixel_))
      {
         reverse_channels();
         channel_mode_ = bgr_mode;
      }
   }

   inline void reverse()
   {
      unsigned char* itr1 = data();
      unsigned char* itr2 = end() - bytes_per_pixel_;

      while (itr1 < itr2)
      {
         for (std::size_t i = 0; i < bytes_per_pixel_; ++i)
         {
            unsigned char* citr1 = itr1 + i;
            unsigned char* citr2 = itr2 + i;

            std::swap(*citr1,*citr2);
         }

         itr1 += bytes_per_pixel_;
         itr2 -= bytes_per_pixel_;
      }
   }

   inline void horizontal_flip()
   {
      for (unsigned int y = 0; y < height_; ++y)
      {
         unsigned char* itr1 = row(y);
         unsigned char* itr2 = itr1 + row_increment_ - bytes_per_pixel_;

         while (itr1 < itr2)
         {
            for (unsigned int i = 0; i < bytes_per_pixel_; ++i)
            {
               unsigned char* p1 = (itr1 + i);
               unsigned char* p2 = (itr2 + i);

               std::swap(*p1,*p2);
            }

            itr1 += bytes_per_pixel_;
            itr2 -= bytes_per_pixel_;
         }
      }
   }

   inline void vertical_flip()
   {
      for (unsigned int y = 0; y < (height_ / 2); ++y)
      {
         unsigned char* itr1 = row(y);
         unsigned char* itr2 = row(height_ - y - 1);

         for (std::size_t x = 0; x < row_increment_; ++x)
         {
            std::swap(*(itr1 + x),*(itr2 + x));
         }
      }
   }

   inline void export_color_plane(const color_plane color, unsigned char* image)
   {
      for (unsigned char* itr = (data() + offset(color)); itr < end(); ++image, itr += bytes_per_pixel_)
      {
         (*image) = (*itr);
      }
   }

   inline void export_color_plane(const color_plane color, bitmap_image& image)
   {
      if (
           (width_  != image.width_ ) ||
           (height_ != image.height_)
         )
      {
         image.setwidth_height(width_,height_);
      }

      image.clear();

      unsigned char* itr1     = (data() + offset(color));
      unsigned char* itr1_end = end();
      unsigned char* itr2     = (image.data() + offset(color));

      while (itr1 < itr1_end)
      {
         (*itr2) = (*itr1);

         itr1 += bytes_per_pixel_;
         itr2 += bytes_per_pixel_;
      }
   }

   inline void export_response_image(const color_plane color, double* response_image)
   {
      double* resp_itr = response_image;

      for (unsigned char* itr = (data() + offset(color)); itr < end(); ++response_image, itr += bytes_per_pixel_)
      {
         *(resp_itr++) = (1.0 * (*itr)) / 256.0;
      }
   }

   inline void export_gray_scale_response_image(double* response_image) const
   {
      double* resp_itr = response_image;

      for (const unsigned char* itr = data(); itr < end(); itr += bytes_per_pixel_)
      {
         unsigned char gray_value = static_cast<unsigned char>
                       (
                         (0.299 * (*(itr + 2))) +
                         (0.587 * (*(itr + 1))) +
                         (0.114 * (*(itr + 0)))
                       );

         *(resp_itr++) = (1.0 * gray_value) / 256.0;
      }
   }

   inline void export_rgb(double* red, double* green, double* blue) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         (*blue ) = (1.0 * (*(itr++))) / 256.0;
         (*green) = (1.0 * (*(itr++))) / 256.0;
         (*red  ) = (1.0 * (*(itr++))) / 256.0;
      }
   }

   inline void export_rgb(float* red, float* green, float* blue) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         (*blue ) = (1.0f * (*(itr++))) / 256.0f;
         (*green) = (1.0f * (*(itr++))) / 256.0f;
         (*red  ) = (1.0f * (*(itr++))) / 256.0f;
      }
   }

   inline void export_rgb(unsigned char* red, unsigned char* green, unsigned char* blue) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         (*blue ) = *(itr++);
         (*green) = *(itr++);
         (*red  ) = *(itr++);
      }
   }

   inline void export_ycbcr(double* y, double* cb, double* cr) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++y, ++cb, ++cr)
      {
         const double blue  = (1.0 * (*(itr++)));
         const double green = (1.0 * (*(itr++)));
         const double red   = (1.0 * (*(itr++)));

         ( *y) = clamp<double>( 16.0 + (1.0/256.0) * (  65.738 * red + 129.057 * green +  25.064 * blue),1.0,254);
         (*cb) = clamp<double>(128.0 + (1.0/256.0) * (- 37.945 * red -  74.494 * green + 112.439 * blue),1.0,254);
         (*cr) = clamp<double>(128.0 + (1.0/256.0) * ( 112.439 * red -  94.154 * green -  18.285 * blue),1.0,254);
      }
   }

   inline void export_rgb_normal(double* red, double* green, double* blue) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         (*blue ) = (1.0 * (*(itr++)));
         (*green) = (1.0 * (*(itr++)));
         (*red  ) = (1.0 * (*(itr++)));
      }
   }

   inline void export_rgb_normal(float* red, float* green, float* blue) const
   {
      if (bgr_mode != channel_mode_)
         return;

      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         (*blue ) = (1.0f * (*(itr++)));
         (*green) = (1.0f * (*(itr++)));
         (*red  ) = (1.0f * (*(itr++)));
      }
   }

   inline void import_rgb(double* red, double* green, double* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(256.0 * (*blue ));
         *(itr++) = static_cast<unsigned char>(256.0 * (*green));
         *(itr++) = static_cast<unsigned char>(256.0 * (*red  ));
      }
   }

   inline void import_rgb(float* red, float* green, float* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(256.0f * (*blue ));
         *(itr++) = static_cast<unsigned char>(256.0f * (*green));
         *(itr++) = static_cast<unsigned char>(256.0f * (*red  ));
      }
   }

   inline void import_rgb(unsigned char* red, unsigned char* green, unsigned char* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = (*blue );
         *(itr++) = (*green);
         *(itr++) = (*red  );
      }
   }

   inline void import_ycbcr(double* y, double* cb, double* cr)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++y, ++cb, ++cr)
      {
         double y_  =  (*y);
         double cb_ = (*cb);
         double cr_ = (*cr);

         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_ + 516.412 * cb_                 ) / 256.0 - 276.836,0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_ - 100.291 * cb_ - 208.120 * cr_ ) / 256.0 + 135.576,0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_                 + 408.583 * cr_ ) / 256.0 - 222.921,0.0,255.0));
      }
   }

   inline void import_gray_scale_clamped(double* gray)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++gray)
      {
         unsigned char c = static_cast<unsigned char>(clamp<double>(256.0 * (*gray),0.0,255.0));

         *(itr + 0) = c;
         *(itr + 1) = c;
         *(itr + 2) = c;

         itr += 3;
      }
   }

   inline void import_rgb_clamped(double* red, double* green, double* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*blue ),0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*green),0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*red  ),0.0,255.0));
      }
   }

   inline void import_rgb_clamped(float* red, float* green, float* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*blue ),0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*green),0.0,255.0));
         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*red  ),0.0,255.0));
      }
   }

   inline void import_rgb_normal(double* red, double* green, double* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(*blue );
         *(itr++) = static_cast<unsigned char>(*green);
         *(itr++) = static_cast<unsigned char>(*red  );
      }
   }

   inline void import_rgb_normal(float* red, float* green, float* blue)
   {
      if (bgr_mode != channel_mode_)
         return;

      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)
      {
         *(itr++) = static_cast<unsigned char>(*blue );
         *(itr++) = static_cast<unsigned char>(*green);
         *(itr++) = static_cast<unsigned char>(*red  );
      }
   }

   inline void subsample(bitmap_image& dest) const
   {
      /*
         Half sub-sample of original image.
      */
      unsigned int w = 0;
      unsigned int h = 0;

      bool odd_width = false;
      bool odd_height = false;

      if (0 == (width_ % 2))
         w = width_ / 2;
      else
      {
         w = 1 + (width_ / 2);
         odd_width = true;
      }

      if (0 == (height_ % 2))
         h = height_ / 2;
      else
      {
         h = 1 + (height_ / 2);
         odd_height = true;
      }

      unsigned int horizontal_upper = (odd_width)  ? (w - 1) : w;
      unsigned int vertical_upper   = (odd_height) ? (h - 1) : h;

      dest.setwidth_height(w,h);
      dest.clear();

            unsigned char* s_itr[3];
      const unsigned char*  itr1[3];
      const unsigned char*  itr2[3];

      s_itr[0] = dest.data() + 0;
      s_itr[1] = dest.data() + 1;
      s_itr[2] = dest.data() + 2;

      itr1[0] = data() + 0;
      itr1[1] = data() + 1;
      itr1[2] = data() + 2;

      itr2[0] = data() + row_increment_ + 0;
      itr2[1] = data() + row_increment_ + 1;
      itr2[2] = data() + row_increment_ + 2;

      unsigned int total = 0;

      for (unsigned int j = 0; j < vertical_upper; ++j)
      {
         for (unsigned int i = 0; i < horizontal_upper; ++i)
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)
            {
               total = 0;
               total += *(itr1[k]);
               total += *(itr1[k]);
               total += *(itr2[k]);
               total += *(itr2[k]);

               itr1[k] += bytes_per_pixel_;
               itr1[k] += bytes_per_pixel_;
               itr2[k] += bytes_per_pixel_;
               itr2[k] += bytes_per_pixel_;

               *(s_itr[k]) = static_cast<unsigned char>(total >> 2);
            }
         }

         if (odd_width)
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)
            {
               total = 0;
               total += *(itr1[k]);
               total += *(itr2[k]);

               itr1[k] += bytes_per_pixel_;
               itr2[k] += bytes_per_pixel_;

               *(s_itr[k]) = static_cast<unsigned char>(total >> 1);
            }
         }

         for (unsigned int k = 0; k < bytes_per_pixel_; ++k)
         {
            itr1[k] += row_increment_;
         }

         if (j != (vertical_upper - 1))
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; ++k)
            {
               itr2[k] += row_increment_;
            }
         }
      }

      if (odd_height)
      {
         for (unsigned int i = 0; i < horizontal_upper; ++i)
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)
            {
               total = 0;
               total += *(itr1[k]);
               total += *(itr2[k]);

               itr1[k] += bytes_per_pixel_;
               itr2[k] += bytes_per_pixel_;

               *(s_itr[k]) = static_cast<unsigned char>(total >> 1);
            }
         }

         if (odd_width)
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; ++k)
            {
               (*(s_itr[k])) = *(itr1[k]);
            }
         }
      }
   }

   inline void upsample(bitmap_image& dest) const
   {
      /*
         2x up-sample of original image.
      */

      dest.setwidth_height(2 * width_ ,2 * height_);
      dest.clear();

      const unsigned char* s_itr[3];
            unsigned char*  itr1[3];
            unsigned char*  itr2[3];

      s_itr[0] = data() + 0;
      s_itr[1] = data() + 1;
      s_itr[2] = data() + 2;

      itr1[0] = dest.data() + 0;
      itr1[1] = dest.data() + 1;
      itr1[2] = dest.data() + 2;

      itr2[0] = dest.data() + dest.row_increment_ + 0;
      itr2[1] = dest.data() + dest.row_increment_ + 1;
      itr2[2] = dest.data() + dest.row_increment_ + 2;

      for (unsigned int j = 0; j < height_; ++j)
      {
         for (unsigned int i = 0; i < width_; ++i)
         {
            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)
            {
               *(itr1[k]) = *(s_itr[k]); itr1[k] += bytes_per_pixel_;
               *(itr1[k]) = *(s_itr[k]); itr1[k] += bytes_per_pixel_;

               *(itr2[k]) = *(s_itr[k]); itr2[k] += bytes_per_pixel_;
               *(itr2[k]) = *(s_itr[k]); itr2[k] += bytes_per_pixel_;
            }
         }

         for (unsigned int k = 0; k < bytes_per_pixel_; ++k)
         {
            itr1[k] += dest.row_increment_;
            itr2[k] += dest.row_increment_;
         }
      }
   }

   inline void alpha_blend(const double& alpha, const bitmap_image& image)
   {
      if (
           (image.width_  != width_ ) ||
           (image.height_ != height_)
         )
      {
         return;
      }

      if ((alpha < 0.0) || (alpha > 1.0))
      {
         return;
      }

      unsigned char* itr1           = data();
      const unsigned char* itr1_end = end();
      const unsigned char* itr2     = image.data();

      double alpha_compliment = 1.0 - alpha;

      while (itr1 != itr1_end)
      {
         *(itr1) = static_cast<unsigned char>((alpha * (*itr2)) + (alpha_compliment * (*itr1)));
         ++itr1;
         ++itr2;
      }
   }

   inline double psnr(const bitmap_image& image)
   {
      if (
           (image.width_  != width_ ) ||
           (image.height_ != height_)
         )
      {
         return 0.0;
      }

      const unsigned char* itr1 = data();
      const unsigned char* itr2 = image.data();

      double mse = 0.0;

      while (itr1 != end())
      {
         const double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));

         mse += v * v;
         ++itr1;
         ++itr2;
      }

      if (mse <= 0.0000001)
      {
         return 1000000.0;
      }
      else
      {
         mse /= (3.0 * width_ * height_);

         return 20.0 * std::log10(255.0 / std::sqrt(mse));
      }
   }

   inline double psnr(const unsigned int& x,
                      const unsigned int& y,
                      const bitmap_image& image)
   {
      if ((x + image.width() ) > width_ ) { return 0.0; }
      if ((y + image.height()) > height_) { return 0.0; }

      double mse = 0.0;

      const unsigned int height = image.height();
      const unsigned int width  = image.width();

      for (unsigned int r = 0; r < height; ++r)
      {
         const unsigned char* itr1     = row(r + y) + x * bytes_per_pixel_;
         const unsigned char* itr1_end = itr1 + (width * bytes_per_pixel_);
         const unsigned char* itr2     = image.row(r);

         while (itr1 != itr1_end)
         {
            double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));
            mse += v * v;
            ++itr1;
            ++itr2;
         }
      }

      if (mse <= 0.0000001)
      {
         return 1000000.0;
      }
      else
      {
         mse /= (3.0 * image.width() * image.height());
         return 20.0 * std::log10(255.0 / std::sqrt(mse));
      }
   }

   inline void histogram(const color_plane color, double hist[256]) const
   {
      std::fill(hist, hist + 256, 0.0);

      for (const unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)
      {
         ++hist[(*itr)];
      }
   }

   inline void histogram_normalized(const color_plane color, double hist[256]) const
   {
      histogram(color,hist);

      double* h_itr = hist;
      const double* h_end = hist + 256;
      const double pixel_count = static_cast<double>(width_ * height_);

      while (h_end != h_itr)
      {
         *(h_itr++) /= pixel_count;
      }
   }

   inline unsigned int offset(const color_plane color) const
   {
      switch (channel_mode_)
      {
         case rgb_mode : {
                            switch (color)
                            {
                               case red_plane   : return 0;
                               case green_plane : return 1;
                               case blue_plane  : return 2;
                               default          : return std::numeric_limits<unsigned int>::max();
                            }
                         }

         case bgr_mode : {
                            switch (color)
                            {
                               case red_plane   : return 2;
                               case green_plane : return 1;
                               case blue_plane  : return 0;
                               default          : return std::numeric_limits<unsigned int>::max();
                            }
                         }

         default       : return std::numeric_limits<unsigned int>::max();
      }
   }

   inline void incremental()
   {
      unsigned char current_color = 0;

      for (unsigned char* itr = data(); itr < end();)
      {
         (*itr++) = (current_color);
         (*itr++) = (current_color);
         (*itr++) = (current_color);

         ++current_color;
      }
   }

   inline void reverse_channels()
   {
      if (3 != bytes_per_pixel_)
         return;

      for (unsigned char* itr = data(); itr < end(); itr += bytes_per_pixel_)
      {
         std::swap(*(itr + 0),*(itr + 2));
      }
   }

private:

   inline const unsigned char* end() const
   {
      return data_.data() + data_.size();
   }

   inline unsigned char* end()
   {
      return const_cast<unsigned char*>(data() + data_.size());
   }

   struct bitmap_file_header
   {
      unsigned short type;
      unsigned int   size;
      unsigned short reserved1;
      unsigned short reserved2;
      unsigned int   off_bits;

      unsigned int struct_size() const
      {
         return sizeof(type     ) +
                sizeof(size     ) +
                sizeof(reserved1) +
                sizeof(reserved2) +
                sizeof(off_bits ) ;
      }

      void clear()
      {
         std::memset(this, 0x00, sizeof(bitmap_file_header));
      }
   };

   struct bitmap_information_header
   {
      unsigned int   size;
      unsigned int   width;
      unsigned int   height;
      unsigned short planes;
      unsigned short bit_count;
      unsigned int   compression;
      unsigned int   size_image;
      unsigned int   x_pels_per_meter;
      unsigned int   y_pels_per_meter;
      unsigned int   clr_used;
      unsigned int   clr_important;

      unsigned int struct_size() const
      {
         return sizeof(size            ) +
                sizeof(width           ) +
                sizeof(height          ) +
                sizeof(planes          ) +
                sizeof(bit_count       ) +
                sizeof(compression     ) +
                sizeof(size_image      ) +
                sizeof(x_pels_per_meter) +
                sizeof(y_pels_per_meter) +
                sizeof(clr_used        ) +
                sizeof(clr_important   ) ;
      }

      void clear()
      {
         std::memset(this, 0x00, sizeof(bitmap_information_header));
      }
   };

   inline bool big_endian() const
   {
      unsigned int v = 0x01;

      return (1 != reinterpret_cast<char*>(&v)[0]);
   }

   inline unsigned short flip(const unsigned short& v) const
   {
      return ((v >> 8) | (v << 8));
   }

   inline unsigned int flip(const unsigned int& v) const
   {
      return (
               ((v & 0xFF000000) >> 0x18) |
               ((v & 0x000000FF) << 0x18) |
               ((v & 0x00FF0000) >> 0x08) |
               ((v & 0x0000FF00) << 0x08)
             );
   }

   template <typename T>
   inline void read_from_stream(std::ifstream& stream,T& t)
   {
      stream.read(reinterpret_cast<char*>(&t),sizeof(T));
   }

   template <typename T>
   inline void write_to_stream(std::ofstream& stream,const T& t) const
   {
      stream.write(reinterpret_cast<const char*>(&t),sizeof(T));
   }

   inline void read_bfh(std::ifstream& stream, bitmap_file_header& bfh)
   {
      read_from_stream(stream,bfh.type     );
      read_from_stream(stream,bfh.size     );
      read_from_stream(stream,bfh.reserved1);
      read_from_stream(stream,bfh.reserved2);
      read_from_stream(stream,bfh.off_bits );

      if (big_endian())
      {
         bfh.type      = flip(bfh.type     );
         bfh.size      = flip(bfh.size     );
         bfh.reserved1 = flip(bfh.reserved1);
         bfh.reserved2 = flip(bfh.reserved2);
         bfh.off_bits  = flip(bfh.off_bits );
      }
   }

   inline void write_bfh(std::ofstream& stream, const bitmap_file_header& bfh) const
   {
      if (big_endian())
      {
         write_to_stream(stream,flip(bfh.type     ));
         write_to_stream(stream,flip(bfh.size     ));
         write_to_stream(stream,flip(bfh.reserved1));
         write_to_stream(stream,flip(bfh.reserved2));
         write_to_stream(stream,flip(bfh.off_bits ));
      }
      else
      {
         write_to_stream(stream,bfh.type     );
         write_to_stream(stream,bfh.size     );
         write_to_stream(stream,bfh.reserved1);
         write_to_stream(stream,bfh.reserved2);
         write_to_stream(stream,bfh.off_bits );
      }
   }

   inline void read_bih(std::ifstream& stream,bitmap_information_header& bih)
   {
      read_from_stream(stream,bih.size            );
      read_from_stream(stream,bih.width           );
      read_from_stream(stream,bih.height          );
      read_from_stream(stream,bih.planes          );
      read_from_stream(stream,bih.bit_count       );
      read_from_stream(stream,bih.compression     );
      read_from_stream(stream,bih.size_image      );
      read_from_stream(stream,bih.x_pels_per_meter);
      read_from_stream(stream,bih.y_pels_per_meter);
      read_from_stream(stream,bih.clr_used        );
      read_from_stream(stream,bih.clr_important   );

      if (big_endian())
      {
         bih.size          = flip(bih.size               );
         bih.width         = flip(bih.width              );
         bih.height        = flip(bih.height             );
         bih.planes        = flip(bih.planes             );
         bih.bit_count     = flip(bih.bit_count          );
         bih.compression   = flip(bih.compression        );
         bih.size_image    = flip(bih.size_image         );
         bih.x_pels_per_meter = flip(bih.x_pels_per_meter);
         bih.y_pels_per_meter = flip(bih.y_pels_per_meter);
         bih.clr_used      = flip(bih.clr_used           );
         bih.clr_important = flip(bih.clr_important      );
      }
   }

   inline void write_bih(std::ofstream& stream, const bitmap_information_header& bih) const
   {
      if (big_endian())
      {
         write_to_stream(stream,flip(bih.size            ));
         write_to_stream(stream,flip(bih.width           ));
         write_to_stream(stream,flip(bih.height          ));
         write_to_stream(stream,flip(bih.planes          ));
         write_to_stream(stream,flip(bih.bit_count       ));
         write_to_stream(stream,flip(bih.compression     ));
         write_to_stream(stream,flip(bih.size_image      ));
         write_to_stream(stream,flip(bih.x_pels_per_meter));
         write_to_stream(stream,flip(bih.y_pels_per_meter));
         write_to_stream(stream,flip(bih.clr_used        ));
         write_to_stream(stream,flip(bih.clr_important   ));
      }
      else
      {
         write_to_stream(stream,bih.size            );
         write_to_stream(stream,bih.width           );
         write_to_stream(stream,bih.height          );
         write_to_stream(stream,bih.planes          );
         write_to_stream(stream,bih.bit_count       );
         write_to_stream(stream,bih.compression     );
         write_to_stream(stream,bih.size_image      );
         write_to_stream(stream,bih.x_pels_per_meter);
         write_to_stream(stream,bih.y_pels_per_meter);
         write_to_stream(stream,bih.clr_used        );
         write_to_stream(stream,bih.clr_important   );
      }
   }

   inline std::size_t file_size(const std::string& file_name) const
   {
      std::ifstream file(file_name.c_str(),std::ios::in | std::ios::binary);
      if (!file) return 0;
      file.seekg (0, std::ios::end);
      return static_cast<std::size_t>(file.tellg());
   }

   void create_bitmap()
   {
      row_increment_ = width_ * bytes_per_pixel_;
      data_.resize(height_ * row_increment_);
   }

   void load_bitmap()
   {
      std::ifstream stream(file_name_.c_str(),std::ios::binary);

      if (!stream)
      {
         std::cerr << "bitmap_image::load_bitmap() ERROR: bitmap_image - file " << file_name_ << " not found!" << std::endl;
         return;
      }

      width_  = 0;
      height_ = 0;

      bitmap_file_header bfh;
      bitmap_information_header bih;

      bfh.clear();
      bih.clear();

      read_bfh(stream,bfh);
      read_bih(stream,bih);

      if (bfh.type != 19778)
      {
         bfh.clear();
         bih.clear();

         stream.close();

         std::cerr << "bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid type value " << bfh.type << " expected 19778." << std::endl;
         return;
      }

      if (bih.bit_count != 24)
      {
         bfh.clear();
         bih.clear();

         stream.close();

         std::cerr << "bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid bit depth " << bih.bit_count << " expected 24." << std::endl;

         return;
      }

      if (bih.size != bih.struct_size())
      {
         bfh.clear();
         bih.clear();

         stream.close();

         std::cerr << "bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid BIH size " << bih.size << " expected " << bih.struct_size() << std::endl;

         return;
      }

      width_  = bih.width;
      height_ = bih.height;

      bytes_per_pixel_ = bih.bit_count >> 3;

      unsigned int padding = (4 - ((3 * width_) % 4)) % 4;
      char padding_data[4] = {0,0,0,0};

      std::size_t bitmap_file_size = file_size(file_name_);

      std::size_t bitmap_logical_size = (height_ * width_ * bytes_per_pixel_) +
                                        (height_ * padding)                   +
                                         bih.struct_size()                    +
                                         bfh.struct_size()                    ;

      if (bitmap_file_size != bitmap_logical_size)
      {
         bfh.clear();
         bih.clear();

         stream.close();

         std::cerr << "bitmap_image::load_bitmap() ERROR: bitmap_image - Mismatch between logical and physical sizes of bitmap. " <<
                      "Logical: "  << bitmap_logical_size << " " <<
                      "Physical: " << bitmap_file_size    << std::endl;

         return;
      }

      create_bitmap();

      for (unsigned int i = 0; i < height_; ++i)
      {
         unsigned char* data_ptr = row(height_ - i - 1); // read in inverted row order

         stream.read(reinterpret_cast<char*>(data_ptr), sizeof(char) * bytes_per_pixel_ * width_);
         stream.read(padding_data,padding);
      }
   }

   template <typename T>
   inline T clamp(const T& v, const T& lower_range, const T& upper_range) const
   {
      if (v < lower_range)
         return lower_range;
      else if (v >  upper_range)
         return upper_range;
      else
         return v;
   }

   std::string  file_name_;
   unsigned int width_;
   unsigned int height_;
   unsigned int row_increment_;
   unsigned int bytes_per_pixel_;
   channel_mode channel_mode_;
   std::vector<unsigned char> data_;
};

typedef bitmap_image::rgb_t rgb_t;

inline bool operator==(const rgb_t& c0, const rgb_t& c1)
{
   return (c0.red   == c1  .red) &&
          (c0.green == c1.green) &&
          (c0.blue  == c1 .blue) ;
}

inline bool operator!=(const rgb_t& c0, const rgb_t& c1)
{
   return (c0.red   != c1  .red) ||
          (c0.green != c1.green) ||
          (c0.blue  != c1 .blue) ;
}

inline std::size_t hamming_distance(const rgb_t& c0, const rgb_t& c1)
{
   std::size_t result = 0;

   if (c0.red   != c1  .red) ++result;
   if (c0.green != c1.green) ++result;
   if (c0.blue  != c1 .blue) ++result;

   return result;
}

inline rgb_t make_colour(const unsigned int& red, const unsigned int& green, const unsigned int& blue)
{
   rgb_t result;

   result.red   = static_cast<unsigned char>(red  );
   result.green = static_cast<unsigned char>(green);
   result.blue  = static_cast<unsigned char>(blue );

   return result;
}

template <typename OutputIterator>
inline void generate_colours(const std::size_t& steps, const rgb_t c0, const rgb_t& c1, OutputIterator out)
{
   double dr = ((double)c1.red   -  (double)c0.red   ) / steps;
   double dg = ((double)c1.green -  (double)c0.green ) / steps;
   double db = ((double)c1.blue  -  (double)c0.blue  ) / steps;

   for (std::size_t i = 0; i < steps; ++i)
   {
      rgb_t c;

      c.red   = static_cast<unsigned char>(c0.red   + (i * dr));
      c.green = static_cast<unsigned char>(c0.green + (i * dg));
      c.blue  = static_cast<unsigned char>(c0.blue  + (i * db));

      *(out++) = c;
   }
}

template <typename ResponseImage, typename Palette>
inline std::size_t convert_rsp_to_image(const ResponseImage& resp_image, const Palette& palette, bitmap_image& image)
{
   if (
        (resp_image.width () > image.width ()) ||
        (resp_image.height() > image.height())
      )
      return 0;

   for (std::size_t y = 0; y < resp_image.height(); ++y)
   {
      for (std::size_t x = 0; x < resp_image.width(); ++x)
      {
         const double v = resp_image(x,y);

         unsigned int index = static_cast<unsigned int>((v < 0) ? 0 : v > (palette.size()) ? (palette.size() - 1) : v);

         image.set_pixel(x,y,palette[index]);
      }
   }

   return (resp_image.width() * resp_image.height());
}

inline void rgb_to_ycbcr(const unsigned int& length, double* red, double* green, double* blue,
                                                     double* y,   double* cb,    double* cr)
{
   unsigned int i = 0;

   while (i < length)
   {
      ( *y) =   16.0 + (  65.481 * (*red) +  128.553 * (*green) +  24.966 * (*blue));
      (*cb) =  128.0 + ( -37.797 * (*red) +  -74.203 * (*green) + 112.000 * (*blue));
      (*cr) =  128.0 + ( 112.000 * (*red) +  -93.786 * (*green) -  18.214 * (*blue));

      ++i;
      ++red; ++green; ++blue;
      ++y;   ++cb;    ++cr;
   }
}

inline void ycbcr_to_rgb(const unsigned int& length, double* y,   double* cb,    double* cr,
                                                     double* red, double* green, double* blue)
{
   unsigned int i = 0;

   while (i < length)
   {
      double y_  =  (*y) -  16.0;
      double cb_ = (*cb) - 128.0;
      double cr_ = (*cr) - 128.0;

        (*red) = 0.000456621 * y_                    + 0.00625893 * cr_;
      (*green) = 0.000456621 * y_ - 0.00153632 * cb_ - 0.00318811 * cr_;
       (*blue) = 0.000456621 * y_                    + 0.00791071 * cb_;

      ++i;
      ++red; ++green; ++blue;
      ++y;   ++cb;    ++cr;
   }
}

inline void subsample(const unsigned int& width,
                      const unsigned int& height,
                      const double* source,
                      unsigned int& w,
                      unsigned int& h,
                      double*& dest)
{
   /*  Single channel.  */

   w = 0;
   h = 0;

   bool odd_width = false;
   bool odd_height = false;

   if (0 == (width % 2))
      w = width / 2;
   else
   {
      w = 1 + (width / 2);
      odd_width = true;
   }

   if (0 == (height % 2))
      h = height / 2;
   else
   {
      h = 1 + (height / 2);
      odd_height = true;
   }

   unsigned int horizontal_upper = (odd_width)  ? w - 1 : w;
   unsigned int vertical_upper   = (odd_height) ? h - 1 : h;

   dest = new double[w * h];

         double* s_itr = dest;
   const double* itr1  = source;
   const double* itr2  = source + width;

   for (unsigned int j = 0; j < vertical_upper; ++j)
   {
      for (unsigned int i = 0; i < horizontal_upper; ++i, ++s_itr)
      {
          (*s_itr)  = *(itr1++);
          (*s_itr) += *(itr1++);
          (*s_itr) += *(itr2++);
          (*s_itr) += *(itr2++);
          (*s_itr) /=  4.0;
      }

      if (odd_width)
      {
         (*(s_itr++)) = ((*itr1++) + (*itr2++)) / 2.0;
      }

      itr1 += width;

      if (j != (vertical_upper -1))
      {
         itr2 += width;
      }
   }

   if (odd_height)
   {
      for (unsigned int i = 0; i < horizontal_upper; ++i, ++s_itr)
      {
         (*s_itr) += (*(itr1++));
         (*s_itr) += (*(itr1++));
         (*s_itr) /= 2.0;
      }

      if (odd_width)
      {
         (*(s_itr++)) = (*itr1);
      }
   }
}

inline void upsample(const unsigned int& width,
                     const unsigned int& height,
                     const double* source,
                     unsigned int& w,
                     unsigned int& h,
                     double*& dest)
{
   /* Single channel. */

   w = 2 * width;
   h = 2 * height;

   dest = new double[w * h];

   const double* s_itr = source;
         double* itr1  = dest;
         double* itr2  = dest + w;

   for (unsigned int j = 0; j < height; ++j)
   {
      for (unsigned int i = 0; i < width; ++i, ++s_itr)
      {
          *(itr1++) = (*s_itr);
          *(itr1++) = (*s_itr);
          *(itr2++) = (*s_itr);
          *(itr2++) = (*s_itr);
      }

      itr1 += w;
      itr2 += w;
   }
}

inline void checkered_pattern(const unsigned int x_width,
                              const unsigned int y_width,
                              const unsigned char value,
                              const bitmap_image::color_plane color,
                              bitmap_image& image)
{
   if (
        (x_width >= image.width ()) ||
        (y_width >= image.height())
      )
   {
      return;
   }

   bool setter_x = false;
   bool setter_y = true;

   const unsigned int color_plane_offset = image.offset(color);
   const unsigned int height = image.height();
   const unsigned int width  = image.width();

   for (unsigned int y = 0; y < height; ++y)
   {
      if (0 == (y % y_width))
      {
         setter_y = !setter_y;
      }

      unsigned char* row = image.row(y) + color_plane_offset;

      for (unsigned int x = 0; x < width; ++x, row += image.bytes_per_pixel())
      {
         if (0 == (x % x_width))
         {
            setter_x = !setter_x;
         }

         if (setter_x ^ setter_y)
         {
            *row = value;
         }
      }
   }
}

inline void checkered_pattern(const unsigned int x_width,
                              const unsigned int y_width,
                              const unsigned char red,
                              const unsigned char green,
                              const unsigned char blue,
                              bitmap_image& image)
{
   if (
        (x_width >= image.width ()) ||
        (y_width >= image.height())
      )
   {
      return;
   }

   bool setter_x = false;
   bool setter_y = true;

   const unsigned int height = image.height();
   const unsigned int width  = image.width();

   for (unsigned int y = 0; y < height; ++y)
   {
      if (0 == (y % y_width))
      {
         setter_y = !setter_y;
      }

      unsigned char* row = image.row(y);

      for (unsigned int x = 0; x < width; ++x, row += image.bytes_per_pixel())
      {
         if (0 == (x % x_width))
         {
            setter_x = !setter_x;
         }

         if (setter_x ^ setter_y)
         {
            *(row + 0) = blue;
            *(row + 1) = green;
            *(row + 2) = red;
         }
      }
   }
}

inline void plasma(bitmap_image& image,
                   const double& x,     const double& y,
                   const double& width, const double& height,
                   const double& c1,    const double& c2,
                   const double& c3,    const double& c4,
                   const double& roughness  = 3.0,
                   const rgb_t   colormap[] = 0)
{
   // Note: c1,c2,c3,c4 -> [0.0,1.0]

   const double half_width  = ( width / 2.0);
   const double half_height = (height / 2.0);

   if ((width >= 1.0) || (height >= 1.0))
   {
      const double corner1 = (c1 + c2) / 2.0;
      const double corner2 = (c2 + c3) / 2.0;
      const double corner3 = (c3 + c4) / 2.0;
      const double corner4 = (c4 + c1) / 2.0;
            double center  = (c1 + c2 + c3 + c4) / 4.0 +
                             ((1.0 * ::rand() /(1.0 * RAND_MAX))  - 0.5) * // should use a better rng
                             ((1.0 * half_width + half_height) / (image.width() + image.height()) * roughness);

      center = std::min<double>(std::max<double>(0.0,center),1.0);

      plasma(image, x,                            y, half_width, half_height,      c1, corner1,  center, corner4,roughness,colormap);
      plasma(image, x + half_width,               y, half_width, half_height, corner1,      c2, corner2,  center,roughness,colormap);
      plasma(image, x + half_width, y + half_height, half_width, half_height,  center, corner2,      c3, corner3,roughness,colormap);
      plasma(image, x,              y + half_height, half_width, half_height, corner4,  center, corner3,      c4,roughness,colormap);
   }
   else
   {
      rgb_t color = colormap[static_cast<unsigned int>(1000.0 * ((c1 + c2 + c3 + c4) / 4.0)) % 1000];

      image.set_pixel(static_cast<unsigned int>(x),static_cast<unsigned int>(y),color);
   }
}

inline void plasma(bitmap_image& image,
                   const double& c1, const double& c2,
                   const double& c3, const double& c4,
                   const double& roughness  = 3.0,
                   const rgb_t   colormap[] = 0)
{
   plasma
   (
     image, 0, 0, image.width(), image.height(),
     c1, c2, c3, c4,
     roughness, colormap
   );
}

inline double psnr_region(const unsigned int& x,      const unsigned int& y,
                          const unsigned int& width,  const unsigned int& height,
                          const bitmap_image& image1, const bitmap_image& image2)
{
   if (
        (image1.width()  != image2.width ()) ||
        (image1.height() != image2.height())
      )
   {
      return 0.0;
   }

   if ((x + width ) > image1.width() ) { return 0.0; }
   if ((y + height) > image1.height()) { return 0.0; }

   double mse = 0.0;

   for (unsigned int r = 0; r < height; ++r)
   {
      const unsigned char* itr1     = image1.row(r + y) + x * image1.bytes_per_pixel();
      const unsigned char* itr1_end = itr1 + (width * image1.bytes_per_pixel());
      const unsigned char* itr2     = image2.row(r + y) + x * image2.bytes_per_pixel();

      while (itr1 != itr1_end)
      {
         double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));
         mse += v * v;
         ++itr1;
         ++itr2;
      }
   }

   if (mse <= 0.0000001)
   {
      return 1000000.0;
   }
   else
   {
      mse /= (3.0 * width * height);
      return 20.0 * std::log10(255.0 / std::sqrt(mse));
   }
}

inline void hierarchical_psnr_r(const double& x,     const double& y,
                                const double& width, const double& height,
                                const bitmap_image& image1,
                                      bitmap_image& image2,
                                const double& threshold,
                                const rgb_t colormap[])
{
   if ((width <= 4.0) || (height <= 4.0))
   {
      const double psnr = psnr_region
                          (
                            static_cast<unsigned int>(x),
                            static_cast<unsigned int>(y),
                            static_cast<unsigned int>(width),
                            static_cast<unsigned int>(height),
                            image1, image2
                          );

      if (psnr < threshold)
      {
         rgb_t c = colormap[static_cast<unsigned int>(1000.0 * (1.0 - (psnr / threshold)))];

         image2.set_region
                (
                  static_cast<unsigned int>(x),
                  static_cast<unsigned int>(y),
                  static_cast<unsigned int>(width + 1),
                  static_cast<unsigned int>(height + 1),
                  c.red, c.green, c.blue
                );
      }
   }
   else
   {
      const double half_width  = ( width / 2.0);
      const double half_height = (height / 2.0);

      hierarchical_psnr_r(x             , y              , half_width, half_height, image1, image2, threshold, colormap);
      hierarchical_psnr_r(x + half_width, y              , half_width, half_height, image1, image2, threshold, colormap);
      hierarchical_psnr_r(x + half_width, y + half_height, half_width, half_height, image1, image2, threshold, colormap);
      hierarchical_psnr_r(x             , y + half_height, half_width, half_height, image1, image2, threshold, colormap);
   }
}

inline void hierarchical_psnr(bitmap_image& image1, bitmap_image& image2, const double threshold, const rgb_t colormap[])
{
   if (
        (image1.width()  != image2.width ()) ||
        (image1.height() != image2.height())
      )
   {
      return;
   }

   const double psnr = psnr_region
                       (
                         0, 0, image1.width(), image1.height(),
                         image1, image2
                       );

   if (psnr < threshold)
   {
      hierarchical_psnr_r
      (
        0, 0, image1.width(), image1.height(),
        image1, image2,
        threshold,
        colormap
      );
   }
}

class image_drawer
{
public:

   image_drawer(bitmap_image& image)
   : image_(image),
     pen_width_(1),
     pen_color_red_  (0),
     pen_color_green_(0),
     pen_color_blue_ (0)
   {}

   void rectangle(int x1, int y1, int x2, int y2)
   {
      line_segment(x1, y1, x2, y1);
      line_segment(x2, y1, x2, y2);
      line_segment(x2, y2, x1, y2);
      line_segment(x1, y2, x1, y1);
   }

   void triangle(int x1, int y1, int x2, int y2,int x3, int y3)
   {
      line_segment(x1, y1, x2, y2);
      line_segment(x2, y2, x3, y3);
      line_segment(x3, y3, x1, y1);
   }

   void quadix(int x1, int y1, int x2, int y2,int x3, int y3, int x4, int y4)
   {
      line_segment(x1, y1, x2, y2);
      line_segment(x2, y2, x3, y3);
      line_segment(x3, y3, x4, y4);
      line_segment(x4, y4, x1, y1);
   }

   void line_segment(int x1, int y1, int x2, int y2)
   {
      int steep = 0;
      int sx    = ((x2 - x1) > 0) ? 1 : -1;
      int sy    = ((y2 - y1) > 0) ? 1 : -1;
      int dx    = abs(x2 - x1);
      int dy    = abs(y2 - y1);

      if (dy > dx)
      {
         std::swap(x1,y1);
         std::swap(dx,dy);
         std::swap(sx,sy);

         steep = 1;
      }

      int e = 2 * dy - dx;

      for (int i = 0; i < dx; ++i)
      {
         if (steep)
            plot_pen_pixel(y1,x1);
         else
            plot_pen_pixel(x1,y1);

         while (e >= 0)
         {
            y1 += sy;
            e -= (dx << 1);
         }

         x1 += sx;
         e  += (dy << 1);
      }

      plot_pen_pixel(x2,y2);
   }

   void horiztonal_line_segment(int x1, int x2, int y)
   {
      if (x1 > x2)
      {
         std::swap(x1,x2);
      }

      for (int i = 0; i < (x2 - x1); ++i)
      {
         plot_pen_pixel(x1 +  i,y);
      }
   }

   void vertical_line_segment(int y1, int y2, int x)
   {
      if (y1 > y2)
      {
         std::swap(y1,y2);
      }

      for (int i = 0; i < (y2 - y1); ++i)
      {
         plot_pen_pixel(x, y1 +  i);
      }
   }

   void ellipse(int centerx, int centery, int a, int b)
   {
      int t1 = a * a;
      int t2 = t1 << 1;
      int t3 = t2 << 1;
      int t4 = b * b;
      int t5 = t4 << 1;
      int t6 = t5 << 1;
      int t7 = a * t5;
      int t8 = t7 << 1;
      int t9 = 0;

      int d1 = t2 - t7 + (t4 >> 1);
      int d2 = (t1 >> 1) - t8 + t5;
      int x  = a;
      int y  = 0;

      int negative_tx = centerx - x;
      int positive_tx = centerx + x;
      int negative_ty = centery - y;
      int positive_ty = centery + y;

      while (d2 < 0)
      {
         plot_pen_pixel(positive_tx, positive_ty);
         plot_pen_pixel(positive_tx, negative_ty);
         plot_pen_pixel(negative_tx, positive_ty);
         plot_pen_pixel(negative_tx, negative_ty);

         ++y;

         t9 = t9 + t3;

         if (d1 < 0)
         {
            d1 = d1 + t9 + t2;
            d2 = d2 + t9;
         }
         else
         {
            x--;
            t8 = t8 - t6;
            d1 = d1 + (t9 + t2 - t8);
            d2 = d2 + (t9 + t5 - t8);
            negative_tx = centerx - x;
            positive_tx = centerx + x;
         }

         negative_ty = centery - y;
         positive_ty = centery + y;
      }

      do
      {
         plot_pen_pixel(positive_tx, positive_ty);
         plot_pen_pixel(positive_tx, negative_ty);
         plot_pen_pixel(negative_tx, positive_ty);
         plot_pen_pixel(negative_tx, negative_ty);

         x--;
         t8 = t8 - t6;

         if (d2 < 0)
         {
            ++y;
            t9 = t9 + t3;
            d2 = d2 + (t9 + t5 - t8);
            negative_ty = centery - y;
            positive_ty = centery + y;
         }
         else
            d2 = d2 + (t5 - t8);

         negative_tx = centerx - x;
         positive_tx = centerx + x;
      }
      while (x >= 0);
   }

   void circle(int centerx, int centery, int radius)
   {
      int x = 0;
      int d = (1 - radius) << 1;

      while (radius >= 0)
      {
         plot_pen_pixel(centerx + x, centery + radius);
         plot_pen_pixel(centerx + x, centery - radius);
         plot_pen_pixel(centerx - x, centery + radius);
         plot_pen_pixel(centerx - x, centery - radius);

         if ((d + radius) > 0)
            d -= ((--radius) << 1) - 1;
         if (x > d)
            d += ((++x) << 1) + 1;
      }
   }

   void plot_pen_pixel(int x, int y)
   {
      switch (pen_width_)
      {
         case 1  : plot_pixel(x,y);
                   break;

         case 2  : {
                      plot_pixel(x    , y    );
                      plot_pixel(x + 1, y    );
                      plot_pixel(x + 1, y + 1);
                      plot_pixel(x    , y + 1);
                   }
                   break;

         case  3 : {
                      plot_pixel(x    , y - 1);
                      plot_pixel(x - 1, y - 1);
                      plot_pixel(x + 1, y - 1);

                      plot_pixel(x    , y    );
                      plot_pixel(x - 1, y    );
                      plot_pixel(x + 1, y    );

                      plot_pixel(x    , y + 1);
                      plot_pixel(x - 1, y + 1);
                      plot_pixel(x + 1, y + 1);
                   }
                   break;

         default : plot_pixel(x,y);
                   break;
      }
   }

   void plot_pixel(int x, int y)
   {
      if (
           (x < 0) ||
           (y < 0) ||
           (x >= static_cast<int>(image_.width ())) ||
           (y >= static_cast<int>(image_.height()))
         )
         return;

      image_.set_pixel(x,y,pen_color_red_,pen_color_green_,pen_color_blue_);
   }

   void pen_width(const unsigned int& width)
   {
      if ((width > 0) && (width < 4))
      {
         pen_width_ = width;
      }
   }

   void pen_color(const unsigned char& red,
                  const unsigned char& green,
                  const unsigned char& blue)
   {
      pen_color_red_   = red;
      pen_color_green_ = green;
      pen_color_blue_  = blue;
   }

   template <typename RGB>
   void pen_color(const RGB colour)
   {
      pen_color_red_   = colour.red;
      pen_color_green_ = colour.green;
      pen_color_blue_  = colour.blue;
   }

private:

   image_drawer(const image_drawer& id);
   image_drawer& operator =(const image_drawer& id);

   bitmap_image& image_;
   unsigned int  pen_width_;
   unsigned char pen_color_red_;
   unsigned char pen_color_green_;
   unsigned char pen_color_blue_;
};

class cartesian_canvas
{
public:

   cartesian_canvas(const double x_length, const double y_length)
   : width_div2_ (0.0),
     height_div2_(0.0),
     min_x_      (0.0),
     min_y_      (0.0),
     max_x_      (0.0),
     max_y_      (0.0),
     draw_       (image_)
   {
      setup_canvas(x_length,y_length);
   }

   inline bool operator!()
   {
      return !image_;
   }

   void rectangle(double x1, double y1, double x2, double y2)
   {
      line_segment(x1, y1, x2, y1);
      line_segment(x2, y1, x2, y2);
      line_segment(x2, y2, x1, y2);
      line_segment(x1, y2, x1, y1);
   }

   void triangle(double x1, double y1, double x2, double y2, double x3, double y3)
   {
      line_segment(x1, y1, x2, y2);
      line_segment(x2, y2, x3, y3);
      line_segment(x3, y3, x1, y1);
   }

   void quadix(double x1, double y1, double x2, double y2, double x3, double y3, double x4, double y4)
   {
      line_segment(x1, y1, x2, y2);
      line_segment(x2, y2, x3, y3);
      line_segment(x3, y3, x4, y4);
      line_segment(x4, y4, x1, y1);
   }

   void line_segment(double x1, double y1, double x2, double y2)
   {
      if (clip(x1, y1, x2, y2))
      {
         const int sc_x1 = static_cast<int>(cart_to_screen_x(x1));
         const int sc_x2 = static_cast<int>(cart_to_screen_x(x2));
         const int sc_y1 = static_cast<int>(cart_to_screen_y(y1));
         const int sc_y2 = static_cast<int>(cart_to_screen_y(y2));

         draw_.line_segment(sc_x1, sc_y1, sc_x2, sc_y2);
      }
   }

   void horiztonal_line_segment(double x1, double x2, double y)
   {
      x1 = clamp_x(x1);
      x2 = clamp_x(x2);
      y  = clamp_y( y);

      const int sc_x1 = static_cast<int>(cart_to_screen_x(x1));
      const int sc_x2 = static_cast<int>(cart_to_screen_x(x2));
      const int sc_y  = static_cast<int>(cart_to_screen_y(y ));

      draw_.horiztonal_line_segment(sc_x1, sc_x2, sc_y);
   }

   void vertical_line_segment(double y1, double y2, double x)
   {
      y1 = clamp_y(y1);
      y2 = clamp_y(y2);
      x  = clamp_x( x);

      const int sc_y1 = static_cast<int>(cart_to_screen_y(y1));
      const int sc_y2 = static_cast<int>(cart_to_screen_y(y2));
      const int sc_x  = static_cast<int>(cart_to_screen_x(x ));

      draw_.vertical_line_segment(sc_y1, sc_y2, sc_x);
   }

   void ellipse(double centerx, double centery, double a, double b)
   {

      const int sc_cx = static_cast<int>(cart_to_screen_x(centerx));
      const int sc_cy = static_cast<int>(cart_to_screen_y(centery));

      draw_.ellipse(sc_cx, sc_cy, static_cast<int>(a), static_cast<int>(b));
   }

   void circle(double centerx, double centery, double radius)
   {
      const int sc_cx = static_cast<int>(cart_to_screen_x(centerx));
      const int sc_cy = static_cast<int>(cart_to_screen_y(centery));

      draw_.circle(sc_cx, sc_cy, static_cast<int>(radius));
   }

   void fill_rectangle(double x1, double y1, double x2, double y2)
   {
      if (y1 > y2)
         std::swap(y1, y2);

      for (double y = y1; y <= y2; y += 0.5)
      {
        line_segment(x1, y, x2, y);
      }
   }

   void fill_triangle(double x1, double y1, double x2, double y2, double x3, double y3)
   {
      typedef std::pair<double,double> point_t;

      std::vector<point_t> p;

      p.push_back(std::make_pair(x1,y1));
      p.push_back(std::make_pair(x2,y2));
      p.push_back(std::make_pair(x3,y3));

      if (p[0].second > p[1].second)
         std::swap(p[0],p[1]);
      if (p[0].second > p[2].second)
         std::swap(p[0],p[2]);
      if (p[1].second > p[2].second)
         std::swap(p[1],p[2]);

      class draw_modes
      {
      private:

         cartesian_canvas& canvas;

         // Needed for incompetent and broken msvc compiler versions
         #ifdef _MSC_VER
            #pragma warning(push)
            #pragma warning(disable: 4822)
         #endif
         draw_modes& operator=(const draw_modes&);
         #ifdef _MSC_VER
            #pragma warning(pop)
         #endif

      public:

         draw_modes(cartesian_canvas& c)
         : canvas(c)
         {}

         void bottom(const point_t& p0, const point_t& p1, const point_t& p2)
         {
            const double m0 = (p1.first - p0.first) / (2.0 * (p1.second - p0.second));
            const double m1 = (p2.first - p0.first) / (2.0 * (p2.second - p0.second));

            double x0 = p0.first;
            double x1 = p0.first;

            for (double y = p0.second; y <= p1.second; y += 0.5)
            {
               canvas.horiztonal_line_segment(x0, x1, y);

               x0 += m0;
               x1 += m1;
            }
         }

         void top(const point_t& p0, const point_t& p1, const point_t& p2)
         {
            const double m0 = (p2.first - p0.first) / (2.0 * (p2.second - p0.second));
            const double m1 = (p2.first - p1.first) / (2.0 * (p2.second - p1.second));

            double x0 = p2.first;
            double x1 = p2.first;

            for (double y = p2.second; y >= p0.second; y -= 0.5)
            {
               canvas.horiztonal_line_segment(x0, x1, y);

               x0 -= m0;
               x1 -= m1;
            }
         }
      };

      draw_modes dm(*this);

      const double eps = 0.00001;

      if (std::abs(p[1].second - p[2].second) < eps)
         dm.bottom(p[0], p[1], p[2]);
      else if (std::abs(p[0].second - p[1].second) < eps)
         dm.top(p[0], p[1], p[2]);
      else
      {
         point_t p3;

         p3.first  = (p[0].first + ((p[1].second - p[0].second) / (p[2].second - p[0].second)) * (p[2].first - p[0].first));
         p3.second = p[1].second;

         dm.bottom(p[0], p[1], p3  );
         dm.top   (p[1], p3  , p[2]);
      }
   }

   void fill_quadix(double x1, double y1, double x2, double y2, double x3, double y3, double x4, double y4)
   {
      fill_triangle(x1, y1, x2, y2, x3, y3);
      fill_triangle(x1, y1, x3, y3, x4, y4);
   }

   void fill_circle(double cx, double cy, double radius)
   {
      const double delta = 1.0;
      double  x = radius;
      double  y = 0.0;
      double dx = delta - (2.0 * delta * radius);
      double dy = 0.0;
      double dr = 0.0;

      while (x >= y)
      {
         for (double i = cx - x; i <= cx + x; i += delta)
         {
            horiztonal_line_segment(cx - x, cx + x, cy + y);
            horiztonal_line_segment(cx - x, cx + x, cy - y);
         }

         for (double i = cx - y; i <= cx + y; i += delta)
         {
            horiztonal_line_segment(cx - y, cx + y, cy + x);
            horiztonal_line_segment(cx - y, cx + y, cy - x);
         }

         y += delta;

         dr += dy;
         dy += 2.0 * delta;

         if ((2.0 * delta * dr + dx) > 0)
         {
             x -= delta;
            dr +=  dx;
            dx += 2.0 * delta;
         }
      }
   }

   void plot_pen_pixel(double x, double y)
   {
      if ((x < min_x_) || (x > max_x_)) return;
      if ((y < min_y_) || (y > max_y_)) return;

      const int sc_x = static_cast<int>(cart_to_screen_x(x));
      const int sc_y = static_cast<int>(cart_to_screen_y(y));

      draw_.plot_pen_pixel(sc_x, sc_y);
   }

   void plot_pixel(double x, double y)
   {
      if ((x < min_x_) || (x > max_x_)) return;
      if ((y < min_y_) || (y > max_y_)) return;

      const int sc_x = static_cast<int>(cart_to_screen_x(x));
      const int sc_y = static_cast<int>(cart_to_screen_y(y));

      draw_.plot_pixel(sc_x, sc_y);
   }

   void pen_width(const unsigned int& width)
   {
      draw_.pen_width(width);
   }

   void pen_color(const unsigned char&   red,
                  const unsigned char& green,
                  const unsigned char&  blue)
   {
      draw_.pen_color(red,green,blue);
   }

   template <typename RGB>
   void pen_color(const RGB colour)
   {
      draw_.pen_color(colour);
   }

   const bitmap_image& image() const
   {
      return image_;
   }

   bitmap_image& image()
   {
      return image_;
   }

   void set_widthheight(const double x_length, const double y_length)
   {
      setup_canvas(x_length, y_length);
   }

   double min_x() const { return min_x_; }
   double min_y() const { return min_y_; }
   double max_x() const { return max_x_; }
   double max_y() const { return max_y_; }

private:

   void setup_canvas(const double x_length, const double y_length)
   {
      if ((x_length < 2.0) || (y_length < 2.0))
         return;

      width_div2_  = x_length / 2.0;
      height_div2_ = y_length / 2.0;

      min_x_ = -width_div2_ ;
      min_y_ = -height_div2_;
      max_x_ =  width_div2_ ;
      max_y_ =  height_div2_;

      image_.setwidth_height(static_cast<unsigned int>(x_length) + 1, static_cast<unsigned int>(y_length) + 1);

      image_.clear(0xFF);
   }

   double clamp_x(const double& x)
   {
           if (x < min_x_)  return min_x_;
      else if (x > max_x_)  return max_x_;
      else                  return x;
   }

   double clamp_y(const double& y)
   {
           if (y < min_y_)  return min_y_;
      else if (y > max_y_)  return max_y_;
      else                  return y;
   }

   double cart_to_screen_x(const double& x)
   {
      return x + width_div2_;
   }

   double cart_to_screen_y(const double& y)
   {
      return height_div2_ - y;
   }

   enum clip_code
   {
      e_clip_bottom = 1,
      e_clip_top    = 2,
      e_clip_left   = 4,
      e_clip_right  = 8
   };

   int out_code(
                 const double&  x, const double&  y,
                 const double& x1, const double& y1,
                 const double& x2, const double& y2
               )
   {
      int result = 0;
      if (y < y1)      result |= e_clip_bottom;
      else if (y > y2) result |= e_clip_top;

      if (x < x1)      result |= e_clip_left;
      else if (x > x2) result |= e_clip_right;

      return result;
   }

   bool clip(double& x1, double& y1, double& x2, double& y2)
   {
      bool   result = false;
      double x      = 0.0;
      double y      = 0.0;

      int outcode0   = out_code(x1, y1, min_x_, min_y_, max_x_, max_y_);
      int outcode1   = out_code(x2, y2, min_x_, min_y_, max_x_, max_y_);
      int outcodeout = 0;

      while ((outcode0 != 0) || (outcode1 != 0))
      {
         if ((outcode0 & outcode1) != 0)
            return result;
         else
         {
            if (outcode0 != 0)
               outcodeout = outcode0;
            else
               outcodeout = outcode1;

            double dx = (x2 - x1);
            double dy = (y2 - y1);

            if ((outcodeout & e_clip_bottom) == e_clip_bottom)
            {
               x = x1 + dx * (min_y_ - y1) / dy;
               y = min_y_;
            }
            else if ((outcodeout & e_clip_top) == e_clip_top)
            {
               x = x1 + dx * (max_y_ - y1) / dy;
               y = max_y_;
            }
            else if ((outcodeout & e_clip_right) == e_clip_right)
            {
               y = y1 + dy * (max_x_ - x1) / dx;
               x = max_x_;
            }
            else if ((outcodeout & e_clip_left) == e_clip_left)
            {
               y = y1 + dy * (min_x_ - x1) / dx;
               x = min_x_;
            }

            if (outcodeout == outcode0)
            {
               x1 = x;
               y1 = y;
               outcode0 = out_code(x1, y1, min_x_, min_y_, max_x_, max_y_);
            }
            else
            {
               x2 = x;
               y2 = y;
               outcode1 = out_code(x2, y2, min_x_, min_y_, max_x_, max_y_);
            }
         }
      }

      return true;
   }

   cartesian_canvas(const cartesian_canvas&);
   cartesian_canvas operator=(const cartesian_canvas&);

   double width_div2_;
   double height_div2_;
   double min_x_;
   double min_y_;
   double max_x_;
   double max_y_;
   bitmap_image image_;
   image_drawer draw_;
};

inline rgb_t convert_wave_length_nm_to_rgb(const double wave_length_nm)
{
   // Credits: Dan Bruton http://www.physics.sfasu.edu/astro/color.html
   double red   = 0.0;
   double green = 0.0;
   double blue  = 0.0;

   if ((380.0 <= wave_length_nm) && (wave_length_nm <= 439.0))
   {
      red   = -(wave_length_nm - 440.0) / (440.0 - 380.0);
      green = 0.0;
      blue  = 1.0;
   }
   else if ((440.0 <= wave_length_nm) && (wave_length_nm <= 489.0))
   {
      red   = 0.0;
      green = (wave_length_nm - 440.0) / (490.0 - 440.0);
      blue  = 1.0;
   }
   else if ((490.0 <= wave_length_nm) && (wave_length_nm <= 509.0))
   {
      red   = 0.0;
      green = 1.0;
      blue  = -(wave_length_nm - 510.0) / (510.0 - 490.0);
   }
   else if ((510.0 <= wave_length_nm) && (wave_length_nm <= 579.0))
   {
      red   = (wave_length_nm - 510.0) / (580.0 - 510.0);
      green = 1.0;
      blue  = 0.0;
   }
   else if ((580.0 <= wave_length_nm) && (wave_length_nm <= 644.0))
   {
      red   = 1.0;
      green = -(wave_length_nm - 645.0) / (645.0 - 580.0);
      blue  = 0.0;
   }
   else if ((645.0 <= wave_length_nm) && (wave_length_nm <= 780.0))
   {
      red   = 1.0;
      green = 0.0;
      blue  = 0.0;
   }

   double factor = 0.0;

   if ((380.0 <= wave_length_nm) && (wave_length_nm <= 419.0))
      factor = 0.3 + 0.7 * (wave_length_nm - 380.0) / (420.0 - 380.0);
   else if ((420.0 <= wave_length_nm) && (wave_length_nm <= 700.0))
      factor = 1.0;
   else if ((701.0 <= wave_length_nm) && (wave_length_nm <= 780.0))
      factor = 0.3 + 0.7 * (780.0 - wave_length_nm) / (780.0 - 700.0);
   else
      factor = 0.0;

   rgb_t result;

   const double gamma         =   0.8;
   const double intensity_max = 255.0;

   #define round(d) std::floor(d + 0.5)

   result.red   = static_cast<unsigned char>((red   == 0.0) ? red   : round(intensity_max * std::pow(red   * factor, gamma)));
   result.green = static_cast<unsigned char>((green == 0.0) ? green : round(intensity_max * std::pow(green * factor, gamma)));
   result.blue  = static_cast<unsigned char>((blue  == 0.0) ? blue  : round(intensity_max * std::pow(blue  * factor, gamma)));

   #undef round

   return result;
}

inline double weighted_distance(const unsigned char r0, const unsigned char g0, const unsigned char b0,
                                const unsigned char r1, const unsigned char g1, const unsigned char b1)
{
   const double diff_r = /*0.30 */ (r0 - r1);
   const double diff_g = /*0.59 */ (g0 - g1);
   const double diff_b = /*0.11 */ (b0 - b1);

   return std::sqrt((diff_r * diff_r) + (diff_g * diff_g) + (diff_b * diff_b));
}

inline double weighted_distance(const rgb_t c0, const rgb_t c1)
{
   return weighted_distance(c0.red, c0.green, c0.blue,
                            c1.red, c1.green, c1.blue);
}

template <typename Iterator>
inline rgb_t find_nearest_color(const rgb_t& c, const Iterator begin, const Iterator end)
{
   if (0 == std::distance(begin,end))
      return c;

   double min_d = std::numeric_limits<double>::max();
   rgb_t result = *begin;

   for (Iterator itr = begin; itr != end; ++itr)
   {
      if (c == (*itr))
      {
         return (*itr);
      }

      double curr_d = weighted_distance(c,*itr);

      if (curr_d < min_d)
      {
          min_d = curr_d;
         result = *itr;
      }
   }

   return result;
}

template <template <typename,typename> class Sequence,
          typename Allocator>
inline rgb_t find_nearest_color(const rgb_t& c, const Sequence<rgb_t,Allocator>& seq)
{
   return find_nearest_color(c, seq.begin(),seq.end());
}

template <std::size_t N>
inline rgb_t find_nearest_color(const rgb_t& c, const rgb_t (&colors)[N])
{
   return find_nearest_color(c, colors, colors + N);
}

inline double find_nearest_wave_length(const rgb_t& c, const double increment = 0.001)
{
   const double max_wave_length = 800.0; //800nm

   double min_wave_length = 0.0;
   double min_d           = std::numeric_limits<double>::max();

   for (double i = 0.0; i < max_wave_length; i += increment)
   {
      const rgb_t  curr_rgb = convert_wave_length_nm_to_rgb(i);

      if (c == curr_rgb)
      {
         return i;
      }

      const double curr_d = weighted_distance(c, curr_rgb);

      if (curr_d <= min_d)
      {
         min_wave_length = i;
         min_d = curr_d;
      }
   }

   return min_wave_length;
}

template <typename T>
class response_image
{
public:

   response_image(const std::size_t& width, const std::size_t& height, const T null = T(0))
   : width_ (width ),
     height_(height),
     null_  (null  )
   {
      data_.resize(width_ * height_);
   }

   std::size_t width () const { return  width_; }
   std::size_t height() const { return height_; }

   void set_all(const T& t)
   {
      std::fill_n(data_.begin(), data_.size(), t);
   }

   const T& operator()(const std::size_t& x, const std::size_t& y) const
   {
      if (y >= height_) return null_;
      if (x >= width_ ) return null_;

      return data_[width_ * y + x];
   }

   T& operator()(const std::size_t& x, const std::size_t& y)
   {
      if (y >= height_) return null_;
      if (x >= width_ ) return null_;

      return data_[width_ * y + x];
   }

   bool valid(const std::size_t& x, const std::size_t& y)
   {
      return ((x < width_ ) || (y < height_));
   }

   void inc_all(const T& v)
   {
      for (std::size_t i = 0; i < data_.size(); ++i)
      {
         data_[i] += v;
      }
   }

   void mul_all(const T& v)
   {
      for (std::size_t i = 0; i < data_.size(); ++i)
      {
         data_[i] *= v;
      }
   }

   T* row (const std::size_t& row_index)
   {
      if (row_index < height_)
         return &data_[width_ * row_index];
      else
         return reinterpret_cast<T*>(0);
   }

   const T* row (const std::size_t& row_index) const
   {
      if (row_index < height_)
         return data_[width_ * row_index];
      else
         return reinterpret_cast<T*>(0);
   }

private:

   std::size_t    width_;
   std::size_t    height_;
   std::vector<T> data_;
   T              null_;
};

inline void sobel_operator(const bitmap_image& src_image,
                                 bitmap_image& dst_image,
                           const double threshold = 0.0)
{
   typedef double T;

   response_image<T> im0(src_image.width(), src_image.height(), 0.0);
   response_image<T> im1(src_image.width(), src_image.height(), 0.0);

   src_image.export_gray_scale_response_image(&im0(0,0));

   for (std::size_t y = 1; y < im0.height() - 1; ++y)
   {
      const T* itr0 = im0.row(y - 1);
      const T* itr1 = im0.row(y    );
      const T* itr2 = im0.row(y + 1);
            T* out  = im1.row(y    ) + 1;

      for (std::size_t x = 1; x < im0.width() - 1; ++x)
      {
         const T c0 = *(itr0 + x - 1);   const T c1 = *(itr0 + x);   const T c2 = *(itr0 + x + 1);
         const T c3 = *(itr1 + x - 1); /*const T c4 = *(itr1 + x);*/ const T c5 = *(itr1 + x + 1);
         const T c6 = *(itr2 + x - 1);   const T c7 = *(itr2 + x);   const T c8 = *(itr2 + x + 1);

         const T gx = (2.0 * (c5 - c3)) + (c2 - c0) + (c8 - c6);
         const T gy = (2.0 * (c1 - c7)) + (c0 - c6) + (c2 - c8);

         *(out++) = std::sqrt((gx * gx) + (gy * gy));
      }
   }

   if (threshold > 0.0)
   {
      const T* end = im1.row(0) + (im1.width() * im1.height());

      for (T* itr = im1.row(0); itr != end; ++itr)
      {
         T& v = *itr;
         if (v <= threshold) v = 0;
      }
   }

   dst_image.setwidth_height
             (
               static_cast<unsigned int>(im1.width()),
               static_cast<unsigned int>(im1.height())
             );

   dst_image.import_gray_scale_clamped(&im1(0,0));
}

enum palette_name
{
   e_red,           e_scarlet,      e_vermilion,        e_tangelo,         e_orange,
   e_gamboge,       e_amber,        e_gold,             e_yellow,          e_apple_green,
   e_lime_green,    e_spring_bud,   e_chartreuse_green, e_pistachio,       e_harlequin,
   e_sap_green,     e_green,        e_emerald_green,    e_malachite_green, e_sea_green,
   e_spring_green,  e_aquamarine,   e_turquoise,        e_opal,            e_cyan,
   e_arctic_blue,   e_cerulean,     e_cornflower_blue,  e_azure,           e_cobalt_blue,
   e_sapphire_blue, e_phthalo_blue, e_blue,             e_persian_blue,    e_indigo,
   e_blue_violet,   e_violet,       e_purple,           e_mulberry,        e_heliotrope,
   e_magenta,       e_orchid,       e_fuchsia,          e_cerise,          e_rose,
   e_raspberry,     e_crimson,      e_amaranth,         e_white,           e_black
};

const rgb_t palette_colormap[] = {
   {255,   0,   0}, {255,  31,   0}, {255,  63,   0}, {255,  95,   0}, {255, 127,   0},
   {255, 159,   0}, {255, 191,   0}, {255, 223,   0}, {255, 255,   0}, {223, 255,   0},
   {191, 255,   0}, {159, 255,   0}, {127, 255,   0}, { 95, 255,   0}, { 63, 255,   0},
   { 31, 255,   0}, {  0, 255,   0}, {  0, 255,  31}, {  0, 255,  63}, {  0, 255,  95},
   {  0, 255, 127}, {  0, 255, 159}, {  0, 255, 191}, {  0, 255, 223}, {  0, 255, 255},
   {  0, 223, 255}, {  0, 191, 255}, {  0, 159, 255}, {  0, 127, 255}, {  0,  95, 255},
   {  0,  63, 255}, {  0,  31, 255}, {  0,   0, 255}, { 31,   0, 255}, { 63,   0, 255},
   { 95,   0, 255}, {127,   0, 255}, {159,   0, 255}, {191,   0, 255}, {223,   0, 255},
   {255,   0, 255}, {255,   0, 223}, {255,   0, 191}, {255,   0, 159}, {255,   0, 127},
   {255,   0,  95}, {255,   0,  63}, {255,   0,  31}, {255, 255, 255}, {  0,   0,   0}
};

const rgb_t autumn_colormap[1000] = {
   {255,   0,   0}, {255,   0,   0}, {255,   1,   0}, {255,   1,   0}, {255,   1,   0},
   {255,   1,   0}, {255,   2,   0}, {255,   2,   0}, {255,   2,   0}, {255,   2,   0},
   {255,   3,   0}, {255,   3,   0}, {255,   3,   0}, {255,   3,   0}, {255,   4,   0},
   {255,   4,   0}, {255,   4,   0}, {255,   4,   0}, {255,   5,   0}, {255,   5,   0},
   {255,   5,   0}, {255,   5,   0}, {255,   6,   0}, {255,   6,   0}, {255,   6,   0},
   {255,   6,   0}, {255,   7,   0}, {255,   7,   0}, {255,   7,   0}, {255,   7,   0},
   {255,   8,   0}, {255,   8,   0}, {255,   8,   0}, {255,   8,   0}, {255,   9,   0},
   {255,   9,   0}, {255,   9,   0}, {255,   9,   0}, {255,  10,   0}, {255,  10,   0},
   {255,  10,   0}, {255,  10,   0}, {255,  11,   0}, {255,  11,   0}, {255,  11,   0},
   {255,  11,   0}, {255,  12,   0}, {255,  12,   0}, {255,  12,   0}, {255,  13,   0},
   {255,  13,   0}, {255,  13,   0}, {255,  13,   0}, {255,  14,   0}, {255,  14,   0},
   {255,  14,   0}, {255,  14,   0}, {255,  15,   0}, {255,  15,   0}, {255,  15,   0},
   {255,  15,   0}, {255,  16,   0}, {255,  16,   0}, {255,  16,   0}, {255,  16,   0},
   {255,  17,   0}, {255,  17,   0}, {255,  17,   0}, {255,  17,   0}, {255,  18,   0},
   {255,  18,   0}, {255,  18,   0}, {255,  18,   0}, {255,  19,   0}, {255,  19,   0},
   {255,  19,   0}, {255,  19,   0}, {255,  20,   0}, {255,  20,   0}, {255,  20,   0},
   {255,  20,   0}, {255,  21,   0}, {255,  21,   0}, {255,  21,   0}, {255,  21,   0},
   {255,  22,   0}, {255,  22,   0}, {255,  22,   0}, {255,  22,   0}, {255,  23,   0},
   {255,  23,   0}, {255,  23,   0}, {255,  23,   0}, {255,  24,   0}, {255,  24,   0},
   {255,  24,   0}, {255,  25,   0}, {255,  25,   0}, {255,  25,   0}, {255,  25,   0},
   {255,  26,   0}, {255,  26,   0}, {255,  26,   0}, {255,  26,   0}, {255,  27,   0},
   {255,  27,   0}, {255,  27,   0}, {255,  27,   0}, {255,  28,   0}, {255,  28,   0},
   {255,  28,   0}, {255,  28,   0}, {255,  29,   0}, {255,  29,   0}, {255,  29,   0},
   {255,  29,   0}, {255,  30,   0}, {255,  30,   0}, {255,  30,   0}, {255,  30,   0},
   {255,  31,   0}, {255,  31,   0}, {255,  31,   0}, {255,  31,   0}, {255,  32,   0},
   {255,  32,   0}, {255,  32,   0}, {255,  32,   0}, {255,  33,   0}, {255,  33,   0},
   {255,  33,   0}, {255,  33,   0}, {255,  34,   0}, {255,  34,   0}, {255,  34,   0},
   {255,  34,   0}, {255,  35,   0}, {255,  35,   0}, {255,  35,   0}, {255,  35,   0},
   {255,  36,   0}, {255,  36,   0}, {255,  36,   0}, {255,  37,   0}, {255,  37,   0},
   {255,  37,   0}, {255,  37,   0}, {255,  38,   0}, {255,  38,   0}, {255,  38,   0},
   {255,  38,   0}, {255,  39,   0}, {255,  39,   0}, {255,  39,   0}, {255,  39,   0},
   {255,  40,   0}, {255,  40,   0}, {255,  40,   0}, {255,  40,   0}, {255,  41,   0},
   {255,  41,   0}, {255,  41,   0}, {255,  41,   0}, {255,  42,   0}, {255,  42,   0},
   {255,  42,   0}, {255,  42,   0}, {255,  43,   0}, {255,  43,   0}, {255,  43,   0},
   {255,  43,   0}, {255,  44,   0}, {255,  44,   0}, {255,  44,   0}, {255,  44,   0},
   {255,  45,   0}, {255,  45,   0}, {255,  45,   0}, {255,  45,   0}, {255,  46,   0},
   {255,  46,   0}, {255,  46,   0}, {255,  46,   0}, {255,  47,   0}, {255,  47,   0},
   {255,  47,   0}, {255,  47,   0}, {255,  48,   0}, {255,  48,   0}, {255,  48,   0},
   {255,  48,   0}, {255,  49,   0}, {255,  49,   0}, {255,  49,   0}, {255,  50,   0},
   {255,  50,   0}, {255,  50,   0}, {255,  50,   0}, {255,  51,   0}, {255,  51,   0},
   {255,  51,   0}, {255,  51,   0}, {255,  52,   0}, {255,  52,   0}, {255,  52,   0},
   {255,  52,   0}, {255,  53,   0}, {255,  53,   0}, {255,  53,   0}, {255,  53,   0},
   {255,  54,   0}, {255,  54,   0}, {255,  54,   0}, {255,  54,   0}, {255,  55,   0},
   {255,  55,   0}, {255,  55,   0}, {255,  55,   0}, {255,  56,   0}, {255,  56,   0},
   {255,  56,   0}, {255,  56,   0}, {255,  57,   0}, {255,  57,   0}, {255,  57,   0},
   {255,  57,   0}, {255,  58,   0}, {255,  58,   0}, {255,  58,   0}, {255,  58,   0},
   {255,  59,   0}, {255,  59,   0}, {255,  59,   0}, {255,  59,   0}, {255,  60,   0},
   {255,  60,   0}, {255,  60,   0}, {255,  60,   0}, {255,  61,   0}, {255,  61,   0},
   {255,  61,   0}, {255,  62,   0}, {255,  62,   0}, {255,  62,   0}, {255,  62,   0},
   {255,  63,   0}, {255,  63,   0}, {255,  63,   0}, {255,  63,   0}, {255,  64,   0},
   {255,  64,   0}, {255,  64,   0}, {255,  64,   0}, {255,  65,   0}, {255,  65,   0},
   {255,  65,   0}, {255,  65,   0}, {255,  66,   0}, {255,  66,   0}, {255,  66,   0},
   {255,  66,   0}, {255,  67,   0}, {255,  67,   0}, {255,  67,   0}, {255,  67,   0},
   {255,  68,   0}, {255,  68,   0}, {255,  68,   0}, {255,  68,   0}, {255,  69,   0},
   {255,  69,   0}, {255,  69,   0}, {255,  69,   0}, {255,  70,   0}, {255,  70,   0},
   {255,  70,   0}, {255,  70,   0}, {255,  71,   0}, {255,  71,   0}, {255,  71,   0},
   {255,  71,   0}, {255,  72,   0}, {255,  72,   0}, {255,  72,   0}, {255,  72,   0},
   {255,  73,   0}, {255,  73,   0}, {255,  73,   0}, {255,  74,   0}, {255,  74,   0},
   {255,  74,   0}, {255,  74,   0}, {255,  75,   0}, {255,  75,   0}, {255,  75,   0},
   {255,  75,   0}, {255,  76,   0}, {255,  76,   0}, {255,  76,   0}, {255,  76,   0},
   {255,  77,   0}, {255,  77,   0}, {255,  77,   0}, {255,  77,   0}, {255,  78,   0},
   {255,  78,   0}, {255,  78,   0}, {255,  78,   0}, {255,  79,   0}, {255,  79,   0},
   {255,  79,   0}, {255,  79,   0}, {255,  80,   0}, {255,  80,   0}, {255,  80,   0},
   {255,  80,   0}, {255,  81,   0}, {255,  81,   0}, {255,  81,   0}, {255,  81,   0},
   {255,  82,   0}, {255,  82,   0}, {255,  82,   0}, {255,  82,   0}, {255,  83,   0},
   {255,  83,   0}, {255,  83,   0}, {255,  83,   0}, {255,  84,   0}, {255,  84,   0},
   {255,  84,   0}, {255,  84,   0}, {255,  85,   0}, {255,  85,   0}, {255,  85,   0},
   {255,  86,   0}, {255,  86,   0}, {255,  86,   0}, {255,  86,   0}, {255,  87,   0},
   {255,  87,   0}, {255,  87,   0}, {255,  87,   0}, {255,  88,   0}, {255,  88,   0},
   {255,  88,   0}, {255,  88,   0}, {255,  89,   0}, {255,  89,   0}, {255,  89,   0},
   {255,  89,   0}, {255,  90,   0}, {255,  90,   0}, {255,  90,   0}, {255,  90,   0},
   {255,  91,   0}, {255,  91,   0}, {255,  91,   0}, {255,  91,   0}, {255,  92,   0},
   {255,  92,   0}, {255,  92,   0}, {255,  92,   0}, {255,  93,   0}, {255,  93,   0},
   {255,  93,   0}, {255,  93,   0}, {255,  94,   0}, {255,  94,   0}, {255,  94,   0},
   {255,  94,   0}, {255,  95,   0}, {255,  95,   0}, {255,  95,   0}, {255,  95,   0},
   {255,  96,   0}, {255,  96,   0}, {255,  96,   0}, {255,  96,   0}, {255,  97,   0},
   {255,  97,   0}, {255,  97,   0}, {255,  98,   0}, {255,  98,   0}, {255,  98,   0},
   {255,  98,   0}, {255,  99,   0}, {255,  99,   0}, {255,  99,   0}, {255,  99,   0},
   {255, 100,   0}, {255, 100,   0}, {255, 100,   0}, {255, 100,   0}, {255, 101,   0},
   {255, 101,   0}, {255, 101,   0}, {255, 101,   0}, {255, 102,   0}, {255, 102,   0},
   {255, 102,   0}, {255, 102,   0}, {255, 103,   0}, {255, 103,   0}, {255, 103,   0},
   {255, 103,   0}, {255, 104,   0}, {255, 104,   0}, {255, 104,   0}, {255, 104,   0},
   {255, 105,   0}, {255, 105,   0}, {255, 105,   0}, {255, 105,   0}, {255, 106,   0},
   {255, 106,   0}, {255, 106,   0}, {255, 106,   0}, {255, 107,   0}, {255, 107,   0},
   {255, 107,   0}, {255, 107,   0}, {255, 108,   0}, {255, 108,   0}, {255, 108,   0},
   {255, 108,   0}, {255, 109,   0}, {255, 109,   0}, {255, 109,   0}, {255, 110,   0},
   {255, 110,   0}, {255, 110,   0}, {255, 110,   0}, {255, 111,   0}, {255, 111,   0},
   {255, 111,   0}, {255, 111,   0}, {255, 112,   0}, {255, 112,   0}, {255, 112,   0},
   {255, 112,   0}, {255, 113,   0}, {255, 113,   0}, {255, 113,   0}, {255, 113,   0},
   {255, 114,   0}, {255, 114,   0}, {255, 114,   0}, {255, 114,   0}, {255, 115,   0},
   {255, 115,   0}, {255, 115,   0}, {255, 115,   0}, {255, 116,   0}, {255, 116,   0},
   {255, 116,   0}, {255, 116,   0}, {255, 117,   0}, {255, 117,   0}, {255, 117,   0},
   {255, 117,   0}, {255, 118,   0}, {255, 118,   0}, {255, 118,   0}, {255, 118,   0},
   {255, 119,   0}, {255, 119,   0}, {255, 119,   0}, {255, 119,   0}, {255, 120,   0},
   {255, 120,   0}, {255, 120,   0}, {255, 120,   0}, {255, 121,   0}, {255, 121,   0},
   {255, 121,   0}, {255, 122,   0}, {255, 122,   0}, {255, 122,   0}, {255, 122,   0},
   {255, 123,   0}, {255, 123,   0}, {255, 123,   0}, {255, 123,   0}, {255, 124,   0},
   {255, 124,   0}, {255, 124,   0}, {255, 124,   0}, {255, 125,   0}, {255, 125,   0},
   {255, 125,   0}, {255, 125,   0}, {255, 126,   0}, {255, 126,   0}, {255, 126,   0},
   {255, 126,   0}, {255, 127,   0}, {255, 127,   0}, {255, 127,   0}, {255, 127,   0},
   {255, 128,   0}, {255, 128,   0}, {255, 128,   0}, {255, 128,   0}, {255, 129,   0},
   {255, 129,   0}, {255, 129,   0}, {255, 129,   0}, {255, 130,   0}, {255, 130,   0},
   {255, 130,   0}, {255, 130,   0}, {255, 131,   0}, {255, 131,   0}, {255, 131,   0},
   {255, 131,   0}, {255, 132,   0}, {255, 132,   0}, {255, 132,   0}, {255, 132,   0},
   {255, 133,   0}, {255, 133,   0}, {255, 133,   0}, {255, 133,   0}, {255, 134,   0},
   {255, 134,   0}, {255, 134,   0}, {255, 135,   0}, {255, 135,   0}, {255, 135,   0},
   {255, 135,   0}, {255, 136,   0}, {255, 136,   0}, {255, 136,   0}, {255, 136,   0},
   {255, 137,   0}, {255, 137,   0}, {255, 137,   0}, {255, 137,   0}, {255, 138,   0},
   {255, 138,   0}, {255, 138,   0}, {255, 138,   0}, {255, 139,   0}, {255, 139,   0},
   {255, 139,   0}, {255, 139,   0}, {255, 140,   0}, {255, 140,   0}, {255, 140,   0},
   {255, 140,   0}, {255, 141,   0}, {255, 141,   0}, {255, 141,   0}, {255, 141,   0},
   {255, 142,   0}, {255, 142,   0}, {255, 142,   0}, {255, 142,   0}, {255, 143,   0},
   {255, 143,   0}, {255, 143,   0}, {255, 143,   0}, {255, 144,   0}, {255, 144,   0},
   {255, 144,   0}, {255, 144,   0}, {255, 145,   0}, {255, 145,   0}, {255, 145,   0},
   {255, 145,   0}, {255, 146,   0}, {255, 146,   0}, {255, 146,   0}, {255, 147,   0},
   {255, 147,   0}, {255, 147,   0}, {255, 147,   0}, {255, 148,   0}, {255, 148,   0},
   {255, 148,   0}, {255, 148,   0}, {255, 149,   0}, {255, 149,   0}, {255, 149,   0},
   {255, 149,   0}, {255, 150,   0}, {255, 150,   0}, {255, 150,   0}, {255, 150,   0},
   {255, 151,   0}, {255, 151,   0}, {255, 151,   0}, {255, 151,   0}, {255, 152,   0},
   {255, 152,   0}, {255, 152,   0}, {255, 152,   0}, {255, 153,   0}, {255, 153,   0},
   {255, 153,   0}, {255, 153,   0}, {255, 154,   0}, {255, 154,   0}, {255, 154,   0},
   {255, 154,   0}, {255, 155,   0}, {255, 155,   0}, {255, 155,   0}, {255, 155,   0},
   {255, 156,   0}, {255, 156,   0}, {255, 156,   0}, {255, 156,   0}, {255, 157,   0},
   {255, 157,   0}, {255, 157,   0}, {255, 157,   0}, {255, 158,   0}, {255, 158,   0},
   {255, 158,   0}, {255, 159,   0}, {255, 159,   0}, {255, 159,   0}, {255, 159,   0},
   {255, 160,   0}, {255, 160,   0}, {255, 160,   0}, {255, 160,   0}, {255, 161,   0},
   {255, 161,   0}, {255, 161,   0}, {255, 161,   0}, {255, 162,   0}, {255, 162,   0},
   {255, 162,   0}, {255, 162,   0}, {255, 163,   0}, {255, 163,   0}, {255, 163,   0},
   {255, 163,   0}, {255, 164,   0}, {255, 164,   0}, {255, 164,   0}, {255, 164,   0},
   {255, 165,   0}, {255, 165,   0}, {255, 165,   0}, {255, 165,   0}, {255, 166,   0},
   {255, 166,   0}, {255, 166,   0}, {255, 166,   0}, {255, 167,   0}, {255, 167,   0},
   {255, 167,   0}, {255, 167,   0}, {255, 168,   0}, {255, 168,   0}, {255, 168,   0},
   {255, 168,   0}, {255, 169,   0}, {255, 169,   0}, {255, 169,   0}, {255, 169,   0},
   {255, 170,   0}, {255, 170,   0}, {255, 170,   0}, {255, 171,   0}, {255, 171,   0},
   {255, 171,   0}, {255, 171,   0}, {255, 172,   0}, {255, 172,   0}, {255, 172,   0},
   {255, 172,   0}, {255, 173,   0}, {255, 173,   0}, {255, 173,   0}, {255, 173,   0},
   {255, 174,   0}, {255, 174,   0}, {255, 174,   0}, {255, 174,   0}, {255, 175,   0},
   {255, 175,   0}, {255, 175,   0}, {255, 175,   0}, {255, 176,   0}, {255, 176,   0},
   {255, 176,   0}, {255, 176,   0}, {255, 177,   0}, {255, 177,   0}, {255, 177,   0},
   {255, 177,   0}, {255, 178,   0}, {255, 178,   0}, {255, 178,   0}, {255, 178,   0},
   {255, 179,   0}, {255, 179,   0}, {255, 179,   0}, {255, 179,   0}, {255, 180,   0},
   {255, 180,   0}, {255, 180,   0}, {255, 180,   0}, {255, 181,   0}, {255, 181,   0},
   {255, 181,   0}, {255, 181,   0}, {255, 182,   0}, {255, 182,   0}, {255, 182,   0},
   {255, 183,   0}, {255, 183,   0}, {255, 183,   0}, {255, 183,   0}, {255, 184,   0},
   {255, 184,   0}, {255, 184,   0}, {255, 184,   0}, {255, 185,   0}, {255, 185,   0},
   {255, 185,   0}, {255, 185,   0}, {255, 186,   0}, {255, 186,   0}, {255, 186,   0},
   {255, 186,   0}, {255, 187,   0}, {255, 187,   0}, {255, 187,   0}, {255, 187,   0},
   {255, 188,   0}, {255, 188,   0}, {255, 188,   0}, {255, 188,   0}, {255, 189,   0},
   {255, 189,   0}, {255, 189,   0}, {255, 189,   0}, {255, 190,   0}, {255, 190,   0},
   {255, 190,   0}, {255, 190,   0}, {255, 191,   0}, {255, 191,   0}, {255, 191,   0},
   {255, 191,   0}, {255, 192,   0}, {255, 192,   0}, {255, 192,   0}, {255, 192,   0},
   {255, 193,   0}, {255, 193,   0}, {255, 193,   0}, {255, 193,   0}, {255, 194,   0},
   {255, 194,   0}, {255, 194,   0}, {255, 195,   0}, {255, 195,   0}, {255, 195,   0},
   {255, 195,   0}, {255, 196,   0}, {255, 196,   0}, {255, 196,   0}, {255, 196,   0},
   {255, 197,   0}, {255, 197,   0}, {255, 197,   0}, {255, 197,   0}, {255, 198,   0},
   {255, 198,   0}, {255, 198,   0}, {255, 198,   0}, {255, 199,   0}, {255, 199,   0},
   {255, 199,   0}, {255, 199,   0}, {255, 200,   0}, {255, 200,   0}, {255, 200,   0},
   {255, 200,   0}, {255, 201,   0}, {255, 201,   0}, {255, 201,   0}, {255, 201,   0},
   {255, 202,   0}, {255, 202,   0}, {255, 202,   0}, {255, 202,   0}, {255, 203,   0},
   {255, 203,   0}, {255, 203,   0}, {255, 203,   0}, {255, 204,   0}, {255, 204,   0},
   {255, 204,   0}, {255, 204,   0}, {255, 205,   0}, {255, 205,   0}, {255, 205,   0},
   {255, 205,   0}, {255, 206,   0}, {255, 206,   0}, {255, 206,   0}, {255, 207,   0},
   {255, 207,   0}, {255, 207,   0}, {255, 207,   0}, {255, 208,   0}, {255, 208,   0},
   {255, 208,   0}, {255, 208,   0}, {255, 209,   0}, {255, 209,   0}, {255, 209,   0},
   {255, 209,   0}, {255, 210,   0}, {255, 210,   0}, {255, 210,   0}, {255, 210,   0},
   {255, 211,   0}, {255, 211,   0}, {255, 211,   0}, {255, 211,   0}, {255, 212,   0},
   {255, 212,   0}, {255, 212,   0}, {255, 212,   0}, {255, 213,   0}, {255, 213,   0},
   {255, 213,   0}, {255, 213,   0}, {255, 214,   0}, {255, 214,   0}, {255, 214,   0},
   {255, 214,   0}, {255, 215,   0}, {255, 215,   0}, {255, 215,   0}, {255, 215,   0},
   {255, 216,   0}, {255, 216,   0}, {255, 216,   0}, {255, 216,   0}, {255, 217,   0},
   {255, 217,   0}, {255, 217,   0}, {255, 217,   0}, {255, 218,   0}, {255, 218,   0},
   {255, 218,   0}, {255, 218,   0}, {255, 219,   0}, {255, 219,   0}, {255, 219,   0},
   {255, 220,   0}, {255, 220,   0}, {255, 220,   0}, {255, 220,   0}, {255, 221,   0},
   {255, 221,   0}, {255, 221,   0}, {255, 221,   0}, {255, 222,   0}, {255, 222,   0},
   {255, 222,   0}, {255, 222,   0}, {255, 223,   0}, {255, 223,   0}, {255, 223,   0},
   {255, 223,   0}, {255, 224,   0}, {255, 224,   0}, {255, 224,   0}, {255, 224,   0},
   {255, 225,   0}, {255, 225,   0}, {255, 225,   0}, {255, 225,   0}, {255, 226,   0},
   {255, 226,   0}, {255, 226,   0}, {255, 226,   0}, {255, 227,   0}, {255, 227,   0},
   {255, 227,   0}, {255, 227,   0}, {255, 228,   0}, {255, 228,   0}, {255, 228,   0},
   {255, 228,   0}, {255, 229,   0}, {255, 229,   0}, {255, 229,   0}, {255, 229,   0},
   {255, 230,   0}, {255, 230,   0}, {255, 230,   0}, {255, 230,   0}, {255, 231,   0},
   {255, 231,   0}, {255, 231,   0}, {255, 232,   0}, {255, 232,   0}, {255, 232,   0},
   {255, 232,   0}, {255, 233,   0}, {255, 233,   0}, {255, 233,   0}, {255, 233,   0},
   {255, 234,   0}, {255, 234,   0}, {255, 234,   0}, {255, 234,   0}, {255, 235,   0},
   {255, 235,   0}, {255, 235,   0}, {255, 235,   0}, {255, 236,   0}, {255, 236,   0},
   {255, 236,   0}, {255, 236,   0}, {255, 237,   0}, {255, 237,   0}, {255, 237,   0},
   {255, 237,   0}, {255, 238,   0}, {255, 238,   0}, {255, 238,   0}, {255, 238,   0},
   {255, 239,   0}, {255, 239,   0}, {255, 239,   0}, {255, 239,   0}, {255, 240,   0},
   {255, 240,   0}, {255, 240,   0}, {255, 240,   0}, {255, 241,   0}, {255, 241,   0},
   {255, 241,   0}, {255, 241,   0}, {255, 242,   0}, {255, 242,   0}, {255, 242,   0},
   {255, 242,   0}, {255, 243,   0}, {255, 243,   0}, {255, 243,   0}, {255, 244,   0},
   {255, 244,   0}, {255, 244,   0}, {255, 244,   0}, {255, 245,   0}, {255, 245,   0},
   {255, 245,   0}, {255, 245,   0}, {255, 246,   0}, {255, 246,   0}, {255, 246,   0},
   {255, 246,   0}, {255, 247,   0}, {255, 247,   0}, {255, 247,   0}, {255, 247,   0},
   {255, 248,   0}, {255, 248,   0}, {255, 248,   0}, {255, 248,   0}, {255, 249,   0},
   {255, 249,   0}, {255, 249,   0}, {255, 249,   0}, {255, 250,   0}, {255, 250,   0},
   {255, 250,   0}, {255, 250,   0}, {255, 251,   0}, {255, 251,   0}, {255, 251,   0},
   {255, 251,   0}, {255, 252,   0}, {255, 252,   0}, {255, 252,   0}, {255, 252,   0},
   {255, 253,   0}, {255, 253,   0}, {255, 253,   0}, {255, 253,   0}, {255, 254,   0},
   {255, 254,   0}, {255, 254,   0}, {255, 254,   0}, {255, 255,   0}, {255, 255,   0}
};

const rgb_t copper_colormap[1000] = {
   {  0,   0,   0}, {  0,   0,   0}, {  1,   0,   0}, {  1,   1,   0}, {  1,   1,   1},
   {  2,   1,   1}, {  2,   1,   1}, {  2,   1,   1}, {  3,   2,   1}, {  3,   2,   1},
   {  3,   2,   1}, {  4,   2,   1}, {  4,   2,   2}, {  4,   3,   2}, {  4,   3,   2},
   {  5,   3,   2}, {  5,   3,   2}, {  5,   3,   2}, {  6,   4,   2}, {  6,   4,   2},
   {  6,   4,   3}, {  7,   4,   3}, {  7,   4,   3}, {  7,   5,   3}, {  8,   5,   3},
   {  8,   5,   3}, {  8,   5,   3}, {  9,   5,   3}, {  9,   6,   4}, {  9,   6,   4},
   { 10,   6,   4}, { 10,   6,   4}, { 10,   6,   4}, { 11,   7,   4}, { 11,   7,   4},
   { 11,   7,   4}, { 11,   7,   5}, { 12,   7,   5}, { 12,   8,   5}, { 12,   8,   5},
   { 13,   8,   5}, { 13,   8,   5}, { 13,   8,   5}, { 14,   9,   5}, { 14,   9,   6},
   { 14,   9,   6}, { 15,   9,   6}, { 15,   9,   6}, { 15,  10,   6}, { 16,  10,   6},
   { 16,  10,   6}, { 16,  10,   6}, { 17,  10,   7}, { 17,  11,   7}, { 17,  11,   7},
   { 18,  11,   7}, { 18,  11,   7}, { 18,  11,   7}, { 19,  12,   7}, { 19,  12,   7},
   { 19,  12,   8}, { 19,  12,   8}, { 20,  12,   8}, { 20,  13,   8}, { 20,  13,   8},
   { 21,  13,   8}, { 21,  13,   8}, { 21,  13,   9}, { 22,  14,   9}, { 22,  14,   9},
   { 22,  14,   9}, { 23,  14,   9}, { 23,  14,   9}, { 23,  15,   9}, { 24,  15,   9},
   { 24,  15,  10}, { 24,  15,  10}, { 25,  15,  10}, { 25,  16,  10}, { 25,  16,  10},
   { 26,  16,  10}, { 26,  16,  10}, { 26,  16,  10}, { 26,  17,  11}, { 27,  17,  11},
   { 27,  17,  11}, { 27,  17,  11}, { 28,  17,  11}, { 28,  18,  11}, { 28,  18,  11},
   { 29,  18,  11}, { 29,  18,  12}, { 29,  18,  12}, { 30,  19,  12}, { 30,  19,  12},
   { 30,  19,  12}, { 31,  19,  12}, { 31,  19,  12}, { 31,  20,  12}, { 32,  20,  13},
   { 32,  20,  13}, { 32,  20,  13}, { 33,  20,  13}, { 33,  21,  13}, { 33,  21,  13},
   { 34,  21,  13}, { 34,  21,  13}, { 34,  21,  14}, { 34,  22,  14}, { 35,  22,  14},
   { 35,  22,  14}, { 35,  22,  14}, { 36,  22,  14}, { 36,  23,  14}, { 36,  23,  14},
   { 37,  23,  15}, { 37,  23,  15}, { 37,  23,  15}, { 38,  24,  15}, { 38,  24,  15},
   { 38,  24,  15}, { 39,  24,  15}, { 39,  24,  15}, { 39,  25,  16}, { 40,  25,  16},
   { 40,  25,  16}, { 40,  25,  16}, { 41,  25,  16}, { 41,  26,  16}, { 41,  26,  16},
   { 41,  26,  17}, { 42,  26,  17}, { 42,  26,  17}, { 42,  27,  17}, { 43,  27,  17},
   { 43,  27,  17}, { 43,  27,  17}, { 44,  27,  17}, { 44,  28,  18}, { 44,  28,  18},
   { 45,  28,  18}, { 45,  28,  18}, { 45,  28,  18}, { 46,  29,  18}, { 46,  29,  18},
   { 46,  29,  18}, { 47,  29,  19}, { 47,  29,  19}, { 47,  30,  19}, { 48,  30,  19},
   { 48,  30,  19}, { 48,  30,  19}, { 48,  30,  19}, { 49,  31,  19}, { 49,  31,  20},
   { 49,  31,  20}, { 50,  31,  20}, { 50,  31,  20}, { 50,  32,  20}, { 51,  32,  20},
   { 51,  32,  20}, { 51,  32,  20}, { 52,  32,  21}, { 52,  33,  21}, { 52,  33,  21},
   { 53,  33,  21}, { 53,  33,  21}, { 53,  33,  21}, { 54,  34,  21}, { 54,  34,  21},
   { 54,  34,  22}, { 55,  34,  22}, { 55,  34,  22}, { 55,  34,  22}, { 56,  35,  22},
   { 56,  35,  22}, { 56,  35,  22}, { 56,  35,  22}, { 57,  35,  23}, { 57,  36,  23},
   { 57,  36,  23}, { 58,  36,  23}, { 58,  36,  23}, { 58,  36,  23}, { 59,  37,  23},
   { 59,  37,  23}, { 59,  37,  24}, { 60,  37,  24}, { 60,  37,  24}, { 60,  38,  24},
   { 61,  38,  24}, { 61,  38,  24}, { 61,  38,  24}, { 62,  38,  25}, { 62,  39,  25},
   { 62,  39,  25}, { 63,  39,  25}, { 63,  39,  25}, { 63,  39,  25}, { 63,  40,  25},
   { 64,  40,  25}, { 64,  40,  26}, { 64,  40,  26}, { 65,  40,  26}, { 65,  41,  26},
   { 65,  41,  26}, { 66,  41,  26}, { 66,  41,  26}, { 66,  41,  26}, { 67,  42,  27},
   { 67,  42,  27}, { 67,  42,  27}, { 68,  42,  27}, { 68,  42,  27}, { 68,  43,  27},
   { 69,  43,  27}, { 69,  43,  27}, { 69,  43,  28}, { 70,  43,  28}, { 70,  44,  28},
   { 70,  44,  28}, { 71,  44,  28}, { 71,  44,  28}, { 71,  44,  28}, { 71,  45,  28},
   { 72,  45,  29}, { 72,  45,  29}, { 72,  45,  29}, { 73,  45,  29}, { 73,  46,  29},
   { 73,  46,  29}, { 74,  46,  29}, { 74,  46,  29}, { 74,  46,  30}, { 75,  47,  30},
   { 75,  47,  30}, { 75,  47,  30}, { 76,  47,  30}, { 76,  47,  30}, { 76,  48,  30},
   { 77,  48,  30}, { 77,  48,  31}, { 77,  48,  31}, { 78,  48,  31}, { 78,  49,  31},
   { 78,  49,  31}, { 78,  49,  31}, { 79,  49,  31}, { 79,  49,  31}, { 79,  50,  32},
   { 80,  50,  32}, { 80,  50,  32}, { 80,  50,  32}, { 81,  50,  32}, { 81,  51,  32},
   { 81,  51,  32}, { 82,  51,  33}, { 82,  51,  33}, { 82,  51,  33}, { 83,  52,  33},
   { 83,  52,  33}, { 83,  52,  33}, { 84,  52,  33}, { 84,  52,  33}, { 84,  53,  34},
   { 85,  53,  34}, { 85,  53,  34}, { 85,  53,  34}, { 86,  53,  34}, { 86,  54,  34},
   { 86,  54,  34}, { 86,  54,  34}, { 87,  54,  35}, { 87,  54,  35}, { 87,  55,  35},
   { 88,  55,  35}, { 88,  55,  35}, { 88,  55,  35}, { 89,  55,  35}, { 89,  56,  35},
   { 89,  56,  36}, { 90,  56,  36}, { 90,  56,  36}, { 90,  56,  36}, { 91,  57,  36},
   { 91,  57,  36}, { 91,  57,  36}, { 92,  57,  36}, { 92,  57,  37}, { 92,  58,  37},
   { 93,  58,  37}, { 93,  58,  37}, { 93,  58,  37}, { 93,  58,  37}, { 94,  59,  37},
   { 94,  59,  37}, { 94,  59,  38}, { 95,  59,  38}, { 95,  59,  38}, { 95,  60,  38},
   { 96,  60,  38}, { 96,  60,  38}, { 96,  60,  38}, { 97,  60,  38}, { 97,  61,  39},
   { 97,  61,  39}, { 98,  61,  39}, { 98,  61,  39}, { 98,  61,  39}, { 99,  62,  39},
   { 99,  62,  39}, { 99,  62,  39}, {100,  62,  40}, {100,  62,  40}, {100,  63,  40},
   {101,  63,  40}, {101,  63,  40}, {101,  63,  40}, {101,  63,  40}, {102,  64,  41},
   {102,  64,  41}, {102,  64,  41}, {103,  64,  41}, {103,  64,  41}, {103,  65,  41},
   {104,  65,  41}, {104,  65,  41}, {104,  65,  42}, {105,  65,  42}, {105,  66,  42},
   {105,  66,  42}, {106,  66,  42}, {106,  66,  42}, {106,  66,  42}, {107,  67,  42},
   {107,  67,  43}, {107,  67,  43}, {108,  67,  43}, {108,  67,  43}, {108,  68,  43},
   {108,  68,  43}, {109,  68,  43}, {109,  68,  43}, {109,  68,  44}, {110,  69,  44},
   {110,  69,  44}, {110,  69,  44}, {111,  69,  44}, {111,  69,  44}, {111,  70,  44},
   {112,  70,  44}, {112,  70,  45}, {112,  70,  45}, {113,  70,  45}, {113,  71,  45},
   {113,  71,  45}, {114,  71,  45}, {114,  71,  45}, {114,  71,  45}, {115,  72,  46},
   {115,  72,  46}, {115,  72,  46}, {116,  72,  46}, {116,  72,  46}, {116,  73,  46},
   {116,  73,  46}, {117,  73,  46}, {117,  73,  47}, {117,  73,  47}, {118,  74,  47},
   {118,  74,  47}, {118,  74,  47}, {119,  74,  47}, {119,  74,  47}, {119,  75,  47},
   {120,  75,  48}, {120,  75,  48}, {120,  75,  48}, {121,  75,  48}, {121,  76,  48},
   {121,  76,  48}, {122,  76,  48}, {122,  76,  49}, {122,  76,  49}, {123,  77,  49},
   {123,  77,  49}, {123,  77,  49}, {123,  77,  49}, {124,  77,  49}, {124,  78,  49},
   {124,  78,  50}, {125,  78,  50}, {125,  78,  50}, {125,  78,  50}, {126,  79,  50},
   {126,  79,  50}, {126,  79,  50}, {127,  79,  50}, {127,  79,  51}, {127,  80,  51},
   {128,  80,  51}, {128,  80,  51}, {128,  80,  51}, {129,  80,  51}, {129,  81,  51},
   {129,  81,  51}, {130,  81,  52}, {130,  81,  52}, {130,  81,  52}, {130,  82,  52},
   {131,  82,  52}, {131,  82,  52}, {131,  82,  52}, {132,  82,  52}, {132,  83,  53},
   {132,  83,  53}, {133,  83,  53}, {133,  83,  53}, {133,  83,  53}, {134,  84,  53},
   {134,  84,  53}, {134,  84,  53}, {135,  84,  54}, {135,  84,  54}, {135,  85,  54},
   {136,  85,  54}, {136,  85,  54}, {136,  85,  54}, {137,  85,  54}, {137,  86,  54},
   {137,  86,  55}, {138,  86,  55}, {138,  86,  55}, {138,  86,  55}, {138,  87,  55},
   {139,  87,  55}, {139,  87,  55}, {139,  87,  55}, {140,  87,  56}, {140,  88,  56},
   {140,  88,  56}, {141,  88,  56}, {141,  88,  56}, {141,  88,  56}, {142,  89,  56},
   {142,  89,  57}, {142,  89,  57}, {143,  89,  57}, {143,  89,  57}, {143,  90,  57},
   {144,  90,  57}, {144,  90,  57}, {144,  90,  57}, {145,  90,  58}, {145,  91,  58},
   {145,  91,  58}, {145,  91,  58}, {146,  91,  58}, {146,  91,  58}, {146,  92,  58},
   {147,  92,  58}, {147,  92,  59}, {147,  92,  59}, {148,  92,  59}, {148,  93,  59},
   {148,  93,  59}, {149,  93,  59}, {149,  93,  59}, {149,  93,  59}, {150,  94,  60},
   {150,  94,  60}, {150,  94,  60}, {151,  94,  60}, {151,  94,  60}, {151,  95,  60},
   {152,  95,  60}, {152,  95,  60}, {152,  95,  61}, {153,  95,  61}, {153,  96,  61},
   {153,  96,  61}, {153,  96,  61}, {154,  96,  61}, {154,  96,  61}, {154,  97,  61},
   {155,  97,  62}, {155,  97,  62}, {155,  97,  62}, {156,  97,  62}, {156,  98,  62},
   {156,  98,  62}, {157,  98,  62}, {157,  98,  62}, {157,  98,  63}, {158,  99,  63},
   {158,  99,  63}, {158,  99,  63}, {159,  99,  63}, {159,  99,  63}, {159, 100,  63},
   {160, 100,  63}, {160, 100,  64}, {160, 100,  64}, {160, 100,  64}, {161, 101,  64},
   {161, 101,  64}, {161, 101,  64}, {162, 101,  64}, {162, 101,  65}, {162, 101,  65},
   {163, 102,  65}, {163, 102,  65}, {163, 102,  65}, {164, 102,  65}, {164, 102,  65},
   {164, 103,  65}, {165, 103,  66}, {165, 103,  66}, {165, 103,  66}, {166, 103,  66},
   {166, 104,  66}, {166, 104,  66}, {167, 104,  66}, {167, 104,  66}, {167, 104,  67},
   {168, 105,  67}, {168, 105,  67}, {168, 105,  67}, {168, 105,  67}, {169, 105,  67},
   {169, 106,  67}, {169, 106,  67}, {170, 106,  68}, {170, 106,  68}, {170, 106,  68},
   {171, 107,  68}, {171, 107,  68}, {171, 107,  68}, {172, 107,  68}, {172, 107,  68},
   {172, 108,  69}, {173, 108,  69}, {173, 108,  69}, {173, 108,  69}, {174, 108,  69},
   {174, 109,  69}, {174, 109,  69}, {175, 109,  69}, {175, 109,  70}, {175, 109,  70},
   {175, 110,  70}, {176, 110,  70}, {176, 110,  70}, {176, 110,  70}, {177, 110,  70},
   {177, 111,  70}, {177, 111,  71}, {178, 111,  71}, {178, 111,  71}, {178, 111,  71},
   {179, 112,  71}, {179, 112,  71}, {179, 112,  71}, {180, 112,  71}, {180, 112,  72},
   {180, 113,  72}, {181, 113,  72}, {181, 113,  72}, {181, 113,  72}, {182, 113,  72},
   {182, 114,  72}, {182, 114,  73}, {183, 114,  73}, {183, 114,  73}, {183, 114,  73},
   {183, 115,  73}, {184, 115,  73}, {184, 115,  73}, {184, 115,  73}, {185, 115,  74},
   {185, 116,  74}, {185, 116,  74}, {186, 116,  74}, {186, 116,  74}, {186, 116,  74},
   {187, 117,  74}, {187, 117,  74}, {187, 117,  75}, {188, 117,  75}, {188, 117,  75},
   {188, 118,  75}, {189, 118,  75}, {189, 118,  75}, {189, 118,  75}, {190, 118,  75},
   {190, 119,  76}, {190, 119,  76}, {190, 119,  76}, {191, 119,  76}, {191, 119,  76},
   {191, 120,  76}, {192, 120,  76}, {192, 120,  76}, {192, 120,  77}, {193, 120,  77},
   {193, 121,  77}, {193, 121,  77}, {194, 121,  77}, {194, 121,  77}, {194, 121,  77},
   {195, 122,  77}, {195, 122,  78}, {195, 122,  78}, {196, 122,  78}, {196, 122,  78},
   {196, 123,  78}, {197, 123,  78}, {197, 123,  78}, {197, 123,  78}, {198, 123,  79},
   {198, 124,  79}, {198, 124,  79}, {198, 124,  79}, {199, 124,  79}, {199, 124,  79},
   {199, 125,  79}, {200, 125,  79}, {200, 125,  80}, {200, 125,  80}, {201, 125,  80},
   {201, 126,  80}, {201, 126,  80}, {202, 126,  80}, {202, 126,  80}, {202, 126,  81},
   {203, 127,  81}, {203, 127,  81}, {203, 127,  81}, {204, 127,  81}, {204, 127,  81},
   {204, 128,  81}, {205, 128,  81}, {205, 128,  82}, {205, 128,  82}, {205, 128,  82},
   {206, 129,  82}, {206, 129,  82}, {206, 129,  82}, {207, 129,  82}, {207, 129,  82},
   {207, 130,  83}, {208, 130,  83}, {208, 130,  83}, {208, 130,  83}, {209, 130,  83},
   {209, 131,  83}, {209, 131,  83}, {210, 131,  83}, {210, 131,  84}, {210, 131,  84},
   {211, 132,  84}, {211, 132,  84}, {211, 132,  84}, {212, 132,  84}, {212, 132,  84},
   {212, 133,  84}, {212, 133,  85}, {213, 133,  85}, {213, 133,  85}, {213, 133,  85},
   {214, 134,  85}, {214, 134,  85}, {214, 134,  85}, {215, 134,  85}, {215, 134,  86},
   {215, 135,  86}, {216, 135,  86}, {216, 135,  86}, {216, 135,  86}, {217, 135,  86},
   {217, 136,  86}, {217, 136,  86}, {218, 136,  87}, {218, 136,  87}, {218, 136,  87},
   {219, 137,  87}, {219, 137,  87}, {219, 137,  87}, {220, 137,  87}, {220, 137,  87},
   {220, 138,  88}, {220, 138,  88}, {221, 138,  88}, {221, 138,  88}, {221, 138,  88},
   {222, 139,  88}, {222, 139,  88}, {222, 139,  89}, {223, 139,  89}, {223, 139,  89},
   {223, 140,  89}, {224, 140,  89}, {224, 140,  89}, {224, 140,  89}, {225, 140,  89},
   {225, 141,  90}, {225, 141,  90}, {226, 141,  90}, {226, 141,  90}, {226, 141,  90},
   {227, 142,  90}, {227, 142,  90}, {227, 142,  90}, {227, 142,  91}, {228, 142,  91},
   {228, 143,  91}, {228, 143,  91}, {229, 143,  91}, {229, 143,  91}, {229, 143,  91},
   {230, 144,  91}, {230, 144,  92}, {230, 144,  92}, {231, 144,  92}, {231, 144,  92},
   {231, 145,  92}, {232, 145,  92}, {232, 145,  92}, {232, 145,  92}, {233, 145,  93},
   {233, 146,  93}, {233, 146,  93}, {234, 146,  93}, {234, 146,  93}, {234, 146,  93},
   {235, 147,  93}, {235, 147,  93}, {235, 147,  94}, {235, 147,  94}, {236, 147,  94},
   {236, 148,  94}, {236, 148,  94}, {237, 148,  94}, {237, 148,  94}, {237, 148,  94},
   {238, 149,  95}, {238, 149,  95}, {238, 149,  95}, {239, 149,  95}, {239, 149,  95},
   {239, 150,  95}, {240, 150,  95}, {240, 150,  95}, {240, 150,  96}, {241, 150,  96},
   {241, 151,  96}, {241, 151,  96}, {242, 151,  96}, {242, 151,  96}, {242, 151,  96},
   {242, 152,  97}, {243, 152,  97}, {243, 152,  97}, {243, 152,  97}, {244, 152,  97},
   {244, 153,  97}, {244, 153,  97}, {245, 153,  97}, {245, 153,  98}, {245, 153,  98},
   {246, 154,  98}, {246, 154,  98}, {246, 154,  98}, {247, 154,  98}, {247, 154,  98},
   {247, 155,  98}, {248, 155,  99}, {248, 155,  99}, {248, 155,  99}, {249, 155,  99},
   {249, 156,  99}, {249, 156,  99}, {250, 156,  99}, {250, 156,  99}, {250, 156, 100},
   {250, 157, 100}, {251, 157, 100}, {251, 157, 100}, {251, 157, 100}, {252, 157, 100},
   {252, 158, 100}, {252, 158, 100}, {253, 158, 101}, {253, 158, 101}, {253, 158, 101},
   {253, 159, 101}, {253, 159, 101}, {254, 159, 101}, {254, 159, 101}, {254, 159, 101},
   {254, 160, 102}, {254, 160, 102}, {254, 160, 102}, {254, 160, 102}, {254, 160, 102},
   {255, 161, 102}, {255, 161, 102}, {255, 161, 102}, {255, 161, 103}, {255, 161, 103},
   {255, 162, 103}, {255, 162, 103}, {255, 162, 103}, {255, 162, 103}, {255, 162, 103},
   {255, 163, 103}, {255, 163, 104}, {255, 163, 104}, {255, 163, 104}, {255, 163, 104},
   {255, 164, 104}, {255, 164, 104}, {255, 164, 104}, {255, 164, 105}, {255, 164, 105},
   {255, 165, 105}, {255, 165, 105}, {255, 165, 105}, {255, 165, 105}, {255, 165, 105},
   {255, 166, 105}, {255, 166, 106}, {255, 166, 106}, {255, 166, 106}, {255, 166, 106},
   {255, 167, 106}, {255, 167, 106}, {255, 167, 106}, {255, 167, 106}, {255, 167, 107},
   {255, 168, 107}, {255, 168, 107}, {255, 168, 107}, {255, 168, 107}, {255, 168, 107},
   {255, 168, 107}, {255, 169, 107}, {255, 169, 108}, {255, 169, 108}, {255, 169, 108},
   {255, 169, 108}, {255, 170, 108}, {255, 170, 108}, {255, 170, 108}, {255, 170, 108},
   {255, 170, 109}, {255, 171, 109}, {255, 171, 109}, {255, 171, 109}, {255, 171, 109},
   {255, 171, 109}, {255, 172, 109}, {255, 172, 109}, {255, 172, 110}, {255, 172, 110},
   {255, 172, 110}, {255, 173, 110}, {255, 173, 110}, {255, 173, 110}, {255, 173, 110},
   {255, 173, 110}, {255, 174, 111}, {255, 174, 111}, {255, 174, 111}, {255, 174, 111},
   {255, 174, 111}, {255, 175, 111}, {255, 175, 111}, {255, 175, 111}, {255, 175, 112},
   {255, 175, 112}, {255, 176, 112}, {255, 176, 112}, {255, 176, 112}, {255, 176, 112},
   {255, 176, 112}, {255, 177, 113}, {255, 177, 113}, {255, 177, 113}, {255, 177, 113},
   {255, 177, 113}, {255, 178, 113}, {255, 178, 113}, {255, 178, 113}, {255, 178, 114},
   {255, 178, 114}, {255, 179, 114}, {255, 179, 114}, {255, 179, 114}, {255, 179, 114},
   {255, 179, 114}, {255, 180, 114}, {255, 180, 115}, {255, 180, 115}, {255, 180, 115},
   {255, 180, 115}, {255, 181, 115}, {255, 181, 115}, {255, 181, 115}, {255, 181, 115},
   {255, 181, 116}, {255, 182, 116}, {255, 182, 116}, {255, 182, 116}, {255, 182, 116},
   {255, 182, 116}, {255, 183, 116}, {255, 183, 116}, {255, 183, 117}, {255, 183, 117},
   {255, 183, 117}, {255, 184, 117}, {255, 184, 117}, {255, 184, 117}, {255, 184, 117},
   {255, 184, 117}, {255, 185, 118}, {255, 185, 118}, {255, 185, 118}, {255, 185, 118},
   {255, 185, 118}, {255, 186, 118}, {255, 186, 118}, {255, 186, 118}, {255, 186, 119},
   {255, 186, 119}, {255, 187, 119}, {255, 187, 119}, {255, 187, 119}, {255, 187, 119},
   {255, 187, 119}, {255, 188, 119}, {255, 188, 120}, {255, 188, 120}, {255, 188, 120},
   {255, 188, 120}, {255, 189, 120}, {255, 189, 120}, {255, 189, 120}, {255, 189, 121},
   {255, 189, 121}, {255, 190, 121}, {255, 190, 121}, {255, 190, 121}, {255, 190, 121},
   {255, 190, 121}, {255, 191, 121}, {255, 191, 122}, {255, 191, 122}, {255, 191, 122},
   {255, 191, 122}, {255, 192, 122}, {255, 192, 122}, {255, 192, 122}, {255, 192, 122},
   {255, 192, 123}, {255, 193, 123}, {255, 193, 123}, {255, 193, 123}, {255, 193, 123},
   {255, 193, 123}, {255, 194, 123}, {255, 194, 123}, {255, 194, 124}, {255, 194, 124},
   {255, 194, 124}, {255, 195, 124}, {255, 195, 124}, {255, 195, 124}, {255, 195, 124},
   {255, 195, 124}, {255, 196, 125}, {255, 196, 125}, {255, 196, 125}, {255, 196, 125},
   {255, 196, 125}, {255, 197, 125}, {255, 197, 125}, {255, 197, 125}, {255, 197, 126},
   {255, 197, 126}, {255, 198, 126}, {255, 198, 126}, {255, 198, 126}, {255, 198, 126},
   {255, 198, 126}, {255, 199, 126}, {255, 199, 127}, {255, 199, 127}, {255, 199, 127}
};

const rgb_t gray_colormap[1000] = {
   {255, 255, 255}, {255, 255, 255}, {254, 254, 254}, {254, 254, 254}, {254, 254, 254},
   {254, 254, 254}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253},
   {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {251, 251, 251},
   {251, 251, 251}, {251, 251, 251}, {251, 251, 251}, {250, 250, 250}, {250, 250, 250},
   {250, 250, 250}, {250, 250, 250}, {249, 249, 249}, {249, 249, 249}, {249, 249, 249},
   {249, 249, 249}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248},
   {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {246, 246, 246},
   {246, 246, 246}, {246, 246, 246}, {246, 246, 246}, {245, 245, 245}, {245, 245, 245},
   {245, 245, 245}, {245, 245, 245}, {244, 244, 244}, {244, 244, 244}, {244, 244, 244},
   {244, 244, 244}, {243, 243, 243}, {243, 243, 243}, {243, 243, 243}, {242, 242, 242},
   {242, 242, 242}, {242, 242, 242}, {242, 242, 242}, {241, 241, 241}, {241, 241, 241},
   {241, 241, 241}, {241, 241, 241}, {240, 240, 240}, {240, 240, 240}, {240, 240, 240},
   {240, 240, 240}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239},
   {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {237, 237, 237},
   {237, 237, 237}, {237, 237, 237}, {237, 237, 237}, {236, 236, 236}, {236, 236, 236},
   {236, 236, 236}, {236, 236, 236}, {235, 235, 235}, {235, 235, 235}, {235, 235, 235},
   {235, 235, 235}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234},
   {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {232, 232, 232},
   {232, 232, 232}, {232, 232, 232}, {232, 232, 232}, {231, 231, 231}, {231, 231, 231},
   {231, 231, 231}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230},
   {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {228, 228, 228},
   {228, 228, 228}, {228, 228, 228}, {228, 228, 228}, {227, 227, 227}, {227, 227, 227},
   {227, 227, 227}, {227, 227, 227}, {226, 226, 226}, {226, 226, 226}, {226, 226, 226},
   {226, 226, 226}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225},
   {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {223, 223, 223},
   {223, 223, 223}, {223, 223, 223}, {223, 223, 223}, {222, 222, 222}, {222, 222, 222},
   {222, 222, 222}, {222, 222, 222}, {221, 221, 221}, {221, 221, 221}, {221, 221, 221},
   {221, 221, 221}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220},
   {219, 219, 219}, {219, 219, 219}, {219, 219, 219}, {218, 218, 218}, {218, 218, 218},
   {218, 218, 218}, {218, 218, 218}, {217, 217, 217}, {217, 217, 217}, {217, 217, 217},
   {217, 217, 217}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216},
   {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {214, 214, 214},
   {214, 214, 214}, {214, 214, 214}, {214, 214, 214}, {213, 213, 213}, {213, 213, 213},
   {213, 213, 213}, {213, 213, 213}, {212, 212, 212}, {212, 212, 212}, {212, 212, 212},
   {212, 212, 212}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211},
   {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {209, 209, 209},
   {209, 209, 209}, {209, 209, 209}, {209, 209, 209}, {208, 208, 208}, {208, 208, 208},
   {208, 208, 208}, {208, 208, 208}, {207, 207, 207}, {207, 207, 207}, {207, 207, 207},
   {207, 207, 207}, {206, 206, 206}, {206, 206, 206}, {206, 206, 206}, {205, 205, 205},
   {205, 205, 205}, {205, 205, 205}, {205, 205, 205}, {204, 204, 204}, {204, 204, 204},
   {204, 204, 204}, {204, 204, 204}, {203, 203, 203}, {203, 203, 203}, {203, 203, 203},
   {203, 203, 203}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202},
   {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {200, 200, 200},
   {200, 200, 200}, {200, 200, 200}, {200, 200, 200}, {199, 199, 199}, {199, 199, 199},
   {199, 199, 199}, {199, 199, 199}, {198, 198, 198}, {198, 198, 198}, {198, 198, 198},
   {198, 198, 198}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197},
   {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {195, 195, 195},
   {195, 195, 195}, {195, 195, 195}, {195, 195, 195}, {194, 194, 194}, {194, 194, 194},
   {194, 194, 194}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193},
   {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {191, 191, 191},
   {191, 191, 191}, {191, 191, 191}, {191, 191, 191}, {190, 190, 190}, {190, 190, 190},
   {190, 190, 190}, {190, 190, 190}, {189, 189, 189}, {189, 189, 189}, {189, 189, 189},
   {189, 189, 189}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188},
   {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {186, 186, 186},
   {186, 186, 186}, {186, 186, 186}, {186, 186, 186}, {185, 185, 185}, {185, 185, 185},
   {185, 185, 185}, {185, 185, 185}, {184, 184, 184}, {184, 184, 184}, {184, 184, 184},
   {184, 184, 184}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183},
   {182, 182, 182}, {182, 182, 182}, {182, 182, 182}, {181, 181, 181}, {181, 181, 181},
   {181, 181, 181}, {181, 181, 181}, {180, 180, 180}, {180, 180, 180}, {180, 180, 180},
   {180, 180, 180}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179},
   {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {177, 177, 177},
   {177, 177, 177}, {177, 177, 177}, {177, 177, 177}, {176, 176, 176}, {176, 176, 176},
   {176, 176, 176}, {176, 176, 176}, {175, 175, 175}, {175, 175, 175}, {175, 175, 175},
   {175, 175, 175}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174},
   {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {172, 172, 172},
   {172, 172, 172}, {172, 172, 172}, {172, 172, 172}, {171, 171, 171}, {171, 171, 171},
   {171, 171, 171}, {171, 171, 171}, {170, 170, 170}, {170, 170, 170}, {170, 170, 170},
   {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {168, 168, 168},
   {168, 168, 168}, {168, 168, 168}, {168, 168, 168}, {167, 167, 167}, {167, 167, 167},
   {167, 167, 167}, {167, 167, 167}, {166, 166, 166}, {166, 166, 166}, {166, 166, 166},
   {166, 166, 166}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165},
   {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {163, 163, 163},
   {163, 163, 163}, {163, 163, 163}, {163, 163, 163}, {162, 162, 162}, {162, 162, 162},
   {162, 162, 162}, {162, 162, 162}, {161, 161, 161}, {161, 161, 161}, {161, 161, 161},
   {161, 161, 161}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160},
   {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {158, 158, 158},
   {158, 158, 158}, {158, 158, 158}, {157, 157, 157}, {157, 157, 157}, {157, 157, 157},
   {157, 157, 157}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156},
   {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {154, 154, 154},
   {154, 154, 154}, {154, 154, 154}, {154, 154, 154}, {153, 153, 153}, {153, 153, 153},
   {153, 153, 153}, {153, 153, 153}, {152, 152, 152}, {152, 152, 152}, {152, 152, 152},
   {152, 152, 152}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151},
   {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {149, 149, 149},
   {149, 149, 149}, {149, 149, 149}, {149, 149, 149}, {148, 148, 148}, {148, 148, 148},
   {148, 148, 148}, {148, 148, 148}, {147, 147, 147}, {147, 147, 147}, {147, 147, 147},
   {147, 147, 147}, {146, 146, 146}, {146, 146, 146}, {146, 146, 146}, {145, 145, 145},
   {145, 145, 145}, {145, 145, 145}, {145, 145, 145}, {144, 144, 144}, {144, 144, 144},
   {144, 144, 144}, {144, 144, 144}, {143, 143, 143}, {143, 143, 143}, {143, 143, 143},
   {143, 143, 143}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142},
   {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {140, 140, 140},
   {140, 140, 140}, {140, 140, 140}, {140, 140, 140}, {139, 139, 139}, {139, 139, 139},
   {139, 139, 139}, {139, 139, 139}, {138, 138, 138}, {138, 138, 138}, {138, 138, 138},
   {138, 138, 138}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137},
   {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {135, 135, 135},
   {135, 135, 135}, {135, 135, 135}, {135, 135, 135}, {134, 134, 134}, {134, 134, 134},
   {134, 134, 134}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133},
   {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {131, 131, 131},
   {131, 131, 131}, {131, 131, 131}, {131, 131, 131}, {130, 130, 130}, {130, 130, 130},
   {130, 130, 130}, {130, 130, 130}, {129, 129, 129}, {129, 129, 129}, {129, 129, 129},
   {129, 129, 129}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
   {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {126, 126, 126},
   {126, 126, 126}, {126, 126, 126}, {126, 126, 126}, {125, 125, 125}, {125, 125, 125},
   {125, 125, 125}, {125, 125, 125}, {124, 124, 124}, {124, 124, 124}, {124, 124, 124},
   {124, 124, 124}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123},
   {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {121, 121, 121},
   {121, 121, 121}, {121, 121, 121}, {120, 120, 120}, {120, 120, 120}, {120, 120, 120},
   {120, 120, 120}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119},
   {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {117, 117, 117},
   {117, 117, 117}, {117, 117, 117}, {117, 117, 117}, {116, 116, 116}, {116, 116, 116},
   {116, 116, 116}, {116, 116, 116}, {115, 115, 115}, {115, 115, 115}, {115, 115, 115},
   {115, 115, 115}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114},
   {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {112, 112, 112},
   {112, 112, 112}, {112, 112, 112}, {112, 112, 112}, {111, 111, 111}, {111, 111, 111},
   {111, 111, 111}, {111, 111, 111}, {110, 110, 110}, {110, 110, 110}, {110, 110, 110},
   {110, 110, 110}, {109, 109, 109}, {109, 109, 109}, {109, 109, 109}, {108, 108, 108},
   {108, 108, 108}, {108, 108, 108}, {108, 108, 108}, {107, 107, 107}, {107, 107, 107},
   {107, 107, 107}, {107, 107, 107}, {106, 106, 106}, {106, 106, 106}, {106, 106, 106},
   {106, 106, 106}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105},
   {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {103, 103, 103},
   {103, 103, 103}, {103, 103, 103}, {103, 103, 103}, {102, 102, 102}, {102, 102, 102},
   {102, 102, 102}, {102, 102, 102}, {101, 101, 101}, {101, 101, 101}, {101, 101, 101},
   {101, 101, 101}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100},
   { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 98,  98,  98},
   { 98,  98,  98}, { 98,  98,  98}, { 98,  98,  98}, { 97,  97,  97}, { 97,  97,  97},
   { 97,  97,  97}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96},
   { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 94,  94,  94},
   { 94,  94,  94}, { 94,  94,  94}, { 94,  94,  94}, { 93,  93,  93}, { 93,  93,  93},
   { 93,  93,  93}, { 93,  93,  93}, { 92,  92,  92}, { 92,  92,  92}, { 92,  92,  92},
   { 92,  92,  92}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91},
   { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 89,  89,  89},
   { 89,  89,  89}, { 89,  89,  89}, { 89,  89,  89}, { 88,  88,  88}, { 88,  88,  88},
   { 88,  88,  88}, { 88,  88,  88}, { 87,  87,  87}, { 87,  87,  87}, { 87,  87,  87},
   { 87,  87,  87}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86},
   { 85,  85,  85}, { 85,  85,  85}, { 85,  85,  85}, { 84,  84,  84}, { 84,  84,  84},
   { 84,  84,  84}, { 84,  84,  84}, { 83,  83,  83}, { 83,  83,  83}, { 83,  83,  83},
   { 83,  83,  83}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82},
   { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 80,  80,  80},
   { 80,  80,  80}, { 80,  80,  80}, { 80,  80,  80}, { 79,  79,  79}, { 79,  79,  79},
   { 79,  79,  79}, { 79,  79,  79}, { 78,  78,  78}, { 78,  78,  78}, { 78,  78,  78},
   { 78,  78,  78}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77},
   { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 75,  75,  75},
   { 75,  75,  75}, { 75,  75,  75}, { 75,  75,  75}, { 74,  74,  74}, { 74,  74,  74},
   { 74,  74,  74}, { 74,  74,  74}, { 73,  73,  73}, { 73,  73,  73}, { 73,  73,  73},
   { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 71,  71,  71},
   { 71,  71,  71}, { 71,  71,  71}, { 71,  71,  71}, { 70,  70,  70}, { 70,  70,  70},
   { 70,  70,  70}, { 70,  70,  70}, { 69,  69,  69}, { 69,  69,  69}, { 69,  69,  69},
   { 69,  69,  69}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68},
   { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 66,  66,  66},
   { 66,  66,  66}, { 66,  66,  66}, { 66,  66,  66}, { 65,  65,  65}, { 65,  65,  65},
   { 65,  65,  65}, { 65,  65,  65}, { 64,  64,  64}, { 64,  64,  64}, { 64,  64,  64},
   { 64,  64,  64}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63},
   { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 61,  61,  61},
   { 61,  61,  61}, { 61,  61,  61}, { 60,  60,  60}, { 60,  60,  60}, { 60,  60,  60},
   { 60,  60,  60}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59},
   { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 57,  57,  57},
   { 57,  57,  57}, { 57,  57,  57}, { 57,  57,  57}, { 56,  56,  56}, { 56,  56,  56},
   { 56,  56,  56}, { 56,  56,  56}, { 55,  55,  55}, { 55,  55,  55}, { 55,  55,  55},
   { 55,  55,  55}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54},
   { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 52,  52,  52},
   { 52,  52,  52}, { 52,  52,  52}, { 52,  52,  52}, { 51,  51,  51}, { 51,  51,  51},
   { 51,  51,  51}, { 51,  51,  51}, { 50,  50,  50}, { 50,  50,  50}, { 50,  50,  50},
   { 50,  50,  50}, { 49,  49,  49}, { 49,  49,  49}, { 49,  49,  49}, { 48,  48,  48},
   { 48,  48,  48}, { 48,  48,  48}, { 48,  48,  48}, { 47,  47,  47}, { 47,  47,  47},
   { 47,  47,  47}, { 47,  47,  47}, { 46,  46,  46}, { 46,  46,  46}, { 46,  46,  46},
   { 46,  46,  46}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45},
   { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 43,  43,  43},
   { 43,  43,  43}, { 43,  43,  43}, { 43,  43,  43}, { 42,  42,  42}, { 42,  42,  42},
   { 42,  42,  42}, { 42,  42,  42}, { 41,  41,  41}, { 41,  41,  41}, { 41,  41,  41},
   { 41,  41,  41}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40},
   { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 38,  38,  38},
   { 38,  38,  38}, { 38,  38,  38}, { 38,  38,  38}, { 37,  37,  37}, { 37,  37,  37},
   { 37,  37,  37}, { 37,  37,  37}, { 36,  36,  36}, { 36,  36,  36}, { 36,  36,  36},
   { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 34,  34,  34},
   { 34,  34,  34}, { 34,  34,  34}, { 34,  34,  34}, { 33,  33,  33}, { 33,  33,  33},
   { 33,  33,  33}, { 33,  33,  33}, { 32,  32,  32}, { 32,  32,  32}, { 32,  32,  32},
   { 32,  32,  32}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31},
   { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 29,  29,  29},
   { 29,  29,  29}, { 29,  29,  29}, { 29,  29,  29}, { 28,  28,  28}, { 28,  28,  28},
   { 28,  28,  28}, { 28,  28,  28}, { 27,  27,  27}, { 27,  27,  27}, { 27,  27,  27},
   { 27,  27,  27}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26},
   { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 24,  24,  24},
   { 24,  24,  24}, { 24,  24,  24}, { 23,  23,  23}, { 23,  23,  23}, { 23,  23,  23},
   { 23,  23,  23}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22},
   { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 20,  20,  20},
   { 20,  20,  20}, { 20,  20,  20}, { 20,  20,  20}, { 19,  19,  19}, { 19,  19,  19},
   { 19,  19,  19}, { 19,  19,  19}, { 18,  18,  18}, { 18,  18,  18}, { 18,  18,  18},
   { 18,  18,  18}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17},
   { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 15,  15,  15},
   { 15,  15,  15}, { 15,  15,  15}, { 15,  15,  15}, { 14,  14,  14}, { 14,  14,  14},
   { 14,  14,  14}, { 14,  14,  14}, { 13,  13,  13}, { 13,  13,  13}, { 13,  13,  13},
   { 13,  13,  13}, { 12,  12,  12}, { 12,  12,  12}, { 12,  12,  12}, { 11,  11,  11},
   { 11,  11,  11}, { 11,  11,  11}, { 11,  11,  11}, { 10,  10,  10}, { 10,  10,  10},
   { 10,  10,  10}, { 10,  10,  10}, {  9,   9,   9}, {  9,   9,   9}, {  9,   9,   9},
   {  9,   9,   9}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8},
   {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  6,   6,   6},
   {  6,   6,   6}, {  6,   6,   6}, {  6,   6,   6}, {  5,   5,   5}, {  5,   5,   5},
   {  5,   5,   5}, {  5,   5,   5}, {  4,   4,   4}, {  4,   4,   4}, {  4,   4,   4},
   {  4,   4,   4}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3},
   {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  1,   1,   1},
   {  1,   1,   1}, {  1,   1,   1}, {  1,   1,   1}, {  0,   0,   0}, {  0,   0,   0}
};

const rgb_t hot_colormap[1000] = {
   { 11,   0,   0}, { 11,   0,   0}, { 12,   0,   0}, { 13,   0,   0}, { 13,   0,   0},
   { 14,   0,   0}, { 15,   0,   0}, { 15,   0,   0}, { 16,   0,   0}, { 17,   0,   0},
   { 17,   0,   0}, { 18,   0,   0}, { 19,   0,   0}, { 19,   0,   0}, { 20,   0,   0},
   { 21,   0,   0}, { 21,   0,   0}, { 22,   0,   0}, { 23,   0,   0}, { 23,   0,   0},
   { 24,   0,   0}, { 25,   0,   0}, { 25,   0,   0}, { 26,   0,   0}, { 27,   0,   0},
   { 27,   0,   0}, { 28,   0,   0}, { 29,   0,   0}, { 29,   0,   0}, { 30,   0,   0},
   { 31,   0,   0}, { 31,   0,   0}, { 32,   0,   0}, { 33,   0,   0}, { 33,   0,   0},
   { 34,   0,   0}, { 35,   0,   0}, { 35,   0,   0}, { 36,   0,   0}, { 37,   0,   0},
   { 37,   0,   0}, { 38,   0,   0}, { 39,   0,   0}, { 39,   0,   0}, { 40,   0,   0},
   { 41,   0,   0}, { 41,   0,   0}, { 42,   0,   0}, { 43,   0,   0}, { 43,   0,   0},
   { 44,   0,   0}, { 45,   0,   0}, { 45,   0,   0}, { 46,   0,   0}, { 47,   0,   0},
   { 47,   0,   0}, { 48,   0,   0}, { 49,   0,   0}, { 49,   0,   0}, { 50,   0,   0},
   { 51,   0,   0}, { 51,   0,   0}, { 52,   0,   0}, { 53,   0,   0}, { 54,   0,   0},
   { 54,   0,   0}, { 55,   0,   0}, { 56,   0,   0}, { 56,   0,   0}, { 57,   0,   0},
   { 58,   0,   0}, { 58,   0,   0}, { 59,   0,   0}, { 60,   0,   0}, { 60,   0,   0},
   { 61,   0,   0}, { 62,   0,   0}, { 62,   0,   0}, { 63,   0,   0}, { 64,   0,   0},
   { 64,   0,   0}, { 65,   0,   0}, { 66,   0,   0}, { 66,   0,   0}, { 67,   0,   0},
   { 68,   0,   0}, { 68,   0,   0}, { 69,   0,   0}, { 70,   0,   0}, { 70,   0,   0},
   { 71,   0,   0}, { 72,   0,   0}, { 72,   0,   0}, { 73,   0,   0}, { 74,   0,   0},
   { 74,   0,   0}, { 75,   0,   0}, { 76,   0,   0}, { 76,   0,   0}, { 77,   0,   0},
   { 78,   0,   0}, { 78,   0,   0}, { 79,   0,   0}, { 80,   0,   0}, { 80,   0,   0},
   { 81,   0,   0}, { 82,   0,   0}, { 82,   0,   0}, { 83,   0,   0}, { 84,   0,   0},
   { 84,   0,   0}, { 85,   0,   0}, { 86,   0,   0}, { 86,   0,   0}, { 87,   0,   0},
   { 88,   0,   0}, { 88,   0,   0}, { 89,   0,   0}, { 90,   0,   0}, { 90,   0,   0},
   { 91,   0,   0}, { 92,   0,   0}, { 92,   0,   0}, { 93,   0,   0}, { 94,   0,   0},
   { 94,   0,   0}, { 95,   0,   0}, { 96,   0,   0}, { 96,   0,   0}, { 97,   0,   0},
   { 98,   0,   0}, { 98,   0,   0}, { 99,   0,   0}, {100,   0,   0}, {100,   0,   0},
   {101,   0,   0}, {102,   0,   0}, {102,   0,   0}, {103,   0,   0}, {104,   0,   0},
   {104,   0,   0}, {105,   0,   0}, {106,   0,   0}, {106,   0,   0}, {107,   0,   0},
   {108,   0,   0}, {108,   0,   0}, {109,   0,   0}, {110,   0,   0}, {110,   0,   0},
   {111,   0,   0}, {112,   0,   0}, {112,   0,   0}, {113,   0,   0}, {114,   0,   0},
   {114,   0,   0}, {115,   0,   0}, {116,   0,   0}, {116,   0,   0}, {117,   0,   0},
   {118,   0,   0}, {119,   0,   0}, {119,   0,   0}, {120,   0,   0}, {121,   0,   0},
   {121,   0,   0}, {122,   0,   0}, {123,   0,   0}, {123,   0,   0}, {124,   0,   0},
   {125,   0,   0}, {125,   0,   0}, {126,   0,   0}, {127,   0,   0}, {127,   0,   0},
   {128,   0,   0}, {129,   0,   0}, {129,   0,   0}, {130,   0,   0}, {131,   0,   0},
   {131,   0,   0}, {132,   0,   0}, {133,   0,   0}, {133,   0,   0}, {134,   0,   0},
   {135,   0,   0}, {135,   0,   0}, {136,   0,   0}, {137,   0,   0}, {137,   0,   0},
   {138,   0,   0}, {139,   0,   0}, {139,   0,   0}, {140,   0,   0}, {141,   0,   0},
   {141,   0,   0}, {142,   0,   0}, {143,   0,   0}, {143,   0,   0}, {144,   0,   0},
   {145,   0,   0}, {145,   0,   0}, {146,   0,   0}, {147,   0,   0}, {147,   0,   0},
   {148,   0,   0}, {149,   0,   0}, {149,   0,   0}, {150,   0,   0}, {151,   0,   0},
   {151,   0,   0}, {152,   0,   0}, {153,   0,   0}, {153,   0,   0}, {154,   0,   0},
   {155,   0,   0}, {155,   0,   0}, {156,   0,   0}, {157,   0,   0}, {157,   0,   0},
   {158,   0,   0}, {159,   0,   0}, {159,   0,   0}, {160,   0,   0}, {161,   0,   0},
   {161,   0,   0}, {162,   0,   0}, {163,   0,   0}, {163,   0,   0}, {164,   0,   0},
   {165,   0,   0}, {165,   0,   0}, {166,   0,   0}, {167,   0,   0}, {167,   0,   0},
   {168,   0,   0}, {169,   0,   0}, {169,   0,   0}, {170,   0,   0}, {171,   0,   0},
   {171,   0,   0}, {172,   0,   0}, {173,   0,   0}, {173,   0,   0}, {174,   0,   0},
   {175,   0,   0}, {175,   0,   0}, {176,   0,   0}, {177,   0,   0}, {177,   0,   0},
   {178,   0,   0}, {179,   0,   0}, {179,   0,   0}, {180,   0,   0}, {181,   0,   0},
   {181,   0,   0}, {182,   0,   0}, {183,   0,   0}, {183,   0,   0}, {184,   0,   0},
   {185,   0,   0}, {186,   0,   0}, {186,   0,   0}, {187,   0,   0}, {188,   0,   0},
   {188,   0,   0}, {189,   0,   0}, {190,   0,   0}, {190,   0,   0}, {191,   0,   0},
   {192,   0,   0}, {192,   0,   0}, {193,   0,   0}, {194,   0,   0}, {194,   0,   0},
   {195,   0,   0}, {196,   0,   0}, {196,   0,   0}, {197,   0,   0}, {198,   0,   0},
   {198,   0,   0}, {199,   0,   0}, {200,   0,   0}, {200,   0,   0}, {201,   0,   0},
   {202,   0,   0}, {202,   0,   0}, {203,   0,   0}, {204,   0,   0}, {204,   0,   0},
   {205,   0,   0}, {206,   0,   0}, {206,   0,   0}, {207,   0,   0}, {208,   0,   0},
   {208,   0,   0}, {209,   0,   0}, {210,   0,   0}, {210,   0,   0}, {211,   0,   0},
   {212,   0,   0}, {212,   0,   0}, {213,   0,   0}, {214,   0,   0}, {214,   0,   0},
   {215,   0,   0}, {216,   0,   0}, {216,   0,   0}, {217,   0,   0}, {218,   0,   0},
   {218,   0,   0}, {219,   0,   0}, {220,   0,   0}, {220,   0,   0}, {221,   0,   0},
   {222,   0,   0}, {222,   0,   0}, {223,   0,   0}, {224,   0,   0}, {224,   0,   0},
   {225,   0,   0}, {226,   0,   0}, {226,   0,   0}, {227,   0,   0}, {228,   0,   0},
   {228,   0,   0}, {229,   0,   0}, {230,   0,   0}, {230,   0,   0}, {231,   0,   0},
   {232,   0,   0}, {232,   0,   0}, {233,   0,   0}, {234,   0,   0}, {234,   0,   0},
   {235,   0,   0}, {236,   0,   0}, {236,   0,   0}, {237,   0,   0}, {238,   0,   0},
   {238,   0,   0}, {239,   0,   0}, {240,   0,   0}, {240,   0,   0}, {241,   0,   0},
   {242,   0,   0}, {242,   0,   0}, {243,   0,   0}, {244,   0,   0}, {244,   0,   0},
   {245,   0,   0}, {246,   0,   0}, {246,   0,   0}, {247,   0,   0}, {248,   0,   0},
   {248,   0,   0}, {249,   0,   0}, {250,   0,   0}, {251,   0,   0}, {251,   0,   0},
   {252,   0,   0}, {253,   0,   0}, {253,   0,   0}, {254,   0,   0}, {255,   0,   0},
   {255,   0,   0}, {255,   1,   0}, {255,   2,   0}, {255,   2,   0}, {255,   3,   0},
   {255,   4,   0}, {255,   4,   0}, {255,   5,   0}, {255,   6,   0}, {255,   6,   0},
   {255,   7,   0}, {255,   8,   0}, {255,   8,   0}, {255,   9,   0}, {255,  10,   0},
   {255,  10,   0}, {255,  11,   0}, {255,  12,   0}, {255,  12,   0}, {255,  13,   0},
   {255,  14,   0}, {255,  14,   0}, {255,  15,   0}, {255,  16,   0}, {255,  16,   0},
   {255,  17,   0}, {255,  18,   0}, {255,  18,   0}, {255,  19,   0}, {255,  20,   0},
   {255,  20,   0}, {255,  21,   0}, {255,  22,   0}, {255,  22,   0}, {255,  23,   0},
   {255,  24,   0}, {255,  24,   0}, {255,  25,   0}, {255,  26,   0}, {255,  26,   0},
   {255,  27,   0}, {255,  28,   0}, {255,  28,   0}, {255,  29,   0}, {255,  30,   0},
   {255,  30,   0}, {255,  31,   0}, {255,  32,   0}, {255,  32,   0}, {255,  33,   0},
   {255,  34,   0}, {255,  34,   0}, {255,  35,   0}, {255,  36,   0}, {255,  36,   0},
   {255,  37,   0}, {255,  38,   0}, {255,  38,   0}, {255,  39,   0}, {255,  40,   0},
   {255,  40,   0}, {255,  41,   0}, {255,  42,   0}, {255,  42,   0}, {255,  43,   0},
   {255,  44,   0}, {255,  44,   0}, {255,  45,   0}, {255,  46,   0}, {255,  46,   0},
   {255,  47,   0}, {255,  48,   0}, {255,  48,   0}, {255,  49,   0}, {255,  50,   0},
   {255,  50,   0}, {255,  51,   0}, {255,  52,   0}, {255,  52,   0}, {255,  53,   0},
   {255,  54,   0}, {255,  54,   0}, {255,  55,   0}, {255,  56,   0}, {255,  56,   0},
   {255,  57,   0}, {255,  58,   0}, {255,  58,   0}, {255,  59,   0}, {255,  60,   0},
   {255,  60,   0}, {255,  61,   0}, {255,  62,   0}, {255,  63,   0}, {255,  63,   0},
   {255,  64,   0}, {255,  65,   0}, {255,  65,   0}, {255,  66,   0}, {255,  67,   0},
   {255,  67,   0}, {255,  68,   0}, {255,  69,   0}, {255,  69,   0}, {255,  70,   0},
   {255,  71,   0}, {255,  71,   0}, {255,  72,   0}, {255,  73,   0}, {255,  73,   0},
   {255,  74,   0}, {255,  75,   0}, {255,  75,   0}, {255,  76,   0}, {255,  77,   0},
   {255,  77,   0}, {255,  78,   0}, {255,  79,   0}, {255,  79,   0}, {255,  80,   0},
   {255,  81,   0}, {255,  81,   0}, {255,  82,   0}, {255,  83,   0}, {255,  83,   0},
   {255,  84,   0}, {255,  85,   0}, {255,  85,   0}, {255,  86,   0}, {255,  87,   0},
   {255,  87,   0}, {255,  88,   0}, {255,  89,   0}, {255,  89,   0}, {255,  90,   0},
   {255,  91,   0}, {255,  91,   0}, {255,  92,   0}, {255,  93,   0}, {255,  93,   0},
   {255,  94,   0}, {255,  95,   0}, {255,  95,   0}, {255,  96,   0}, {255,  97,   0},
   {255,  97,   0}, {255,  98,   0}, {255,  99,   0}, {255,  99,   0}, {255, 100,   0},
   {255, 101,   0}, {255, 101,   0}, {255, 102,   0}, {255, 103,   0}, {255, 103,   0},
   {255, 104,   0}, {255, 105,   0}, {255, 105,   0}, {255, 106,   0}, {255, 107,   0},
   {255, 107,   0}, {255, 108,   0}, {255, 109,   0}, {255, 109,   0}, {255, 110,   0},
   {255, 111,   0}, {255, 111,   0}, {255, 112,   0}, {255, 113,   0}, {255, 113,   0},
   {255, 114,   0}, {255, 115,   0}, {255, 115,   0}, {255, 116,   0}, {255, 117,   0},
   {255, 117,   0}, {255, 118,   0}, {255, 119,   0}, {255, 119,   0}, {255, 120,   0},
   {255, 121,   0}, {255, 121,   0}, {255, 122,   0}, {255, 123,   0}, {255, 123,   0},
   {255, 124,   0}, {255, 125,   0}, {255, 125,   0}, {255, 126,   0}, {255, 127,   0},
   {255, 128,   0}, {255, 128,   0}, {255, 129,   0}, {255, 130,   0}, {255, 130,   0},
   {255, 131,   0}, {255, 132,   0}, {255, 132,   0}, {255, 133,   0}, {255, 134,   0},
   {255, 134,   0}, {255, 135,   0}, {255, 136,   0}, {255, 136,   0}, {255, 137,   0},
   {255, 138,   0}, {255, 138,   0}, {255, 139,   0}, {255, 140,   0}, {255, 140,   0},
   {255, 141,   0}, {255, 142,   0}, {255, 142,   0}, {255, 143,   0}, {255, 144,   0},
   {255, 144,   0}, {255, 145,   0}, {255, 146,   0}, {255, 146,   0}, {255, 147,   0},
   {255, 148,   0}, {255, 148,   0}, {255, 149,   0}, {255, 150,   0}, {255, 150,   0},
   {255, 151,   0}, {255, 152,   0}, {255, 152,   0}, {255, 153,   0}, {255, 154,   0},
   {255, 154,   0}, {255, 155,   0}, {255, 156,   0}, {255, 156,   0}, {255, 157,   0},
   {255, 158,   0}, {255, 158,   0}, {255, 159,   0}, {255, 160,   0}, {255, 160,   0},
   {255, 161,   0}, {255, 162,   0}, {255, 162,   0}, {255, 163,   0}, {255, 164,   0},
   {255, 164,   0}, {255, 165,   0}, {255, 166,   0}, {255, 166,   0}, {255, 167,   0},
   {255, 168,   0}, {255, 168,   0}, {255, 169,   0}, {255, 170,   0}, {255, 170,   0},
   {255, 171,   0}, {255, 172,   0}, {255, 172,   0}, {255, 173,   0}, {255, 174,   0},
   {255, 174,   0}, {255, 175,   0}, {255, 176,   0}, {255, 176,   0}, {255, 177,   0},
   {255, 178,   0}, {255, 178,   0}, {255, 179,   0}, {255, 180,   0}, {255, 180,   0},
   {255, 181,   0}, {255, 182,   0}, {255, 182,   0}, {255, 183,   0}, {255, 184,   0},
   {255, 184,   0}, {255, 185,   0}, {255, 186,   0}, {255, 186,   0}, {255, 187,   0},
   {255, 188,   0}, {255, 188,   0}, {255, 189,   0}, {255, 190,   0}, {255, 190,   0},
   {255, 191,   0}, {255, 192,   0}, {255, 192,   0}, {255, 193,   0}, {255, 194,   0},
   {255, 195,   0}, {255, 195,   0}, {255, 196,   0}, {255, 197,   0}, {255, 197,   0},
   {255, 198,   0}, {255, 199,   0}, {255, 199,   0}, {255, 200,   0}, {255, 201,   0},
   {255, 201,   0}, {255, 202,   0}, {255, 203,   0}, {255, 203,   0}, {255, 204,   0},
   {255, 205,   0}, {255, 205,   0}, {255, 206,   0}, {255, 207,   0}, {255, 207,   0},
   {255, 208,   0}, {255, 209,   0}, {255, 209,   0}, {255, 210,   0}, {255, 211,   0},
   {255, 211,   0}, {255, 212,   0}, {255, 213,   0}, {255, 213,   0}, {255, 214,   0},
   {255, 215,   0}, {255, 215,   0}, {255, 216,   0}, {255, 217,   0}, {255, 217,   0},
   {255, 218,   0}, {255, 219,   0}, {255, 219,   0}, {255, 220,   0}, {255, 221,   0},
   {255, 221,   0}, {255, 222,   0}, {255, 223,   0}, {255, 223,   0}, {255, 224,   0},
   {255, 225,   0}, {255, 225,   0}, {255, 226,   0}, {255, 227,   0}, {255, 227,   0},
   {255, 228,   0}, {255, 229,   0}, {255, 229,   0}, {255, 230,   0}, {255, 231,   0},
   {255, 231,   0}, {255, 232,   0}, {255, 233,   0}, {255, 233,   0}, {255, 234,   0},
   {255, 235,   0}, {255, 235,   0}, {255, 236,   0}, {255, 237,   0}, {255, 237,   0},
   {255, 238,   0}, {255, 239,   0}, {255, 239,   0}, {255, 240,   0}, {255, 241,   0},
   {255, 241,   0}, {255, 242,   0}, {255, 243,   0}, {255, 243,   0}, {255, 244,   0},
   {255, 245,   0}, {255, 245,   0}, {255, 246,   0}, {255, 247,   0}, {255, 247,   0},
   {255, 248,   0}, {255, 249,   0}, {255, 249,   0}, {255, 250,   0}, {255, 251,   0},
   {255, 251,   0}, {255, 252,   0}, {255, 253,   0}, {255, 253,   0}, {255, 254,   0},
   {255, 255,   0}, {255, 255,   1}, {255, 255,   2}, {255, 255,   3}, {255, 255,   4},
   {255, 255,   5}, {255, 255,   6}, {255, 255,   7}, {255, 255,   8}, {255, 255,   9},
   {255, 255,  10}, {255, 255,  11}, {255, 255,  12}, {255, 255,  13}, {255, 255,  14},
   {255, 255,  15}, {255, 255,  16}, {255, 255,  17}, {255, 255,  18}, {255, 255,  19},
   {255, 255,  20}, {255, 255,  21}, {255, 255,  22}, {255, 255,  23}, {255, 255,  24},
   {255, 255,  25}, {255, 255,  26}, {255, 255,  27}, {255, 255,  28}, {255, 255,  29},
   {255, 255,  30}, {255, 255,  31}, {255, 255,  32}, {255, 255,  33}, {255, 255,  34},
   {255, 255,  35}, {255, 255,  36}, {255, 255,  37}, {255, 255,  38}, {255, 255,  39},
   {255, 255,  40}, {255, 255,  41}, {255, 255,  42}, {255, 255,  43}, {255, 255,  44},
   {255, 255,  45}, {255, 255,  46}, {255, 255,  47}, {255, 255,  48}, {255, 255,  49},
   {255, 255,  50}, {255, 255,  51}, {255, 255,  52}, {255, 255,  53}, {255, 255,  54},
   {255, 255,  55}, {255, 255,  56}, {255, 255,  57}, {255, 255,  58}, {255, 255,  59},
   {255, 255,  60}, {255, 255,  61}, {255, 255,  62}, {255, 255,  63}, {255, 255,  64},
   {255, 255,  65}, {255, 255,  66}, {255, 255,  67}, {255, 255,  68}, {255, 255,  69},
   {255, 255,  70}, {255, 255,  71}, {255, 255,  72}, {255, 255,  73}, {255, 255,  74},
   {255, 255,  75}, {255, 255,  76}, {255, 255,  77}, {255, 255,  78}, {255, 255,  79},
   {255, 255,  80}, {255, 255,  81}, {255, 255,  82}, {255, 255,  83}, {255, 255,  84},
   {255, 255,  85}, {255, 255,  86}, {255, 255,  87}, {255, 255,  88}, {255, 255,  89},
   {255, 255,  90}, {255, 255,  91}, {255, 255,  92}, {255, 255,  93}, {255, 255,  94},
   {255, 255,  95}, {255, 255,  96}, {255, 255,  97}, {255, 255,  98}, {255, 255,  99},
   {255, 255, 100}, {255, 255, 101}, {255, 255, 102}, {255, 255, 103}, {255, 255, 104},
   {255, 255, 105}, {255, 255, 106}, {255, 255, 107}, {255, 255, 108}, {255, 255, 109},
   {255, 255, 110}, {255, 255, 111}, {255, 255, 112}, {255, 255, 113}, {255, 255, 114},
   {255, 255, 115}, {255, 255, 116}, {255, 255, 117}, {255, 255, 118}, {255, 255, 119},
   {255, 255, 120}, {255, 255, 121}, {255, 255, 122}, {255, 255, 123}, {255, 255, 124},
   {255, 255, 125}, {255, 255, 126}, {255, 255, 127}, {255, 255, 128}, {255, 255, 129},
   {255, 255, 130}, {255, 255, 131}, {255, 255, 132}, {255, 255, 133}, {255, 255, 134},
   {255, 255, 135}, {255, 255, 136}, {255, 255, 137}, {255, 255, 138}, {255, 255, 139},
   {255, 255, 140}, {255, 255, 141}, {255, 255, 142}, {255, 255, 143}, {255, 255, 144},
   {255, 255, 145}, {255, 255, 146}, {255, 255, 147}, {255, 255, 148}, {255, 255, 149},
   {255, 255, 150}, {255, 255, 151}, {255, 255, 152}, {255, 255, 153}, {255, 255, 154},
   {255, 255, 155}, {255, 255, 157}, {255, 255, 158}, {255, 255, 159}, {255, 255, 160},
   {255, 255, 161}, {255, 255, 162}, {255, 255, 163}, {255, 255, 164}, {255, 255, 165},
   {255, 255, 166}, {255, 255, 167}, {255, 255, 168}, {255, 255, 169}, {255, 255, 170},
   {255, 255, 171}, {255, 255, 172}, {255, 255, 173}, {255, 255, 174}, {255, 255, 175},
   {255, 255, 176}, {255, 255, 177}, {255, 255, 178}, {255, 255, 179}, {255, 255, 180},
   {255, 255, 181}, {255, 255, 182}, {255, 255, 183}, {255, 255, 184}, {255, 255, 185},
   {255, 255, 186}, {255, 255, 187}, {255, 255, 188}, {255, 255, 189}, {255, 255, 190},
   {255, 255, 191}, {255, 255, 192}, {255, 255, 193}, {255, 255, 194}, {255, 255, 195},
   {255, 255, 196}, {255, 255, 197}, {255, 255, 198}, {255, 255, 199}, {255, 255, 200},
   {255, 255, 201}, {255, 255, 202}, {255, 255, 203}, {255, 255, 204}, {255, 255, 205},
   {255, 255, 206}, {255, 255, 207}, {255, 255, 208}, {255, 255, 209}, {255, 255, 210},
   {255, 255, 211}, {255, 255, 212}, {255, 255, 213}, {255, 255, 214}, {255, 255, 215},
   {255, 255, 216}, {255, 255, 217}, {255, 255, 218}, {255, 255, 219}, {255, 255, 220},
   {255, 255, 221}, {255, 255, 222}, {255, 255, 223}, {255, 255, 224}, {255, 255, 225},
   {255, 255, 226}, {255, 255, 227}, {255, 255, 228}, {255, 255, 229}, {255, 255, 230},
   {255, 255, 231}, {255, 255, 232}, {255, 255, 233}, {255, 255, 234}, {255, 255, 235},
   {255, 255, 236}, {255, 255, 237}, {255, 255, 238}, {255, 255, 239}, {255, 255, 240},
   {255, 255, 241}, {255, 255, 242}, {255, 255, 243}, {255, 255, 244}, {255, 255, 245},
   {255, 255, 246}, {255, 255, 247}, {255, 255, 248}, {255, 255, 249}, {255, 255, 250},
   {255, 255, 251}, {255, 255, 252}, {255, 255, 253}, {255, 255, 254}, {255, 255, 255}
};

const rgb_t hsv_colormap[1000] = {
   {255,   0,   0}, {255,   2,   0}, {255,   3,   0}, {255,   5,   0}, {255,   6,   0},
   {255,   8,   0}, {255,   9,   0}, {255,  11,   0}, {255,  12,   0}, {255,  14,   0},
   {255,  15,   0}, {255,  17,   0}, {255,  18,   0}, {255,  20,   0}, {255,  21,   0},
   {255,  23,   0}, {255,  24,   0}, {255,  26,   0}, {255,  27,   0}, {255,  29,   0},
   {255,  30,   0}, {255,  32,   0}, {255,  33,   0}, {255,  35,   0}, {255,  36,   0},
   {255,  38,   0}, {255,  39,   0}, {255,  41,   0}, {255,  42,   0}, {255,  44,   0},
   {255,  45,   0}, {255,  47,   0}, {255,  48,   0}, {255,  50,   0}, {255,  51,   0},
   {255,  53,   0}, {255,  54,   0}, {255,  56,   0}, {255,  57,   0}, {255,  59,   0},
   {255,  60,   0}, {255,  62,   0}, {255,  63,   0}, {255,  65,   0}, {255,  66,   0},
   {255,  68,   0}, {255,  69,   0}, {255,  71,   0}, {255,  72,   0}, {255,  74,   0},
   {255,  75,   0}, {255,  77,   0}, {255,  78,   0}, {255,  80,   0}, {255,  81,   0},
   {255,  83,   0}, {255,  84,   0}, {255,  86,   0}, {255,  87,   0}, {255,  89,   0},
   {255,  90,   0}, {255,  92,   0}, {255,  93,   0}, {255,  95,   0}, {255,  96,   0},
   {255,  98,   0}, {255, 100,   0}, {255, 101,   0}, {255, 103,   0}, {255, 104,   0},
   {255, 106,   0}, {255, 107,   0}, {255, 109,   0}, {255, 110,   0}, {255, 112,   0},
   {255, 113,   0}, {255, 115,   0}, {255, 116,   0}, {255, 118,   0}, {255, 119,   0},
   {255, 121,   0}, {255, 122,   0}, {255, 124,   0}, {255, 125,   0}, {255, 127,   0},
   {255, 128,   0}, {255, 130,   0}, {255, 131,   0}, {255, 133,   0}, {255, 134,   0},
   {255, 136,   0}, {255, 137,   0}, {255, 139,   0}, {255, 140,   0}, {255, 142,   0},
   {255, 143,   0}, {255, 145,   0}, {255, 146,   0}, {255, 148,   0}, {255, 149,   0},
   {255, 151,   0}, {255, 152,   0}, {255, 154,   0}, {255, 155,   0}, {255, 157,   0},
   {255, 158,   0}, {255, 160,   0}, {255, 161,   0}, {255, 163,   0}, {255, 164,   0},
   {255, 166,   0}, {255, 167,   0}, {255, 169,   0}, {255, 170,   0}, {255, 172,   0},
   {255, 173,   0}, {255, 175,   0}, {255, 176,   0}, {255, 178,   0}, {255, 179,   0},
   {255, 181,   0}, {255, 182,   0}, {255, 184,   0}, {255, 185,   0}, {255, 187,   0},
   {255, 188,   0}, {255, 190,   0}, {255, 191,   0}, {255, 193,   0}, {255, 194,   0},
   {255, 196,   0}, {255, 197,   0}, {255, 199,   0}, {255, 201,   0}, {255, 202,   0},
   {255, 204,   0}, {255, 205,   0}, {255, 207,   0}, {255, 208,   0}, {255, 210,   0},
   {255, 211,   0}, {255, 213,   0}, {255, 214,   0}, {255, 216,   0}, {255, 217,   0},
   {255, 219,   0}, {255, 220,   0}, {255, 222,   0}, {255, 223,   0}, {255, 225,   0},
   {255, 226,   0}, {255, 228,   0}, {255, 229,   0}, {255, 231,   0}, {255, 232,   0},
   {255, 234,   0}, {255, 235,   0}, {255, 237,   0}, {255, 238,   0}, {255, 239,   0},
   {254, 240,   0}, {254, 242,   0}, {253, 243,   0}, {253, 244,   0}, {252, 245,   0},
   {252, 246,   0}, {251, 247,   0}, {251, 248,   0}, {250, 249,   0}, {250, 250,   0},
   {249, 251,   0}, {249, 252,   0}, {248, 253,   0}, {248, 254,   0}, {247, 255,   0},
   {246, 255,   0}, {245, 255,   0}, {243, 255,   0}, {242, 255,   0}, {240, 255,   0},
   {239, 255,   0}, {237, 255,   0}, {236, 255,   0}, {234, 255,   0}, {233, 255,   0},
   {231, 255,   0}, {230, 255,   0}, {228, 255,   0}, {227, 255,   0}, {225, 255,   0},
   {224, 255,   0}, {222, 255,   0}, {221, 255,   0}, {219, 255,   0}, {218, 255,   0},
   {216, 255,   0}, {215, 255,   0}, {213, 255,   0}, {211, 255,   0}, {210, 255,   0},
   {208, 255,   0}, {207, 255,   0}, {205, 255,   0}, {204, 255,   0}, {202, 255,   0},
   {201, 255,   0}, {199, 255,   0}, {198, 255,   0}, {196, 255,   0}, {195, 255,   0},
   {193, 255,   0}, {192, 255,   0}, {190, 255,   0}, {189, 255,   0}, {187, 255,   0},
   {186, 255,   0}, {184, 255,   0}, {183, 255,   0}, {181, 255,   0}, {180, 255,   0},
   {178, 255,   0}, {177, 255,   0}, {175, 255,   0}, {174, 255,   0}, {172, 255,   0},
   {171, 255,   0}, {169, 255,   0}, {168, 255,   0}, {166, 255,   0}, {165, 255,   0},
   {163, 255,   0}, {162, 255,   0}, {160, 255,   0}, {159, 255,   0}, {157, 255,   0},
   {156, 255,   0}, {154, 255,   0}, {153, 255,   0}, {151, 255,   0}, {150, 255,   0},
   {148, 255,   0}, {147, 255,   0}, {145, 255,   0}, {144, 255,   0}, {142, 255,   0},
   {141, 255,   0}, {139, 255,   0}, {138, 255,   0}, {136, 255,   0}, {135, 255,   0},
   {133, 255,   0}, {132, 255,   0}, {130, 255,   0}, {129, 255,   0}, {127, 255,   0},
   {126, 255,   0}, {124, 255,   0}, {123, 255,   0}, {121, 255,   0}, {120, 255,   0},
   {118, 255,   0}, {117, 255,   0}, {115, 255,   0}, {114, 255,   0}, {112, 255,   0},
   {110, 255,   0}, {109, 255,   0}, {107, 255,   0}, {106, 255,   0}, {104, 255,   0},
   {103, 255,   0}, {101, 255,   0}, {100, 255,   0}, { 98, 255,   0}, { 97, 255,   0},
   { 95, 255,   0}, { 94, 255,   0}, { 92, 255,   0}, { 91, 255,   0}, { 89, 255,   0},
   { 88, 255,   0}, { 86, 255,   0}, { 85, 255,   0}, { 83, 255,   0}, { 82, 255,   0},
   { 80, 255,   0}, { 79, 255,   0}, { 77, 255,   0}, { 76, 255,   0}, { 74, 255,   0},
   { 73, 255,   0}, { 71, 255,   0}, { 70, 255,   0}, { 68, 255,   0}, { 67, 255,   0},
   { 65, 255,   0}, { 64, 255,   0}, { 62, 255,   0}, { 61, 255,   0}, { 59, 255,   0},
   { 58, 255,   0}, { 56, 255,   0}, { 55, 255,   0}, { 53, 255,   0}, { 52, 255,   0},
   { 50, 255,   0}, { 49, 255,   0}, { 47, 255,   0}, { 46, 255,   0}, { 44, 255,   0},
   { 43, 255,   0}, { 41, 255,   0}, { 40, 255,   0}, { 38, 255,   0}, { 37, 255,   0},
   { 35, 255,   0}, { 34, 255,   0}, { 32, 255,   0}, { 31, 255,   0}, { 29, 255,   0},
   { 28, 255,   0}, { 26, 255,   0}, { 25, 255,   0}, { 23, 255,   0}, { 22, 255,   0},
   { 20, 255,   0}, { 19, 255,   0}, { 17, 255,   0}, { 16, 255,   0}, { 14, 255,   0},
   { 12, 255,   0}, { 11, 255,   0}, {  9, 255,   0}, {  8, 255,   0}, {  7, 255,   1},
   {  7, 255,   2}, {  6, 255,   3}, {  6, 255,   4}, {  5, 255,   5}, {  5, 255,   6},
   {  4, 255,   7}, {  4, 255,   8}, {  3, 255,   9}, {  3, 255,  10}, {  2, 255,  11},
   {  2, 255,  12}, {  1, 255,  13}, {  1, 255,  14}, {  0, 255,  15}, {  0, 255,  16},
   {  0, 255,  18}, {  0, 255,  19}, {  0, 255,  21}, {  0, 255,  22}, {  0, 255,  24},
   {  0, 255,  25}, {  0, 255,  27}, {  0, 255,  28}, {  0, 255,  30}, {  0, 255,  31},
   {  0, 255,  33}, {  0, 255,  34}, {  0, 255,  36}, {  0, 255,  37}, {  0, 255,  39},
   {  0, 255,  40}, {  0, 255,  42}, {  0, 255,  43}, {  0, 255,  45}, {  0, 255,  46},
   {  0, 255,  48}, {  0, 255,  49}, {  0, 255,  51}, {  0, 255,  52}, {  0, 255,  54},
   {  0, 255,  55}, {  0, 255,  57}, {  0, 255,  58}, {  0, 255,  60}, {  0, 255,  61},
   {  0, 255,  63}, {  0, 255,  64}, {  0, 255,  66}, {  0, 255,  67}, {  0, 255,  69},
   {  0, 255,  70}, {  0, 255,  72}, {  0, 255,  73}, {  0, 255,  75}, {  0, 255,  76},
   {  0, 255,  78}, {  0, 255,  79}, {  0, 255,  81}, {  0, 255,  82}, {  0, 255,  84},
   {  0, 255,  86}, {  0, 255,  87}, {  0, 255,  89}, {  0, 255,  90}, {  0, 255,  92},
   {  0, 255,  93}, {  0, 255,  95}, {  0, 255,  96}, {  0, 255,  98}, {  0, 255,  99},
   {  0, 255, 101}, {  0, 255, 102}, {  0, 255, 104}, {  0, 255, 105}, {  0, 255, 107},
   {  0, 255, 108}, {  0, 255, 110}, {  0, 255, 111}, {  0, 255, 113}, {  0, 255, 114},
   {  0, 255, 116}, {  0, 255, 117}, {  0, 255, 119}, {  0, 255, 120}, {  0, 255, 122},
   {  0, 255, 123}, {  0, 255, 125}, {  0, 255, 126}, {  0, 255, 128}, {  0, 255, 129},
   {  0, 255, 131}, {  0, 255, 132}, {  0, 255, 134}, {  0, 255, 135}, {  0, 255, 137},
   {  0, 255, 138}, {  0, 255, 140}, {  0, 255, 141}, {  0, 255, 143}, {  0, 255, 144},
   {  0, 255, 146}, {  0, 255, 147}, {  0, 255, 149}, {  0, 255, 150}, {  0, 255, 152},
   {  0, 255, 153}, {  0, 255, 155}, {  0, 255, 156}, {  0, 255, 158}, {  0, 255, 159},
   {  0, 255, 161}, {  0, 255, 162}, {  0, 255, 164}, {  0, 255, 165}, {  0, 255, 167},
   {  0, 255, 168}, {  0, 255, 170}, {  0, 255, 171}, {  0, 255, 173}, {  0, 255, 174},
   {  0, 255, 176}, {  0, 255, 177}, {  0, 255, 179}, {  0, 255, 180}, {  0, 255, 182},
   {  0, 255, 183}, {  0, 255, 185}, {  0, 255, 187}, {  0, 255, 188}, {  0, 255, 190},
   {  0, 255, 191}, {  0, 255, 193}, {  0, 255, 194}, {  0, 255, 196}, {  0, 255, 197},
   {  0, 255, 199}, {  0, 255, 200}, {  0, 255, 202}, {  0, 255, 203}, {  0, 255, 205},
   {  0, 255, 206}, {  0, 255, 208}, {  0, 255, 209}, {  0, 255, 211}, {  0, 255, 212},
   {  0, 255, 214}, {  0, 255, 215}, {  0, 255, 217}, {  0, 255, 218}, {  0, 255, 220},
   {  0, 255, 221}, {  0, 255, 223}, {  0, 255, 224}, {  0, 255, 226}, {  0, 255, 227},
   {  0, 255, 229}, {  0, 255, 230}, {  0, 255, 232}, {  0, 255, 233}, {  0, 255, 235},
   {  0, 255, 236}, {  0, 255, 238}, {  0, 255, 239}, {  0, 255, 241}, {  0, 255, 242},
   {  0, 255, 244}, {  0, 255, 245}, {  0, 255, 247}, {  0, 255, 248}, {  0, 255, 250},
   {  0, 255, 251}, {  0, 255, 253}, {  0, 255, 254}, {  0, 254, 255}, {  0, 253, 255},
   {  0, 251, 255}, {  0, 250, 255}, {  0, 248, 255}, {  0, 247, 255}, {  0, 245, 255},
   {  0, 244, 255}, {  0, 242, 255}, {  0, 241, 255}, {  0, 239, 255}, {  0, 238, 255},
   {  0, 236, 255}, {  0, 235, 255}, {  0, 233, 255}, {  0, 232, 255}, {  0, 230, 255},
   {  0, 229, 255}, {  0, 227, 255}, {  0, 225, 255}, {  0, 224, 255}, {  0, 222, 255},
   {  0, 221, 255}, {  0, 219, 255}, {  0, 218, 255}, {  0, 216, 255}, {  0, 215, 255},
   {  0, 213, 255}, {  0, 212, 255}, {  0, 210, 255}, {  0, 209, 255}, {  0, 207, 255},
   {  0, 206, 255}, {  0, 204, 255}, {  0, 203, 255}, {  0, 201, 255}, {  0, 200, 255},
   {  0, 198, 255}, {  0, 197, 255}, {  0, 195, 255}, {  0, 194, 255}, {  0, 192, 255},
   {  0, 191, 255}, {  0, 189, 255}, {  0, 188, 255}, {  0, 186, 255}, {  0, 185, 255},
   {  0, 183, 255}, {  0, 182, 255}, {  0, 180, 255}, {  0, 179, 255}, {  0, 177, 255},
   {  0, 176, 255}, {  0, 174, 255}, {  0, 173, 255}, {  0, 171, 255}, {  0, 170, 255},
   {  0, 168, 255}, {  0, 167, 255}, {  0, 165, 255}, {  0, 164, 255}, {  0, 162, 255},
   {  0, 161, 255}, {  0, 159, 255}, {  0, 158, 255}, {  0, 156, 255}, {  0, 155, 255},
   {  0, 153, 255}, {  0, 152, 255}, {  0, 150, 255}, {  0, 149, 255}, {  0, 147, 255},
   {  0, 146, 255}, {  0, 144, 255}, {  0, 143, 255}, {  0, 141, 255}, {  0, 140, 255},
   {  0, 138, 255}, {  0, 137, 255}, {  0, 135, 255}, {  0, 134, 255}, {  0, 132, 255},
   {  0, 131, 255}, {  0, 129, 255}, {  0, 128, 255}, {  0, 126, 255}, {  0, 124, 255},
   {  0, 123, 255}, {  0, 121, 255}, {  0, 120, 255}, {  0, 118, 255}, {  0, 117, 255},
   {  0, 115, 255}, {  0, 114, 255}, {  0, 112, 255}, {  0, 111, 255}, {  0, 109, 255},
   {  0, 108, 255}, {  0, 106, 255}, {  0, 105, 255}, {  0, 103, 255}, {  0, 102, 255},
   {  0, 100, 255}, {  0,  99, 255}, {  0,  97, 255}, {  0,  96, 255}, {  0,  94, 255},
   {  0,  93, 255}, {  0,  91, 255}, {  0,  90, 255}, {  0,  88, 255}, {  0,  87, 255},
   {  0,  85, 255}, {  0,  84, 255}, {  0,  82, 255}, {  0,  81, 255}, {  0,  79, 255},
   {  0,  78, 255}, {  0,  76, 255}, {  0,  75, 255}, {  0,  73, 255}, {  0,  72, 255},
   {  0,  70, 255}, {  0,  69, 255}, {  0,  67, 255}, {  0,  66, 255}, {  0,  64, 255},
   {  0,  63, 255}, {  0,  61, 255}, {  0,  60, 255}, {  0,  58, 255}, {  0,  57, 255},
   {  0,  55, 255}, {  0,  54, 255}, {  0,  52, 255}, {  0,  51, 255}, {  0,  49, 255},
   {  0,  48, 255}, {  0,  46, 255}, {  0,  45, 255}, {  0,  43, 255}, {  0,  42, 255},
   {  0,  40, 255}, {  0,  39, 255}, {  0,  37, 255}, {  0,  36, 255}, {  0,  34, 255},
   {  0,  33, 255}, {  0,  31, 255}, {  0,  30, 255}, {  0,  28, 255}, {  0,  26, 255},
   {  0,  25, 255}, {  0,  23, 255}, {  0,  22, 255}, {  0,  20, 255}, {  0,  19, 255},
   {  0,  17, 255}, {  0,  16, 255}, {  1,  15, 255}, {  1,  14, 255}, {  2,  13, 255},
   {  2,  12, 255}, {  3,  11, 255}, {  3,  10, 255}, {  4,   9, 255}, {  4,   8, 255},
   {  5,   7, 255}, {  5,   6, 255}, {  6,   5, 255}, {  6,   4, 255}, {  7,   3, 255},
   {  7,   2, 255}, {  8,   1, 255}, {  8,   0, 255}, { 10,   0, 255}, { 11,   0, 255},
   { 13,   0, 255}, { 14,   0, 255}, { 16,   0, 255}, { 17,   0, 255}, { 19,   0, 255},
   { 20,   0, 255}, { 22,   0, 255}, { 23,   0, 255}, { 25,   0, 255}, { 26,   0, 255},
   { 28,   0, 255}, { 29,   0, 255}, { 31,   0, 255}, { 32,   0, 255}, { 34,   0, 255},
   { 35,   0, 255}, { 37,   0, 255}, { 38,   0, 255}, { 40,   0, 255}, { 41,   0, 255},
   { 43,   0, 255}, { 44,   0, 255}, { 46,   0, 255}, { 47,   0, 255}, { 49,   0, 255},
   { 50,   0, 255}, { 52,   0, 255}, { 53,   0, 255}, { 55,   0, 255}, { 56,   0, 255},
   { 58,   0, 255}, { 59,   0, 255}, { 61,   0, 255}, { 62,   0, 255}, { 64,   0, 255},
   { 65,   0, 255}, { 67,   0, 255}, { 68,   0, 255}, { 70,   0, 255}, { 72,   0, 255},
   { 73,   0, 255}, { 75,   0, 255}, { 76,   0, 255}, { 78,   0, 255}, { 79,   0, 255},
   { 81,   0, 255}, { 82,   0, 255}, { 84,   0, 255}, { 85,   0, 255}, { 87,   0, 255},
   { 88,   0, 255}, { 90,   0, 255}, { 91,   0, 255}, { 93,   0, 255}, { 94,   0, 255},
   { 96,   0, 255}, { 97,   0, 255}, { 99,   0, 255}, {100,   0, 255}, {102,   0, 255},
   {103,   0, 255}, {105,   0, 255}, {106,   0, 255}, {108,   0, 255}, {109,   0, 255},
   {111,   0, 255}, {112,   0, 255}, {114,   0, 255}, {115,   0, 255}, {117,   0, 255},
   {118,   0, 255}, {120,   0, 255}, {121,   0, 255}, {123,   0, 255}, {124,   0, 255},
   {126,   0, 255}, {127,   0, 255}, {129,   0, 255}, {130,   0, 255}, {132,   0, 255},
   {133,   0, 255}, {135,   0, 255}, {136,   0, 255}, {138,   0, 255}, {139,   0, 255},
   {141,   0, 255}, {142,   0, 255}, {144,   0, 255}, {145,   0, 255}, {147,   0, 255},
   {148,   0, 255}, {150,   0, 255}, {151,   0, 255}, {153,   0, 255}, {154,   0, 255},
   {156,   0, 255}, {157,   0, 255}, {159,   0, 255}, {160,   0, 255}, {162,   0, 255},
   {163,   0, 255}, {165,   0, 255}, {166,   0, 255}, {168,   0, 255}, {169,   0, 255},
   {171,   0, 255}, {173,   0, 255}, {174,   0, 255}, {176,   0, 255}, {177,   0, 255},
   {179,   0, 255}, {180,   0, 255}, {182,   0, 255}, {183,   0, 255}, {185,   0, 255},
   {186,   0, 255}, {188,   0, 255}, {189,   0, 255}, {191,   0, 255}, {192,   0, 255},
   {194,   0, 255}, {195,   0, 255}, {197,   0, 255}, {198,   0, 255}, {200,   0, 255},
   {201,   0, 255}, {203,   0, 255}, {204,   0, 255}, {206,   0, 255}, {207,   0, 255},
   {209,   0, 255}, {210,   0, 255}, {212,   0, 255}, {213,   0, 255}, {215,   0, 255},
   {216,   0, 255}, {218,   0, 255}, {219,   0, 255}, {221,   0, 255}, {222,   0, 255},
   {224,   0, 255}, {225,   0, 255}, {227,   0, 255}, {228,   0, 255}, {230,   0, 255},
   {231,   0, 255}, {233,   0, 255}, {234,   0, 255}, {236,   0, 255}, {237,   0, 255},
   {239,   0, 255}, {240,   0, 255}, {242,   0, 255}, {243,   0, 255}, {245,   0, 255},
   {246,   0, 255}, {247,   0, 254}, {248,   0, 253}, {248,   0, 252}, {249,   0, 251},
   {249,   0, 250}, {250,   0, 249}, {250,   0, 248}, {251,   0, 247}, {251,   0, 246},
   {252,   0, 245}, {252,   0, 244}, {253,   0, 243}, {253,   0, 242}, {254,   0, 241},
   {254,   0, 240}, {255,   0, 239}, {255,   0, 238}, {255,   0, 236}, {255,   0, 235},
   {255,   0, 233}, {255,   0, 232}, {255,   0, 230}, {255,   0, 229}, {255,   0, 227},
   {255,   0, 226}, {255,   0, 224}, {255,   0, 223}, {255,   0, 221}, {255,   0, 220},
   {255,   0, 218}, {255,   0, 217}, {255,   0, 215}, {255,   0, 214}, {255,   0, 212},
   {255,   0, 211}, {255,   0, 209}, {255,   0, 208}, {255,   0, 206}, {255,   0, 205},
   {255,   0, 203}, {255,   0, 202}, {255,   0, 200}, {255,   0, 199}, {255,   0, 197},
   {255,   0, 196}, {255,   0, 194}, {255,   0, 193}, {255,   0, 191}, {255,   0, 190},
   {255,   0, 188}, {255,   0, 187}, {255,   0, 185}, {255,   0, 184}, {255,   0, 182},
   {255,   0, 181}, {255,   0, 179}, {255,   0, 178}, {255,   0, 176}, {255,   0, 175},
   {255,   0, 173}, {255,   0, 172}, {255,   0, 170}, {255,   0, 169}, {255,   0, 167},
   {255,   0, 166}, {255,   0, 164}, {255,   0, 163}, {255,   0, 161}, {255,   0, 160},
   {255,   0, 158}, {255,   0, 157}, {255,   0, 155}, {255,   0, 154}, {255,   0, 152},
   {255,   0, 151}, {255,   0, 149}, {255,   0, 148}, {255,   0, 146}, {255,   0, 145},
   {255,   0, 143}, {255,   0, 141}, {255,   0, 140}, {255,   0, 138}, {255,   0, 137},
   {255,   0, 135}, {255,   0, 134}, {255,   0, 132}, {255,   0, 131}, {255,   0, 129},
   {255,   0, 128}, {255,   0, 126}, {255,   0, 125}, {255,   0, 123}, {255,   0, 122},
   {255,   0, 120}, {255,   0, 119}, {255,   0, 117}, {255,   0, 116}, {255,   0, 114},
   {255,   0, 113}, {255,   0, 111}, {255,   0, 110}, {255,   0, 108}, {255,   0, 107},
   {255,   0, 105}, {255,   0, 104}, {255,   0, 102}, {255,   0, 101}, {255,   0,  99},
   {255,   0,  98}, {255,   0,  96}, {255,   0,  95}, {255,   0,  93}, {255,   0,  92},
   {255,   0,  90}, {255,   0,  89}, {255,   0,  87}, {255,   0,  86}, {255,   0,  84},
   {255,   0,  83}, {255,   0,  81}, {255,   0,  80}, {255,   0,  78}, {255,   0,  77},
   {255,   0,  75}, {255,   0,  74}, {255,   0,  72}, {255,   0,  71}, {255,   0,  69},
   {255,   0,  68}, {255,   0,  66}, {255,   0,  65}, {255,   0,  63}, {255,   0,  62},
   {255,   0,  60}, {255,   0,  59}, {255,   0,  57}, {255,   0,  56}, {255,   0,  54},
   {255,   0,  53}, {255,   0,  51}, {255,   0,  50}, {255,   0,  48}, {255,   0,  47},
   {255,   0,  45}, {255,   0,  44}, {255,   0,  42}, {255,   0,  40}, {255,   0,  39},
   {255,   0,  37}, {255,   0,  36}, {255,   0,  34}, {255,   0,  33}, {255,   0,  31},
   {255,   0,  30}, {255,   0,  28}, {255,   0,  27}, {255,   0,  25}, {255,   0,  24}
};

const rgb_t jet_colormap[1000] = {
   { 29,   0, 102}, { 23,   0, 107}, { 17,   0, 112}, { 12,   0, 117}, {  6,   0, 122},
   {  0,   0, 127}, {  0,   0, 128}, {  0,   0, 129}, {  0,   0, 129}, {  0,   0, 130},
   {  0,   0, 131}, {  0,   0, 132}, {  0,   0, 133}, {  0,   0, 133}, {  0,   0, 134},
   {  0,   0, 135}, {  0,   0, 136}, {  0,   0, 137}, {  0,   0, 138}, {  0,   0, 140},
   {  0,   0, 141}, {  0,   0, 142}, {  0,   0, 143}, {  0,   0, 145}, {  0,   0, 146},
   {  0,   0, 147}, {  0,   0, 148}, {  0,   0, 150}, {  0,   0, 151}, {  0,   0, 152},
   {  0,   0, 153}, {  0,   0, 154}, {  0,   0, 156}, {  0,   0, 157}, {  0,   0, 158},
   {  0,   0, 159}, {  0,   0, 160}, {  0,   0, 161}, {  0,   0, 163}, {  0,   0, 164},
   {  0,   0, 165}, {  0,   0, 166}, {  0,   0, 168}, {  0,   0, 169}, {  0,   0, 170},
   {  0,   0, 171}, {  0,   0, 173}, {  0,   0, 174}, {  0,   0, 175}, {  0,   0, 176},
   {  0,   0, 178}, {  0,   0, 179}, {  0,   0, 180}, {  0,   0, 181}, {  0,   0, 183},
   {  0,   0, 184}, {  0,   0, 185}, {  0,   0, 186}, {  0,   0, 188}, {  0,   0, 189},
   {  0,   0, 190}, {  0,   0, 191}, {  0,   0, 193}, {  0,   0, 194}, {  0,   0, 195},
   {  0,   0, 196}, {  0,   0, 197}, {  0,   0, 198}, {  0,   0, 200}, {  0,   0, 201},
   {  0,   0, 202}, {  0,   0, 203}, {  0,   0, 204}, {  0,   0, 206}, {  0,   0, 207},
   {  0,   0, 208}, {  0,   0, 209}, {  0,   0, 211}, {  0,   0, 212}, {  0,   0, 213},
   {  0,   0, 214}, {  0,   0, 216}, {  0,   0, 217}, {  0,   0, 218}, {  0,   0, 219},
   {  0,   0, 221}, {  0,   0, 222}, {  0,   0, 223}, {  0,   0, 225}, {  0,   0, 226},
   {  0,   0, 227}, {  0,   0, 228}, {  0,   0, 230}, {  0,   0, 231}, {  0,   0, 232},
   {  0,   0, 233}, {  0,   0, 234}, {  0,   0, 234}, {  0,   0, 235}, {  0,   0, 236},
   {  0,   0, 237}, {  0,   0, 238}, {  0,   0, 239}, {  0,   0, 239}, {  0,   0, 240},
   {  0,   0, 241}, {  0,   0, 242}, {  0,   0, 243}, {  0,   0, 244}, {  0,   0, 246},
   {  0,   0, 247}, {  0,   0, 248}, {  0,   0, 249}, {  0,   0, 250}, {  0,   0, 251},
   {  0,   0, 253}, {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 254},
   {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 255}, {  0,   0, 255}, {  0,   0, 255},
   {  0,   0, 255}, {  0,   0, 255}, {  0,   0, 255}, {  0,   1, 255}, {  0,   1, 255},
   {  0,   2, 255}, {  0,   3, 255}, {  0,   3, 255}, {  0,   4, 255}, {  0,   5, 255},
   {  0,   6, 255}, {  0,   6, 255}, {  0,   7, 255}, {  0,   8, 255}, {  0,   9, 255},
   {  0,  10, 255}, {  0,  11, 255}, {  0,  12, 255}, {  0,  13, 255}, {  0,  14, 255},
   {  0,  15, 255}, {  0,  16, 255}, {  0,  17, 255}, {  0,  18, 255}, {  0,  19, 255},
   {  0,  21, 255}, {  0,  22, 255}, {  0,  23, 255}, {  0,  24, 255}, {  0,  25, 255},
   {  0,  26, 255}, {  0,  27, 255}, {  0,  28, 255}, {  0,  29, 255}, {  0,  30, 255},
   {  0,  31, 255}, {  0,  32, 255}, {  0,  34, 255}, {  0,  35, 255}, {  0,  36, 255},
   {  0,  37, 255}, {  0,  38, 255}, {  0,  39, 255}, {  0,  40, 255}, {  0,  41, 255},
   {  0,  42, 255}, {  0,  43, 255}, {  0,  44, 255}, {  0,  45, 255}, {  0,  46, 255},
   {  0,  48, 255}, {  0,  49, 255}, {  0,  50, 255}, {  0,  51, 255}, {  0,  52, 255},
   {  0,  53, 255}, {  0,  54, 255}, {  0,  55, 255}, {  0,  56, 255}, {  0,  57, 255},
   {  0,  58, 255}, {  0,  58, 255}, {  0,  59, 255}, {  0,  60, 255}, {  0,  60, 255},
   {  0,  61, 255}, {  0,  62, 255}, {  0,  63, 255}, {  0,  63, 255}, {  0,  64, 255},
   {  0,  65, 255}, {  0,  66, 255}, {  0,  67, 255}, {  0,  68, 255}, {  0,  69, 255},
   {  0,  71, 255}, {  0,  72, 255}, {  0,  73, 255}, {  0,  74, 255}, {  0,  75, 255},
   {  0,  76, 255}, {  0,  77, 255}, {  0,  78, 255}, {  0,  79, 255}, {  0,  80, 255},
   {  0,  81, 255}, {  0,  82, 255}, {  0,  84, 255}, {  0,  85, 255}, {  0,  86, 255},
   {  0,  87, 255}, {  0,  88, 255}, {  0,  89, 255}, {  0,  90, 255}, {  0,  91, 255},
   {  0,  92, 255}, {  0,  93, 255}, {  0,  94, 255}, {  0,  95, 255}, {  0,  96, 255},
   {  0,  98, 255}, {  0,  99, 255}, {  0, 100, 255}, {  0, 101, 255}, {  0, 102, 255},
   {  0, 103, 255}, {  0, 104, 255}, {  0, 105, 255}, {  0, 106, 255}, {  0, 107, 255},
   {  0, 108, 255}, {  0, 109, 255}, {  0, 111, 255}, {  0, 112, 255}, {  0, 113, 255},
   {  0, 114, 255}, {  0, 115, 255}, {  0, 116, 255}, {  0, 117, 255}, {  0, 118, 255},
   {  0, 119, 255}, {  0, 120, 255}, {  0, 121, 255}, {  0, 122, 255}, {  0, 123, 255},
   {  0, 125, 255}, {  0, 126, 255}, {  0, 127, 255}, {  0, 128, 255}, {  0, 129, 255},
   {  0, 130, 255}, {  0, 131, 255}, {  0, 132, 255}, {  0, 133, 255}, {  0, 134, 255},
   {  0, 135, 255}, {  0, 136, 255}, {  0, 138, 255}, {  0, 139, 255}, {  0, 140, 255},
   {  0, 141, 255}, {  0, 142, 255}, {  0, 143, 255}, {  0, 144, 255}, {  0, 145, 255},
   {  0, 146, 255}, {  0, 147, 255}, {  0, 148, 255}, {  0, 149, 255}, {  0, 150, 255},
   {  0, 150, 255}, {  0, 151, 255}, {  0, 152, 255}, {  0, 153, 255}, {  0, 153, 255},
   {  0, 154, 255}, {  0, 155, 255}, {  0, 155, 255}, {  0, 156, 255}, {  0, 157, 255},
   {  0, 158, 255}, {  0, 159, 255}, {  0, 161, 255}, {  0, 162, 255}, {  0, 163, 255},
   {  0, 164, 255}, {  0, 165, 255}, {  0, 166, 255}, {  0, 167, 255}, {  0, 168, 255},
   {  0, 169, 255}, {  0, 170, 255}, {  0, 171, 255}, {  0, 172, 255}, {  0, 173, 255},
   {  0, 175, 255}, {  0, 176, 255}, {  0, 177, 255}, {  0, 178, 255}, {  0, 179, 255},
   {  0, 180, 255}, {  0, 181, 255}, {  0, 182, 255}, {  0, 183, 255}, {  0, 184, 255},
   {  0, 185, 255}, {  0, 186, 255}, {  0, 188, 255}, {  0, 189, 255}, {  0, 190, 255},
   {  0, 191, 255}, {  0, 192, 255}, {  0, 193, 255}, {  0, 194, 255}, {  0, 195, 255},
   {  0, 196, 255}, {  0, 197, 255}, {  0, 198, 255}, {  0, 199, 255}, {  0, 200, 255},
   {  0, 202, 255}, {  0, 203, 255}, {  0, 204, 255}, {  0, 205, 255}, {  0, 206, 255},
   {  0, 207, 255}, {  0, 208, 255}, {  0, 209, 255}, {  0, 210, 255}, {  0, 211, 255},
   {  0, 212, 255}, {  0, 213, 255}, {  0, 215, 255}, {  0, 216, 255}, {  0, 217, 255},
   {  0, 218, 254}, {  0, 219, 253}, {  0, 220, 252}, {  0, 221, 252}, {  0, 222, 251},
   {  0, 223, 250}, {  0, 224, 250}, {  0, 225, 249}, {  0, 226, 248}, {  0, 227, 247},
   {  0, 229, 247}, {  1, 230, 246}, {  2, 231, 245}, {  3, 232, 244}, {  3, 233, 243},
   {  4, 234, 242}, {  5, 235, 241}, {  5, 236, 240}, {  6, 237, 239}, {  7, 238, 238},
   {  8, 239, 238}, {  8, 240, 237}, {  9, 241, 236}, { 10, 242, 236}, { 10, 242, 235},
   { 11, 243, 235}, { 11, 244, 234}, { 12, 245, 234}, { 13, 245, 233}, { 13, 246, 232},
   { 14, 247, 232}, { 15, 247, 231}, { 15, 248, 231}, { 16, 249, 230}, { 17, 249, 229},
   { 18, 250, 228}, { 18, 251, 227}, { 19, 251, 226}, { 20, 252, 225}, { 21, 253, 224},
   { 22, 253, 224}, { 23, 254, 223}, { 23, 254, 222}, { 24, 255, 221}, { 25, 255, 220},
   { 26, 255, 219}, { 27, 255, 218}, { 28, 255, 218}, { 29, 255, 217}, { 30, 255, 216},
   { 30, 255, 215}, { 31, 255, 214}, { 32, 255, 214}, { 33, 255, 213}, { 34, 255, 212},
   { 35, 255, 211}, { 36, 255, 210}, { 37, 255, 209}, { 38, 255, 208}, { 39, 255, 207},
   { 39, 255, 207}, { 40, 255, 206}, { 41, 255, 205}, { 42, 255, 204}, { 43, 255, 203},
   { 44, 255, 202}, { 45, 255, 201}, { 46, 255, 200}, { 47, 255, 199}, { 48, 255, 198},
   { 48, 255, 198}, { 49, 255, 197}, { 50, 255, 196}, { 51, 255, 195}, { 52, 255, 194},
   { 53, 255, 193}, { 54, 255, 192}, { 55, 255, 191}, { 55, 255, 191}, { 56, 255, 190},
   { 57, 255, 189}, { 58, 255, 188}, { 59, 255, 187}, { 60, 255, 186}, { 60, 255, 186},
   { 61, 255, 185}, { 62, 255, 184}, { 63, 255, 183}, { 64, 255, 182}, { 65, 255, 181},
   { 65, 255, 181}, { 66, 255, 180}, { 67, 255, 179}, { 68, 255, 178}, { 69, 255, 177},
   { 70, 255, 176}, { 71, 255, 175}, { 72, 255, 174}, { 73, 255, 173}, { 74, 255, 172},
   { 74, 255, 172}, { 75, 255, 171}, { 76, 255, 170}, { 77, 255, 169}, { 78, 255, 168},
   { 79, 255, 167}, { 80, 255, 166}, { 81, 255, 165}, { 82, 255, 164}, { 83, 255, 163},
   { 83, 255, 163}, { 84, 255, 162}, { 84, 255, 162}, { 85, 255, 161}, { 85, 255, 161},
   { 86, 255, 160}, { 87, 255, 159}, { 87, 255, 159}, { 88, 255, 158}, { 88, 255, 158},
   { 89, 255, 157}, { 89, 255, 157}, { 90, 255, 156}, { 91, 255, 155}, { 92, 255, 154},
   { 93, 255, 153}, { 94, 255, 152}, { 95, 255, 151}, { 96, 255, 150}, { 97, 255, 149},
   { 97, 255, 149}, { 98, 255, 148}, { 99, 255, 147}, {100, 255, 146}, {101, 255, 145},
   {102, 255, 144}, {102, 255, 143}, {103, 255, 142}, {104, 255, 141}, {105, 255, 140},
   {106, 255, 140}, {107, 255, 139}, {107, 255, 138}, {108, 255, 137}, {109, 255, 136},
   {110, 255, 135}, {111, 255, 134}, {112, 255, 134}, {113, 255, 133}, {114, 255, 132},
   {114, 255, 131}, {115, 255, 130}, {116, 255, 130}, {117, 255, 129}, {118, 255, 128},
   {119, 255, 127}, {120, 255, 126}, {121, 255, 125}, {122, 255, 124}, {123, 255, 123},
   {123, 255, 123}, {124, 255, 122}, {125, 255, 121}, {126, 255, 120}, {127, 255, 119},
   {128, 255, 118}, {129, 255, 117}, {130, 255, 116}, {130, 255, 115}, {131, 255, 114},
   {132, 255, 114}, {133, 255, 113}, {134, 255, 112}, {134, 255, 111}, {135, 255, 110},
   {136, 255, 109}, {137, 255, 108}, {138, 255, 107}, {139, 255, 107}, {140, 255, 106},
   {140, 255, 105}, {141, 255, 104}, {142, 255, 103}, {143, 255, 102}, {144, 255, 102},
   {145, 255, 101}, {146, 255, 100}, {147, 255,  99}, {148, 255,  98}, {149, 255,  97},
   {149, 255,  97}, {150, 255,  96}, {151, 255,  95}, {152, 255,  94}, {153, 255,  93},
   {154, 255,  92}, {155, 255,  91}, {156, 255,  90}, {157, 255,  89}, {157, 255,  89},
   {158, 255,  88}, {158, 255,  88}, {159, 255,  87}, {159, 255,  87}, {160, 255,  86},
   {161, 255,  85}, {161, 255,  85}, {162, 255,  84}, {162, 255,  84}, {163, 255,  83},
   {163, 255,  83}, {164, 255,  82}, {165, 255,  81}, {166, 255,  80}, {167, 255,  79},
   {168, 255,  78}, {169, 255,  77}, {170, 255,  76}, {171, 255,  75}, {172, 255,  74},
   {172, 255,  74}, {173, 255,  73}, {174, 255,  72}, {175, 255,  71}, {176, 255,  70},
   {177, 255,  69}, {178, 255,  68}, {179, 255,  67}, {180, 255,  66}, {181, 255,  65},
   {181, 255,  65}, {182, 255,  64}, {183, 255,  63}, {184, 255,  62}, {185, 255,  61},
   {186, 255,  60}, {186, 255,  60}, {187, 255,  59}, {188, 255,  58}, {189, 255,  57},
   {190, 255,  56}, {191, 255,  55}, {191, 255,  55}, {192, 255,  54}, {193, 255,  53},
   {194, 255,  52}, {195, 255,  51}, {196, 255,  50}, {197, 255,  49}, {198, 255,  48},
   {198, 255,  48}, {199, 255,  47}, {200, 255,  46}, {201, 255,  45}, {202, 255,  44},
   {203, 255,  43}, {204, 255,  42}, {205, 255,  41}, {206, 255,  40}, {207, 255,  39},
   {207, 255,  39}, {208, 255,  38}, {209, 255,  37}, {210, 255,  36}, {211, 255,  35},
   {212, 255,  34}, {213, 255,  33}, {214, 255,  32}, {214, 255,  31}, {215, 255,  30},
   {216, 255,  30}, {217, 255,  29}, {218, 255,  28}, {218, 255,  27}, {219, 255,  26},
   {220, 255,  25}, {221, 255,  24}, {222, 255,  23}, {223, 255,  23}, {224, 255,  22},
   {224, 255,  21}, {225, 255,  20}, {226, 255,  19}, {227, 255,  18}, {228, 255,  18},
   {229, 255,  17}, {230, 255,  16}, {231, 255,  15}, {231, 255,  15}, {232, 255,  14},
   {232, 255,  13}, {233, 255,  13}, {234, 255,  12}, {234, 255,  11}, {235, 255,  11},
   {235, 255,  10}, {236, 255,  10}, {236, 255,   9}, {237, 255,   8}, {238, 254,   8},
   {238, 253,   7}, {239, 252,   6}, {240, 251,   5}, {241, 250,   5}, {242, 249,   4},
   {243, 248,   3}, {244, 247,   3}, {245, 246,   2}, {246, 246,   1}, {247, 245,   0},
   {247, 243,   0}, {248, 242,   0}, {249, 242,   0}, {250, 241,   0}, {250, 240,   0},
   {251, 239,   0}, {252, 238,   0}, {252, 237,   0}, {253, 236,   0}, {254, 235,   0},
   {255, 234,   0}, {255, 233,   0}, {255, 232,   0}, {255, 231,   0}, {255, 230,   0},
   {255, 229,   0}, {255, 228,   0}, {255, 227,   0}, {255, 226,   0}, {255, 225,   0},
   {255, 224,   0}, {255, 223,   0}, {255, 222,   0}, {255, 221,   0}, {255, 220,   0},
   {255, 219,   0}, {255, 218,   0}, {255, 217,   0}, {255, 216,   0}, {255, 215,   0},
   {255, 214,   0}, {255, 213,   0}, {255, 212,   0}, {255, 211,   0}, {255, 210,   0},
   {255, 209,   0}, {255, 208,   0}, {255, 207,   0}, {255, 206,   0}, {255, 205,   0},
   {255, 204,   0}, {255, 203,   0}, {255, 202,   0}, {255, 201,   0}, {255, 200,   0},
   {255, 199,   0}, {255, 198,   0}, {255, 197,   0}, {255, 196,   0}, {255, 195,   0},
   {255, 194,   0}, {255, 193,   0}, {255, 192,   0}, {255, 191,   0}, {255, 190,   0},
   {255, 189,   0}, {255, 188,   0}, {255, 187,   0}, {255, 186,   0}, {255, 185,   0},
   {255, 184,   0}, {255, 183,   0}, {255, 182,   0}, {255, 180,   0}, {255, 179,   0},
   {255, 178,   0}, {255, 177,   0}, {255, 176,   0}, {255, 176,   0}, {255, 175,   0},
   {255, 175,   0}, {255, 174,   0}, {255, 173,   0}, {255, 173,   0}, {255, 172,   0},
   {255, 171,   0}, {255, 171,   0}, {255, 170,   0}, {255, 169,   0}, {255, 168,   0},
   {255, 167,   0}, {255, 166,   0}, {255, 165,   0}, {255, 164,   0}, {255, 163,   0},
   {255, 162,   0}, {255, 161,   0}, {255, 160,   0}, {255, 159,   0}, {255, 158,   0},
   {255, 157,   0}, {255, 156,   0}, {255, 155,   0}, {255, 154,   0}, {255, 153,   0},
   {255, 152,   0}, {255, 151,   0}, {255, 150,   0}, {255, 150,   0}, {255, 149,   0},
   {255, 147,   0}, {255, 146,   0}, {255, 146,   0}, {255, 145,   0}, {255, 144,   0},
   {255, 143,   0}, {255, 142,   0}, {255, 141,   0}, {255, 140,   0}, {255, 139,   0},
   {255, 138,   0}, {255, 137,   0}, {255, 136,   0}, {255, 135,   0}, {255, 134,   0},
   {255, 133,   0}, {255, 132,   0}, {255, 131,   0}, {255, 130,   0}, {255, 129,   0},
   {255, 128,   0}, {255, 127,   0}, {255, 126,   0}, {255, 125,   0}, {255, 124,   0},
   {255, 123,   0}, {255, 122,   0}, {255, 121,   0}, {255, 120,   0}, {255, 119,   0},
   {255, 118,   0}, {255, 117,   0}, {255, 116,   0}, {255, 115,   0}, {255, 114,   0},
   {255, 113,   0}, {255, 112,   0}, {255, 111,   0}, {255, 109,   0}, {255, 108,   0},
   {255, 107,   0}, {255, 106,   0}, {255, 105,   0}, {255, 104,   0}, {255, 103,   0},
   {255, 102,   0}, {255, 101,   0}, {255, 100,   0}, {255,  99,   0}, {255,  98,   0},
   {255,  97,   0}, {255,  96,   0}, {255,  95,   0}, {255,  94,   0}, {255,  93,   0},
   {255,  92,   0}, {255,  91,   0}, {255,  91,   0}, {255,  90,   0}, {255,  90,   0},
   {255,  89,   0}, {255,  88,   0}, {255,  88,   0}, {255,  87,   0}, {255,  86,   0},
   {255,  86,   0}, {255,  85,   0}, {255,  84,   0}, {255,  83,   0}, {255,  82,   0},
   {255,  81,   0}, {255,  80,   0}, {255,  79,   0}, {255,  78,   0}, {255,  77,   0},
   {255,  76,   0}, {255,  75,   0}, {255,  74,   0}, {255,  73,   0}, {255,  72,   0},
   {255,  71,   0}, {255,  70,   0}, {255,  69,   0}, {255,  68,   0}, {255,  67,   0},
   {255,  66,   0}, {255,  65,   0}, {255,  64,   0}, {255,  63,   0}, {255,  62,   0},
   {255,  61,   0}, {255,  60,   0}, {255,  59,   0}, {255,  58,   0}, {255,  57,   0},
   {255,  56,   0}, {255,  55,   0}, {255,  54,   0}, {255,  54,   0}, {255,  53,   0},
   {255,  51,   0}, {255,  50,   0}, {255,  49,   0}, {255,  48,   0}, {255,  47,   0},
   {255,  46,   0}, {255,  45,   0}, {255,  44,   0}, {255,  43,   0}, {255,  42,   0},
   {255,  41,   0}, {255,  40,   0}, {255,  39,   0}, {255,  38,   0}, {255,  37,   0},
   {255,  36,   0}, {255,  35,   0}, {255,  34,   0}, {255,  33,   0}, {255,  32,   0},
   {255,  31,   0}, {255,  30,   0}, {255,  29,   0}, {255,  28,   0}, {255,  27,   0},
   {255,  26,   0}, {255,  25,   0}, {255,  24,   0}, {254,  23,   0}, {254,  22,   0},
   {254,  21,   0}, {254,  20,   0}, {254,  19,   0}, {254,  18,   0}, {253,  17,   0},
   {251,  16,   0}, {250,  15,   0}, {249,  14,   0}, {248,  13,   0}, {247,  12,   0},
   {246,  11,   0}, {244,  10,   0}, {243,   9,   0}, {242,   8,   0}, {241,   7,   0},
   {240,   6,   0}, {239,   6,   0}, {239,   5,   0}, {238,   4,   0}, {237,   4,   0},
   {236,   3,   0}, {235,   3,   0}, {234,   2,   0}, {234,   1,   0}, {233,   1,   0},
   {232,   0,   0}, {231,   0,   0}, {230,   0,   0}, {228,   0,   0}, {227,   0,   0},
   {226,   0,   0}, {225,   0,   0}, {223,   0,   0}, {222,   0,   0}, {221,   0,   0},
   {219,   0,   0}, {218,   0,   0}, {217,   0,   0}, {216,   0,   0}, {214,   0,   0},
   {213,   0,   0}, {212,   0,   0}, {211,   0,   0}, {209,   0,   0}, {208,   0,   0},
   {207,   0,   0}, {206,   0,   0}, {204,   0,   0}, {203,   0,   0}, {202,   0,   0},
   {201,   0,   0}, {200,   0,   0}, {198,   0,   0}, {197,   0,   0}, {196,   0,   0},
   {195,   0,   0}, {194,   0,   0}, {193,   0,   0}, {191,   0,   0}, {190,   0,   0},
   {189,   0,   0}, {188,   0,   0}, {186,   0,   0}, {185,   0,   0}, {184,   0,   0},
   {183,   0,   0}, {181,   0,   0}, {180,   0,   0}, {179,   0,   0}, {178,   0,   0},
   {176,   0,   0}, {175,   0,   0}, {174,   0,   0}, {173,   0,   0}, {171,   0,   0},
   {170,   0,   0}, {169,   0,   0}, {168,   0,   0}, {166,   0,   0}, {165,   0,   0},
   {164,   0,   0}, {163,   0,   0}, {161,   0,   0}, {160,   0,   0}, {159,   0,   0},
   {158,   0,   0}, {157,   0,   0}, {156,   0,   0}, {154,   0,   0}, {153,   0,   0},
   {152,   0,   0}, {151,   0,   0}, {150,   0,   0}, {148,   0,   0}, {147,   0,   0},
   {146,   0,   0}, {145,   0,   0}, {143,   0,   0}, {142,   0,   0}, {141,   0,   0},
   {140,   0,   0}, {138,   0,   0}, {137,   0,   0}, {136,   0,   0}, {135,   0,   0},
   {134,   0,   0}, {133,   0,   0}, {133,   0,   0}, {132,   0,   0}, {131,   0,   0},
   {130,   0,   0}, {129,   0,   0}, {129,   0,   0}, {128,   0,   0}, {127,   0,   0},
   {122,   0,   9}, {117,   0,  18}, {112,   0,  27}, {107,   0,  36}, {102,   0,  45}
};

const rgb_t prism_colormap[1000] = {
   {255,   0,   0}, {255,   2,   0}, {255,   4,   0}, {255,   6,   0}, {255,   8,   0},
   {255,  10,   0}, {255,  11,   0}, {255,  13,   0}, {255,  15,   0}, {255,  17,   0},
   {255,  19,   0}, {255,  21,   0}, {255,  23,   0}, {255,  25,   0}, {255,  27,   0},
   {255,  29,   0}, {255,  31,   0}, {255,  33,   0}, {255,  34,   0}, {255,  36,   0},
   {255,  38,   0}, {255,  40,   0}, {255,  42,   0}, {255,  44,   0}, {255,  46,   0},
   {255,  48,   0}, {255,  50,   0}, {255,  52,   0}, {255,  54,   0}, {255,  56,   0},
   {255,  57,   0}, {255,  59,   0}, {255,  61,   0}, {255,  63,   0}, {255,  65,   0},
   {255,  67,   0}, {255,  69,   0}, {255,  71,   0}, {255,  73,   0}, {255,  75,   0},
   {255,  77,   0}, {255,  78,   0}, {255,  80,   0}, {255,  82,   0}, {255,  84,   0},
   {255,  86,   0}, {255,  88,   0}, {255,  90,   0}, {255,  92,   0}, {255,  94,   0},
   {255,  96,   0}, {255,  98,   0}, {255, 100,   0}, {255, 101,   0}, {255, 103,   0},
   {255, 105,   0}, {255, 107,   0}, {255, 109,   0}, {255, 111,   0}, {255, 113,   0},
   {255, 115,   0}, {255, 117,   0}, {255, 119,   0}, {255, 121,   0}, {255, 123,   0},
   {255, 124,   0}, {255, 126,   0}, {255, 128,   0}, {255, 130,   0}, {255, 132,   0},
   {255, 134,   0}, {255, 136,   0}, {255, 138,   0}, {255, 140,   0}, {255, 142,   0},
   {255, 144,   0}, {255, 145,   0}, {255, 147,   0}, {255, 149,   0}, {255, 151,   0},
   {255, 153,   0}, {255, 155,   0}, {255, 157,   0}, {255, 159,   0}, {255, 161,   0},
   {255, 163,   0}, {255, 165,   0}, {255, 167,   0}, {255, 168,   0}, {255, 170,   0},
   {255, 172,   0}, {255, 174,   0}, {255, 176,   0}, {255, 178,   0}, {255, 180,   0},
   {255, 182,   0}, {255, 184,   0}, {255, 186,   0}, {255, 188,   0}, {255, 190,   0},
   {255, 191,   0}, {255, 193,   0}, {255, 195,   0}, {255, 197,   0}, {255, 199,   0},
   {255, 201,   0}, {255, 203,   0}, {255, 205,   0}, {255, 207,   0}, {255, 209,   0},
   {255, 211,   0}, {255, 212,   0}, {255, 214,   0}, {255, 216,   0}, {255, 218,   0},
   {255, 220,   0}, {255, 222,   0}, {255, 224,   0}, {255, 226,   0}, {255, 228,   0},
   {255, 230,   0}, {255, 232,   0}, {255, 234,   0}, {255, 235,   0}, {255, 237,   0},
   {255, 239,   0}, {255, 241,   0}, {255, 243,   0}, {255, 245,   0}, {255, 247,   0},
   {255, 249,   0}, {255, 251,   0}, {255, 253,   0}, {255, 255,   0}, {252, 255,   0},
   {248, 255,   0}, {244, 255,   0}, {240, 255,   0}, {237, 255,   0}, {233, 255,   0},
   {229, 255,   0}, {225, 255,   0}, {221, 255,   0}, {217, 255,   0}, {214, 255,   0},
   {210, 255,   0}, {206, 255,   0}, {202, 255,   0}, {198, 255,   0}, {195, 255,   0},
   {191, 255,   0}, {187, 255,   0}, {183, 255,   0}, {179, 255,   0}, {175, 255,   0},
   {172, 255,   0}, {168, 255,   0}, {164, 255,   0}, {160, 255,   0}, {156, 255,   0},
   {152, 255,   0}, {149, 255,   0}, {145, 255,   0}, {141, 255,   0}, {137, 255,   0},
   {133, 255,   0}, {129, 255,   0}, {126, 255,   0}, {122, 255,   0}, {118, 255,   0},
   {114, 255,   0}, {110, 255,   0}, {106, 255,   0}, {103, 255,   0}, { 99, 255,   0},
   { 95, 255,   0}, { 91, 255,   0}, { 87, 255,   0}, { 83, 255,   0}, { 80, 255,   0},
   { 76, 255,   0}, { 72, 255,   0}, { 68, 255,   0}, { 64, 255,   0}, { 60, 255,   0},
   { 57, 255,   0}, { 53, 255,   0}, { 49, 255,   0}, { 45, 255,   0}, { 41, 255,   0},
   { 38, 255,   0}, { 34, 255,   0}, { 30, 255,   0}, { 26, 255,   0}, { 22, 255,   0},
   { 18, 255,   0}, { 15, 255,   0}, { 11, 255,   0}, {  7, 255,   0}, {  3, 255,   0},
   {  0, 254,   1}, {  0, 250,   5}, {  0, 247,   8}, {  0, 243,  12}, {  0, 239,  16},
   {  0, 235,  20}, {  0, 231,  24}, {  0, 227,  28}, {  0, 224,  31}, {  0, 220,  35},
   {  0, 216,  39}, {  0, 212,  43}, {  0, 208,  47}, {  0, 204,  51}, {  0, 201,  54},
   {  0, 197,  58}, {  0, 193,  62}, {  0, 189,  66}, {  0, 185,  70}, {  0, 181,  74},
   {  0, 178,  77}, {  0, 174,  81}, {  0, 170,  85}, {  0, 166,  89}, {  0, 162,  93},
   {  0, 159,  96}, {  0, 155, 100}, {  0, 151, 104}, {  0, 147, 108}, {  0, 143, 112},
   {  0, 139, 116}, {  0, 136, 119}, {  0, 132, 123}, {  0, 128, 127}, {  0, 124, 131},
   {  0, 120, 135}, {  0, 116, 139}, {  0, 113, 142}, {  0, 109, 146}, {  0, 105, 150},
   {  0, 101, 154}, {  0,  97, 158}, {  0,  93, 162}, {  0,  90, 165}, {  0,  86, 169},
   {  0,  82, 173}, {  0,  78, 177}, {  0,  74, 181}, {  0,  70, 185}, {  0,  67, 188},
   {  0,  63, 192}, {  0,  59, 196}, {  0,  55, 200}, {  0,  51, 204}, {  0,  47, 208},
   {  0,  44, 211}, {  0,  40, 215}, {  0,  36, 219}, {  0,  32, 223}, {  0,  28, 227},
   {  0,  25, 230}, {  0,  21, 234}, {  0,  17, 238}, {  0,  13, 242}, {  0,   9, 246},
   {  0,   5, 250}, {  0,   2, 253}, {  2,   0, 255}, {  4,   0, 255}, {  7,   0, 255},
   {  9,   0, 255}, { 12,   0, 255}, { 14,   0, 255}, { 17,   0, 255}, { 19,   0, 255},
   { 22,   0, 255}, { 25,   0, 255}, { 27,   0, 255}, { 30,   0, 255}, { 32,   0, 255},
   { 35,   0, 255}, { 37,   0, 255}, { 40,   0, 255}, { 42,   0, 255}, { 45,   0, 255},
   { 47,   0, 255}, { 50,   0, 255}, { 53,   0, 255}, { 55,   0, 255}, { 58,   0, 255},
   { 60,   0, 255}, { 63,   0, 255}, { 65,   0, 255}, { 68,   0, 255}, { 70,   0, 255},
   { 73,   0, 255}, { 76,   0, 255}, { 78,   0, 255}, { 81,   0, 255}, { 83,   0, 255},
   { 86,   0, 255}, { 88,   0, 255}, { 91,   0, 255}, { 93,   0, 255}, { 96,   0, 255},
   { 99,   0, 255}, {101,   0, 255}, {104,   0, 255}, {106,   0, 255}, {109,   0, 255},
   {111,   0, 255}, {114,   0, 255}, {116,   0, 255}, {119,   0, 255}, {122,   0, 255},
   {124,   0, 255}, {127,   0, 255}, {129,   0, 255}, {132,   0, 255}, {134,   0, 255},
   {137,   0, 255}, {139,   0, 255}, {142,   0, 255}, {144,   0, 255}, {147,   0, 255},
   {150,   0, 255}, {152,   0, 255}, {155,   0, 255}, {157,   0, 255}, {160,   0, 255},
   {162,   0, 255}, {165,   0, 255}, {167,   0, 255}, {170,   0, 255}, {171,   0, 251},
   {173,   0, 247}, {174,   0, 244}, {175,   0, 240}, {176,   0, 236}, {178,   0, 232},
   {179,   0, 228}, {180,   0, 224}, {181,   0, 221}, {183,   0, 217}, {184,   0, 213},
   {185,   0, 209}, {187,   0, 205}, {188,   0, 201}, {189,   0, 198}, {190,   0, 194},
   {192,   0, 190}, {193,   0, 186}, {194,   0, 182}, {196,   0, 178}, {197,   0, 175},
   {198,   0, 171}, {199,   0, 167}, {201,   0, 163}, {202,   0, 159}, {203,   0, 155},
   {204,   0, 152}, {206,   0, 148}, {207,   0, 144}, {208,   0, 140}, {210,   0, 136},
   {211,   0, 132}, {212,   0, 129}, {213,   0, 125}, {215,   0, 121}, {216,   0, 117},
   {217,   0, 113}, {218,   0, 110}, {220,   0, 106}, {221,   0, 102}, {222,   0,  98},
   {224,   0,  94}, {225,   0,  90}, {226,   0,  87}, {227,   0,  83}, {229,   0,  79},
   {230,   0,  75}, {231,   0,  71}, {233,   0,  67}, {234,   0,  64}, {235,   0,  60},
   {236,   0,  56}, {238,   0,  52}, {239,   0,  48}, {240,   0,  44}, {241,   0,  41},
   {243,   0,  37}, {244,   0,  33}, {245,   0,  29}, {247,   0,  25}, {248,   0,  21},
   {249,   0,  18}, {250,   0,  14}, {252,   0,  10}, {253,   0,   6}, {254,   0,   2},
   {255,   1,   0}, {255,   3,   0}, {255,   5,   0}, {255,   7,   0}, {255,   8,   0},
   {255,  10,   0}, {255,  12,   0}, {255,  14,   0}, {255,  16,   0}, {255,  18,   0},
   {255,  20,   0}, {255,  22,   0}, {255,  24,   0}, {255,  26,   0}, {255,  28,   0},
   {255,  29,   0}, {255,  31,   0}, {255,  33,   0}, {255,  35,   0}, {255,  37,   0},
   {255,  39,   0}, {255,  41,   0}, {255,  43,   0}, {255,  45,   0}, {255,  47,   0},
   {255,  49,   0}, {255,  51,   0}, {255,  52,   0}, {255,  54,   0}, {255,  56,   0},
   {255,  58,   0}, {255,  60,   0}, {255,  62,   0}, {255,  64,   0}, {255,  66,   0},
   {255,  68,   0}, {255,  70,   0}, {255,  72,   0}, {255,  74,   0}, {255,  75,   0},
   {255,  77,   0}, {255,  79,   0}, {255,  81,   0}, {255,  83,   0}, {255,  85,   0},
   {255,  87,   0}, {255,  89,   0}, {255,  91,   0}, {255,  93,   0}, {255,  95,   0},
   {255,  96,   0}, {255,  98,   0}, {255, 100,   0}, {255, 102,   0}, {255, 104,   0},
   {255, 106,   0}, {255, 108,   0}, {255, 110,   0}, {255, 112,   0}, {255, 114,   0},
   {255, 116,   0}, {255, 118,   0}, {255, 119,   0}, {255, 121,   0}, {255, 123,   0},
   {255, 125,   0}, {255, 127,   0}, {255, 129,   0}, {255, 131,   0}, {255, 133,   0},
   {255, 135,   0}, {255, 137,   0}, {255, 139,   0}, {255, 141,   0}, {255, 142,   0},
   {255, 144,   0}, {255, 146,   0}, {255, 148,   0}, {255, 150,   0}, {255, 152,   0},
   {255, 154,   0}, {255, 156,   0}, {255, 158,   0}, {255, 160,   0}, {255, 162,   0},
   {255, 163,   0}, {255, 165,   0}, {255, 167,   0}, {255, 169,   0}, {255, 171,   0},
   {255, 173,   0}, {255, 175,   0}, {255, 177,   0}, {255, 179,   0}, {255, 181,   0},
   {255, 183,   0}, {255, 185,   0}, {255, 186,   0}, {255, 188,   0}, {255, 190,   0},
   {255, 192,   0}, {255, 194,   0}, {255, 196,   0}, {255, 198,   0}, {255, 200,   0},
   {255, 202,   0}, {255, 204,   0}, {255, 206,   0}, {255, 208,   0}, {255, 209,   0},
   {255, 211,   0}, {255, 213,   0}, {255, 215,   0}, {255, 217,   0}, {255, 219,   0},
   {255, 221,   0}, {255, 223,   0}, {255, 225,   0}, {255, 227,   0}, {255, 229,   0},
   {255, 230,   0}, {255, 232,   0}, {255, 234,   0}, {255, 236,   0}, {255, 238,   0},
   {255, 240,   0}, {255, 242,   0}, {255, 244,   0}, {255, 246,   0}, {255, 248,   0},
   {255, 250,   0}, {255, 252,   0}, {255, 253,   0}, {254, 255,   0}, {250, 255,   0},
   {247, 255,   0}, {243, 255,   0}, {239, 255,   0}, {235, 255,   0}, {231, 255,   0},
   {227, 255,   0}, {224, 255,   0}, {220, 255,   0}, {216, 255,   0}, {212, 255,   0},
   {208, 255,   0}, {204, 255,   0}, {201, 255,   0}, {197, 255,   0}, {193, 255,   0},
   {189, 255,   0}, {185, 255,   0}, {181, 255,   0}, {178, 255,   0}, {174, 255,   0},
   {170, 255,   0}, {166, 255,   0}, {162, 255,   0}, {159, 255,   0}, {155, 255,   0},
   {151, 255,   0}, {147, 255,   0}, {143, 255,   0}, {139, 255,   0}, {136, 255,   0},
   {132, 255,   0}, {128, 255,   0}, {124, 255,   0}, {120, 255,   0}, {116, 255,   0},
   {113, 255,   0}, {109, 255,   0}, {105, 255,   0}, {101, 255,   0}, { 97, 255,   0},
   { 93, 255,   0}, { 90, 255,   0}, { 86, 255,   0}, { 82, 255,   0}, { 78, 255,   0},
   { 74, 255,   0}, { 70, 255,   0}, { 67, 255,   0}, { 63, 255,   0}, { 59, 255,   0},
   { 55, 255,   0}, { 51, 255,   0}, { 47, 255,   0}, { 44, 255,   0}, { 40, 255,   0},
   { 36, 255,   0}, { 32, 255,   0}, { 28, 255,   0}, { 25, 255,   0}, { 21, 255,   0},
   { 17, 255,   0}, { 13, 255,   0}, {  9, 255,   0}, {  5, 255,   0}, {  2, 255,   0},
   {  0, 253,   2}, {  0, 249,   6}, {  0, 245,  10}, {  0, 241,  14}, {  0, 237,  18},
   {  0, 234,  21}, {  0, 230,  25}, {  0, 226,  29}, {  0, 222,  33}, {  0, 218,  37},
   {  0, 214,  41}, {  0, 211,  44}, {  0, 207,  48}, {  0, 203,  52}, {  0, 199,  56},
   {  0, 195,  60}, {  0, 191,  64}, {  0, 188,  67}, {  0, 184,  71}, {  0, 180,  75},
   {  0, 176,  79}, {  0, 172,  83}, {  0, 168,  87}, {  0, 165,  90}, {  0, 161,  94},
   {  0, 157,  98}, {  0, 153, 102}, {  0, 149, 106}, {  0, 145, 110}, {  0, 142, 113},
   {  0, 138, 117}, {  0, 134, 121}, {  0, 130, 125}, {  0, 126, 129}, {  0, 123, 132},
   {  0, 119, 136}, {  0, 115, 140}, {  0, 111, 144}, {  0, 107, 148}, {  0, 103, 152},
   {  0, 100, 155}, {  0,  96, 159}, {  0,  92, 163}, {  0,  88, 167}, {  0,  84, 171},
   {  0,  80, 175}, {  0,  77, 178}, {  0,  73, 182}, {  0,  69, 186}, {  0,  65, 190},
   {  0,  61, 194}, {  0,  57, 198}, {  0,  54, 201}, {  0,  50, 205}, {  0,  46, 209},
   {  0,  42, 213}, {  0,  38, 217}, {  0,  34, 221}, {  0,  31, 224}, {  0,  27, 228},
   {  0,  23, 232}, {  0,  19, 236}, {  0,  15, 240}, {  0,  11, 244}, {  0,   8, 247},
   {  0,   4, 251}, {  0,   0, 255}, {  3,   0, 255}, {  5,   0, 255}, {  8,   0, 255},
   { 10,   0, 255}, { 13,   0, 255}, { 15,   0, 255}, { 18,   0, 255}, { 20,   0, 255},
   { 23,   0, 255}, { 26,   0, 255}, { 28,   0, 255}, { 31,   0, 255}, { 33,   0, 255},
   { 36,   0, 255}, { 38,   0, 255}, { 41,   0, 255}, { 43,   0, 255}, { 46,   0, 255},
   { 48,   0, 255}, { 51,   0, 255}, { 54,   0, 255}, { 56,   0, 255}, { 59,   0, 255},
   { 61,   0, 255}, { 64,   0, 255}, { 66,   0, 255}, { 69,   0, 255}, { 71,   0, 255},
   { 74,   0, 255}, { 77,   0, 255}, { 79,   0, 255}, { 82,   0, 255}, { 84,   0, 255},
   { 87,   0, 255}, { 89,   0, 255}, { 92,   0, 255}, { 94,   0, 255}, { 97,   0, 255},
   {100,   0, 255}, {102,   0, 255}, {105,   0, 255}, {107,   0, 255}, {110,   0, 255},
   {112,   0, 255}, {115,   0, 255}, {117,   0, 255}, {120,   0, 255}, {123,   0, 255},
   {125,   0, 255}, {128,   0, 255}, {130,   0, 255}, {133,   0, 255}, {135,   0, 255},
   {138,   0, 255}, {140,   0, 255}, {143,   0, 255}, {145,   0, 255}, {148,   0, 255},
   {151,   0, 255}, {153,   0, 255}, {156,   0, 255}, {158,   0, 255}, {161,   0, 255},
   {163,   0, 255}, {166,   0, 255}, {168,   0, 255}, {171,   0, 253}, {172,   0, 250},
   {173,   0, 246}, {174,   0, 242}, {176,   0, 238}, {177,   0, 234}, {178,   0, 230},
   {179,   0, 227}, {181,   0, 223}, {182,   0, 219}, {183,   0, 215}, {185,   0, 211},
   {186,   0, 208}, {187,   0, 204}, {188,   0, 200}, {190,   0, 196}, {191,   0, 192},
   {192,   0, 188}, {193,   0, 185}, {195,   0, 181}, {196,   0, 177}, {197,   0, 173},
   {199,   0, 169}, {200,   0, 165}, {201,   0, 162}, {202,   0, 158}, {204,   0, 154},
   {205,   0, 150}, {206,   0, 146}, {208,   0, 142}, {209,   0, 139}, {210,   0, 135},
   {211,   0, 131}, {213,   0, 127}, {214,   0, 123}, {215,   0, 119}, {216,   0, 116},
   {218,   0, 112}, {219,   0, 108}, {220,   0, 104}, {222,   0, 100}, {223,   0,  96},
   {224,   0,  93}, {225,   0,  89}, {227,   0,  85}, {228,   0,  81}, {229,   0,  77},
   {230,   0,  74}, {232,   0,  70}, {233,   0,  66}, {234,   0,  62}, {236,   0,  58},
   {237,   0,  54}, {238,   0,  51}, {239,   0,  47}, {241,   0,  43}, {242,   0,  39},
   {243,   0,  35}, {245,   0,  31}, {246,   0,  28}, {247,   0,  24}, {248,   0,  20},
   {250,   0,  16}, {251,   0,  12}, {252,   0,   8}, {253,   0,   5}, {255,   0,   1},
   {255,   2,   0}, {255,   3,   0}, {255,   5,   0}, {255,   7,   0}, {255,   9,   0},
   {255,  11,   0}, {255,  13,   0}, {255,  15,   0}, {255,  17,   0}, {255,  19,   0},
   {255,  21,   0}, {255,  23,   0}, {255,  25,   0}, {255,  26,   0}, {255,  28,   0},
   {255,  30,   0}, {255,  32,   0}, {255,  34,   0}, {255,  36,   0}, {255,  38,   0},
   {255,  40,   0}, {255,  42,   0}, {255,  44,   0}, {255,  46,   0}, {255,  47,   0},
   {255,  49,   0}, {255,  51,   0}, {255,  53,   0}, {255,  55,   0}, {255,  57,   0},
   {255,  59,   0}, {255,  61,   0}, {255,  63,   0}, {255,  65,   0}, {255,  67,   0},
   {255,  69,   0}, {255,  70,   0}, {255,  72,   0}, {255,  74,   0}, {255,  76,   0},
   {255,  78,   0}, {255,  80,   0}, {255,  82,   0}, {255,  84,   0}, {255,  86,   0},
   {255,  88,   0}, {255,  90,   0}, {255,  92,   0}, {255,  93,   0}, {255,  95,   0},
   {255,  97,   0}, {255,  99,   0}, {255, 101,   0}, {255, 103,   0}, {255, 105,   0},
   {255, 107,   0}, {255, 109,   0}, {255, 111,   0}, {255, 113,   0}, {255, 114,   0},
   {255, 116,   0}, {255, 118,   0}, {255, 120,   0}, {255, 122,   0}, {255, 124,   0},
   {255, 126,   0}, {255, 128,   0}, {255, 130,   0}, {255, 132,   0}, {255, 134,   0},
   {255, 136,   0}, {255, 137,   0}, {255, 139,   0}, {255, 141,   0}, {255, 143,   0},
   {255, 145,   0}, {255, 147,   0}, {255, 149,   0}, {255, 151,   0}, {255, 153,   0},
   {255, 155,   0}, {255, 157,   0}, {255, 159,   0}, {255, 160,   0}, {255, 162,   0},
   {255, 164,   0}, {255, 166,   0}, {255, 168,   0}, {255, 170,   0}, {255, 172,   0},
   {255, 174,   0}, {255, 176,   0}, {255, 178,   0}, {255, 180,   0}, {255, 181,   0},
   {255, 183,   0}, {255, 185,   0}, {255, 187,   0}, {255, 189,   0}, {255, 191,   0},
   {255, 193,   0}, {255, 195,   0}, {255, 197,   0}, {255, 199,   0}, {255, 201,   0},
   {255, 203,   0}, {255, 204,   0}, {255, 206,   0}, {255, 208,   0}, {255, 210,   0},
   {255, 212,   0}, {255, 214,   0}, {255, 216,   0}, {255, 218,   0}, {255, 220,   0},
   {255, 222,   0}, {255, 224,   0}, {255, 226,   0}, {255, 227,   0}, {255, 229,   0},
   {255, 231,   0}, {255, 233,   0}, {255, 235,   0}, {255, 237,   0}, {255, 239,   0},
   {255, 241,   0}, {255, 243,   0}, {255, 245,   0}, {255, 247,   0}, {255, 248,   0},
   {255, 250,   0}, {255, 252,   0}, {255, 254,   0}, {253, 255,   0}, {249, 255,   0},
   {245, 255,   0}, {241, 255,   0}, {237, 255,   0}, {234, 255,   0}, {230, 255,   0},
   {226, 255,   0}, {222, 255,   0}, {218, 255,   0}, {214, 255,   0}, {211, 255,   0},
   {207, 255,   0}, {203, 255,   0}, {199, 255,   0}, {195, 255,   0}, {191, 255,   0},
   {188, 255,   0}, {184, 255,   0}, {180, 255,   0}, {176, 255,   0}, {172, 255,   0},
   {168, 255,   0}, {165, 255,   0}, {161, 255,   0}, {157, 255,   0}, {153, 255,   0},
   {149, 255,   0}, {145, 255,   0}, {142, 255,   0}, {138, 255,   0}, {134, 255,   0},
   {130, 255,   0}, {126, 255,   0}, {123, 255,   0}, {119, 255,   0}, {115, 255,   0},
   {111, 255,   0}, {107, 255,   0}, {103, 255,   0}, {100, 255,   0}, { 96, 255,   0},
   { 92, 255,   0}, { 88, 255,   0}, { 84, 255,   0}, { 80, 255,   0}, { 77, 255,   0},
   { 73, 255,   0}, { 69, 255,   0}, { 65, 255,   0}, { 61, 255,   0}, { 57, 255,   0},
   { 54, 255,   0}, { 50, 255,   0}, { 46, 255,   0}, { 42, 255,   0}, { 38, 255,   0},
   { 34, 255,   0}, { 31, 255,   0}, { 27, 255,   0}, { 23, 255,   0}, { 19, 255,   0},
   { 15, 255,   0}, { 11, 255,   0}, {  8, 255,   0}, {  4, 255,   0}, {  0, 255,   0}
};

const rgb_t vga_colormap[1000] = {
   {255, 255, 255}, {254, 254, 254}, {253, 253, 253}, {252, 252, 252}, {251, 251, 251},
   {250, 250, 250}, {249, 249, 249}, {248, 248, 248}, {247, 247, 247}, {246, 246, 246},
   {245, 245, 245}, {244, 244, 244}, {244, 244, 244}, {243, 243, 243}, {242, 242, 242},
   {241, 241, 241}, {240, 240, 240}, {239, 239, 239}, {238, 238, 238}, {237, 237, 237},
   {236, 236, 236}, {235, 235, 235}, {234, 234, 234}, {233, 233, 233}, {232, 232, 232},
   {231, 231, 231}, {230, 230, 230}, {229, 229, 229}, {228, 228, 228}, {227, 227, 227},
   {226, 226, 226}, {225, 225, 225}, {224, 224, 224}, {223, 223, 223}, {222, 222, 222},
   {221, 221, 221}, {221, 221, 221}, {220, 220, 220}, {219, 219, 219}, {218, 218, 218},
   {217, 217, 217}, {216, 216, 216}, {215, 215, 215}, {214, 214, 214}, {213, 213, 213},
   {212, 212, 212}, {211, 211, 211}, {210, 210, 210}, {209, 209, 209}, {208, 208, 208},
   {207, 207, 207}, {206, 206, 206}, {205, 205, 205}, {204, 204, 204}, {203, 203, 203},
   {202, 202, 202}, {201, 201, 201}, {200, 200, 200}, {199, 199, 199}, {199, 199, 199},
   {198, 198, 198}, {197, 197, 197}, {196, 196, 196}, {195, 195, 195}, {194, 194, 194},
   {193, 193, 193}, {192, 192, 192}, {192, 190, 190}, {193, 187, 187}, {194, 184, 184},
   {195, 181, 181}, {195, 179, 179}, {196, 176, 176}, {197, 173, 173}, {198, 170, 170},
   {199, 167, 167}, {200, 164, 164}, {201, 161, 161}, {202, 159, 159}, {203, 156, 156},
   {204, 153, 153}, {205, 150, 150}, {206, 147, 147}, {207, 144, 144}, {208, 141, 141},
   {209, 138, 138}, {210, 136, 136}, {211, 133, 133}, {212, 130, 130}, {213, 127, 127},
   {214, 124, 124}, {215, 121, 121}, {216, 118, 118}, {217, 115, 115}, {217, 113, 113},
   {218, 110, 110}, {219, 107, 107}, {220, 104, 104}, {221, 101, 101}, {222,  98,  98},
   {223,  95,  95}, {224,  92,  92}, {225,  90,  90}, {226,  87,  87}, {227,  84,  84},
   {228,  81,  81}, {229,  78,  78}, {230,  75,  75}, {231,  72,  72}, {232,  69,  69},
   {233,  67,  67}, {234,  64,  64}, {235,  61,  61}, {236,  58,  58}, {237,  55,  55},
   {238,  52,  52}, {239,  49,  49}, {239,  47,  47}, {240,  44,  44}, {241,  41,  41},
   {242,  38,  38}, {243,  35,  35}, {244,  32,  32}, {245,  29,  29}, {246,  26,  26},
   {247,  24,  24}, {248,  21,  21}, {249,  18,  18}, {250,  15,  15}, {251,  12,  12},
   {252,   9,   9}, {253,   6,   6}, {254,   3,   3}, {255,   1,   1}, {255,   3,   0},
   {255,   7,   0}, {255,  11,   0}, {255,  15,   0}, {255,  18,   0}, {255,  22,   0},
   {255,  26,   0}, {255,  30,   0}, {255,  34,   0}, {255,  38,   0}, {255,  41,   0},
   {255,  45,   0}, {255,  49,   0}, {255,  53,   0}, {255,  57,   0}, {255,  60,   0},
   {255,  64,   0}, {255,  68,   0}, {255,  72,   0}, {255,  76,   0}, {255,  80,   0},
   {255,  83,   0}, {255,  87,   0}, {255,  91,   0}, {255,  95,   0}, {255,  99,   0},
   {255, 103,   0}, {255, 106,   0}, {255, 110,   0}, {255, 114,   0}, {255, 118,   0},
   {255, 122,   0}, {255, 126,   0}, {255, 129,   0}, {255, 133,   0}, {255, 137,   0},
   {255, 141,   0}, {255, 145,   0}, {255, 149,   0}, {255, 152,   0}, {255, 156,   0},
   {255, 160,   0}, {255, 164,   0}, {255, 168,   0}, {255, 172,   0}, {255, 175,   0},
   {255, 179,   0}, {255, 183,   0}, {255, 187,   0}, {255, 191,   0}, {255, 195,   0},
   {255, 198,   0}, {255, 202,   0}, {255, 206,   0}, {255, 210,   0}, {255, 214,   0},
   {255, 217,   0}, {255, 221,   0}, {255, 225,   0}, {255, 229,   0}, {255, 233,   0},
   {255, 237,   0}, {255, 240,   0}, {255, 244,   0}, {255, 248,   0}, {255, 252,   0},
   {254, 255,   0}, {250, 255,   0}, {247, 255,   0}, {243, 255,   0}, {239, 255,   0},
   {235, 255,   0}, {231, 255,   0}, {227, 255,   0}, {224, 255,   0}, {220, 255,   0},
   {216, 255,   0}, {212, 255,   0}, {208, 255,   0}, {204, 255,   0}, {201, 255,   0},
   {197, 255,   0}, {193, 255,   0}, {189, 255,   0}, {185, 255,   0}, {181, 255,   0},
   {178, 255,   0}, {174, 255,   0}, {170, 255,   0}, {166, 255,   0}, {162, 255,   0},
   {159, 255,   0}, {155, 255,   0}, {151, 255,   0}, {147, 255,   0}, {143, 255,   0},
   {139, 255,   0}, {136, 255,   0}, {132, 255,   0}, {128, 255,   0}, {124, 255,   0},
   {120, 255,   0}, {116, 255,   0}, {113, 255,   0}, {109, 255,   0}, {105, 255,   0},
   {101, 255,   0}, { 97, 255,   0}, { 93, 255,   0}, { 90, 255,   0}, { 86, 255,   0},
   { 82, 255,   0}, { 78, 255,   0}, { 74, 255,   0}, { 70, 255,   0}, { 67, 255,   0},
   { 63, 255,   0}, { 59, 255,   0}, { 55, 255,   0}, { 51, 255,   0}, { 47, 255,   0},
   { 44, 255,   0}, { 40, 255,   0}, { 36, 255,   0}, { 32, 255,   0}, { 28, 255,   0},
   { 25, 255,   0}, { 21, 255,   0}, { 17, 255,   0}, { 13, 255,   0}, {  9, 255,   0},
   {  5, 255,   0}, {  2, 255,   0}, {  0, 255,   2}, {  0, 255,   6}, {  0, 255,  10},
   {  0, 255,  14}, {  0, 255,  18}, {  0, 255,  21}, {  0, 255,  25}, {  0, 255,  29},
   {  0, 255,  33}, {  0, 255,  37}, {  0, 255,  41}, {  0, 255,  44}, {  0, 255,  48},
   {  0, 255,  52}, {  0, 255,  56}, {  0, 255,  60}, {  0, 255,  64}, {  0, 255,  67},
   {  0, 255,  71}, {  0, 255,  75}, {  0, 255,  79}, {  0, 255,  83}, {  0, 255,  87},
   {  0, 255,  90}, {  0, 255,  94}, {  0, 255,  98}, {  0, 255, 102}, {  0, 255, 106},
   {  0, 255, 110}, {  0, 255, 113}, {  0, 255, 117}, {  0, 255, 121}, {  0, 255, 125},
   {  0, 255, 129}, {  0, 255, 132}, {  0, 255, 136}, {  0, 255, 140}, {  0, 255, 144},
   {  0, 255, 148}, {  0, 255, 152}, {  0, 255, 155}, {  0, 255, 159}, {  0, 255, 163},
   {  0, 255, 167}, {  0, 255, 171}, {  0, 255, 175}, {  0, 255, 178}, {  0, 255, 182},
   {  0, 255, 186}, {  0, 255, 190}, {  0, 255, 194}, {  0, 255, 198}, {  0, 255, 201},
   {  0, 255, 205}, {  0, 255, 209}, {  0, 255, 213}, {  0, 255, 217}, {  0, 255, 221},
   {  0, 255, 224}, {  0, 255, 228}, {  0, 255, 232}, {  0, 255, 236}, {  0, 255, 240},
   {  0, 255, 244}, {  0, 255, 247}, {  0, 255, 251}, {  0, 255, 255}, {  0, 251, 255},
   {  0, 247, 255}, {  0, 244, 255}, {  0, 240, 255}, {  0, 236, 255}, {  0, 232, 255},
   {  0, 228, 255}, {  0, 224, 255}, {  0, 221, 255}, {  0, 217, 255}, {  0, 213, 255},
   {  0, 209, 255}, {  0, 205, 255}, {  0, 201, 255}, {  0, 198, 255}, {  0, 194, 255},
   {  0, 190, 255}, {  0, 186, 255}, {  0, 182, 255}, {  0, 178, 255}, {  0, 175, 255},
   {  0, 171, 255}, {  0, 167, 255}, {  0, 163, 255}, {  0, 159, 255}, {  0, 155, 255},
   {  0, 152, 255}, {  0, 148, 255}, {  0, 144, 255}, {  0, 140, 255}, {  0, 136, 255},
   {  0, 132, 255}, {  0, 129, 255}, {  0, 125, 255}, {  0, 121, 255}, {  0, 117, 255},
   {  0, 113, 255}, {  0, 110, 255}, {  0, 106, 255}, {  0, 102, 255}, {  0,  98, 255},
   {  0,  94, 255}, {  0,  90, 255}, {  0,  87, 255}, {  0,  83, 255}, {  0,  79, 255},
   {  0,  75, 255}, {  0,  71, 255}, {  0,  67, 255}, {  0,  64, 255}, {  0,  60, 255},
   {  0,  56, 255}, {  0,  52, 255}, {  0,  48, 255}, {  0,  44, 255}, {  0,  41, 255},
   {  0,  37, 255}, {  0,  33, 255}, {  0,  29, 255}, {  0,  25, 255}, {  0,  21, 255},
   {  0,  18, 255}, {  0,  14, 255}, {  0,  10, 255}, {  0,   6, 255}, {  0,   2, 255},
   {  2,   0, 255}, {  5,   0, 255}, {  9,   0, 255}, { 13,   0, 255}, { 17,   0, 255},
   { 21,   0, 255}, { 25,   0, 255}, { 28,   0, 255}, { 32,   0, 255}, { 36,   0, 255},
   { 40,   0, 255}, { 44,   0, 255}, { 47,   0, 255}, { 51,   0, 255}, { 55,   0, 255},
   { 59,   0, 255}, { 63,   0, 255}, { 67,   0, 255}, { 70,   0, 255}, { 74,   0, 255},
   { 78,   0, 255}, { 82,   0, 255}, { 86,   0, 255}, { 90,   0, 255}, { 93,   0, 255},
   { 97,   0, 255}, {101,   0, 255}, {105,   0, 255}, {109,   0, 255}, {113,   0, 255},
   {116,   0, 255}, {120,   0, 255}, {124,   0, 255}, {128,   0, 255}, {132,   0, 255},
   {136,   0, 255}, {139,   0, 255}, {143,   0, 255}, {147,   0, 255}, {151,   0, 255},
   {155,   0, 255}, {159,   0, 255}, {162,   0, 255}, {166,   0, 255}, {170,   0, 255},
   {174,   0, 255}, {178,   0, 255}, {181,   0, 255}, {185,   0, 255}, {189,   0, 255},
   {193,   0, 255}, {197,   0, 255}, {201,   0, 255}, {204,   0, 255}, {208,   0, 255},
   {212,   0, 255}, {216,   0, 255}, {220,   0, 255}, {224,   0, 255}, {227,   0, 255},
   {231,   0, 255}, {235,   0, 255}, {239,   0, 255}, {243,   0, 255}, {247,   0, 255},
   {250,   0, 255}, {254,   0, 255}, {252,   0, 252}, {248,   0, 248}, {244,   0, 244},
   {240,   0, 240}, {237,   0, 237}, {233,   0, 233}, {229,   0, 229}, {225,   0, 225},
   {221,   0, 221}, {217,   0, 217}, {214,   0, 214}, {210,   0, 210}, {206,   0, 206},
   {202,   0, 202}, {198,   0, 198}, {195,   0, 195}, {191,   0, 191}, {187,   0, 187},
   {183,   0, 183}, {179,   0, 179}, {175,   0, 175}, {172,   0, 172}, {168,   0, 168},
   {164,   0, 164}, {160,   0, 160}, {156,   0, 156}, {152,   0, 152}, {149,   0, 149},
   {145,   0, 145}, {141,   0, 141}, {137,   0, 137}, {133,   0, 133}, {129,   0, 129},
   {126,   0, 126}, {122,   0, 122}, {118,   0, 118}, {114,   0, 114}, {110,   0, 110},
   {106,   0, 106}, {103,   0, 103}, { 99,   0,  99}, { 95,   0,  95}, { 91,   0,  91},
   { 87,   0,  87}, { 83,   0,  83}, { 80,   0,  80}, { 76,   0,  76}, { 72,   0,  72},
   { 68,   0,  68}, { 64,   0,  64}, { 60,   0,  60}, { 57,   0,  57}, { 53,   0,  53},
   { 49,   0,  49}, { 45,   0,  45}, { 41,   0,  41}, { 38,   0,  38}, { 34,   0,  34},
   { 30,   0,  30}, { 26,   0,  26}, { 22,   0,  22}, { 18,   0,  18}, { 15,   0,  15},
   { 11,   0,  11}, {  7,   0,   7}, {  3,   0,   3}, {  0,   0,   0}, {  2,   2,   2},
   {  4,   4,   4}, {  6,   6,   6}, {  8,   8,   8}, { 10,  10,  10}, { 12,  12,  12},
   { 14,  14,  14}, { 16,  16,  16}, { 18,  18,  18}, { 20,  20,  20}, { 21,  21,  21},
   { 23,  23,  23}, { 25,  25,  25}, { 27,  27,  27}, { 29,  29,  29}, { 31,  31,  31},
   { 33,  33,  33}, { 35,  35,  35}, { 37,  37,  37}, { 39,  39,  39}, { 41,  41,  41},
   { 43,  43,  43}, { 44,  44,  44}, { 46,  46,  46}, { 48,  48,  48}, { 50,  50,  50},
   { 52,  52,  52}, { 54,  54,  54}, { 56,  56,  56}, { 58,  58,  58}, { 60,  60,  60},
   { 62,  62,  62}, { 64,  64,  64}, { 65,  65,  65}, { 67,  67,  67}, { 69,  69,  69},
   { 71,  71,  71}, { 73,  73,  73}, { 75,  75,  75}, { 77,  77,  77}, { 79,  79,  79},
   { 81,  81,  81}, { 83,  83,  83}, { 85,  85,  85}, { 87,  87,  87}, { 88,  88,  88},
   { 90,  90,  90}, { 92,  92,  92}, { 94,  94,  94}, { 96,  96,  96}, { 98,  98,  98},
   {100, 100, 100}, {102, 102, 102}, {104, 104, 104}, {106, 106, 106}, {108, 108, 108},
   {110, 110, 110}, {111, 111, 111}, {113, 113, 113}, {115, 115, 115}, {117, 117, 117},
   {119, 119, 119}, {121, 121, 121}, {123, 123, 123}, {125, 125, 125}, {127, 127, 127},
   {128, 126, 126}, {128, 124, 124}, {128, 123, 123}, {128, 121, 121}, {128, 119, 119},
   {128, 117, 117}, {128, 115, 115}, {128, 113, 113}, {128, 111, 111}, {128, 109, 109},
   {128, 107, 107}, {128, 105, 105}, {128, 103, 103}, {128, 101, 101}, {128, 100, 100},
   {128,  98,  98}, {128,  96,  96}, {128,  94,  94}, {128,  92,  92}, {128,  90,  90},
   {128,  88,  88}, {128,  86,  86}, {128,  84,  84}, {128,  82,  82}, {128,  80,  80},
   {128,  78,  78}, {128,  77,  77}, {128,  75,  75}, {128,  73,  73}, {128,  71,  71},
   {128,  69,  69}, {128,  67,  67}, {128,  65,  65}, {128,  63,  63}, {128,  61,  61},
   {128,  59,  59}, {128,  57,  57}, {128,  56,  56}, {128,  54,  54}, {128,  52,  52},
   {128,  50,  50}, {128,  48,  48}, {128,  46,  46}, {128,  44,  44}, {128,  42,  42},
   {128,  40,  40}, {128,  38,  38}, {128,  36,  36}, {128,  34,  34}, {128,  33,  33},
   {128,  31,  31}, {128,  29,  29}, {128,  27,  27}, {128,  25,  25}, {128,  23,  23},
   {128,  21,  21}, {128,  19,  19}, {128,  17,  17}, {128,  15,  15}, {128,  13,  13},
   {128,  11,  11}, {128,  10,  10}, {128,   8,   8}, {128,   6,   6}, {128,   4,   4},
   {128,   2,   2}, {128,   0,   0}, {128,   2,   0}, {128,   4,   0}, {128,   6,   0},
   {128,   8,   0}, {128,  10,   0}, {128,  11,   0}, {128,  13,   0}, {128,  15,   0},
   {128,  17,   0}, {128,  19,   0}, {128,  21,   0}, {128,  23,   0}, {128,  25,   0},
   {128,  27,   0}, {128,  29,   0}, {128,  31,   0}, {128,  33,   0}, {128,  34,   0},
   {128,  36,   0}, {128,  38,   0}, {128,  40,   0}, {128,  42,   0}, {128,  44,   0},
   {128,  46,   0}, {128,  48,   0}, {128,  50,   0}, {128,  52,   0}, {128,  54,   0},
   {128,  56,   0}, {128,  57,   0}, {128,  59,   0}, {128,  61,   0}, {128,  63,   0},
   {128,  65,   0}, {128,  67,   0}, {128,  69,   0}, {128,  71,   0}, {128,  73,   0},
   {128,  75,   0}, {128,  77,   0}, {128,  78,   0}, {128,  80,   0}, {128,  82,   0},
   {128,  84,   0}, {128,  86,   0}, {128,  88,   0}, {128,  90,   0}, {128,  92,   0},
   {128,  94,   0}, {128,  96,   0}, {128,  98,   0}, {128, 100,   0}, {128, 101,   0},
   {128, 103,   0}, {128, 105,   0}, {128, 107,   0}, {128, 109,   0}, {128, 111,   0},
   {128, 113,   0}, {128, 115,   0}, {128, 117,   0}, {128, 119,   0}, {128, 121,   0},
   {128, 123,   0}, {128, 124,   0}, {128, 126,   0}, {127, 128,   0}, {125, 128,   0},
   {123, 128,   0}, {121, 128,   0}, {119, 128,   0}, {117, 128,   0}, {115, 128,   0},
   {113, 128,   0}, {111, 128,   0}, {110, 128,   0}, {108, 128,   0}, {106, 128,   0},
   {104, 128,   0}, {102, 128,   0}, {100, 128,   0}, { 98, 128,   0}, { 96, 128,   0},
   { 94, 128,   0}, { 92, 128,   0}, { 90, 128,   0}, { 88, 128,   0}, { 87, 128,   0},
   { 85, 128,   0}, { 83, 128,   0}, { 81, 128,   0}, { 79, 128,   0}, { 77, 128,   0},
   { 75, 128,   0}, { 73, 128,   0}, { 71, 128,   0}, { 69, 128,   0}, { 67, 128,   0},
   { 65, 128,   0}, { 64, 128,   0}, { 62, 128,   0}, { 60, 128,   0}, { 58, 128,   0},
   { 56, 128,   0}, { 54, 128,   0}, { 52, 128,   0}, { 50, 128,   0}, { 48, 128,   0},
   { 46, 128,   0}, { 44, 128,   0}, { 43, 128,   0}, { 41, 128,   0}, { 39, 128,   0},
   { 37, 128,   0}, { 35, 128,   0}, { 33, 128,   0}, { 31, 128,   0}, { 29, 128,   0},
   { 27, 128,   0}, { 25, 128,   0}, { 23, 128,   0}, { 21, 128,   0}, { 20, 128,   0},
   { 18, 128,   0}, { 16, 128,   0}, { 14, 128,   0}, { 12, 128,   0}, { 10, 128,   0},
   {  8, 128,   0}, {  6, 128,   0}, {  4, 128,   0}, {  2, 128,   0}, {  0, 128,   0},
   {  0, 128,   2}, {  0, 128,   3}, {  0, 128,   5}, {  0, 128,   7}, {  0, 128,   9},
   {  0, 128,  11}, {  0, 128,  13}, {  0, 128,  15}, {  0, 128,  17}, {  0, 128,  19},
   {  0, 128,  21}, {  0, 128,  23}, {  0, 128,  25}, {  0, 128,  26}, {  0, 128,  28},
   {  0, 128,  30}, {  0, 128,  32}, {  0, 128,  34}, {  0, 128,  36}, {  0, 128,  38},
   {  0, 128,  40}, {  0, 128,  42}, {  0, 128,  44}, {  0, 128,  46}, {  0, 128,  47},
   {  0, 128,  49}, {  0, 128,  51}, {  0, 128,  53}, {  0, 128,  55}, {  0, 128,  57},
   {  0, 128,  59}, {  0, 128,  61}, {  0, 128,  63}, {  0, 128,  65}, {  0, 128,  67},
   {  0, 128,  69}, {  0, 128,  70}, {  0, 128,  72}, {  0, 128,  74}, {  0, 128,  76},
   {  0, 128,  78}, {  0, 128,  80}, {  0, 128,  82}, {  0, 128,  84}, {  0, 128,  86},
   {  0, 128,  88}, {  0, 128,  90}, {  0, 128,  92}, {  0, 128,  93}, {  0, 128,  95},
   {  0, 128,  97}, {  0, 128,  99}, {  0, 128, 101}, {  0, 128, 103}, {  0, 128, 105},
   {  0, 128, 107}, {  0, 128, 109}, {  0, 128, 111}, {  0, 128, 113}, {  0, 128, 114},
   {  0, 128, 116}, {  0, 128, 118}, {  0, 128, 120}, {  0, 128, 122}, {  0, 128, 124},
   {  0, 128, 126}, {  0, 127, 128}, {  0, 125, 128}, {  0, 123, 128}, {  0, 121, 128},
   {  0, 119, 128}, {  0, 118, 128}, {  0, 116, 128}, {  0, 114, 128}, {  0, 112, 128},
   {  0, 110, 128}, {  0, 108, 128}, {  0, 106, 128}, {  0, 104, 128}, {  0, 102, 128},
   {  0, 100, 128}, {  0,  98, 128}, {  0,  96, 128}, {  0,  95, 128}, {  0,  93, 128},
   {  0,  91, 128}, {  0,  89, 128}, {  0,  87, 128}, {  0,  85, 128}, {  0,  83, 128},
   {  0,  81, 128}, {  0,  79, 128}, {  0,  77, 128}, {  0,  75, 128}, {  0,  74, 128},
   {  0,  72, 128}, {  0,  70, 128}, {  0,  68, 128}, {  0,  66, 128}, {  0,  64, 128},
   {  0,  62, 128}, {  0,  60, 128}, {  0,  58, 128}, {  0,  56, 128}, {  0,  54, 128},
   {  0,  52, 128}, {  0,  51, 128}, {  0,  49, 128}, {  0,  47, 128}, {  0,  45, 128},
   {  0,  43, 128}, {  0,  41, 128}, {  0,  39, 128}, {  0,  37, 128}, {  0,  35, 128},
   {  0,  33, 128}, {  0,  31, 128}, {  0,  29, 128}, {  0,  28, 128}, {  0,  26, 128},
   {  0,  24, 128}, {  0,  22, 128}, {  0,  20, 128}, {  0,  18, 128}, {  0,  16, 128},
   {  0,  14, 128}, {  0,  12, 128}, {  0,  10, 128}, {  0,   8, 128}, {  0,   7, 128},
   {  0,   5, 128}, {  0,   3, 128}, {  0,   1, 128}, {  1,   0, 128}, {  3,   0, 128},
   {  5,   0, 128}, {  7,   0, 128}, {  9,   0, 128}, { 11,   0, 128}, { 13,   0, 128},
   { 15,   0, 128}, { 16,   0, 128}, { 18,   0, 128}, { 20,   0, 128}, { 22,   0, 128},
   { 24,   0, 128}, { 26,   0, 128}, { 28,   0, 128}, { 30,   0, 128}, { 32,   0, 128},
   { 34,   0, 128}, { 36,   0, 128}, { 38,   0, 128}, { 39,   0, 128}, { 41,   0, 128},
   { 43,   0, 128}, { 45,   0, 128}, { 47,   0, 128}, { 49,   0, 128}, { 51,   0, 128},
   { 53,   0, 128}, { 55,   0, 128}, { 57,   0, 128}, { 59,   0, 128}, { 60,   0, 128},
   { 62,   0, 128}, { 64,   0, 128}, { 66,   0, 128}, { 68,   0, 128}, { 70,   0, 128},
   { 72,   0, 128}, { 74,   0, 128}, { 76,   0, 128}, { 78,   0, 128}, { 80,   0, 128},
   { 82,   0, 128}, { 83,   0, 128}, { 85,   0, 128}, { 87,   0, 128}, { 89,   0, 128},
   { 91,   0, 128}, { 93,   0, 128}, { 95,   0, 128}, { 97,   0, 128}, { 99,   0, 128},
   {101,   0, 128}, {103,   0, 128}, {105,   0, 128}, {106,   0, 128}, {108,   0, 128},
   {110,   0, 128}, {112,   0, 128}, {114,   0, 128}, {116,   0, 128}, {118,   0, 128},
   {120,   0, 128}, {122,   0, 128}, {124,   0, 128}, {126,   0, 128}, {128,   0, 128}
};

const rgb_t yarg_colormap[1000] = {
   {  0,   0,   0}, {  0,   0,   0}, {  1,   1,   1}, {  1,   1,   1}, {  1,   1,   1},
   {  1,   1,   1}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2},
   {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  4,   4,   4},
   {  4,   4,   4}, {  4,   4,   4}, {  4,   4,   4}, {  5,   5,   5}, {  5,   5,   5},
   {  5,   5,   5}, {  5,   5,   5}, {  6,   6,   6}, {  6,   6,   6}, {  6,   6,   6},
   {  6,   6,   6}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7},
   {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  9,   9,   9},
   {  9,   9,   9}, {  9,   9,   9}, {  9,   9,   9}, { 10,  10,  10}, { 10,  10,  10},
   { 10,  10,  10}, { 10,  10,  10}, { 11,  11,  11}, { 11,  11,  11}, { 11,  11,  11},
   { 11,  11,  11}, { 12,  12,  12}, { 12,  12,  12}, { 12,  12,  12}, { 13,  13,  13},
   { 13,  13,  13}, { 13,  13,  13}, { 13,  13,  13}, { 14,  14,  14}, { 14,  14,  14},
   { 14,  14,  14}, { 14,  14,  14}, { 15,  15,  15}, { 15,  15,  15}, { 15,  15,  15},
   { 15,  15,  15}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16},
   { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 18,  18,  18},
   { 18,  18,  18}, { 18,  18,  18}, { 18,  18,  18}, { 19,  19,  19}, { 19,  19,  19},
   { 19,  19,  19}, { 19,  19,  19}, { 20,  20,  20}, { 20,  20,  20}, { 20,  20,  20},
   { 20,  20,  20}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21},
   { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 23,  23,  23},
   { 23,  23,  23}, { 23,  23,  23}, { 23,  23,  23}, { 24,  24,  24}, { 24,  24,  24},
   { 24,  24,  24}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25},
   { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 27,  27,  27},
   { 27,  27,  27}, { 27,  27,  27}, { 27,  27,  27}, { 28,  28,  28}, { 28,  28,  28},
   { 28,  28,  28}, { 28,  28,  28}, { 29,  29,  29}, { 29,  29,  29}, { 29,  29,  29},
   { 29,  29,  29}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30},
   { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 32,  32,  32},
   { 32,  32,  32}, { 32,  32,  32}, { 32,  32,  32}, { 33,  33,  33}, { 33,  33,  33},
   { 33,  33,  33}, { 33,  33,  33}, { 34,  34,  34}, { 34,  34,  34}, { 34,  34,  34},
   { 34,  34,  34}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35},
   { 36,  36,  36}, { 36,  36,  36}, { 36,  36,  36}, { 37,  37,  37}, { 37,  37,  37},
   { 37,  37,  37}, { 37,  37,  37}, { 38,  38,  38}, { 38,  38,  38}, { 38,  38,  38},
   { 38,  38,  38}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39},
   { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 41,  41,  41},
   { 41,  41,  41}, { 41,  41,  41}, { 41,  41,  41}, { 42,  42,  42}, { 42,  42,  42},
   { 42,  42,  42}, { 42,  42,  42}, { 43,  43,  43}, { 43,  43,  43}, { 43,  43,  43},
   { 43,  43,  43}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44},
   { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 46,  46,  46},
   { 46,  46,  46}, { 46,  46,  46}, { 46,  46,  46}, { 47,  47,  47}, { 47,  47,  47},
   { 47,  47,  47}, { 47,  47,  47}, { 48,  48,  48}, { 48,  48,  48}, { 48,  48,  48},
   { 48,  48,  48}, { 49,  49,  49}, { 49,  49,  49}, { 49,  49,  49}, { 50,  50,  50},
   { 50,  50,  50}, { 50,  50,  50}, { 50,  50,  50}, { 51,  51,  51}, { 51,  51,  51},
   { 51,  51,  51}, { 51,  51,  51}, { 52,  52,  52}, { 52,  52,  52}, { 52,  52,  52},
   { 52,  52,  52}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53},
   { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 55,  55,  55},
   { 55,  55,  55}, { 55,  55,  55}, { 55,  55,  55}, { 56,  56,  56}, { 56,  56,  56},
   { 56,  56,  56}, { 56,  56,  56}, { 57,  57,  57}, { 57,  57,  57}, { 57,  57,  57},
   { 57,  57,  57}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58},
   { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 60,  60,  60},
   { 60,  60,  60}, { 60,  60,  60}, { 60,  60,  60}, { 61,  61,  61}, { 61,  61,  61},
   { 61,  61,  61}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62},
   { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 64,  64,  64},
   { 64,  64,  64}, { 64,  64,  64}, { 64,  64,  64}, { 65,  65,  65}, { 65,  65,  65},
   { 65,  65,  65}, { 65,  65,  65}, { 66,  66,  66}, { 66,  66,  66}, { 66,  66,  66},
   { 66,  66,  66}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67},
   { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 69,  69,  69},
   { 69,  69,  69}, { 69,  69,  69}, { 69,  69,  69}, { 70,  70,  70}, { 70,  70,  70},
   { 70,  70,  70}, { 70,  70,  70}, { 71,  71,  71}, { 71,  71,  71}, { 71,  71,  71},
   { 71,  71,  71}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72},
   { 73,  73,  73}, { 73,  73,  73}, { 73,  73,  73}, { 74,  74,  74}, { 74,  74,  74},
   { 74,  74,  74}, { 74,  74,  74}, { 75,  75,  75}, { 75,  75,  75}, { 75,  75,  75},
   { 75,  75,  75}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76},
   { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 78,  78,  78},
   { 78,  78,  78}, { 78,  78,  78}, { 78,  78,  78}, { 79,  79,  79}, { 79,  79,  79},
   { 79,  79,  79}, { 79,  79,  79}, { 80,  80,  80}, { 80,  80,  80}, { 80,  80,  80},
   { 80,  80,  80}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81},
   { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 83,  83,  83},
   { 83,  83,  83}, { 83,  83,  83}, { 83,  83,  83}, { 84,  84,  84}, { 84,  84,  84},
   { 84,  84,  84}, { 84,  84,  84}, { 85,  85,  85}, { 85,  85,  85}, { 85,  85,  85},
   { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 87,  87,  87},
   { 87,  87,  87}, { 87,  87,  87}, { 87,  87,  87}, { 88,  88,  88}, { 88,  88,  88},
   { 88,  88,  88}, { 88,  88,  88}, { 89,  89,  89}, { 89,  89,  89}, { 89,  89,  89},
   { 89,  89,  89}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90},
   { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 92,  92,  92},
   { 92,  92,  92}, { 92,  92,  92}, { 92,  92,  92}, { 93,  93,  93}, { 93,  93,  93},
   { 93,  93,  93}, { 93,  93,  93}, { 94,  94,  94}, { 94,  94,  94}, { 94,  94,  94},
   { 94,  94,  94}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95},
   { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 97,  97,  97},
   { 97,  97,  97}, { 97,  97,  97}, { 98,  98,  98}, { 98,  98,  98}, { 98,  98,  98},
   { 98,  98,  98}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99},
   {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {101, 101, 101},
   {101, 101, 101}, {101, 101, 101}, {101, 101, 101}, {102, 102, 102}, {102, 102, 102},
   {102, 102, 102}, {102, 102, 102}, {103, 103, 103}, {103, 103, 103}, {103, 103, 103},
   {103, 103, 103}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104},
   {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {106, 106, 106},
   {106, 106, 106}, {106, 106, 106}, {106, 106, 106}, {107, 107, 107}, {107, 107, 107},
   {107, 107, 107}, {107, 107, 107}, {108, 108, 108}, {108, 108, 108}, {108, 108, 108},
   {108, 108, 108}, {109, 109, 109}, {109, 109, 109}, {109, 109, 109}, {110, 110, 110},
   {110, 110, 110}, {110, 110, 110}, {110, 110, 110}, {111, 111, 111}, {111, 111, 111},
   {111, 111, 111}, {111, 111, 111}, {112, 112, 112}, {112, 112, 112}, {112, 112, 112},
   {112, 112, 112}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113},
   {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {115, 115, 115},
   {115, 115, 115}, {115, 115, 115}, {115, 115, 115}, {116, 116, 116}, {116, 116, 116},
   {116, 116, 116}, {116, 116, 116}, {117, 117, 117}, {117, 117, 117}, {117, 117, 117},
   {117, 117, 117}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118},
   {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {120, 120, 120},
   {120, 120, 120}, {120, 120, 120}, {120, 120, 120}, {121, 121, 121}, {121, 121, 121},
   {121, 121, 121}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122},
   {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {124, 124, 124},
   {124, 124, 124}, {124, 124, 124}, {124, 124, 124}, {125, 125, 125}, {125, 125, 125},
   {125, 125, 125}, {125, 125, 125}, {126, 126, 126}, {126, 126, 126}, {126, 126, 126},
   {126, 126, 126}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127},
   {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {129, 129, 129},
   {129, 129, 129}, {129, 129, 129}, {129, 129, 129}, {130, 130, 130}, {130, 130, 130},
   {130, 130, 130}, {130, 130, 130}, {131, 131, 131}, {131, 131, 131}, {131, 131, 131},
   {131, 131, 131}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132},
   {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {134, 134, 134},
   {134, 134, 134}, {134, 134, 134}, {135, 135, 135}, {135, 135, 135}, {135, 135, 135},
   {135, 135, 135}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136},
   {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {138, 138, 138},
   {138, 138, 138}, {138, 138, 138}, {138, 138, 138}, {139, 139, 139}, {139, 139, 139},
   {139, 139, 139}, {139, 139, 139}, {140, 140, 140}, {140, 140, 140}, {140, 140, 140},
   {140, 140, 140}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141},
   {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {143, 143, 143},
   {143, 143, 143}, {143, 143, 143}, {143, 143, 143}, {144, 144, 144}, {144, 144, 144},
   {144, 144, 144}, {144, 144, 144}, {145, 145, 145}, {145, 145, 145}, {145, 145, 145},
   {145, 145, 145}, {146, 146, 146}, {146, 146, 146}, {146, 146, 146}, {147, 147, 147},
   {147, 147, 147}, {147, 147, 147}, {147, 147, 147}, {148, 148, 148}, {148, 148, 148},
   {148, 148, 148}, {148, 148, 148}, {149, 149, 149}, {149, 149, 149}, {149, 149, 149},
   {149, 149, 149}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150},
   {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {152, 152, 152},
   {152, 152, 152}, {152, 152, 152}, {152, 152, 152}, {153, 153, 153}, {153, 153, 153},
   {153, 153, 153}, {153, 153, 153}, {154, 154, 154}, {154, 154, 154}, {154, 154, 154},
   {154, 154, 154}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155},
   {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {157, 157, 157},
   {157, 157, 157}, {157, 157, 157}, {157, 157, 157}, {158, 158, 158}, {158, 158, 158},
   {158, 158, 158}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159},
   {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {161, 161, 161},
   {161, 161, 161}, {161, 161, 161}, {161, 161, 161}, {162, 162, 162}, {162, 162, 162},
   {162, 162, 162}, {162, 162, 162}, {163, 163, 163}, {163, 163, 163}, {163, 163, 163},
   {163, 163, 163}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164},
   {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {166, 166, 166},
   {166, 166, 166}, {166, 166, 166}, {166, 166, 166}, {167, 167, 167}, {167, 167, 167},
   {167, 167, 167}, {167, 167, 167}, {168, 168, 168}, {168, 168, 168}, {168, 168, 168},
   {168, 168, 168}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169},
   {170, 170, 170}, {170, 170, 170}, {170, 170, 170}, {171, 171, 171}, {171, 171, 171},
   {171, 171, 171}, {171, 171, 171}, {172, 172, 172}, {172, 172, 172}, {172, 172, 172},
   {172, 172, 172}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173},
   {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {175, 175, 175},
   {175, 175, 175}, {175, 175, 175}, {175, 175, 175}, {176, 176, 176}, {176, 176, 176},
   {176, 176, 176}, {176, 176, 176}, {177, 177, 177}, {177, 177, 177}, {177, 177, 177},
   {177, 177, 177}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178},
   {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {180, 180, 180},
   {180, 180, 180}, {180, 180, 180}, {180, 180, 180}, {181, 181, 181}, {181, 181, 181},
   {181, 181, 181}, {181, 181, 181}, {182, 182, 182}, {182, 182, 182}, {182, 182, 182},
   {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {184, 184, 184},
   {184, 184, 184}, {184, 184, 184}, {184, 184, 184}, {185, 185, 185}, {185, 185, 185},
   {185, 185, 185}, {185, 185, 185}, {186, 186, 186}, {186, 186, 186}, {186, 186, 186},
   {186, 186, 186}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187},
   {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {189, 189, 189},
   {189, 189, 189}, {189, 189, 189}, {189, 189, 189}, {190, 190, 190}, {190, 190, 190},
   {190, 190, 190}, {190, 190, 190}, {191, 191, 191}, {191, 191, 191}, {191, 191, 191},
   {191, 191, 191}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192},
   {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {194, 194, 194},
   {194, 194, 194}, {194, 194, 194}, {195, 195, 195}, {195, 195, 195}, {195, 195, 195},
   {195, 195, 195}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196},
   {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {198, 198, 198},
   {198, 198, 198}, {198, 198, 198}, {198, 198, 198}, {199, 199, 199}, {199, 199, 199},
   {199, 199, 199}, {199, 199, 199}, {200, 200, 200}, {200, 200, 200}, {200, 200, 200},
   {200, 200, 200}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201},
   {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {203, 203, 203},
   {203, 203, 203}, {203, 203, 203}, {203, 203, 203}, {204, 204, 204}, {204, 204, 204},
   {204, 204, 204}, {204, 204, 204}, {205, 205, 205}, {205, 205, 205}, {205, 205, 205},
   {205, 205, 205}, {206, 206, 206}, {206, 206, 206}, {206, 206, 206}, {207, 207, 207},
   {207, 207, 207}, {207, 207, 207}, {207, 207, 207}, {208, 208, 208}, {208, 208, 208},
   {208, 208, 208}, {208, 208, 208}, {209, 209, 209}, {209, 209, 209}, {209, 209, 209},
   {209, 209, 209}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210},
   {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {212, 212, 212},
   {212, 212, 212}, {212, 212, 212}, {212, 212, 212}, {213, 213, 213}, {213, 213, 213},
   {213, 213, 213}, {213, 213, 213}, {214, 214, 214}, {214, 214, 214}, {214, 214, 214},
   {214, 214, 214}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215},
   {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {217, 217, 217},
   {217, 217, 217}, {217, 217, 217}, {217, 217, 217}, {218, 218, 218}, {218, 218, 218},
   {218, 218, 218}, {218, 218, 218}, {219, 219, 219}, {219, 219, 219}, {219, 219, 219},
   {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {221, 221, 221},
   {221, 221, 221}, {221, 221, 221}, {221, 221, 221}, {222, 222, 222}, {222, 222, 222},
   {222, 222, 222}, {222, 222, 222}, {223, 223, 223}, {223, 223, 223}, {223, 223, 223},
   {223, 223, 223}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224},
   {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {226, 226, 226},
   {226, 226, 226}, {226, 226, 226}, {226, 226, 226}, {227, 227, 227}, {227, 227, 227},
   {227, 227, 227}, {227, 227, 227}, {228, 228, 228}, {228, 228, 228}, {228, 228, 228},
   {228, 228, 228}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229},
   {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {231, 231, 231},
   {231, 231, 231}, {231, 231, 231}, {232, 232, 232}, {232, 232, 232}, {232, 232, 232},
   {232, 232, 232}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233},
   {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {235, 235, 235},
   {235, 235, 235}, {235, 235, 235}, {235, 235, 235}, {236, 236, 236}, {236, 236, 236},
   {236, 236, 236}, {236, 236, 236}, {237, 237, 237}, {237, 237, 237}, {237, 237, 237},
   {237, 237, 237}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238},
   {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {240, 240, 240},
   {240, 240, 240}, {240, 240, 240}, {240, 240, 240}, {241, 241, 241}, {241, 241, 241},
   {241, 241, 241}, {241, 241, 241}, {242, 242, 242}, {242, 242, 242}, {242, 242, 242},
   {242, 242, 242}, {243, 243, 243}, {243, 243, 243}, {243, 243, 243}, {244, 244, 244},
   {244, 244, 244}, {244, 244, 244}, {244, 244, 244}, {245, 245, 245}, {245, 245, 245},
   {245, 245, 245}, {245, 245, 245}, {246, 246, 246}, {246, 246, 246}, {246, 246, 246},
   {246, 246, 246}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247},
   {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {249, 249, 249},
   {249, 249, 249}, {249, 249, 249}, {249, 249, 249}, {250, 250, 250}, {250, 250, 250},
   {250, 250, 250}, {250, 250, 250}, {251, 251, 251}, {251, 251, 251}, {251, 251, 251},
   {251, 251, 251}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252},
   {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {254, 254, 254},
   {254, 254, 254}, {254, 254, 254}, {254, 254, 254}, {255, 255, 255}, {255, 255, 255}
};

#endif


================================================
FILE: lonestar/eda/cpu/sproute/bitmap_test.cpp
================================================
/*
 *****************************************************************************
 *                                                                           *
 *                          Platform Independent                             *
 *                     Bitmap Image Reader Writer Library                    *
 *                                                                           *
 * Author: Arash Partow - 2002                                               *
 * URL: http://partow.net/programming/bitmap/index.html                      *
 *                                                                           *
 * Note: This library only supports 24-bits per pixel bitmap format files.   *
 *                                                                           *
 * Copyright notice:                                                         *
 * Free use of the Platform Independent Bitmap Image Reader Writer Library   *
 * is permitted under the guidelines and in accordance with the most current *
 * version of the MIT License.                                               *
 * http://www.opensource.org/licenses/MIT                                    *
 *                                                                           *
 *****************************************************************************
 */

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <string>

#include "bitmap_image.hpp"

void test01() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test01() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  image.save_image("test01_saved.bmp");
}

void test02() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test02() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  image.save_image("test02_saved.bmp");

  image.vertical_flip();
  image.save_image("test02_saved_vert_flip.bmp");
  image.vertical_flip();

  image.horizontal_flip();
  image.save_image("test02_saved_horiz_flip.bmp");
}

void test03() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test03() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  bitmap_image subsampled_image1;
  bitmap_image subsampled_image2;
  bitmap_image subsampled_image3;

  image.subsample(subsampled_image1);
  subsampled_image1.save_image("test03_1xsubsampled_image.bmp");

  subsampled_image1.subsample(subsampled_image2);
  subsampled_image2.save_image("test03_2xsubsampled_image.bmp");

  subsampled_image2.subsample(subsampled_image3);
  subsampled_image3.save_image("test03_3xsubsampled_image.bmp");
}

void test04() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test04() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  bitmap_image upsampled_image1;
  bitmap_image upsampled_image2;
  bitmap_image upsampled_image3;

  image.upsample(upsampled_image1);
  upsampled_image1.save_image("test04_1xupsampled_image.bmp");

  upsampled_image1.upsample(upsampled_image2);
  upsampled_image2.save_image("test04_2xupsampled_image.bmp");

  upsampled_image2.upsample(upsampled_image3);
  upsampled_image3.save_image("test04_3xupsampled_image.bmp");
}

void test05() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test05() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  image.set_all_ith_bits_low(0);
  image.save_image("test05_lsb0_removed_saved.bmp");
  image.set_all_ith_bits_low(1);
  image.save_image("test05_lsb01_removed_saved.bmp");
  image.set_all_ith_bits_low(2);
  image.save_image("test05_lsb012_removed_saved.bmp");
  image.set_all_ith_bits_low(3);
  image.save_image("test05_lsb0123_removed_saved.bmp");
  image.set_all_ith_bits_low(4);
  image.save_image("test05_lsb01234_removed_saved.bmp");
  image.set_all_ith_bits_low(5);
  image.save_image("test05_lsb012345_removed_saved.bmp");
  image.set_all_ith_bits_low(6);
  image.save_image("test05_lsb0123456_removed_saved.bmp");
}

void test06() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test06() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  bitmap_image red_channel_image;
  image.export_color_plane(bitmap_image::red_plane, red_channel_image);
  red_channel_image.save_image("test06_red_channel_image.bmp");

  bitmap_image green_channel_image;
  image.export_color_plane(bitmap_image::green_plane, green_channel_image);
  green_channel_image.save_image("test06_green_channel_image.bmp");

  bitmap_image blue_channel_image;
  image.export_color_plane(bitmap_image::blue_plane, blue_channel_image);
  blue_channel_image.save_image("test06_blue_channel_image.bmp");
}

void test07() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test07() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  image.convert_to_grayscale();
  image.save_image("test07_grayscale_image.bmp");
}

void test08() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test08() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  bitmap_image image1;
  bitmap_image image2;
  bitmap_image image3;
  bitmap_image image4;

  unsigned int w = image.width();
  unsigned int h = image.height();

  if (!image.region(0, 0, w / 2, h / 2, image1)) {
    std::cout << "ERROR: upper_left_image" << std::endl;
  }

  if (!image.region((w - 1) / 2, 0, w / 2, h / 2, image2)) {
    std::cout << "ERROR: upper_right_image" << std::endl;
  }

  if (!image.region(0, (h - 1) / 2, w / 2, h / 2, image3)) {
    std::cout << "ERROR: lower_left_image" << std::endl;
  }

  if (!image.region((w - 1) / 2, (h - 1) / 2, w / 2, h / 2, image4)) {
    std::cout << "ERROR: lower_right_image" << std::endl;
  }

  image1.save_image("test08_upper_left_image.bmp");
  image2.save_image("test08_upper_right_image.bmp");
  image3.save_image("test08_lower_left_image.bmp");
  image4.save_image("test08_lower_right_image.bmp");
}

void test09() {
  const unsigned int dim = 1000;

  bitmap_image image(dim, dim);

  for (unsigned int x = 0; x < dim; ++x) {
    for (unsigned int y = 0; y < dim; ++y) {
      rgb_t col = jet_colormap[(x + y) % dim];
      image.set_pixel(x, y, col.red, col.green, col.blue);
    }
  }

  image.save_image("test09_color_map_image.bmp");
}

void test10() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test10() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  image.invert_color_planes();
  image.save_image("test10_inverted_color_image.bmp");
}

void test11() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test11() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  for (unsigned int i = 0; i < 10; ++i) {
    image.add_to_color_plane(bitmap_image::red_plane, 10);
    image.save_image(std::string("test11_") + static_cast<char>(48 + i) +
                     std::string("_red_inc_image.bmp"));
  }
}

void test12() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test12() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  double* y  = new double[image.pixel_count()];
  double* cb = new double[image.pixel_count()];
  double* cr = new double[image.pixel_count()];

  image.export_ycbcr(y, cb, cr);

  for (unsigned int i = 0; i < image.pixel_count(); ++i) {
    cb[i] = cr[i] = 0.0;
  }

  image.import_ycbcr(y, cb, cr);
  image.save_image("test12_only_y_image.bmp");

  delete[] y;
  delete[] cb;
  delete[] cr;
}

void test13() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test13() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  double* y  = new double[image.pixel_count()];
  double* cb = new double[image.pixel_count()];
  double* cr = new double[image.pixel_count()];

  image.export_ycbcr(y, cb, cr);

  for (unsigned int j = 0; j < 10; ++j) {
    for (unsigned int i = 0; i < image.pixel_count(); ++i) {
      y[i] += 15.0;
    }

    image.import_ycbcr(y, cb, cr);
    image.save_image(std::string("test13_") + static_cast<char>(48 + j) +
                     std::string("_y_image.bmp"));
  }

  delete[] y;
  delete[] cb;
  delete[] cr;
}

void test14() {
  bitmap_image image(512, 512);

  image.clear();
  checkered_pattern(64, 64, 220, bitmap_image::red_plane, image);
  image.save_image("test14_checkered_01.bmp");

  image.clear();
  checkered_pattern(32, 64, 100, 200, 50, image);
  image.save_image("test14_checkered_02.bmp");
}

void test15() {
  bitmap_image image(1024, 1024);

  image.clear();

  double c1 = 0.9;
  double c2 = 0.5;
  double c3 = 0.3;
  double c4 = 0.7;

  ::srand(0xA5AA5AA5);
  plasma(image, 0, 0, image.width(), image.height(), c1, c2, c3, c4, 3.0,
         jet_colormap);
  image.save_image("test15_plasma.bmp");
}

void test16() {
  std::string file_name("image.bmp");

  bitmap_image image(file_name);

  if (!image) {
    printf("test16() - Error - Failed to open '%s'\n", file_name.c_str());
    return;
  }

  double c1 = 0.9;
  double c2 = 0.5;
  double c3 = 0.3;
  double c4 = 0.7;

  bitmap_image plasma_image(image.width(), image.height());
  plasma(plasma_image, 0, 0, plasma_image.width(), plasma_image.height(), c1,
         c2, c3, c4, 3.0, jet_colormap);

  bitmap_image temp_image(image);

  temp_image.alpha_blend(0.1, plasma_image);
  temp_image.save_image("test16_alpha_0.1.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.2, plasma_image);
  temp_image.save_image("test16_alpha_0.2.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.3, plasma_image);
  temp_image.save_image("test16_alpha_0.3.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.4, plasma_image);
  temp_image.save_image("test16_alpha_0.4.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.5, plasma_image);
  temp_image.save_image("test16_alpha_0.5.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.6, plasma_image);
  temp_image.save_image("test16_alpha_0.6.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.7, plasma_image);
  temp_image.save_image("test16_alpha_0.7.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.8, plasma_image);
  temp_image.save_image("test16_alpha_0.8.bmp");
  temp_image = image;

  temp_image.alpha_blend(0.9, plasma_image);
  temp_image.save_image("test16_alpha_0.9.bmp");
}

void test17() {
  bitmap_image image(1024, 1024);

  double c1 = 0.9;
  double c2 = 0.5;
  double c3 = 0.3;
  double c4 = 0.7;

  plasma(image, 0, 0, image.width(), image.height(), c1, c2, c3, c4, 3.0,
         jet_colormap);

  image_drawer draw(image);

  draw.pen_width(3);
  draw.pen_color(255, 0, 0);
  draw.circle(image.width() / 2 + 100, image.height() / 2, 100);

  draw.pen_width(2);
  draw.pen_color(0, 255, 255);
  draw.ellipse(image.width() / 2, image.height() / 2, 200, 350);

  draw.pen_width(1);
  draw.pen_color(255, 255, 0);
  draw.rectangle(50, 50, 250, 400);

  draw.pen_color(0, 255, 0);
  draw.rectangle(450, 250, 850, 880);

  image.save_image("test17_image_drawer.bmp");
}

void test18() {
  {
    bitmap_image image(1000, 180);
    image_drawer draw(image);
    const rgb_t* colormap[9] = {
        autumn_colormap, copper_colormap, gray_colormap,
        hot_colormap,    hsv_colormap,    jet_colormap,
        prism_colormap,  vga_colormap,    yarg_colormap};

    for (unsigned int i = 0; i < image.width(); ++i) {
      for (unsigned int j = 0; j < 9; ++j) {
        draw.pen_color(colormap[j][i].red, colormap[j][i].green,
                       colormap[j][i].blue);
        draw.vertical_line_segment(j * 20, (j + 1) * 20, i);
      }
    }

    image.save_image("test18_color_maps.bmp");
  }

  {
    bitmap_image image(1000, 500);
    image_drawer draw(image);

    std::size_t palette_colormap_size =
        sizeof(palette_colormap) / sizeof(rgb_t);
    std::size_t bar_width = image.width() / palette_colormap_size;

    for (std::size_t i = 0; i < palette_colormap_size; ++i) {
      for (std::size_t j = 0; j < bar_width; ++j) {
        draw.pen_color(palette_colormap[i].red, palette_colormap[i].green,
                       palette_colormap[i].blue);
        draw.vertical_line_segment(0, image.height(),
                                   static_cast<int>(i * bar_width + j));
      }
    }

    image.save_image("test18_palette_colormap.bmp");
  }
}

void test19() {
  {
    cartesian_canvas canvas(1000, 1000);

    if (!canvas) {
      printf("test19() - Error - Failed to instantiate cartesian "
             "canvas(1000x1000) [1]\n");
      return;
    }

    canvas.rectangle(canvas.min_x(), canvas.min_y(), canvas.max_x(),
                     canvas.max_y());

    canvas.horiztonal_line_segment(canvas.min_x(), canvas.max_x(), -400.0);

    canvas.line_segment(-500.0, 600.0, 600.0, -500.0);

    canvas.pen_width(3);

    for (std::size_t i = 0; i < 160; i++) {
      std::size_t c_idx = i % (sizeof(palette_colormap) / sizeof(rgb_t));

      canvas.pen_color(palette_colormap[c_idx].red,
                       palette_colormap[c_idx].green,
                       palette_colormap[c_idx].blue);

      canvas.circle(0.0, 0.0, 3.0 * i);
    }

    canvas.image().save_image("test19_cartesian_canvas01.bmp");
  }

  {
    static const double pi =
        3.14159265358979323846264338327950288419716939937510;

    cartesian_canvas canvas(1000, 1000);

    if (!canvas) {
      printf("test19() - Error - Failed to instantiate cartesian "
             "canvas(1000x1000) [2]\n");
      return;
    }

    canvas.image().set_all_channels(0xFF);

    canvas.pen_width(2);

    unsigned int i = 0;

    for (double x = -500; x < 500; x += 3, ++i) {
      std::size_t c_idx = i % (sizeof(palette_colormap) / sizeof(rgb_t));

      canvas.pen_color(palette_colormap[c_idx].red,
                       palette_colormap[c_idx].green,
                       palette_colormap[c_idx].blue);

      double radius =
          std::max(10.0, std::abs(80.0 * std::sin((1.0 / 80.0) * pi * x)));

      double y = 400.0 * std::sin((1.0 / 200.0) * pi * x);

      canvas.circle(x, y, radius);
    }

    canvas.image().save_image("test19_cartesian_canvas02.bmp");
  }
}

void test20() {
  const rgb_t* colormap[4] = {hsv_colormap, jet_colormap, prism_colormap,
                              vga_colormap};

  const unsigned int fractal_width  = 1200;
  const unsigned int fractal_height = 800;

  {
    bitmap_image fractal_hsv(fractal_width, fractal_height);
    bitmap_image fractal_jet(fractal_width, fractal_height);
    bitmap_image fractal_prism(fractal_width, fractal_height);
    bitmap_image fractal_vga(fractal_width, fractal_height);

    fractal_hsv.clear();
    fractal_jet.clear();
    fractal_prism.clear();
    fractal_vga.clear();

    double cr, ci;
    double nextr, nexti;
    double prevr, previ;

    const unsigned int max_iterations = 1000;

    for (unsigned int y = 0; y < fractal_height; ++y) {
      for (unsigned int x = 0; x < fractal_width; ++x) {
        cr = 1.5 * (2.0 * x / fractal_width - 1.0) - 0.5;
        ci = (2.0 * y / fractal_height - 1.0);

        nextr = nexti = 0;
        prevr = previ = 0;

        for (unsigned int i = 0; i < max_iterations; i++) {
          prevr = nextr;
          previ = nexti;

          nextr = prevr * prevr - previ * previ + cr;
          nexti = 2 * prevr * previ + ci;

          if (((nextr * nextr) + (nexti * nexti)) > 4) {
            if (max_iterations != i) {
              double z = sqrt(nextr * nextr + nexti * nexti);

#define log2(x) (std::log(1.0 * x) / std::log(2.0))

              unsigned int index = static_cast<unsigned int>(
                  1000.0 * log2(1.75 + i - log2(log2(z))) /
                  log2(max_iterations));
#undef log2

              rgb_t c0 = colormap[0][index];
              rgb_t c1 = colormap[1][index];
              rgb_t c2 = colormap[2][index];
              rgb_t c3 = colormap[3][index];

              fractal_hsv.set_pixel(x, y, c0.red, c0.green, c0.blue);
              fractal_jet.set_pixel(x, y, c1.red, c1.green, c1.blue);
              fractal_prism.set_pixel(x, y, c2.red, c2.green, c2.blue);
              fractal_vga.set_pixel(x, y, c3.red, c3.green, c3.blue);
            }

            break;
          }
        }
      }
    }

    fractal_hsv.save_image("test20_mandelbrot_set_hsv.bmp");
    fractal_jet.save_image("test20_mandelbrot_set_jet.bmp");
    fractal_prism.save_image("test20_mandelbrot_set_prism.bmp");
    fractal_vga.save_image("test20_mandelbrot_set_vga.bmp");
  }

  {
    bitmap_image fractal_hsv(fractal_width, fractal_height);
    bitmap_image fractal_jet(fractal_width, fractal_height);
    bitmap_image fractal_prism(fractal_width, fractal_height);
    bitmap_image fractal_vga(fractal_width, fractal_height);

    fractal_hsv.clear();
    fractal_jet.clear();
    fractal_prism.clear();
    fractal_vga.clear();

    const unsigned int max_iterations = 300;

    const double cr = -0.70000;
    const double ci = 0.27015;

    double prevr, previ;

    for (unsigned int y = 0; y < fractal_height; ++y) {
      for (unsigned int x = 0; x < fractal_width; ++x) {
        double nextr = 1.5 * (2.0 * x / fractal_width - 1.0);
        double nexti = (2.0 * y / fractal_height - 1.0);

        for (unsigned int i = 0; i < max_iterations; i++) {
          prevr = nextr;
          previ = nexti;

          nextr = prevr * prevr - previ * previ + cr;
          nexti = 2 * prevr * previ + ci;

          if (((nextr * nextr) + (nexti * nexti)) > 4) {
            if (max_iterations != i) {
              unsigned int index =
                  static_cast<int>((1000.0 * i) / max_iterations);

              rgb_t c0 = colormap[0][index];
              rgb_t c1 = colormap[1][index];
              rgb_t c2 = colormap[2][index];
              rgb_t c3 = colormap[3][index];

              fractal_hsv.set_pixel(x, y, c0.red, c0.green, c0.blue);
              fractal_jet.set_pixel(x, y, c1.red, c1.green, c1.blue);
              fractal_prism.set_pixel(x, y, c2.red, c2.green, c2.blue);
              fractal_vga.set_pixel(x, y, c3.red, c3.green, c3.blue);
            }

            break;
          }
        }
      }
    }

    fractal_hsv.save_image("test20_julia_set_hsv.bmp");
    fractal_jet.save_image("test20_julia_set_jet.bmp");
    fractal_prism.save_image("test20_julia_set_prism.bmp");
    fractal_vga.save_image("test20_julia_set_vga.bmp");
  }
}

int main() {
  test01();
  test02();
  test03();
  test04();
  test05();
  test06();
  test07();
  test08();
  test09();
  test10();
  test11();
  test12();
  test13();
  test14();
  test15();
  test16();
  test17();
  test18();
  test19();
  test20();
  return 0;
}

/*
   Note: In some of the tests a bitmap image by the name of 'image.bmp'
         is required. If not present the test will fail.
*/


================================================
FILE: lonestar/eda/cpu/sproute/bookshelf_IO.c
================================================
/* -----------  FastPlace - Version 1.0 ----------------
                       by 
   Natarajan Viswanathan and Chris C.-N. Chu
     Dept. of ECpE, Iowa State University
          Copyright (c) - 2004 
Iowa State University Research Foundation, Inc.
--------------------------------------------------------*/
/* --------------------------------------------------------------------------
   Contains routines to:
   - Read and Write the benchmark files in Bookshelf format 
----------------------------------------------------------------------------*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include "memAlloc.h"
#include "bookshelf_IO.h"

#define MAX(a,b) ((a)>(b) ? (a) : (b))
#define MIN(a,b) ((a)<(b) ? (a) : (b))


/*-- extern variables --*/
    // from createHash()
    char **cellName;
    
    // from readAuxFile()
    char nodesFile[BUFFERSIZE], netsFile[BUFFERSIZE], wtsFile[BUFFERSIZE]; 
    char sclFile[BUFFERSIZE], plFile[BUFFERSIZE], benchmarkName[BUFFERSIZE];
    
    // from readNodesFile()
    int movableNodes, numTerminals;
    float averageCellWidth, *cellWidth, *cellHeight; 
    
    // from readNetsFile() 
    int numPins, *netlist, *netlistIndex;
    float *xPinOffset, *yPinOffset;
       
    // from readPlFile()
    float *xCellCoord, *yCellCoord, minX, maxX, minY, maxY;
    int *areaArrayIO, numAreaArrayIO;


    // from readSclFile()
    int numRows, numRowBlockages;
    float siteOriginY, siteEndY, coreHeight;
    float siteOriginX, siteEndX, coreWidth;
    float siteWidth, siteSpacing, coreRowHeight;
    float *rowOriginX, *rowEndX;
    float *xRowBlockage, *yRowBlockage, *widthRowBlockage;

/*-- global variables --*/
    typedef struct nodesHash NODES;
    struct nodesHash  {
        char name[STRINGLEN];
        unsigned long index;
    };

    NODES *NodesInfo;

    long hashSize, hashBits, *RN;
    unsigned char *hashFlag;
    long modresNum;
    int numNodes;


/*-- functions --*/
    void createHash(char benchmarkPath[], char nodesFile[]);
    void freeHash();
    void readAuxFile(char benchmarkPath[], char auxFile[]);
    void readNodesFile(char benchmarkPath[], char nodesFile[]);
    void readNetsFile(char benchmarkPath[], char netsFile[]);
    void readPlFile(char benchmarkPath[], char plFile[]);
    void readSclFile(char benchmarkPath[], char sclFile[]);
    void writePlFile(char outputDir[], char benchmarkName[], float xCoord[], float yCoord[]);


/* -----------------------------------------------------------
   Reads the .nodes file and creates a hash linking cell name 
   to cell index for all the nodes in the circuit 
   (movable nodes + fixed nodes + I/O pads)
   
   creates extern vars:
      cellName[]
----------------------------------------------------------- */
void createHash(char benchmarkPath[], char nodesFile[]) 
{
    FILE *fp;
    char line[BUFFERSIZE], temp[BUFFERSIZE], s4[BUFFERSIZE];
    float nodeWidth, nodeHeight;
    long currentPos, j, k, nodeIndex, nodeNo;
    long R, nonpin_ptr, pin_ptr, hashfunc, RN_index; 


    strcpy(temp, benchmarkPath);
    strcat(temp, "/");
    strcat(temp, nodesFile);
    
    if((fp=fopen(temp, "r")) == NULL) {
        printf("Error in opening: %s \n", temp);
        exit(1);
    }
    
    // Reading first few lines 
    if(!fgets(temp, BUFFERSIZE, fp)) abort();
    do {
        currentPos = ftell(fp);
        if(!fgets(temp, BUFFERSIZE, fp)) abort();
    } while( (temp[0] == '#') || (strlen(temp) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  

    // getting numNodes and numTerminals
    if(fscanf(fp, "NumNodes\t:\t%d\n", &numNodes) != 1) abort();
    if(fscanf(fp, "NumTerminals\t:\t%d\n", &numTerminals) != 1) abort();

    // in case there are any more comments or blank lines before actual cell information
    do {
        currentPos = ftell(fp);
        if(!fgets(temp, BUFFERSIZE, fp)) abort();
    } while( (temp[0] == '#') || (strlen(temp) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  
    
    // defining hash variables
    hashBits = 3+(long)(log((double)numNodes)/log((double)2));
    hashSize = (long) pow(2, (double)hashBits);
    NodesInfo = (NODES *) malloc(hashSize*sizeof(NODES));
    RN = (long *) lvector(1, hashSize);
    hashFlag = cvector(1, hashSize);   

    // global vector giving inverse mapping b/w cell names and cell indexes
    cellName = cmatrix(1, numNodes, 1, STRINGLEN); 

     // initialize hash flags
    for(j=1;j<=hashSize;j++)
        hashFlag[j] = 0;

    // generate random sequence
    R = 1;
    for(j=1;j<=hashSize;j++) {
        R = (5*R)%hashSize;
        RN[j] = R/4;
    }
    modresNum = (hashBits+2)/3;

    nonpin_ptr = 1;                      // movable nodes start from 1
    pin_ptr = numNodes-numTerminals+1;   // fixed nodes start from movableNodes+1
    
    for(nodeNo=1;nodeNo<=numNodes;nodeNo++) {

        if(!fgets(line, BUFFERSIZE, fp)) abort();
        strcpy(s4, "");
        sscanf(line, "%s%f%f%s\n", temp, &nodeWidth, &nodeHeight, s4);

        if(strcmp(s4, "terminal")==0) {

            // create array to save cell name
            strcpy(cellName[pin_ptr], temp);

            // create a hash table for name searching
            hashfunc = 0;
            for(j=1;j<=IMIN(strlen(temp), modresNum);j++)
                hashfunc += ((long)temp[j-1]<<3*(j-1))%hashSize;
          
            hashfunc = hashfunc%hashSize;
            RN_index = 1;

            while(hashFlag[hashfunc]!=0 && RN_index<hashSize) {
                hashfunc = (hashfunc+RN[RN_index])%hashSize;
                RN_index++;
            }

            if (RN_index>=hashSize) {  
                printf("cannot fill in hash table\n");
                exit(1);
            }
          
            strcpy(NodesInfo[hashfunc].name, temp);
            NodesInfo[hashfunc].index = pin_ptr;
            hashFlag[hashfunc] = 1;
          
            pin_ptr++;
       
        } else {

            // create array to save cell name
            strcpy(cellName[nonpin_ptr], temp);
          
            // create a hash table for name searching
            hashfunc = 0;
            for(j=1;j<=IMIN(strlen(temp), modresNum);j++)
                hashfunc += ((long)temp[j-1]<<3*(j-1))%hashSize;
          
            hashfunc = hashfunc%hashSize;
            RN_index = 1;
          
            while(hashFlag[hashfunc]!=0 && RN_index<hashSize) {
                hashfunc = (hashfunc+RN[RN_index])%hashSize;
                RN_index++;
            }
          
            if (RN_index>=hashSize) {  
                printf("cannot fill in hash table\n");
                exit(1);
            }
          
            strcpy(NodesInfo[hashfunc].name, temp);
            NodesInfo[hashfunc].index = nonpin_ptr;
            hashFlag[hashfunc] = 1;

            nonpin_ptr++;
        }
    }

    fclose(fp);  
}

    
/* -----------------------------------------------------------
  frees hash elements
----------------------------------------------------------- */
void freeHash()
{
  free(NodesInfo);
  free_lvector((unsigned long *) RN, 1, hashSize);
  free_cvector(hashFlag, 1, hashSize);
}


/* -----------------------------------------------------------
  Reads the .aux file to get the other file names
  
  creates extern vars:
     nodesFile[], netsFile[], wtsFile[], sclFile[], 
     plFile[], benchmarkName[];
----------------------------------------------------------- */
void readAuxFile(char benchmarkPath[], char auxFile[]) 
{
    FILE *fp;
    char temp[BUFFERSIZE], placementType[BUFFERSIZE], *name;

  
    strcpy(temp, benchmarkPath);
    strcat(temp, "/");
    strcat(temp, auxFile);
   
    if((fp=fopen(temp, "r")) == NULL) {
        printf("Error in opening: %s \n", auxFile);
        exit(1);
    }
//    printf("Reading %s ...\n",auxFile);
    
    if(fscanf(fp, "%s\t:\t%s%s%s%s%s\n", placementType, nodesFile, netsFile, wtsFile, plFile, sclFile) != 6) abort();

    strcpy(temp, auxFile);
    name = strtok(temp, ".");
    strcpy(benchmarkName, name);

    fclose(fp);
}  


/* -----------------------------------------------------------
  Reads the .nodes file to get cell widths and heights
  
  creates extern vars: 
     movableNodes, numTerminals, averageCellWidth, 
     cellWidth[], cellHeight[]
----------------------------------------------------------- */
void readNodesFile(char benchmarkPath[], char nodesFile[])
{
    FILE *fp;
    char line[BUFFERSIZE], tempStr[BUFFERSIZE], s4[STRINGLEN];
    long j, nodeIndex, nodeNo, currentPos;
    long hashfunc, RN_index;
    float nodeWidth, nodeHeight, sumWidth;

    
    strcpy(tempStr, benchmarkPath);
    strcat(tempStr, "/");
    strcat(tempStr, nodesFile);
    
    if((fp=fopen(tempStr, "r"))==NULL) {
        printf("Error in opening %s file \n", nodesFile);
        exit(1);
    }
//    printf("Reading %s ...\n", nodesFile);

    // Reading first few lines 
    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    do {
        currentPos = ftell(fp);
        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  

    if(fscanf(fp, "NumNodes\t:\t%d\n", &numNodes) != 1) abort();
    if(fscanf(fp, "NumTerminals\t:\t%d\n", &numTerminals) != 1) abort();

    do {
       currentPos = ftell(fp);
       if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  
    
    movableNodes = numNodes - numTerminals;       // global var - num of movable cells
    cellWidth = vector(1, numNodes);               // global vector giving cell widths
    cellHeight = vector(1, numNodes);             // global vector giving cell heights
   
    sumWidth = 0;
    
    for(nodeNo=1;nodeNo<=numNodes;nodeNo++) {

        if(!fgets(line, BUFFERSIZE, fp)) abort();
        strcpy(s4, "");
        sscanf(line, "%s%f%f%s\n", tempStr, &nodeWidth, &nodeHeight, s4);

        if(strcmp(s4, "terminal")==0) {

            // find the nodeIndex corresponding to tempStr
            hashfunc = 0;
            for(j=1;j<=IMIN(strlen(tempStr), modresNum);j++)
                hashfunc += ((long)tempStr[j-1]<<3*(j-1))%hashSize;
      
            hashfunc = hashfunc%hashSize;
            RN_index = 1;
  
            while(strcmp(tempStr, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {
                hashfunc = (hashfunc+RN[RN_index])%hashSize;
                RN_index++;
            }
      
            if (RN_index>=hashSize) {  
                printf("cannot find in hash table\n");
                exit(1);
            }
      
            nodeIndex = NodesInfo[hashfunc].index;

            // store cellwidth and cellheight corresponding to nodeIndex
            cellWidth[nodeIndex] = nodeWidth;
            cellHeight[nodeIndex] = nodeHeight;
      
        } else {

            // find the nodeIndex corresponding to tempStr
            hashfunc = 0;
            for(j=1;j<=IMIN(strlen(tempStr), modresNum);j++)
                hashfunc += ((long)tempStr[j-1]<<3*(j-1))%hashSize;
      
            hashfunc = hashfunc%hashSize;
            RN_index = 1;
  
            while(strcmp(tempStr, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {
                hashfunc = (hashfunc+RN[RN_index])%hashSize;
                RN_index++;
            }
      
            if (RN_index>=hashSize) {  
                printf("cannot find in hash table\n");
                exit(1);
            }
            
            nodeIndex = NodesInfo[hashfunc].index;

            // store cellwidth and cellheight corresponding to nodeIndex
            cellWidth[nodeIndex] = nodeWidth;
            cellHeight[nodeIndex] = nodeHeight;
            sumWidth += nodeWidth;
        }
    }

    // find average cell width
    averageCellWidth = sumWidth/movableNodes;
    averageCellWidth *= 100;
    averageCellWidth = (int)averageCellWidth;
    averageCellWidth /= 100;

    fclose(fp);  

#if(DEBUG)
int i;
for(i=1; i<=movableNodes+numTerminals; i++) {
    printf("%d  %s  %.2f  %.2f\n", i, cellName[i], cellWidth[i], cellHeight[i]);
}

printf("Avg Cell Width:  %.2f \n", averageCellWidth);    
#endif
}  


/* -----------------------------------------------------------
   Reads the .nets file to get the netlist information
   
   creates extern vars: 
      numNets, numPins, 
      xPinOffset[], yPinOffset[], netlist[], netlistIndex[]
----------------------------------------------------------- */
void readNetsFile(char benchmarkPath[], char netsFile[])
{
    FILE *fp;
    long i, j, k, netNo, nodeIndex;
    long currentPos, startPointer, hashfunc, RN_index;
    char tempStr[BUFFERSIZE], nodeName[BUFFERSIZE];
    int degree, prevElements;
    float xOffset, yOffset;


    strcpy(tempStr, benchmarkPath);
    strcat(tempStr, "/");
    strcat(tempStr, netsFile);

    if((fp=fopen(tempStr, "r"))==NULL) {
        printf("Error in opening %s file \n", netsFile);
        exit(1);
    }
//    printf("Reading %s ...\n", netsFile);

    // Reading first four lines 
    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    do {
        currentPos = ftell(fp);
        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  

    // getting numNets and numPins
    if(fscanf(fp, "NumNets\t:\t%d\n", &numNets) != 1) abort();
    if(fscanf(fp, "NumPins\t:\t%d\n", &numPins) != 1) abort();

    do {
        currentPos = ftell(fp);
        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  
   
    // stores the netlist and pin offsets relative to the center of the cells
    netlist = ivector(1,numPins+1);
    xPinOffset = vector(1,numPins+1);
    yPinOffset = vector(1,numPins+1);

    // index vector for the netlist and offset vectors
    netlistIndex = ivector(0,numNets+1);
    
    netlistIndex[0] = 1;
    prevElements = 0;

    for(netNo=1;netNo<=numNets;netNo++) {

        do {
            currentPos = ftell(fp);
            if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
        } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  

        sscanf(tempStr, "NetDegree\t:\t%d\n", &degree);

        netlistIndex[netNo] = netlistIndex[netNo-1] + prevElements;
        startPointer = netlistIndex[netNo];
        prevElements = degree;
      
        for(k=1;k<=degree;k++) {
         
            do {
                currentPos = ftell(fp);
                if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
            } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
        
            xOffset = yOffset = 0.0;
            sscanf(tempStr, "%s%f%f", nodeName, &xOffset, &yOffset);

            // find the nodeIndex corresponding to nodeName
            hashfunc = 0;
            for(j=1;j<=IMIN(strlen(nodeName), modresNum);j++)
                hashfunc += ((long)nodeName[j-1]<<3*(j-1))%hashSize;
        
            hashfunc = hashfunc%hashSize;
            RN_index = 1;
   
            while(strcmp(nodeName, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {
                hashfunc = (hashfunc+RN[RN_index])%hashSize;
                RN_index++;
            }
        
            if (RN_index>=hashSize) {  
                printf("cannot find in hash table\n");
               exit(1);
            }

            nodeIndex = NodesInfo[hashfunc].index;
            netlist[startPointer+k-1] = nodeIndex;
            xPinOffset[startPointer+k-1] = xOffset;
            yPinOffset[startPointer+k-1] = yOffset; 
        }
    }
   
    netlistIndex[numNets+1] = netlistIndex[numNets] + prevElements;
    netlist[netlistIndex[numNets+1]] = 0;

    fclose(fp); 

#if(DEBUG)
for(i=1; i<=numNets; i++) {
    printf("**%d**  ", netlistIndex[i+1]-netlistIndex[i]);
    for(j=netlistIndex[i]; j<netlistIndex[i+1]; j++) {
        printf("(%d) %.2f %.2f  ", netlist[j], xPinOffset[j], yPinOffset[j]);
    }
    printf("\n");    
}
#endif
}


/* -----------------------------------------------------------
  Reads the .pl file to get coordinates of all nodes and the 
  placement boundary based on the position of the I/O pads
  
  creates extern vars:
     xCellCoord[], yCellCoord[]
     areaArrayIO[], numAreaArrayIO
----------------------------------------------------------- */
void readPlFile(char benchmarkPath[], char plFile[])
{
    FILE *fp;
    long nodeIndex, currentPos, j, hashfunc, RN_index, nodeNo, movable;
    char tempStr[BUFFERSIZE], nodeName[BUFFERSIZE], fixedType[BUFFERSIZE];
    float xCoord, yCoord;

  
    strcpy(tempStr, benchmarkPath);
    strcat(tempStr, "/");
    strcat(tempStr, plFile);

    if((fp=fopen(tempStr, "r"))==NULL) {
        printf("Error in opening %s file \n", plFile);
        exit(1);
    }
//    printf("Reading %s ...\n", plFile);
  
    // Reading first four lines 
    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    do {
        currentPos = ftell(fp);
        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);
  
    xCellCoord = vector(1,numNodes);
    yCellCoord = vector(1,numNodes);
    areaArrayIO = ivector(1, numTerminals);
    numAreaArrayIO = 0;

    movable = numNodes-numTerminals;
    for(nodeNo=1; nodeNo<=numNodes; nodeNo++) {

        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
        strcpy(fixedType, "");
        sscanf(tempStr, "%s%f%f\t:\t%*s%s\n", nodeName, &xCoord, &yCoord, fixedType);

        hashfunc = 0;
        for(j=1;j<=IMIN(strlen(nodeName), modresNum);j++)
            hashfunc += ((long)nodeName[j-1]<<3*(j-1))%hashSize;
      
        hashfunc = hashfunc%hashSize;
        RN_index = 1;
    
        while(strcmp(nodeName, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {
            hashfunc = (hashfunc+RN[RN_index])%hashSize;
            RN_index++;
        }
      
        if (RN_index>=hashSize) {  
            printf("cannot find in hash table\n");
            exit(1);
        }
      
        nodeIndex = NodesInfo[hashfunc].index;
        // Assume all coordinates are integers
        xCellCoord[nodeIndex] = (int) (xCoord + 0.5*cellWidth[nodeIndex]);
        yCellCoord[nodeIndex] = (int) (yCoord + 0.5*cellHeight[nodeIndex]);
/*        
        if(nodeIndex > movable) {
            // Is a fixed terminal but can allow overlap with it
            if(strcmp(fixedType, "/FIXED") != 0) {
                numAreaArrayIO++;
                areaArrayIO[numAreaArrayIO] = nodeIndex;
            }
        }
*/
    }

    fclose(fp);
}    


/* -----------------------------------------------------------
  Reads the .scl file to get placement region information
  
  creates extern vars:
     siteOriginX, siteEndX, siteOriginY, siteEndY, siteWidth, 
     siteSpacing, coreRowHeight, coreWidth, coreHeight, 
     numRows, minX, maxX, minY, maxY
----------------------------------------------------------- */
void readSclFile(char benchmarkPath[], char sclFile[])
{
    FILE *fp;
    char tempStr[BUFFERSIZE], siteOrient[2], siteSymmetry[2], junk[BUFFERSIZE];
    int totalSites, row;
    long currentPos, nodeIndex, movable;
    float originY, originX, minOrigin, maxEnd;


    strcpy(tempStr, benchmarkPath);
    strcat(tempStr, "/");
    strcat(tempStr, sclFile);

    if((fp=fopen(tempStr, "r"))==NULL) {
      printf("Error in opening %s file \n", sclFile);
      exit(1);
    }
//    printf("Reading %s ...\n", sclFile);

    // Reading first four lines 
    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    do {
      currentPos = ftell(fp);
      if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  
   
    // getting numRows
    if(fscanf(fp, "%*s\t:\t%d\n", &numRows) != 1) abort();

    rowOriginX = vector(1, numRows);
    rowEndX = vector(1, numRows);
    xRowBlockage = vector(1, 2*numRows);
    yRowBlockage = vector(1, 2*numRows);
    widthRowBlockage = vector(1, 2*numRows);

    // any blanks or comments after numRows line
    do {
      currentPos = ftell(fp);
      if(!fgets(tempStr, BUFFERSIZE, fp)) abort();
    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  
    fseek(fp, currentPos, SEEK_SET);  

    siteOriginX = 1.0e6;
    siteEndX = -1.0e6;
    for(row=1; row<=numRows; row++) {
        // Reading CoreRow Horizontal
        if(!fgets(junk, BUFFERSIZE, fp)) abort();
    
        if(fscanf(fp, "\tCoordinate\t:\t%f\n", &originY) != 1) abort();
        if(row == 1) siteOriginY = originY;
        
        if(fscanf(fp, "Height\t:\t%f\n", &coreRowHeight) != 1) abort();
        if(fscanf(fp, "Sitewidth\t:\t%f\n", &siteWidth) != 1) abort();
        if(fscanf(fp, "Sitespacing\t:\t%f\n", &siteSpacing) != 1) abort();
        if(fscanf(fp, "Siteorient\t:\t%s\n", siteOrient) != 1) abort();
        if(fscanf(fp, "Sitesymmetry\t:\t%s\n", siteSymmetry)!= 1) abort();
        if(fscanf(fp, "SubrowOrigin\t:\t%f\t%*s\t:\t%d\n", &originX, &totalSites) != 2) abort();
        // Reading End
        if(!fgets(junk, BUFFERSIZE, fp)) abort();
      
        rowOriginX[row] = originX;
        rowEndX[row] = originX + totalSites*siteSpacing;
        if(rowOriginX[row] < siteOriginX) siteOriginX = rowOriginX[row];
        if(rowEndX[row] > siteEndX) siteEndX = rowEndX[row];
    }

    siteEndY = numRows*coreRowHeight + siteOriginY;
    coreHeight = siteEndY - siteOriginY;          // height of placement area 
    coreWidth = siteEndX - siteOriginX;

    numRowBlockages = 0;
    for(row=1; row<=numRows; row++) {
        if(rowOriginX[row] > siteOriginX) {
            numRowBlockages++;
            xRowBlockage[numRowBlockages] = siteOriginX + 0.5*(rowOriginX[row] - siteOriginX);
            yRowBlockage[numRowBlockages] = siteOriginY + (row-0.5)*coreRowHeight;
            widthRowBlockage[numRowBlockages] = (rowOriginX[row] - siteOriginX);
        }

        if(siteEndX > rowEndX[row]) {
            numRowBlockages++;
            xRowBlockage[numRowBlockages] = rowEndX[row] + 0.5*(siteEndX - rowEndX[row]);
            yRowBlockage[numRowBlockages] = siteOriginY + (row-0.5)*coreRowHeight;
            widthRowBlockage[numRowBlockages] = (siteEndX - rowEndX[row]);
        }
    }
    
    maxX = 0;
    minX = 1.0e10;
    maxY = 0;
    minY = 1.0e10;
    movable = numNodes-numTerminals;
    for(nodeIndex=movable+1; nodeIndex<=numNodes; nodeIndex++) {
    
        if(xCellCoord[nodeIndex] > maxX) maxX = xCellCoord[nodeIndex];
        if(xCellCoord[nodeIndex] < minX) minX = xCellCoord[nodeIndex];
        if(yCellCoord[nodeIndex] > maxY) maxY = yCellCoord[nodeIndex];
        if(yCellCoord[nodeIndex] < minY) minY = yCellCoord[nodeIndex];
    }
    maxX = MAX(maxX, siteEndX+5.0);
    minX = MIN(minX, siteOriginX-5.0);
    maxY = MAX(maxY, siteEndY+5.0);
    minY = MIN(minY, siteOriginY-5.0);   
    
    fclose(fp);
}  


/* -----------------------------------------------------------
   writes out a bookshelf format .pl file
----------------------------------------------------------- */
void writePlFile(char outputDir[], char benchmarkName[], float xCoord[], float yCoord[]) 
{
    FILE *fp;
    char tempStr[BUFFERSIZE];
    int i;


    strcpy(tempStr, outputDir);
    strcat(tempStr, "/");
    strcat(tempStr, benchmarkName);
    strcat(tempStr, "_out.pl");
    
    if( (fp=fopen(tempStr,"w")) == NULL ) {
     
        printf("ERROR in opening the %s file for write \n", tempStr);
        exit(1);
    }
    printf("\nPrinting %s File... \n",tempStr);

    fprintf(fp, "UCLA pl 1.0\n");
    fprintf(fp, "# Circuit  :  %s\n", benchmarkName);

    for(i=1; i<=numNodes; i++)
        fprintf(fp, "    %20s    %-10.2f    %-10.2f  :  N\n", 
                cellName[i], xCoord[i]-0.5*cellWidth[i], yCoord[i]-0.5*cellHeight[i]);
   
    fclose(fp);
}


================================================
FILE: lonestar/eda/cpu/sproute/bookshelf_IO.h
================================================
/* -----------  FastPlace - Version 1.0 ----------------
                       by
   Natarajan Viswanathan and Chris C.-N. Chu
     Dept. of ECpE, Iowa State University
          Copyright (c) - 2004
Iowa State University Research Foundation, Inc.
-------------------------------------------------------- */
/* --------------------------------------------------------------------------
   Header file used in bookshelf_IO.c
----------------------------------------------------------------------------*/

#ifndef _BOOKSHELF_IO_H_
#define _BOOKSHELF_IO_H_

#define BUFFERSIZE 200
#define STRINGLEN 30

/* -----------------------------------------------------------------------------
    Reads the .nodes file and creates a hash linking cell name to cell index for
    all the nodes in the circuit (movable nodes + fixed nodes + I/O pads)

    creates extern vars:
        cellName[i]     (i = 1..movableNodes + numTerminals)
--------------------------------------------------------------------------------
*/
extern void createHash(char benchmarkPath[], char nodesFile[]);
extern void freeHash();

/* -----------------------------------------------------------------------------
    Reads the .aux file to get the other file names

    creates extern vars:
        nodesFile[], netsFile[], wtsFile[], sclFile[], plFile[], benchmarkName[]
--------------------------------------------------------------------------------
*/
extern void readAuxFile(char benchmarkPath[], char auxFile[]);

/* -----------------------------------------------------------------------------
    Reads the .nodes file to get cell widths and heights

    creates extern vars:
        movableNodes, numTerminals, averageCellWidth, cellWidth[], cellHeight[]
--------------------------------------------------------------------------------
*/
extern void readNodesFile(char benchmarkPath[], char nodesFile[]);

/* -----------------------------------------------------------------------------
    Reads the .nets file to get the netlist information

    creates extern vars:
        numNets, numPins, netlist[], netlistIndex[], xPinOffset[], yPinOffset[]
--------------------------------------------------------------------------------
*/
extern void readNetsFile(char benchmarkPath[], char netsFile[]);

/* -----------------------------------------------------------------------------
    Reads the .pl file to get coordinates of all the nodes and the placement
    boundary based on the position of the I/O pads

    creates extern vars:
        xCellCoord[], yCellCoord[], minX, maxX, minY, maxY
--------------------------------------------------------------------------------
*/
extern void readPlFile(char benchmarkPath[], char plFile[]);

/* -----------------------------------------------------------------------------
    Reads the .scl file to get placement (core) region information

    creates extern vars:
        siteOriginX, siteEndX, siteOriginY, siteEndY, siteWidth, siteSpacing,
        numRows, coreRowHeight, coreWidth, coreHeight
--------------------------------------------------------------------------------
*/
extern void readSclFile(char benchmarkPath[], char sclFile[]);

/* -----------------------------------------------------------------------------
    writes out a bookshelf format .pl file
--------------------------------------------------------------------------------
*/
extern void writePlFile(char outputDir[], char benchmarkName[], float xCoord[],
                        float yCoord[]);

/*--------------  Extern Variables  ------------------*/

extern char** cellName;

extern char nodesFile[BUFFERSIZE], netsFile[BUFFERSIZE], wtsFile[BUFFERSIZE];
extern char sclFile[BUFFERSIZE], plFile[BUFFERSIZE], benchmarkName[BUFFERSIZE];

extern int movableNodes, numTerminals;
extern float averageCellWidth, *cellWidth, *cellHeight;

extern int numNets, numPins, *netlist, *netlistIndex;
extern float *xPinOffset, *yPinOffset;

extern float *xCellCoord, *yCellCoord, minX, maxX, minY, maxY;
extern int *areaArrayIO, numAreaArrayIO;

extern int numRows, numRowBlockages;
extern float siteOriginY, siteEndY, coreHeight;
extern float siteOriginX, siteEndX, coreWidth;
extern float siteWidth, siteSpacing, coreRowHeight;
extern float *rowOriginX, *rowEndX;
extern float *xRowBlockage, *yRowBlockage, *widthRowBlockage;

#endif /* _BOOKSHELF_IO_H_*/

/* -----------------------------------------------------------------------------------------------
                                Description of Extern Variables

    cellName[i]         =   cell name corresponding to cell index "i"
                            (i = 1..movableNodes+numTerminals)

    movableNodes        =   number of movable nodes,
    numTerminals        =   number of fixed nodes (I/O Pads + Fixed Macros)
    averageCellWidth    =   avg width of movable nodes,
    cellWidth[i]        =   width of cell "i"   (i
= 1..movableNodes+numTerminals) cellHeight[i]       =   height of cell "i"  (i
= 1..movableNodes+numTerminals)

    numNets             =   number of nets
    numPins             =   number of pins
    netlist[]           =   netlist of the circuit
    xPinOffset[]        =   x-offset of the pins from the center of the cell
    yPinOffset[]        =   y-offset (      "       )
    netlistIndex[]      =   index to the netlist and offset vectors

    xCellCoord[i]       =   x-coordinate of cell "i"  (i
= 1..movableNodes+numTerminals) yCellCoord[i]       =   y-coordinate of cell
"i", minX, maxX          =   left and right boundaries of the chip (Note: not
the placement region) minY, maxY          =   bottom and top boundaries of the
chip areaArrayIO[]       =   All fixed terminals with which there can be an
overlap of movable nodes (eg area array IOs) numAreaArrayIO      =   the total
number of area array IOs

    siteOriginX[]       =   left boundary of the placement region within a row
    siteEndX[]          =   right boundary of the placement region within a row
    siteOriginY         =   bottom boundary of the placement region
    siteEndY            =   top boundary of the placement region
    siteWidth           =   width of a placement site within a row
    siteSpacing         =   the space b/w adjacent placement sites within a row
    numRows             =   number of placement rows
    coreRowHeight       =   row Height
    coreWidth           =   siteEndX - siteOriginX
    coreHeight          =   siteEndY - siteOriginY

---------------------------------------------------------------------------------------------------*/


================================================
FILE: lonestar/eda/cpu/sproute/cong.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "DataType.h"
#include "DataProc.h"


================================================
FILE: lonestar/eda/cpu/sproute/cong.h
================================================
#ifndef _DATAPROC_H_
#define _DATAPROC_H_

#endif /* _DATAPROC_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/dist.c
================================================
#include "global.h"

/*********************************************************************/
/*
   Return the Manhattan distance between two points
*/

long  dist(
  Point  p,
  Point  q
)
{
  long  dx, dy;
    
  dx = (p.x) - (q.x);
  if( dx < 0 )  dx = -dx;
  dy = (p.y) - (q.y);
  if( dy < 0 )  dy = -dy;

  return  dx + dy; 
}

/*********************************************************************/
/*
   Return the Manhattan distance between two points
*/

long  dist2(
  Point*  p,
  Point*  q
)
{
  long  dx, dy;
    
  dx = (p->x) - (q->x);
  if( dx < 0 )  dx = -dx;
  dy = (p->y) - (q->y);
  if( dy < 0 )  dy = -dy;

  return  dx + dy; 
}

/*********************************************************************/
/*********************************************************************/


================================================
FILE: lonestar/eda/cpu/sproute/dist.h
================================================
#ifndef _DIST_H_
#define _DIST_H_

#include "global.h"

long dist(Point p, Point q);

long dist2(Point* p, Point* q);

#endif


================================================
FILE: lonestar/eda/cpu/sproute/dl.c
================================================
#include "dl.h"
#include <assert.h>
#include <stdio.h>

dl_t dl_alloc()
{
  dl_t dl = (dl_t)malloc(sizeof(dl_s));
  if (!dl) {
      printf("Out of memory!!\n");
  } else {
    dl->first = dl->last = 0; dl->count = 0;
  }
  return dl;
}

void dl_delete(dl_t dl, dl_el *el)
{
  if (dl->first == el) {
    dl->first = el->next;
  }
  if (dl->last == el) {
    dl->last = el->prev;
  }
  if (el->next) {
    el->next->prev = el->prev;
  }
  if (el->prev) {
    el->prev->next = el->next;
  }
  free(el); dl->count--;
}

void dl_clear(dl_t dl)
{
  dl_el *el, *next;
  if (dl->count > 0) {
    for (el=dl->first; el; el=next) {
      next = el->next;
      free(el);
    }
  }
  dl->first = dl->last = 0;
  dl->count = 0;
}

void dl_concat(dl_t first_list, dl_t second_list)
{
  if (first_list->count <= 0) {
    *first_list = *second_list;
  } else if (second_list->count > 0) {
    first_list->last->next = second_list->first;
    second_list->first->prev = first_list->last;
    first_list->last = second_list->last;
    first_list->count += second_list->count;
  }

  free(second_list);
}

static void dl_insertion_sort(dl_t dl, size_t el_size,
			      int(*compar)(void *, void *))
{
  char *buf;
  void *curr_d, *srch_d;
  dl_el *curr, *srch;

  if (dl_length(dl) <= 1) {
    return;
  }

  buf = (char*)malloc(el_size);

  for (curr=dl->first; curr!=dl->last; curr=curr->next) {
    curr_d = (void*)(((dl_el*)curr)+1);

    for (srch=dl->last; srch!=curr; srch=srch->prev) {
      srch_d = (void*)(((dl_el*)srch)+1);
      if (compar(curr_d, srch_d) > 0) {
	memcpy((void*)buf, curr_d, el_size);	
	memcpy(curr_d, srch_d, el_size);
	memcpy(srch_d, (void*)buf, el_size);
      }
    }
  }
  

  free(buf);
}

void dl_sort(dl_t dl, size_t el_size, int(*compar)(void *, void *))
{
  dl_el *el, *first_head, *second_head;
  dl_s first_list, second_list;
  void *first_item, *second_item;
  int i, len;

  if (dl_length(dl) <= 25) {
    dl_insertion_sort(dl, el_size, compar);
    return;
  }

  len = dl_length(dl)/2;
  for (i=0, el=dl->first; i<len; i++) {
    el = el->next;
  }

  first_list.first = dl->first;
  first_list.last = el->prev;
  first_list.count = len;
  first_list.last->next = 0;

  second_list.first = el;
  second_list.last = dl->last;
  second_list.count = dl_length(dl)-len;
  second_list.first->prev = 0;

  dl_sort(&first_list, el_size, compar);
  dl_sort(&second_list, el_size, compar);

  /* in-place merging */
  first_head = first_list.first;
  second_head = second_list.first;

  first_item = (void*)(((dl_el*)first_head)+1);
  second_item = (void*)(((dl_el*)second_head)+1);
  if (compar(first_item, second_item) <= 0) {
    dl->first = el = first_head;
    first_head = first_head->next;
  } else {
    dl->first = el = second_head;
    second_head = second_head->next;
  }

  while (1) {
    first_item = (void*)(((dl_el*)first_head)+1);
    second_item = (void*)(((dl_el*)second_head)+1);
    if (compar(first_item, second_item) <= 0) {
      el->next = first_head;
      first_head->prev = el;
      el = first_head;
      first_head = first_head->next;
      if (!first_head) {
	el->next = second_head;
	second_head->prev = el;
	dl->last = second_list.last;
	break;
      }
    } else {
      el->next = second_head;
      second_head->prev = el;
      el = second_head;
      second_head = second_head->next;
      if (!second_head) {
	el->next = first_head;
	first_head->prev = el;
	dl->last = first_list.last;
	break;
      }
    }
  }
}


================================================
FILE: lonestar/eda/cpu/sproute/dl.h
================================================
#ifndef DL_H
#define DL_H

#include <string.h>
#include <stdlib.h>

typedef struct dl_el_s {
  struct dl_el_s *prev, *next;
} dl_el;

typedef struct {
  dl_el *first, *last;
  unsigned int count;
} dl_s, *dl_t;

dl_t dl_alloc(void);
void dl_delete(dl_t dl, dl_el* el);
void dl_clear(dl_t dl);
void dl_concat(dl_t list1, dl_t list2);
void dl_sort(dl_t dl, size_t el_size, int (*compar)(void*, void*));

#define dl_length(dl) (dl)->count

#define dl_empty(dl) ((dl)->count <= 0)

#define dl_data(type, el) *(type*)(((dl_el*)(el)) + 1)

#define dl_data_p(type, el) ((type*)(((dl_el*)(el)) + 1))

#define dl_forall(type, dl, data)                                              \
  {                                                                            \
    dl_el *_el, *_next;                                                        \
    dl_t _curr_dl = (dl);                                                      \
    for (_el = _curr_dl->first; _el; _el = _next) {                            \
      _next  = _el->next;                                                      \
      (data) = dl_data(type, _el);

#define dl_forall_p(type, dl, data_p)                                          \
  {                                                                            \
    dl_el *_el, *_next;                                                        \
    dl_t _curr_dl = (dl);                                                      \
    for (_el = _curr_dl->first; _el; _el = _next) {                            \
      _next    = _el->next;                                                    \
      (data_p) = dl_data_p(type, _el);

#define dl_current() _el
#define dl_delete_current() dl_delete(_curr_dl, _el)

#define dl_endfor                                                              \
  }                                                                            \
  }

#define dl_forall_reverse(type, dl, data)                                      \
  {                                                                            \
    dl_el *_el, *_next;                                                        \
    dl_t _curr_dl = (dl);                                                      \
    for (_el = _curr_dl->last; _el; _el = _next) {                             \
      _next  = _el->prev;                                                      \
      (data) = dl_data(type, _el);

#define dl_forall_reverse_p(type, dl, data_p)                                  \
  {                                                                            \
    dl_el *_el, *_next;                                                        \
    dl_t _curr_dl = (dl);                                                      \
    for (_el = _curr_dl->last; _el; _el = _next) {                             \
      _next    = _el->prev;                                                    \
      (data_p) = dl_data_p(type, _el);

#define dl_first(type, dl) dl_data(type, (dl)->first)

#define dl_first_element(dl) (dl)->first

#define dl_last(type, dl) dl_data(type, (dl)->last)

#define dl_pop_first(type, dl, data)                                           \
  {                                                                            \
    (data) = dl_first(type, dl);                                               \
    dl_delete((dl), (dl)->first);                                              \
  }

#define dl_pop_last(type, dl, data)                                            \
  {                                                                            \
    (data) = dl_last(type, dl);                                                \
    dl_delete((dl), (dl)->last);                                               \
  }

#define dl_insert_before(type, dl, element, data)                              \
  {                                                                            \
    if ((element) == (dl)->first) {                                            \
      dl_prepend(type, dl, data);                                              \
    } else {                                                                   \
      dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));               \
      if (!_el) {                                                              \
        printf("Out of memory!!\n");                                           \
      } else {                                                                 \
        memcpy(_el + 1, &(data), sizeof(type));                                \
        _el->prev             = (element)->prev;                               \
        _el->next             = (element);                                     \
        (element)->prev->next = _el;                                           \
        (element)->prev       = _el;                                           \
        (dl)->count++;                                                         \
      }                                                                        \
    }                                                                          \
  }

#define dl_insert_after(type, dl, element, data)                               \
  {                                                                            \
    if ((element) == (dl)->last) {                                             \
      dl_append(type, dl, data);                                               \
    } else {                                                                   \
      dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));               \
      if (!_el) {                                                              \
        printf("Out of memory!!\n");                                           \
      } else {                                                                 \
        memcpy(_el + 1, &(data), sizeof(type));                                \
        _el->next             = (element)->next;                               \
        _el->prev             = (element);                                     \
        (element)->next->prev = _el;                                           \
        (element)->next       = _el;                                           \
        (dl)->count++;                                                         \
      }                                                                        \
    }                                                                          \
  }

#define dl_append(type, dl, data)                                              \
  {                                                                            \
    dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));                 \
    if (!_el) {                                                                \
      printf("Out of memory!!\n");                                             \
    } else {                                                                   \
      memcpy(_el + 1, &(data), sizeof(type));                                  \
      _el->next = 0;                                                           \
      if ((dl)->count <= 0) {                                                  \
        _el->prev   = 0;                                                       \
        (dl)->first = (dl)->last = _el;                                        \
        (dl)->count              = 1;                                          \
      } else {                                                                 \
        _el->prev        = (dl)->last;                                         \
        (dl)->last->next = _el;                                                \
        (dl)->last       = _el;                                                \
        (dl)->count++;                                                         \
      }                                                                        \
    }                                                                          \
  }

#define dl_prepend(type, dl, data)                                             \
  {                                                                            \
    dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));                 \
    if (!_el) {                                                                \
      printf("Out of memory!!\n");                                             \
    } else {                                                                   \
      memcpy(_el + 1, &(data), sizeof(type));                                  \
      _el->prev = 0;                                                           \
      if ((dl)->count <= 0) {                                                  \
        _el->next   = 0;                                                       \
        (dl)->first = (dl)->last = _el;                                        \
        (dl)->count              = 1;                                          \
      } else {                                                                 \
        _el->next         = (dl)->first;                                       \
        (dl)->first->prev = _el;                                               \
        (dl)->first       = _el;                                               \
        (dl)->count++;                                                         \
      }                                                                        \
    }                                                                          \
  }

#define dl_free(dl)                                                            \
  {                                                                            \
    dl_clear(dl);                                                              \
    free(dl);                                                                  \
    dl = 0;                                                                    \
  }

#define dl_duplicate(dest, src, type)                                          \
  {                                                                            \
    dest = dl_alloc();                                                         \
    type _data_el;                                                             \
    dl_forall(type, src, _data_el) { dl_append(type, dest, _data_el); }        \
    dl_endfor;                                                                 \
  }

#endif


================================================
FILE: lonestar/eda/cpu/sproute/err.c
================================================
#include <stdio.h>
#include <stdlib.h>

/**************************************************************************/
/*
  print error message and continue
*/

void  err_msg(
char* msg
)
{
  fprintf(stderr, "%s\n", msg);
}

/**************************************************************************/
/*
  print error message and  exit
*/

void  err_exit(
char* msg
)
{
  fprintf(stderr, "%s\n", msg);
  exit(1);
}


================================================
FILE: lonestar/eda/cpu/sproute/err.h
================================================
#ifndef _ERR_H_
#define _ERR_H_

void err_msg(char* msg);

void err_exit(char* msg);

#endif


================================================
FILE: lonestar/eda/cpu/sproute/flute-ckt.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include "bookshelf_IO.h"
#include "memAlloc.h"
#include "flute.h"
#include "string.h"

float HPwl();
float flutewl();

int main (int argc, char *argv[])
{
    char auxFile[BUFFERSIZE], benchmarkPath[BUFFERSIZE], placefile[BUFFERSIZE];

    if(argc != 4) {
        printf("Usage: %s <benchmark_dir> <aux_file> <placement_file>\n",
               argv[0]);
        printf("    <benchmark_dir> is the benchmark file directory.\n");
        printf("    <aux_file> is the bookshelf format auxiliary file");
        printf(" (assume in <benchmark_dir>).\n");
        printf("    <placement_file> is the placement file");
        printf(" (assume in current directory).\n");
        exit(1);
    }    

    strcpy(benchmarkPath, argv[1]);
    strcpy(auxFile, argv[2]);
    strcpy(placefile, argv[3]);

    readAuxFile(benchmarkPath, auxFile);
    createHash(benchmarkPath, nodesFile);
    readNodesFile(benchmarkPath, nodesFile);
    readNetsFile(benchmarkPath, netsFile);
    readPlFile(".", placefile);
    freeHash();

    readLUT();

    printf("Half-perimeter wirelength: %.2f\n", HPwl());
    printf("FLUTE wirelength         : %.2f\n", flutewl());
}

float HPwl()
{
    float totx, toty, xu, xl, yu, yl, xOffset, yOffset;
    int i, j, k;

    totx = 0.0; toty = 0.0;
    for (j=1; j<=numNets; j++) {
        xl = yl = 1e9;
        xu = yu = -1e9;
        for (k=netlistIndex[j]; k<netlistIndex[j+1]; k++) {
            i = netlist[k];
            xOffset = xPinOffset[k];
            yOffset = yPinOffset[k];

            if (xCellCoord[i]+xOffset < xl) xl = xCellCoord[i]+xOffset;
            if (xu < xCellCoord[i]+xOffset) xu = xCellCoord[i]+xOffset;
            if (yCellCoord[i]+yOffset < yl) yl = yCellCoord[i]+yOffset;
            if (yu < yCellCoord[i]+yOffset) yu = yCellCoord[i]+yOffset;
        }
        totx += xu - xl;
        toty += yu - yl;
    }
    
    return totx + toty;
}

float flutewl()
{
    Tree t;
    DTYPE totwl;
    DTYPE x[MAXD], y[MAXD];
    float xOffset, yOffset;
    int i, j, k, r, d;

    totwl = 0;
    for (j=1; j<=numNets; j++) {
        d = netlistIndex[j+1] - netlistIndex[j];
        k = netlistIndex[j]; 
        for (r=0; r<d; r++) {
            i = netlist[k+r];
            xOffset = xPinOffset[k+r];
            yOffset = yPinOffset[k+r];
            x[r] = (DTYPE) xCellCoord[i]+xOffset;
            y[r] = (DTYPE) yCellCoord[i]+yOffset;
        }
#if ROUTING==1
        t = flute(d, x, y, ACCURACY); totwl += t.length;
#else        
        totwl += flute_wl(d, x, y, ACCURACY);
#endif        
    }
    
    return (float) totwl;
}    


================================================
FILE: lonestar/eda/cpu/sproute/flute-net.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include "flute.h"

int main()
{
    int d=0;
    int x[MAXD], y[MAXD];
    Tree flutetree;
    int flutewl;
    
    while (!feof(stdin)) {
        scanf("%d %d\n", &x[d], &y[d]);
        d++;
    }
    readLUT();

    flutetree = flute(d, x, y, ACCURACY);
    printf("FLUTE wirelength = %d\n", flutetree.length);

    flutewl = flute_wl(d, x, y, ACCURACY);
    printf("FLUTE wirelength (without RSMT construction) = %d\n", flutewl);
}


================================================
FILE: lonestar/eda/cpu/sproute/flute.h
================================================
#ifndef _FLUTE_H_
#define _FLUTE_H_

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <math.h>
//#include "flute_mst.h"

/*****************************/
/*  User-Defined Parameters  */
/*****************************/
#define MAXD 1000              // max. degree that can be handled
#define ACCURACY 10            // Default accuracy
#define ROUTING 1              // 1 to construct routing, 0 to estimate WL only
#define LOCAL_REFINEMENT 1     // Suggestion: Set to 1 if ACCURACY >= 5
#define REMOVE_DUPLICATE_PIN 1 // Remove dup. pin for flute_wl() & flute()

#ifndef DTYPE // Data type for distance
#define DTYPE int
#endif

/*****************************/
/*  User-Callable Functions  */
/*****************************/
// void readLUT();
// DTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc);
// DTYPE flutes_wl(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
// Tree flute(int d, DTYPE x[], DTYPE y[], int acc);
// Tree flutes(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
// DTYPE wirelength(Tree t);
// void printtree(Tree t);
// void plottree(Tree t);

/*************************************/
/* Internal Parameters and Functions */
/*************************************/
#define POWVFILE "/POWV9.dat" // LUT for POWV (Wirelength Vector)
#define POSTFILE "/POST9.dat" // LUT for POST (Steiner Tree)
#define D 9                   // LUT is used for d <= D, D <= 9
#define TAU(A) (8 + 1.3 * (A))
#define D1(A) (25 + 120 / ((A) * (A))) // flute_mr is used for D1 < d <= D2
#define D2(A) ((A) <= 6 ? 500 : 75 + 5 * (A))

typedef struct {
  DTYPE x, y; // starting point of the branch
  int n;      // index of neighbor
} Branch;

typedef struct {
  int deg;        // degree
  DTYPE length;   // total wirelength
  Branch* branch; // array of tree branches
} Tree;

#if REMOVE_DUPLICATE_PIN == 1
#define flutes_wl(d, xs, ys, s, acc) flutes_wl_RDP(d, xs, ys, s, acc)
#define flutes(d, xs, ys, s, acc) flutes_RDP(d, xs, ys, s, acc)
#else
#define flutes_wl(d, xs, ys, s, acc) flutes_wl_ALLD(d, xs, ys, s, acc)
#define flutes(d, xs, ys, s, acc) flutes_ALLD(d, xs, ys, s, acc)
#endif

#define flutes_wl_ALLD(d, xs, ys, s, acc) flutes_wl_LMD(d, xs, ys, s, acc)
#define flutes_ALLD(d, xs, ys, s, acc)                                         \
  (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))
//          : (d<=D1(acc) ? flutes_MD(d, xs, ys, s, acc)
//                        : flutes_HD(d, xs, ys, s, acc)))

#define flutes_wl_LMD(d, xs, ys, s, acc)                                       \
  (d <= D ? flutes_wl_LD(d, xs, ys, s) : flutes_wl_MD(d, xs, ys, s, acc))
#define flutes_LMD(d, xs, ys, s, acc)                                          \
  (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))

//#define max(x,y) ((x)>(y)?(x):(y))
//#define min(x,y) ((x)<(y)?(x):(y))
// to work around max conflict with bitmap
//#define abs(x) ((x)<0?(-x):(x))
using namespace std;
#define ADIFF(x, y) ((x) > (y) ? (x - y) : (y - x)) // Absolute difference

#if D <= 7
#define MGROUP 5040 / 4 // Max. # of groups, 7! = 5040
#define MPOWV 15        // Max. # of POWVs per group
#elif D == 8
#define MGROUP 40320 / 4 // Max. # of groups, 8! = 40320
#define MPOWV 33         // Max. # of POWVs per group
#elif D == 9
#define MGROUP 362880 / 4 // Max. # of groups, 9! = 362880
#define MPOWV 79          // Max. # of POWVs per group
#endif
int numgrp[10] = {0, 0, 0, 0, 6, 30, 180, 1260, 10080, 90720};

struct csoln {
  unsigned char parent;
  unsigned char seg[11];       // Add: 0..i, Sub: j..10; seg[i+1]=seg[j-1]=0
  unsigned char rowcol[D - 2]; // row = rowcol[]/16, col = rowcol[]%16,
  unsigned char neighbor[2 * D - 2];
};
struct csoln* LUT[D + 1][MGROUP]; // storing 4 .. D
int numsoln[D + 1][MGROUP];

typedef struct node_pair_s { // pair of nodes representing an edge
  int node1, node2;
} node_pair;
node_pair* heap;

struct point {
  DTYPE x, y;
  int o;
};

void readLUT();
DTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc);
DTYPE flutes_wl_LD(int d, DTYPE xs[], DTYPE ys[], int s[]);
DTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
DTYPE flutes_wl_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
Tree flute(int d, DTYPE x[], DTYPE y[], int acc);
Tree flutes_LD(int d, DTYPE xs[], DTYPE ys[], int s[]);
Tree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
Tree flutes_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);
Tree dmergetree(Tree t1, Tree t2);
Tree hmergetree(Tree t1, Tree t2, int s[]);
Tree vmergetree(Tree t1, Tree t2);
void local_refinement(Tree* tp, int p);
DTYPE wirelength(Tree t);
void printtree(Tree t);
void plottree(Tree t);

#define MAX_HEAP_SIZE (MAXD * 2)
int max_heap_size = MAX_HEAP_SIZE;
void init_param() {
  heap = (node_pair*)malloc(sizeof(node_pair) * (max_heap_size + 1));
}

[[noreturn]] void abort_with_message(std::string message) noexcept {
  std::cerr << message << std::endl;
  std::abort();
}

void readLUT(const char* fluteDir) {
  unsigned char charnum[256], line[32], *linep, c;
  FILE *fpwv, *fprt;
  struct csoln* p;
  int d, i, j, k, kk, ns, nn;

  init_param();

  for (i = 0; i <= 255; i++) {
    if ('0' <= i && i <= '9')
      charnum[i] = i - '0';
    else if (i >= 'A')
      charnum[i] = i - 'A' + 10;
    else // if (i=='$' || i=='\n' || ... )
      charnum[i] = 0;
  }

  string powvfile, postfile;
  powvfile = fluteDir;
  powvfile += POWVFILE;
  postfile = fluteDir;
  postfile += POSTFILE;

  fpwv = fopen(powvfile.c_str(), "r");
  if (fpwv == NULL) {
    printf("Error in opening POWV: %s\n", powvfile.c_str());
    exit(1);
  }

#if ROUTING == 1
  fprt = fopen(postfile.c_str(), "r");
  if (fprt == NULL) {
    printf("Error in opening POST: %s\n", postfile.c_str());
    exit(1);
  }
#endif

  for (d = 4; d <= D; d++) {
    if (fscanf(fpwv, "d=%d\n", &d) != 1)
      abort_with_message("Unable to get needed info from POWV.");
#if ROUTING == 1
    if (fscanf(fprt, "d=%d\n", &d) != 1)
      abort_with_message("Unable to get needed info from POST.");
#endif
    for (k = 0; k < numgrp[d]; k++) {
      ns = (int)charnum[fgetc(fpwv)];

      if (ns == 0) { // same as some previous group
        if (fscanf(fpwv, "%d\n", &kk) != 1)
          abort_with_message("Unable to get needed info from POWV.");
        numsoln[d][k] = numsoln[d][kk];
        LUT[d][k]     = LUT[d][kk];
      } else {
        fgetc(fpwv); // '\n'
        numsoln[d][k] = ns;
        p             = (struct csoln*)malloc(ns * sizeof(struct csoln));
        LUT[d][k]     = p;
        for (i = 1; i <= ns; i++) {
          linep     = (unsigned char*)fgets((char*)line, 32, fpwv);
          p->parent = charnum[*(linep++)];
          j         = 0;
          while ((p->seg[j++] = charnum[*(linep++)]) != 0)
            ;
          j = 10;
          while ((p->seg[j--] = charnum[*(linep++)]) != 0)
            ;
#if ROUTING == 1
          nn = 2 * d - 2;
          if (!fread(line, 1, d - 2, fprt))
            abort_with_message("Unable to get needed info from POST.");
          linep = line;
          for (j = d; j < nn; j++) {
            c                = charnum[*(linep++)];
            p->rowcol[j - d] = c;
          }
          if (!fread(line, 1, nn / 2 + 1, fprt))
            abort_with_message("Unable to get needed info from POST.");
          linep = line; // last char \n
          for (j = 0; j < nn;) {
            c                = *(linep++);
            p->neighbor[j++] = c / 16;
            p->neighbor[j++] = c % 16;
          }
#endif
          p++;
        }
      }
    }
  }
}

void readLUT() {
  unsigned char charnum[256], line[32], *linep, c;
  FILE *fpwv, *fprt;
  struct csoln* p;
  int d, i, j, k, kk, ns, nn;

  init_param();

  for (i = 0; i <= 255; i++) {
    if ('0' <= i && i <= '9')
      charnum[i] = i - '0';
    else if (i >= 'A')
      charnum[i] = i - 'A' + 10;
    else // if (i=='$' || i=='\n' || ... )
      charnum[i] = 0;
  }

  fpwv = fopen(POWVFILE, "r");
  if (fpwv == NULL) {
    printf("Error in opening %s\n", POWVFILE);
    exit(1);
  }

#if ROUTING == 1
  fprt = fopen(POSTFILE, "r");
  if (fprt == NULL) {
    printf("Error in opening %s\n", POSTFILE);
    exit(1);
  }
#endif

  for (d = 4; d <= D; d++) {
    if (fscanf(fpwv, "d=%d\n", &d) != 1)
      abort_with_message("Unable to get needed info from POWV.");
#if ROUTING == 1
    if (fscanf(fprt, "d=%d\n", &d) != 1)
      abort_with_message("Unable to get needed info from POST.");
#endif
    for (k = 0; k < numgrp[d]; k++) {
      ns = (int)charnum[fgetc(fpwv)];

      if (ns == 0) { // same as some previous group
        if (fscanf(fpwv, "%d\n", &kk) != 1)
          abort_with_message("Unable to get needed info from POWV.");
        numsoln[d][k] = numsoln[d][kk];
        LUT[d][k]     = LUT[d][kk];
      } else {
        fgetc(fpwv); // '\n'
        numsoln[d][k] = ns;
        p             = (struct csoln*)malloc(ns * sizeof(struct csoln));
        LUT[d][k]     = p;
        for (i = 1; i <= ns; i++) {
          linep     = (unsigned char*)fgets((char*)line, 32, fpwv);
          p->parent = charnum[*(linep++)];
          j         = 0;
          while ((p->seg[j++] = charnum[*(linep++)]) != 0)
            ;
          j = 10;
          while ((p->seg[j--] = charnum[*(linep++)]) != 0)
            ;
#if ROUTING == 1
          nn = 2 * d - 2;
          if (!fread(line, 1, d - 2, fprt))
            abort_with_message("Unable to get needed info from POST.");
          linep = line;
          for (j = d; j < nn; j++) {
            c                = charnum[*(linep++)];
            p->rowcol[j - d] = c;
          }
          if (!fread(line, 1, nn / 2 + 1, fprt))
            abort_with_message("Unable to get needed info from POST.");
          linep = line; // last char \n
          for (j = 0; j < nn;) {
            c                = *(linep++);
            p->neighbor[j++] = c / 16;
            p->neighbor[j++] = c % 16;
          }
#endif
          p++;
        }
      }
    }
  }
}

DTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc) {
  DTYPE xs[MAXD], ys[MAXD], minval, l, xu, xl, yu, yl;
  int s[MAXD];
  int i, j, k, minidx;
  struct point pt[MAXD], *ptp[MAXD], *tmpp;

  if (d == 2)
    l = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);
  else if (d == 3) {
    if (x[0] > x[1]) {
      xu = max(x[0], x[2]);
      xl = min(x[1], x[2]);
    } else {
      xu = max(x[1], x[2]);
      xl = min(x[0], x[2]);
    }
    if (y[0] > y[1]) {
      yu = max(y[0], y[2]);
      yl = min(y[1], y[2]);
    } else {
      yu = max(y[1], y[2]);
      yl = min(y[0], y[2]);
    }
    l = (xu - xl) + (yu - yl);
  } else {
    for (i = 0; i < d; i++) {
      pt[i].x = x[i];
      pt[i].y = y[i];
      ptp[i]  = &pt[i];
    }

    // sort x
    for (i = 0; i < d - 1; i++) {
      minval = ptp[i]->x;
      minidx = i;
      for (j = i + 1; j < d; j++) {
        if (minval > ptp[j]->x) {
          minval = ptp[j]->x;
          minidx = j;
        }
      }
      tmpp        = ptp[i];
      ptp[i]      = ptp[minidx];
      ptp[minidx] = tmpp;
    }

#if REMOVE_DUPLICATE_PIN == 1
    ptp[d]    = &pt[d];
    ptp[d]->x = ptp[d]->y = -999999;
    j                     = 0;
    for (i = 0; i < d; i++) {
      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)
        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same
          break;
      if (ptp[k]->x != ptp[i]->x)
        ptp[j++] = ptp[i];
    }
    d = j;
#endif

    for (i = 0; i < d; i++) {
      xs[i]     = ptp[i]->x;
      ptp[i]->o = i;
    }

    // sort y to find s[]
    for (i = 0; i < d - 1; i++) {
      minval = ptp[i]->y;
      minidx = i;
      for (j = i + 1; j < d; j++) {
        if (minval > ptp[j]->y) {
          minval = ptp[j]->y;
          minidx = j;
        }
      }
      ys[i]       = ptp[minidx]->y;
      s[i]        = ptp[minidx]->o;
      ptp[minidx] = ptp[i];
    }
    ys[d - 1] = ptp[d - 1]->y;
    s[d - 1]  = ptp[d - 1]->o;

    l = flutes_wl(d, xs, ys, s, acc);
  }
  return l;
}

// xs[] and ys[] are coords in x and y in sorted order
// s[] is a list of nodes in increasing y direction
//   if nodes are indexed in the order of increasing x coord
//   i.e., s[i] = s_i as defined in paper
// The points are (xs[s[i]], ys[i]) for i=0..d-1
//             or (xs[i], ys[si[i]]) for i=0..d-1

DTYPE flutes_wl_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
  int i, j, ss;

  for (i = 0; i < d - 1; i++) {
    if (xs[s[i]] == xs[s[i + 1]] && ys[i] == ys[i + 1]) {
      if (s[i] < s[i + 1])
        ss = s[i + 1];
      else {
        ss   = s[i];
        s[i] = s[i + 1];
      }
      for (j = i + 2; j < d; j++) {
        ys[j - 1] = ys[j];
        s[j - 1]  = s[j];
      }
      for (j = ss + 1; j < d; j++)
        xs[j - 1] = xs[j];
      for (j = 0; j <= d - 2; j++)
        if (s[j] > ss)
          s[j]--;
      i--;
      d--;
    }
  }
  return flutes_wl_ALLD(d, xs, ys, s, acc);
}

// For low-degree, i.e., 2 <= d <= D
DTYPE flutes_wl_LD(int d, DTYPE xs[], DTYPE ys[], int s[]) {
  int k, pi, i, j;
  struct csoln* rlist;
  DTYPE dd[2 * D - 2]; // 0..D-2 for v, D-1..2*D-3 for h
  DTYPE minl, sum, l[MPOWV + 1];

  if (d <= 3)
    minl = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];
  else {
    k = 0;
    if (s[0] < s[2])
      k++;
    if (s[1] < s[2])
      k++;

    for (i = 3; i <= d - 1; i++) { // p0=0 always, skip i=1 for symmetry
      pi = s[i];
      for (j = d - 1; j > i; j--)
        if (s[j] < s[i])
          pi--;
      k = pi + (i + 1) * k;
    }

    if (k < numgrp[d]) // no horizontal flip
      for (i = 1; i <= d - 3; i++) {
        dd[i]         = ys[i + 1] - ys[i];
        dd[d - 1 + i] = xs[i + 1] - xs[i];
      }
    else {
      k = 2 * numgrp[d] - 1 - k;
      for (i = 1; i <= d - 3; i++) {
        dd[i]         = ys[i + 1] - ys[i];
        dd[d - 1 + i] = xs[d - 1 - i] - xs[d - 2 - i];
      }
    }

    minl = l[0] = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];
    rlist       = LUT[d][k];
    for (i = 0; rlist->seg[i] > 0; i++)
      minl += dd[rlist->seg[i]];

    l[1] = minl;
    j    = 2;
    while (j <= numsoln[d][k]) {
      rlist++;
      sum = l[rlist->parent];
      for (i = 0; rlist->seg[i] > 0; i++)
        sum += dd[rlist->seg[i]];
      for (i = 10; rlist->seg[i] > 0; i--)
        sum -= dd[rlist->seg[i]];
      minl   = min(minl, sum);
      l[j++] = sum;
    }
  }

  return minl;
}

// For medium-degree, i.e., D+1 <= d
DTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
  DTYPE x1[MAXD], x2[MAXD], y1[MAXD], y2[MAXD];
  int si[MAXD], s1[MAXD], s2[MAXD];
  float score[2 * MAXD], penalty[MAXD], pnlty, dx, dy;
  DTYPE ll, minl, extral = 0;
  int i, r, p, maxbp, nbp, bp, ub, lb, n1, n2, newacc;
  int ms, mins, maxs, minsi, maxsi;
  DTYPE distx[MAXD], disty[MAXD], xydiff;

  if (s[0] < s[d - 1]) {
    ms = max(s[0], s[1]);
    for (i = 2; i <= ms; i++)
      ms = max(ms, s[i]);
    if (ms <= d - 3) {
      for (i = 0; i <= ms; i++) {
        x1[i] = xs[i];
        y1[i] = ys[i];
        s1[i] = s[i];
      }
      x1[ms + 1] = xs[ms];
      y1[ms + 1] = ys[ms];
      s1[ms + 1] = ms + 1;

      s2[0] = 0;
      for (i = 1; i <= d - 1 - ms; i++)
        s2[i] = s[i + ms] - ms;

      return flutes_wl_LMD(ms + 2, x1, y1, s1, acc) +
             flutes_wl_LMD(d - ms, xs + ms, ys + ms, s2, acc);
    }
  } else { // (s[0] > s[d-1])
    ms = min(s[0], s[1]);
    for (i = 2; i <= d - 1 - ms; i++)
      ms = min(ms, s[i]);
    if (ms >= 2) {
      x1[0] = xs[ms];
      y1[0] = ys[0];
      s1[0] = s[0] - ms + 1;
      for (i = 1; i <= d - 1 - ms; i++) {
        x1[i] = xs[i + ms - 1];
        y1[i] = ys[i];
        s1[i] = s[i] - ms + 1;
      }
      x1[d - ms] = xs[d - 1];
      y1[d - ms] = ys[d - 1 - ms];
      s1[d - ms] = 0;

      s2[0] = ms;
      for (i = 1; i <= ms; i++)
        s2[i] = s[i + d - 1 - ms];

      return flutes_wl_LMD(d + 1 - ms, x1, y1, s1, acc) +
             flutes_wl_LMD(ms + 1, xs, ys + d - 1 - ms, s2, acc);
    }
  }

  // Find inverse si[] of s[]
  for (r = 0; r < d; r++)
    si[s[r]] = r;

  // Determine breaking directions and positions dp[]
  lb = (d - 2 * acc + 2) / 4;
  if (lb < 2)
    lb = 2;
  ub = d - 1 - lb;

// Compute scores
#define AAWL 0.6
#define BBWL 0.3
  float CCWL = 7.4 / ((d + 10.) * (d - 3.));
  float DDWL = 4.8 / (d - 1);

  // Compute penalty[]
  dx = CCWL * (xs[d - 2] - xs[1]);
  dy = CCWL * (ys[d - 2] - ys[1]);
  for (r = d / 2, pnlty = 0; r >= 0; r--, pnlty += dx)
    penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;
  for (r = d / 2 - 1, pnlty = dy; r >= 0; r--, pnlty += dy)
    penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
  //#define CCWL 0.16
  //    for (r=0; r<d; r++)
  //        penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;

  // Compute distx[], disty[]
  xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
  if (s[0] < s[1])
    mins = s[0], maxs = s[1];
  else
    mins = s[1], maxs = s[0];
  if (si[0] < si[1])
    minsi = si[0], maxsi = si[1];
  else
    minsi = si[1], maxsi = si[0];
  for (r = 2; r <= ub; r++) {
    if (s[r] < mins)
      mins = s[r];
    else if (s[r] > maxs)
      maxs = s[r];
    distx[r] = xs[maxs] - xs[mins];
    if (si[r] < minsi)
      minsi = si[r];
    else if (si[r] > maxsi)
      maxsi = si[r];
    disty[r] = ys[maxsi] - ys[minsi] + xydiff;
  }

  if (s[d - 2] < s[d - 1])
    mins = s[d - 2], maxs = s[d - 1];
  else
    mins = s[d - 1], maxs = s[d - 2];
  if (si[d - 2] < si[d - 1])
    minsi = si[d - 2], maxsi = si[d - 1];
  else
    minsi = si[d - 1], maxsi = si[d - 2];
  for (r = d - 3; r >= lb; r--) {
    if (s[r] < mins)
      mins = s[r];
    else if (s[r] > maxs)
      maxs = s[r];
    distx[r] += xs[maxs] - xs[mins];
    if (si[r] < minsi)
      minsi = si[r];
    else if (si[r] > maxsi)
      maxsi = si[r];
    disty[r] += ys[maxsi] - ys[minsi];
  }

  nbp = 0;
  for (r = lb; r <= ub; r++) {
    if (si[r] == 0 || si[r] == d - 1)
      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -
                   AAWL * (ys[d - 2] - ys[1]) - DDWL * disty[r];
    else
      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -
                   BBWL * (ys[si[r] + 1] - ys[si[r] - 1]) - DDWL * disty[r];
    nbp++;

    if (s[r] == 0 || s[r] == d - 1)
      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -
                   AAWL * (xs[d - 2] - xs[1]) - DDWL * distx[r];
    else
      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -
                   BBWL * (xs[s[r] + 1] - xs[s[r] - 1]) - DDWL * distx[r];
    nbp++;
  }

  if (acc <= 3)
    newacc = 1;
  else {
    newacc = acc / 2;
    if (acc >= nbp)
      acc = nbp - 1;
  }

  minl = (DTYPE)INT_MAX;
  for (i = 0; i < acc; i++) {
    maxbp = 0;
    for (bp = 1; bp < nbp; bp++)
      if (score[maxbp] < score[bp])
        maxbp = bp;
    score[maxbp] = -9e9;

#define BreakPt(bp) ((bp) / 2 + lb)
#define BreakInX(bp) ((bp) % 2 == 0)
    p = BreakPt(maxbp);
    // Breaking in p
    if (BreakInX(maxbp)) { // break in x
      n1 = n2 = 0;
      for (r = 0; r < d; r++) {
        if (s[r] < p) {
          s1[n1] = s[r];
          y1[n1] = ys[r];
          n1++;
        } else if (s[r] > p) {
          s2[n2] = s[r] - p;
          y2[n2] = ys[r];
          n2++;
        } else { // if (s[r] == p)  i.e.,  r = si[p]
          s1[n1] = p;
          s2[n2] = 0;
          if (r == d - 1 || r == d - 2) {
            y1[n1] = y2[n2] = ys[r - 1];
            extral          = ys[r] - ys[r - 1];
          }
          if (r == 0 || r == 1) {
            y1[n1] = y2[n2] = ys[r + 1];
            extral          = ys[r + 1] - ys[r];
          } else {
            y1[n1] = y2[n2] = ys[r];
            extral          = 0;
          }
          n1++;
          n2++;
        }
      }
      ll = extral + flutes_wl_LMD(p + 1, xs, y1, s1, newacc) +
           flutes_wl_LMD(d - p, xs + p, y2, s2, newacc);
    } else { // if (!BreakInX(maxbp))
      n1 = n2 = 0;
      for (r = 0; r < d; r++) {
        if (si[r] < p) {
          s1[si[r]] = n1;
          x1[n1]    = xs[r];
          n1++;
        } else if (si[r] > p) {
          s2[si[r] - p] = n2;
          x2[n2]        = xs[r];
          n2++;
        } else { // if (si[r] == p)  i.e.,  r = s[p]
          s1[p] = n1;
          s2[0] = n2;
          if (r == d - 1 || r == d - 2) {
            x1[n1] = x2[n2] = xs[r - 1];
            extral          = xs[r] - xs[r - 1];
          }
          if (r == 0 || r == 1) {
            x1[n1] = x2[n2] = xs[r + 1];
            extral          = xs[r + 1] - xs[r];
          } else {
            x1[n1] = x2[n2] = xs[r];
            extral          = 0;
          }
          n1++;
          n2++;
        }
      }
      ll = extral + flutes_wl_LMD(p + 1, x1, ys, s1, newacc) +
           flutes_wl_LMD(d - p, x2, ys + p, s2, newacc);
    }
    if (minl > ll)
      minl = ll;
  }
  return minl;
}

static int orderx(const void* a, const void* b) {
  struct point *pa, *pb;

  pa = *(struct point**)a;
  pb = *(struct point**)b;

  if (pa->x < pb->x)
    return -1;
  if (pa->x > pb->x)
    return 1;
  return 0;
}

static int ordery(const void* a, const void* b) {
  struct point *pa, *pb;

  pa = *(struct point**)a;
  pb = *(struct point**)b;

  if (pa->y < pb->y)
    return -1;
  if (pa->y > pb->y)
    return 1;
  return 0;
}

Tree flute(int d, DTYPE x[], DTYPE y[], int acc) {
  DTYPE *xs, *ys, minval;
  int* s;
  int i, j, k, minidx;
  struct point *pt, **ptp, *tmpp;
  Tree t;

  if (d == 2) {
    t.deg         = 2;
    t.length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);
    t.branch      = (Branch*)malloc(2 * sizeof(Branch));
    t.branch[0].x = x[0];
    t.branch[0].y = y[0];
    t.branch[0].n = 1;
    t.branch[1].x = x[1];
    t.branch[1].y = y[1];
    t.branch[1].n = 1;
  } else {
    xs  = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    ys  = (DTYPE*)malloc(sizeof(DTYPE) * (d));
    s   = (int*)malloc(sizeof(int) * (d));
    pt  = (struct point*)malloc(sizeof(struct point) * (d + 1));
    ptp = (struct point**)malloc(sizeof(struct point*) * (d + 1));

    for (i = 0; i < d; i++) {
      pt[i].x = x[i];
      pt[i].y = y[i];
      ptp[i]  = &pt[i];
    }

    // sort x
    if (d < 200) {
      for (i = 0; i < d - 1; i++) {
        minval = ptp[i]->x;
        minidx = i;
        for (j = i + 1; j < d; j++) {
          if (minval > ptp[j]->x) {
            minval = ptp[j]->x;
            minidx = j;
          }
        }
        tmpp        = ptp[i];
        ptp[i]      = ptp[minidx];
        ptp[minidx] = tmpp;
      }
    } else {
      qsort(ptp, d, sizeof(struct point*), orderx);
    }

#if REMOVE_DUPLICATE_PIN == 1
    ptp[d]    = &pt[d];
    ptp[d]->x = ptp[d]->y = -999999;
    j                     = 0;
    for (i = 0; i < d; i++) {
      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)
        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same
          break;
      if (ptp[k]->x != ptp[i]->x)
        ptp[j++] = ptp[i];
    }
    d = j;
#endif

    for (i = 0; i < d; i++) {
      xs[i]     = ptp[i]->x;
      ptp[i]->o = i;
    }

    // sort y to find s[]
    if (d < 200) {
      for (i = 0; i < d - 1; i++) {
        minval = ptp[i]->y;
        minidx = i;
        for (j = i + 1; j < d; j++) {
          if (minval > ptp[j]->y) {
            minval = ptp[j]->y;
            minidx = j;
          }
        }
        ys[i]       = ptp[minidx]->y;
        s[i]        = ptp[minidx]->o;
        ptp[minidx] = ptp[i];
      }
      ys[d - 1] = ptp[d - 1]->y;
      s[d - 1]  = ptp[d - 1]->o;
    } else {
      qsort(ptp, d, sizeof(struct point*), ordery);
      for (i = 0; i < d; i++) {
        ys[i] = ptp[i]->y;
        s[i]  = ptp[i]->o;
      }
    }

    t = flutes(d, xs, ys, s, acc);

    free(xs);
    free(ys);
    free(s);
    free(pt);
    free(ptp);
  }

  return t;
}

// xs[] and ys[] are coords in x and y in sorted order
// s[] is a list of nodes in increasing y direction
//   if nodes are indexed in the order of increasing x coord
//   i.e., s[i] = s_i as defined in paper
// The points are (xs[s[i]], ys[i]) for i=0..d-1
//             or (xs[i], ys[si[i]]) for i=0..d-1

Tree flutes_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
  int i, j, ss;

  for (i = 0; i < d - 1; i++) {
    if (xs[s[i]] == xs[s[i + 1]] && ys[i] == ys[i + 1]) {
      if (s[i] < s[i + 1])
        ss = s[i + 1];
      else {
        ss   = s[i];
        s[i] = s[i + 1];
      }
      for (j = i + 2; j < d; j++) {
        ys[j - 1] = ys[j];
        s[j - 1]  = s[j];
      }
      for (j = ss + 1; j < d; j++)
        xs[j - 1] = xs[j];
      for (j = 0; j <= d - 2; j++)
        if (s[j] > ss)
          s[j]--;
      i--;
      d--;
    }
  }
  return flutes_ALLD(d, xs, ys, s, acc);
}

// For low-degree, i.e., 2 <= d <= D
Tree flutes_LD(int d, DTYPE xs[], DTYPE ys[], int s[]) {
  int k, pi, i, j;
  struct csoln *rlist, *bestrlist;
  DTYPE dd[2 * D - 2]; // 0..D-2 for v, D-1..2*D-3 for h
  DTYPE minl, sum, l[MPOWV + 1];
  int hflip;
  Tree t;

  t.deg    = d;
  t.branch = (Branch*)malloc((2 * d - 2) * sizeof(Branch));
  if (d == 2) {
    minl          = xs[1] - xs[0] + ys[1] - ys[0];
    t.branch[0].x = xs[s[0]];
    t.branch[0].y = ys[0];
    t.branch[0].n = 1;
    t.branch[1].x = xs[s[1]];
    t.branch[1].y = ys[1];
    t.branch[1].n = 1;
  } else if (d == 3) {
    minl          = xs[2] - xs[0] + ys[2] - ys[0];
    t.branch[0].x = xs[s[0]];
    t.branch[0].y = ys[0];
    t.branch[0].n = 3;
    t.branch[1].x = xs[s[1]];
    t.branch[1].y = ys[1];
    t.branch[1].n = 3;
    t.branch[2].x = xs[s[2]];
    t.branch[2].y = ys[2];
    t.branch[2].n = 3;
    t.branch[3].x = xs[1];
    t.branch[3].y = ys[1];
    t.branch[3].n = 3;
  } else {
    k = 0;
    if (s[0] < s[2])
      k++;
    if (s[1] < s[2])
      k++;

    for (i = 3; i <= d - 1; i++) { // p0=0 always, skip i=1 for symmetry
      pi = s[i];
      for (j = d - 1; j > i; j--)
        if (s[j] < s[i])
          pi--;
      k = pi + (i + 1) * k;
    }

    if (k < numgrp[d]) { // no horizontal flip
      hflip = 0;
      for (i = 1; i <= d - 3; i++) {
        dd[i]         = ys[i + 1] - ys[i];
        dd[d - 1 + i] = xs[i + 1] - xs[i];
      }
    } else {
      hflip = 1;
      k     = 2 * numgrp[d] - 1 - k;
      for (i = 1; i <= d - 3; i++) {
        dd[i]         = ys[i + 1] - ys[i];
        dd[d - 1 + i] = xs[d - 1 - i] - xs[d - 2 - i];
      }
    }

    minl = l[0] = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];
    rlist       = LUT[d][k];
    for (i = 0; rlist->seg[i] > 0; i++)
      minl += dd[rlist->seg[i]];
    bestrlist = rlist;
    l[1]      = minl;
    j         = 2;
    while (j <= numsoln[d][k]) {
      rlist++;
      sum = l[rlist->parent];
      for (i = 0; rlist->seg[i] > 0; i++)
        sum += dd[rlist->seg[i]];
      for (i = 10; rlist->seg[i] > 0; i--)
        sum -= dd[rlist->seg[i]];
      if (sum < minl) {
        minl      = sum;
        bestrlist = rlist;
      }
      l[j++] = sum;
    }

    t.branch[0].x = xs[s[0]];
    t.branch[0].y = ys[0];
    t.branch[1].x = xs[s[1]];
    t.branch[1].y = ys[1];
    for (i = 2; i < d - 2; i++) {
      t.branch[i].x = xs[s[i]];
      t.branch[i].y = ys[i];
      t.branch[i].n = bestrlist->neighbor[i];
    }
    t.branch[d - 2].x = xs[s[d - 2]];
    t.branch[d - 2].y = ys[d - 2];
    t.branch[d - 1].x = xs[s[d - 1]];
    t.branch[d - 1].y = ys[d - 1];
    if (hflip) {
      if (s[1] < s[0]) {
        t.branch[0].n = bestrlist->neighbor[1];
        t.branch[1].n = bestrlist->neighbor[0];
      } else {
        t.branch[0].n = bestrlist->neighbor[0];
        t.branch[1].n = bestrlist->neighbor[1];
      }
      if (s[d - 1] < s[d - 2]) {
        t.branch[d - 2].n = bestrlist->neighbor[d - 1];
        t.branch[d - 1].n = bestrlist->neighbor[d - 2];
      } else {
        t.branch[d - 2].n = bestrlist->neighbor[d - 2];
        t.branch[d - 1].n = bestrlist->neighbor[d - 1];
      }
      for (i = d; i < 2 * d - 2; i++) {
        t.branch[i].x = xs[d - 1 - bestrlist->rowcol[i - d] % 16];
        t.branch[i].y = ys[bestrlist->rowcol[i - d] / 16];
        t.branch[i].n = bestrlist->neighbor[i];
      }
    } else { // !hflip
      if (s[0] < s[1]) {
        t.branch[0].n = bestrlist->neighbor[1];
        t.branch[1].n = bestrlist->neighbor[0];
      } else {
        t.branch[0].n = bestrlist->neighbor[0];
        t.branch[1].n = bestrlist->neighbor[1];
      }
      if (s[d - 2] < s[d - 1]) {
        t.branch[d - 2].n = bestrlist->neighbor[d - 1];
        t.branch[d - 1].n = bestrlist->neighbor[d - 2];
      } else {
        t.branch[d - 2].n = bestrlist->neighbor[d - 2];
        t.branch[d - 1].n = bestrlist->neighbor[d - 1];
      }
      for (i = d; i < 2 * d - 2; i++) {
        t.branch[i].x = xs[bestrlist->rowcol[i - d] % 16];
        t.branch[i].y = ys[bestrlist->rowcol[i - d] / 16];
        t.branch[i].n = bestrlist->neighbor[i];
      }
    }
  }
  t.length = minl;

  return t;
}

// For medium-degree, i.e., D+1 <= d
Tree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
  DTYPE x1[MAXD], x2[MAXD], y1[MAXD], y2[MAXD];
  int si[MAXD], s1[MAXD], s2[MAXD];
  float score[2 * MAXD], penalty[MAXD], pnlty, dx, dy;
  DTYPE ll, minl, coord1, coord2;
  int i, r, p, maxbp, bestbp = 0, bp, nbp, ub, lb, n1, n2, nn1 = 0, nn2 = 0,
                      newacc;
  Tree t, t1, t2, bestt1, bestt2;
  int ms, mins, maxs, minsi, maxsi;
  DTYPE distx[MAXD], disty[MAXD], xydiff;

  if (s[0] < s[d - 1]) {
    ms = max(s[0], s[1]);
    for (i = 2; i <= ms; i++)
      ms = max(ms, s[i]);
    if (ms <= d - 3) {
      for (i = 0; i <= ms; i++) {
        x1[i] = xs[i];
        y1[i] = ys[i];
        s1[i] = s[i];
      }
      x1[ms + 1] = xs[ms];
      y1[ms + 1] = ys[ms];
      s1[ms + 1] = ms + 1;

      s2[0] = 0;
      for (i = 1; i <= d - 1 - ms; i++)
        s2[i] = s[i + ms] - ms;

      t1 = flutes_LMD(ms + 2, x1, y1, s1, acc);
      t2 = flutes_LMD(d - ms, xs + ms, ys + ms, s2, acc);
      t  = dmergetree(t1, t2);
      free(t1.branch);
      free(t2.branch);

      return t;
    }
  } else { // (s[0] > s[d-1])
    ms = min(s[0], s[1]);
    for (i = 2; i <= d - 1 - ms; i++)
      ms = min(ms, s[i]);
    if (ms >= 2) {
      x1[0] = xs[ms];
      y1[0] = ys[0];
      s1[0] = s[0] - ms + 1;
      for (i = 1; i <= d - 1 - ms; i++) {
        x1[i] = xs[i + ms - 1];
        y1[i] = ys[i];
        s1[i] = s[i] - ms + 1;
      }
      x1[d - ms] = xs[d - 1];
      y1[d - ms] = ys[d - 1 - ms];
      s1[d - ms] = 0;

      s2[0] = ms;
      for (i = 1; i <= ms; i++)
        s2[i] = s[i + d - 1 - ms];

      t1 = flutes_LMD(d + 1 - ms, x1, y1, s1, acc);
      t2 = flutes_LMD(ms + 1, xs, ys + d - 1 - ms, s2, acc);
      t  = dmergetree(t1, t2);
      free(t1.branch);
      free(t2.branch);

      return t;
    }
  }

  // Find inverse si[] of s[]
  for (r = 0; r < d; r++)
    si[s[r]] = r;

  // Determine breaking directions and positions dp[]
  lb = (d - 2 * acc + 2) / 4;
  if (lb < 2)
    lb = 2;
  ub = d - 1 - lb;

// Compute scores
#define AA 0.6 // 2.0*BB
#define BB 0.3
  float CC = 7.4 / ((d + 10.) * (d - 3.));
  float DD = 4.8 / (d - 1);

  // Compute penalty[]
  dx = CC * (xs[d - 2] - xs[1]);
  dy = CC * (ys[d - 2] - ys[1]);
  for (r = d / 2, pnlty = 0; r >= 2; r--, pnlty += dx)
    penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;
  penalty[1] = pnlty, penalty[d - 2] = pnlty;
  penalty[0] = pnlty, penalty[d - 1] = pnlty;
  for (r = d / 2 - 1, pnlty = dy; r >= 2; r--, pnlty += dy)
    penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
  penalty[s[1]] += pnlty, penalty[s[d - 2]] += pnlty;
  penalty[s[0]] += pnlty, penalty[s[d - 1]] += pnlty;
  //#define CC 0.16
  //#define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))
  //    for (r=0; r<d; r++)
  //        penalty[r] = v(r)*dx + v(si[r])*dy;

  // Compute distx[], disty[]
  xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
  if (s[0] < s[1])
    mins = s[0], maxs = s[1];
  else
    mins = s[1], maxs = s[0];
  if (si[0] < si[1])
    minsi = si[0], maxsi = si[1];
  else
    minsi = si[1], maxsi = si[0];
  for (r = 2; r <= ub; r++) {
    if (s[r] < mins)
      mins = s[r];
    else if (s[r] > maxs)
      maxs = s[r];
    distx[r] = xs[maxs] - xs[mins];
    if (si[r] < minsi)
      minsi = si[r];
    else if (si[r] > maxsi)
      maxsi = si[r];
    disty[r] = ys[maxsi] - ys[minsi] + xydiff;
  }

  if (s[d - 2] < s[d - 1])
    mins = s[d - 2], maxs = s[d - 1];
  else
    mins = s[d - 1], maxs = s[d - 2];
  if (si[d - 2] < si[d - 1])
    minsi = si[d - 2], maxsi = si[d - 1];
  else
    minsi = si[d - 1], maxsi = si[d - 2];
  for (r = d - 3; r >= lb; r--) {
    if (s[r] < mins)
      mins = s[r];
    else if (s[r] > maxs)
      maxs = s[r];
    distx[r] += xs[maxs] - xs[mins];
    if (si[r] < minsi)
      minsi = si[r];
    else if (si[r] > maxsi)
      maxsi = si[r];
    disty[r] += ys[maxsi] - ys[minsi];
  }

  nbp = 0;
  for (r = lb; r <= ub; r++) {
    if (si[r] <= 1)
      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] - AA * (ys[2] - ys[1]) -
                   DD * disty[r];
    else if (si[r] >= d - 2)
      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -
                   AA * (ys[d - 2] - ys[d - 3]) - DD * disty[r];
    else
      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -
                   BB * (ys[si[r] + 1] - ys[si[r] - 1]) - DD * disty[r];
    nbp++;

    if (s[r] <= 1)
      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -
                   AA * (xs[2] - xs[1]) - DD * distx[r];
    else if (s[r] >= d - 2)
      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -
                   AA * (xs[d - 2] - xs[d - 3]) - DD * distx[r];
    else
      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -
                   BB * (xs[s[r] + 1] - xs[s[r] - 1]) - DD * distx[r];
    nbp++;
  }

  if (acc <= 3)
    newacc = 1;
  else {
    newacc = acc / 2;
    if (acc >= nbp)
      acc = nbp - 1;
  }

  minl          = (DTYPE)INT_MAX;
  bestt1.branch = bestt2.branch = NULL;
  for (i = 0; i < acc; i++) {
    maxbp = 0;
    for (bp = 1; bp < nbp; bp++)
      if (score[maxbp] < score[bp])
        maxbp = bp;
    score[maxbp] = -9e9;

#define BreakPt(bp) ((bp) / 2 + lb)
#define BreakInX(bp) ((bp) % 2 == 0)
    p = BreakPt(maxbp);
    // Breaking in p
    if (BreakInX(maxbp)) { // break in x
      n1 = n2 = 0;
      for (r = 0; r < d; r++) {
        if (s[r] < p) {
          s1[n1] = s[r];
          y1[n1] = ys[r];
          n1++;
        } else if (s[r] > p) {
          s2[n2] = s[r] - p;
          y2[n2] = ys[r];
          n2++;
        } else { // if (s[r] == p)  i.e.,  r = si[p]
          s1[n1] = p;
          s2[n2] = 0;
          y1[n1] = y2[n2] = ys[r];
          nn1             = n1;
          nn2             = n2;
          n1++;
          n2++;
        }
      }

      t1     = flutes_LMD(p + 1, xs, y1, s1, newacc);
      t2     = flutes_LMD(d - p, xs + p, y2, s2, newacc);
      ll     = t1.length + t2.length;
      coord1 = t1.branch[t1.branch[nn1].n].y;
      coord2 = t2.branch[t2.branch[nn2].n].y;
      if (t2.branch[nn2].y > max(coord1, coord2))
        ll -= t2.branch[nn2].y - max(coord1, coord2);
      else if (t2.branch[nn2].y < min(coord1, coord2))
        ll -= min(coord1, coord2) - t2.branch[nn2].y;
    } else { // if (!BreakInX(maxbp))
      n1 = n2 = 0;
      for (r = 0; r < d; r++) {
        if (si[r] < p) {
          s1[si[r]] = n1;
          x1[n1]    = xs[r];
          n1++;
        } else if (si[r] > p) {
          s2[si[r] - p] = n2;
          x2[n2]        = xs[r];
          n2++;
        } else { // if (si[r] == p)  i.e.,  r = s[p]
          s1[p]  = n1;
          s2[0]  = n2;
          x1[n1] = x2[n2] = xs[r];
          n1++;
          n2++;
        }
      }

      t1     = flutes_LMD(p + 1, x1, ys, s1, newacc);
      t2     = flutes_LMD(d - p, x2, ys + p, s2, newacc);
      ll     = t1.length + t2.length;
      coord1 = t1.branch[t1.branch[p].n].x;
      coord2 = t2.branch[t2.branch[0].n].x;
      if (t2.branch[0].x > max(coord1, coord2))
        ll -= t2.branch[0].x - max(coord1, coord2);
      else if (t2.branch[0].x < min(coord1, coord2))
        ll -= min(coord1, coord2) - t2.branch[0].x;
    }
    if (minl > ll) {
      minl = ll;
      free(bestt1.branch);
      free(bestt2.branch);
      bestt1 = t1;
      bestt2 = t2;
      bestbp = maxbp;
    } else {
      free(t1.branch);
      free(t2.branch);
    }
  }

#if LOCAL_REFINEMENT == 1
  if (BreakInX(bestbp)) {
    t = hmergetree(bestt1, bestt2, s);
    local_refinement(&t, si[BreakPt(bestbp)]);
  } else {
    t = vmergetree(bestt1, bestt2);
    local_refinement(&t, BreakPt(bestbp));
  }
#else
  if (BreakInX(bestbp)) {
    t = hmergetree(bestt1, bestt2, s);
  } else {
    t = vmergetree(bestt1, bestt2);
  }
#endif

  free(bestt1.branch);
  free(bestt2.branch);

  return t;
}

Tree dmergetree(Tree t1, Tree t2) {
  int i, d, prev, curr, next, offset1, offset2;
  Tree t;

  t.deg = d = t1.deg + t2.deg - 2;
  t.length  = t1.length + t2.length;
  t.branch  = (Branch*)malloc((2 * d - 2) * sizeof(Branch));
  offset1   = t2.deg - 2;
  offset2   = 2 * t1.deg - 4;

  for (i = 0; i <= t1.deg - 2; i++) {
    t.branch[i].x = t1.branch[i].x;
    t.branch[i].y = t1.branch[i].y;
    t.branch[i].n = t1.branch[i].n + offset1;
  }
  for (i = t1.deg - 1; i <= d - 1; i++) {
    t.branch[i].x = t2.branch[i - t1.deg + 2].x;
    t.branch[i].y = t2.branch[i - t1.deg + 2].y;
    t.branch[i].n = t2.branch[i - t1.deg + 2].n + offset2;
  }
  for (i = d; i <= d + t1.deg - 3; i++) {
    t.branch[i].x = t1.branch[i - offset1].x;
    t.branch[i].y = t1.branch[i - offset1].y;
    t.branch[i].n = t1.branch[i - offset1].n + offset1;
  }
  for (i = d + t1.deg - 2; i <= 2 * d - 3; i++) {
    t.branch[i].x = t2.branch[i - offset2].x;
    t.branch[i].y = t2.branch[i - offset2].y;
    t.branch[i].n = t2.branch[i - offset2].n + offset2;
  }

  prev = t2.branch[0].n + offset2;
  curr = t1.branch[t1.deg - 1].n + offset1;
  next = t.branch[curr].n;
  while (curr != next) {
    t.branch[curr].n = prev;
    prev             = curr;
    curr             = next;
    next             = t.branch[curr].n;
  }
  t.branch[curr].n = prev;

  return t;
}

Tree hmergetree(Tree t1, Tree t2, int s[]) {
  int i, prev, curr, next, extra, offset1, offset2;
  int p, ii = 0, n1, n2, nn1 = 0, nn2 = 0;
  DTYPE coord1, coord2;
  Tree t;

  t.deg    = t1.deg + t2.deg - 1;
  t.length = t1.length + t2.length;
  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));
  offset1  = t2.deg - 1;
  offset2  = 2 * t1.deg - 3;

  p  = t1.deg - 1;
  n1 = n2 = 0;
  for (i = 0; i < t.deg; i++) {
    if (s[i] < p) {
      t.branch[i].x = t1.branch[n1].x;
      t.branch[i].y = t1.branch[n1].y;
      t.branch[i].n = t1.branch[n1].n + offset1;
      n1++;
    } else if (s[i] > p) {
      t.branch[i].x = t2.branch[n2].x;
      t.branch[i].y = t2.branch[n2].y;
      t.branch[i].n = t2.branch[n2].n + offset2;
      n2++;
    } else {
      t.branch[i].x = t2.branch[n2].x;
      t.branch[i].y = t2.branch[n2].y;
      t.branch[i].n = t2.branch[n2].n + offset2;
      nn1           = n1;
      nn2           = n2;
      ii            = i;
      n1++;
      n2++;
    }
  }
  for (i = t.deg; i <= t.deg + t1.deg - 3; i++) {
    t.branch[i].x = t1.branch[i - offset1].x;
    t.branch[i].y = t1.branch[i - offset1].y;
    t.branch[i].n = t1.branch[i - offset1].n + offset1;
  }
  for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) {
    t.branch[i].x = t2.branch[i - offset2].x;
    t.branch[i].y = t2.branch[i - offset2].y;
    t.branch[i].n = t2.branch[i - offset2].n + offset2;
  }
  extra  = 2 * t.deg - 3;
  coord1 = t1.branch[t1.branch[nn1].n].y;
  coord2 = t2.branch[t2.branch[nn2].n].y;
  if (t2.branch[nn2].y > max(coord1, coord2)) {
    t.branch[extra].y = max(coord1, coord2);
    t.length -= t2.branch[nn2].y - t.branch[extra].y;
  } else if (t2.branch[nn2].y < min(coord1, coord2)) {
    t.branch[extra].y = min(coord1, coord2);
    t.length -= t.branch[extra].y - t2.branch[nn2].y;
  } else
    t.branch[extra].y = t2.branch[nn2].y;
  t.branch[extra].x = t2.branch[nn2].x;
  t.branch[extra].n = t.branch[ii].n;
  t.branch[ii].n    = extra;

  prev = extra;
  curr = t1.branch[nn1].n + offset1;
  next = t.branch[curr].n;
  while (curr != next) {
    t.branch[curr].n = prev;
    prev             = curr;
    curr             = next;
    next             = t.branch[curr].n;
  }
  t.branch[curr].n = prev;

  return t;
}

Tree vmergetree(Tree t1, Tree t2) {
  int i, prev, curr, next, extra, offset1, offset2;
  DTYPE coord1, coord2;
  Tree t;

  t.deg    = t1.deg + t2.deg - 1;
  t.length = t1.length + t2.length;
  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));
  offset1  = t2.deg - 1;
  offset2  = 2 * t1.deg - 3;

  for (i = 0; i <= t1.deg - 2; i++) {
    t.branch[i].x = t1.branch[i].x;
    t.branch[i].y = t1.branch[i].y;
    t.branch[i].n = t1.branch[i].n + offset1;
  }
  for (i = t1.deg - 1; i <= t.deg - 1; i++) {
    t.branch[i].x = t2.branch[i - t1.deg + 1].x;
    t.branch[i].y = t2.branch[i - t1.deg + 1].y;
    t.branch[i].n = t2.branch[i - t1.deg + 1].n + offset2;
  }
  for (i = t.deg; i <= t.deg + t1.deg - 3; i++) {
    t.branch[i].x = t1.branch[i - offset1].x;
    t.branch[i].y = t1.branch[i - offset1].y;
    t.branch[i].n = t1.branch[i - offset1].n + offset1;
  }
  for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) {
    t.branch[i].x = t2.branch[i - offset2].x;
    t.branch[i].y = t2.branch[i - offset2].y;
    t.branch[i].n = t2.branch[i - offset2].n + offset2;
  }
  extra  = 2 * t.deg - 3;
  coord1 = t1.branch[t1.branch[t1.deg - 1].n].x;
  coord2 = t2.branch[t2.branch[0].n].x;
  if (t2.branch[0].x > max(coord1, coord2)) {
    t.branch[extra].x = max(coord1, coord2);
    t.length -= t2.branch[0].x - t.branch[extra].x;
  } else if (t2.branch[0].x < min(coord1, coord2)) {
    t.branch[extra].x = min(coord1, coord2);
    t.length -= t.branch[extra].x - t2.branch[0].x;
  } else
    t.branch[extra].x = t2.branch[0].x;
  t.branch[extra].y      = t2.branch[0].y;
  t.branch[extra].n      = t.branch[t1.deg - 1].n;
  t.branch[t1.deg - 1].n = extra;

  prev = extra;
  curr = t1.branch[t1.deg - 1].n + offset1;
  next = t.branch[curr].n;
  while (curr != next) {
    t.branch[curr].n = prev;
    prev             = curr;
    curr             = next;
    next             = t.branch[curr].n;
  }
  t.branch[curr].n = prev;

  return t;
}

void local_refinement(Tree* tp, int p) {
  int d, dd, i, ii, j, prev, curr, next, root;
  int SteinerPin[2 * MAXD], index[2 * MAXD];
  DTYPE x[MAXD], xs[D], ys[D];
  int ss[D];
  Tree tt;

  d    = tp->deg;
  root = tp->branch[p].n;

  // Reverse edges to point to root
  prev = root;
  curr = tp->branch[prev].n;
  next = tp->branch[curr].n;
  while (curr != next) {
    tp->branch[curr].n = prev;
    prev               = curr;
    curr               = next;
    next               = tp->branch[curr].n;
  }
  tp->branch[curr].n = prev;
  tp->branch[root].n = root;

  // Find Steiner nodes that are at pins
  for (i = d; i <= 2 * d - 3; i++)
    SteinerPin[i] = -1;
  for (i = 0; i < d; i++) {
    next = tp->branch[i].n;
    if (tp->branch[i].x == tp->branch[next].x &&
        tp->branch[i].y == tp->branch[next].y)
      SteinerPin[next] = i; // Steiner 'next' at Pin 'i'
  }
  SteinerPin[root] = p;

  // Find pins that are directly connected to root
  dd = 0;
  for (i = 0; i < d; i++) {
    curr = tp->branch[i].n;
    if (SteinerPin[curr] == i)
      curr = tp->branch[curr].n;
    while (SteinerPin[curr] < 0)
      curr = tp->branch[curr].n;
    if (curr == root) {
      x[dd] = tp->branch[i].x;
      if (SteinerPin[tp->branch[i].n] == i && tp->branch[i].n != root)
        index[dd++] = tp->branch[i].n; // Steiner node
      else
        index[dd++] = i; // Pin
    }
  }

  if (4 <= dd && dd <= D) {
    // Find Steiner nodes that are directly connected to root
    ii = dd;
    for (i = 0; i < dd; i++) {
      curr = tp->branch[index[i]].n;
      while (SteinerPin[curr] < 0) {
        index[ii++]      = curr;
        SteinerPin[curr] = INT_MAX;
        curr             = tp->branch[curr].n;
      }
    }
    index[ii] = root;

    for (ii = 0; ii < dd; ii++) {
      ss[ii] = 0;
      for (j = 0; j < ii; j++)
        if (x[j] < x[ii])
          ss[ii]++;
      for (j = ii + 1; j < dd; j++)
        if (x[j] <= x[ii])
          ss[ii]++;
      xs[ss[ii]] = x[ii];
      ys[ii]     = tp->branch[index[ii]].y;
    }

    tt = flutes_LD(dd, xs, ys, ss);

    // Find new wirelength
    tp->length += tt.length;
    for (ii = 0; ii < 2 * dd - 3; ii++) {
      i = index[ii];
      j = tp->branch[i].n;
      tp->length -= ADIFF(tp->branch[i].x, tp->branch[j].x) +
                    ADIFF(tp->branch[i].y, tp->branch[j].y);
    }

    // Copy tt into t
    for (ii = 0; ii < dd; ii++) {
      tp->branch[index[ii]].n = index[tt.branch[ii].n];
    }
    for (; ii <= 2 * dd - 3; ii++) {
      tp->branch[index[ii]].x = tt.branch[ii].x;
      tp->branch[index[ii]].y = tt.branch[ii].y;
      tp->branch[index[ii]].n = index[tt.branch[ii].n];
    }
    free(tt.branch);
  }

  return;
}

DTYPE wirelength(Tree t) {
  int i, j;
  DTYPE l = 0;

  for (i = 0; i < 2 * t.deg - 2; i++) {
    j = t.branch[i].n;
    l += ADIFF(t.branch[i].x, t.branch[j].x) +
         ADIFF(t.branch[i].y, t.branch[j].y);
  }

  return l;
}

void printtree(Tree t) {
  int i;

  for (i = 0; i < t.deg; i++)
    printf(" %-2d:  x=%4g  y=%4g  e=%d\n", i, (float)t.branch[i].x,
           (float)t.branch[i].y, t.branch[i].n);
  for (i = t.deg; i < 2 * t.deg - 2; i++)
    printf("s%-2d:  x=%4g  y=%4g  e=%d\n", i, (float)t.branch[i].x,
           (float)t.branch[i].y, t.branch[i].n);
  printf("\n");
}

// Output in a format that can be plotted by gnuplot
void plottree(Tree t) {
  int i;

  for (i = 0; i < 2 * t.deg - 2; i++) {
    printf("%d %d\n", t.branch[i].x, t.branch[i].y);
    printf("%d %d\n\n", t.branch[t.branch[i].n].x, t.branch[t.branch[i].n].y);
  }
}
#endif /* _FLUTE_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/flute_mst.h
================================================
#ifndef _FLUTE_MST_H_
#define _FLUTE_MST_H_

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include "dl.h"
#include "flute.h"

#include "mst2.h"

#define INFNTY INT_MAX

#define D2M D2(1) // Max net degree that flute_mr will handle
#define MR_FOR_SMALL_CASES_ONLY 1
#if MR_FOR_SMALL_CASES_ONLY
#define MAXPART D2M // max partition of an MST
#define MAXT (D2M / D * 2)
#else
#define MAXPART (d / 9 * 2) //(MAXD/THD*2) // max partition of an MST
#define MAXPART2 ((t1.deg + t2.deg) / 9 * 2)
#define MAXT (d / 5)
#endif

int D3 = INFNTY;

int FIRST_ROUND         = 2; // note that num of total rounds = 1+FIRST_ROUND
int EARLY_QUIT_CRITERIA = 1;

#define DEFAULT_QSIZE (3 + min(d, 1000))

#define USE_HASHING 1
#if USE_HASHING
#define new_ht 1
// int new_ht=1;
dl_t ht[D2M + 1]; // hash table of subtrees indexed by degree
#endif

unsigned int curr_mark = 0;

Tree wmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy,
                int acc);
Tree xmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy);
void color_tree(Tree t, int* color);
int longest_path_edge(int i, int j, int* e, int* p, int* es);
void preprocess_edges(int num_edges, int* edges, DTYPE* len, int* e, int* p,
                      int* es);

#define init_queue(q)                                                          \
  { q[1] = 2; }
inline void enqueue(int** q, int e) {
  int _qsize;
  if ((*q)[0] == (*q)[1]) {
    _qsize  = 2 * ((*q)[0] + 1);
    (*q)    = (int*)realloc((*q), _qsize * sizeof(int));
    (*q)[0] = _qsize;
  }
  (*q)[(*q)[1]++] = e;
}

DTYPE** hdist;

int heap_size = 0;

int in_heap_order(int e1, int e2) {
  if (hdist[heap[e1].node1][heap[e1].node2] <
      hdist[heap[e2].node1][heap[e2].node2]) {
    return 1;
  } else {
    return 0;
  }
}

void sift_up(int i) {
  node_pair tmp;
  int j;

  for (j = i / 2; j >= 1 && in_heap_order(i, j); i = j, j /= 2) {
    tmp     = heap[j];
    heap[j] = heap[i];
    heap[i] = tmp;
  }
}

void sift_down(int i) {
  int left, right, j;
  node_pair tmp;

  left  = i * 2;
  right = left + 1;

  while (left <= heap_size) {
    if (left == heap_size || in_heap_order(left, right)) {
      j = left;
    } else {
      j = right;
    }
    if (in_heap_order(j, i)) {
      tmp     = heap[j];
      heap[j] = heap[i];
      heap[i] = tmp;
      i       = j;
      left    = i * 2;
      right   = left + 1;
    } else {
      break;
    }
  }
}

void insert_heap(node_pair* np) {
  if (heap_size >= max_heap_size) {
    max_heap_size *= 2;
    heap = (node_pair*)realloc(heap, sizeof(node_pair) * (max_heap_size + 1));
  }
  heap[++heap_size] = *np;
  sift_up(heap_size);
}

void extract_heap(node_pair* np) {
  // caller has to make sure heap is not empty
  *np     = heap[1];
  heap[1] = heap[heap_size--];
  sift_down(1);
}

Tree reftree; // reference for qsort
int cmp_branch(const void* a, const void* b) {
  int n;
  DTYPE x1, x2, x3;

  x1 = reftree.branch[*(int*)a].x;
  n  = reftree.branch[*(int*)a].n;
  x3 = reftree.branch[n].x;
  if (x3 < x1)
    x1 = x3;

  x2 = reftree.branch[*(int*)b].x;
  n  = reftree.branch[*(int*)b].n;
  x3 = reftree.branch[n].x;
  if (x3 < x2)
    x2 = x3;

  return (x1 <= x2) ? -1 : 1;
}

void update_dist2(Tree t, DTYPE** dist, DTYPE longest, int* host,
                  int* min_node1, int* min_node2, int** nb) {
  int i, j, m, n, dd, node1, node2, node3, node4, p1, p2, pi, pn;
  DTYPE min_dist, smallest;
  DTYPE x1, x2, x3, x4, y1, y2, y3, y4;
  DTYPE threshold_x, threshold_y;
  DTYPE md = dist[*min_node1][*min_node2];

#if MR_FOR_SMALL_CASES_ONLY
  int isPin_base[D2M], *isPin, id[2 * D2M];
  int u, v, b[D2M * 2];
#else
  int *isPin_base, *isPin, *id;
  int u, v, *b;

  isPin_base = (int*)malloc(sizeof(int) * t.deg);
  id         = (int*)malloc(sizeof(int) * t.deg * 2);
  b          = (int*)malloc(sizeof(int) * t.deg * 2);
#endif

  isPin = &(isPin_base[0]) - t.deg;
  dd    = t.deg * 2 - 2;

  for (i = 0; i < t.deg; i++) {
    isPin_base[i] = -1;
  }

  for (i = t.deg; i < dd; i++) {
    host[i] = -1;
  }

  for (i = 0; i < t.deg; i++) {
    n = t.branch[i].n;
    if (isPin[n] < 0 && t.branch[n].x == t.branch[i].x &&
        t.branch[n].y == t.branch[i].y) {
      isPin[n] = i; // this steiner node coincides with a pin
    }
    host[i] = i;

    if (isPin[n] == i) {
      host[n] = host[i];
    }
  }

  for (i = 0; i < dd; i++) {
    id[i] = i;
  }

  for (i = 0; i < dd; i++) {
    n = t.branch[i].n;
    if (isPin[n] >= 0 || n == i) {
      continue;
    }

    if (id[i] < id[n]) {
      id[n] = id[i];
    } else {
      id[i] = id[n];
    }
  }

  for (i = 0; i < dd; i++) {
    while (id[i] != id[id[i]]) {
      id[i] = id[id[i]];
    }
  }

  x1 = y1 = INFNTY;
  x2 = y2 = 0;
  for (i = 0; i < t.deg; i++) {
    x1 = min(x1, t.branch[i].x);
    y1 = min(y1, t.branch[i].y);
    x2 = max(x2, t.branch[i].x);
    y2 = max(y2, t.branch[i].y);
  }

  threshold_x = (x2 - x1) / 4;
  threshold_y = (y2 - y1) / 4;
  threshold_x = min(threshold_x, longest);
  threshold_y = min(threshold_y, longest);

  for (i = 0; i < dd; i++) {
    b[i] = i;
  }
  reftree = t;
  qsort(b, dd, sizeof(int), cmp_branch);

  for (u = 0; u < dd; u++) {
    i     = b[u];
    n     = t.branch[i].n;
    node1 = host[i];
    node2 = host[n];
    if (node1 < 0 && node2 < 0) {
      continue;
    }
    if (t.branch[i].x <= t.branch[n].x) {
      x3 = t.branch[i].x;
      x4 = t.branch[n].x;
    } else {
      x3 = t.branch[n].x;
      x4 = t.branch[i].x;
    }
    if (t.branch[i].y <= t.branch[n].y) {
      y3 = t.branch[i].y;
      y4 = t.branch[n].y;
    } else {
      y3 = t.branch[n].y;
      y4 = t.branch[i].y;
    }

    for (v = u + 1; v < dd; v++) {
      j = b[v];
      if (id[i] == id[j]) { // in the same connecting subtree
        continue;
      }
      if (ADIFF(t.branch[i].x, t.branch[j].x) > threshold_x ||
          ADIFF(t.branch[i].y, t.branch[j].y) > threshold_y)
        continue;
      m     = t.branch[j].n;
      node3 = host[j];
      node4 = host[m];
      if (node3 < 0 && node4 < 0) {
        continue;
      }

      if (t.branch[j].x <= t.branch[m].x) {
        x1 = t.branch[j].x;
        x2 = t.branch[m].x;
      } else {
        x1 = t.branch[m].x;
        x2 = t.branch[j].x;
      }
      if (t.branch[j].y <= t.branch[m].y) {
        y1 = t.branch[j].y;
        y2 = t.branch[m].y;
      } else {
        y1 = t.branch[m].y;
        y2 = t.branch[j].y;
      }

      if (x2 < x3) {
        min_dist = x3 - x2;
      } else if (x4 < x1) {
        min_dist = x1 - x4;
      } else {
        min_dist = 0;
      }

      if (min_dist >= threshold_x) {
        break;
      }

      if (y2 < y3) {
        min_dist += y3 - y2;
      } else if (y4 < y1) {
        min_dist += y1 - y4;
      }

      if (min_dist >= longest) {
        continue;
      }

      p1 = (node1 < 0) ? node2 : ((node2 < 0) ? node1 : -1);
      p2 = (node3 < 0) ? node4 : ((node4 < 0) ? node3 : -1);

      if (p1 >= 0 && p2 < 0) {
        dist[p1][node3] = ADIFF(t.branch[p1].x, t.branch[node3].x) +
                          ADIFF(t.branch[p1].y, t.branch[node3].y);
        dist[p1][node4] = ADIFF(t.branch[p1].x, t.branch[node4].x) +
                          ADIFF(t.branch[p1].y, t.branch[node4].y);
        p2 = (dist[p1][node3] <= dist[p1][node4]) ? node3 : node4;
      } else if (p1 < 0 && p2 >= 0) {
        dist[node1][p2] = ADIFF(t.branch[node1].x, t.branch[p2].x) +
                          ADIFF(t.branch[node1].y, t.branch[p2].y);
        dist[node2][p2] = ADIFF(t.branch[node2].x, t.branch[p2].x) +
                          ADIFF(t.branch[node2].y, t.branch[p2].y);
        p1 = (dist[node1][p2] <= dist[node2][p2]) ? node1 : node2;
      } else if (p1 < 0 && p2 < 0) {
        // all 4 nodes are real, pick the closest pair

        dist[node1][node3] = ADIFF(t.branch[node1].x, t.branch[node3].x) +
                             ADIFF(t.branch[node1].y, t.branch[node3].y);
        dist[node1][node4] = ADIFF(t.branch[node1].x, t.branch[node4].x) +
                             ADIFF(t.branch[node1].y, t.branch[node4].y);
        dist[node2][node3] = ADIFF(t.branch[node2].x, t.branch[node3].x) +
                             ADIFF(t.branch[node2].y, t.branch[node3].y);
        dist[node2][node4] = ADIFF(t.branch[node2].x, t.branch[node4].x) +
                             ADIFF(t.branch[node2].y, t.branch[node4].y);

        p1       = node1;
        p2       = node3;
        smallest = dist[p1][p2];

        if (dist[node2][node3] < smallest) {
          p1       = node2;
          smallest = dist[p1][p2];
        }
        if (dist[node1][node4] < smallest) {
          p1       = node1;
          p2       = node4;
          smallest = dist[p1][p2];
        }
        if (dist[node2][node4] < smallest) {
          p1       = node2;
          p2       = node4;
          smallest = dist[p1][p2];
        }
      } else {
        dist[p1][p2] = ADIFF(t.branch[p1].x, t.branch[p2].x) +
                       ADIFF(t.branch[p1].y, t.branch[p2].y);
      }

      if (min_dist < dist[p1][p2]) {
        dist[p1][p2] = dist[p2][p1] = min_dist;
        enqueue(&nb[p1], p2);
        enqueue(&nb[p2], p1);

        if (min_dist < md) {
          md         = min_dist;
          *min_node1 = p1;
          *min_node2 = p2;
        }
      }
    }
  }

#if !(MR_FOR_SMALL_CASES_ONLY)
  free(isPin_base);
  free(id);
  free(b);
#endif
}

void mst_from_heap(int d, DTYPE** dist, int node1, int node2, int** nb,
                   int* edges, int* tree_id) {
  int i, j, itr, idx;
  node_pair e;

  hdist     = dist;
  heap_size = 0;

  for (i = 0; i < d; i++) {
    tree_id[i] = 0;
  }

  tree_id[node1] = 1;
  tree_id[node2] = 1;
  e.node1 = edges[0] = node1;
  e.node2 = edges[1] = node2;
  idx                = 2;

  insert_heap(&e);

  for (j = nb[node1][1] - 1; j > 1; j--) {
    i = nb[node1][j];
    if (tree_id[i])
      continue;
    {
      e.node2 = i;
      insert_heap(&e);
    }
  }
  for (itr = d - 2; itr >= 1; itr--) {
    e.node1 = node2;

    for (j = nb[node2][1] - 1; j > 1; j--) {
      i = nb[node2][j];
      if (tree_id[i])
        continue;
      {
        e.node2 = i;
        insert_heap(&e);
      }
    }

    do {
      // assert(heap_size>0);
      // extract_heap(&e);
      e = heap[1];
      while (tree_id[heap[heap_size].node2]) {
        heap_size--;
      }
      heap[1] = heap[heap_size--];
      sift_down(1);
      node2 = e.node2;
    } while (tree_id[node2]);
    node1          = e.node1;
    tree_id[node2] = tree_id[node1];
    edges[idx++]   = node1;
    edges[idx++]   = node2;
  }
  // assert(idx==2*d-2);
}

void build_rmst(long d, DTYPE* x, DTYPE* y, int* edges, int* inMST) {
  int i, j, idx, n;
  int* map     = (int*)calloc(d, sizeof(int));
  Point* pt    = (Point*)calloc(4 * d, sizeof(Point));
  long* parent = (long*)calloc(4 * d, sizeof(long));
  long* par    = (long*)calloc(4 * d, sizeof(long));
  int* size    = (int*)calloc(d, sizeof(int));

  for (i = 0; i < d; i++) {
    pt[i].x   = x[i];
    pt[i].y   = y[i];
    parent[i] = par[i] = -1;
    size[i]            = 1;
    inMST[i]           = 0;
  }

  map[0] = 0;
  for (i = n = 1; i < d; i++) {
    if (x[i] != x[i - 1] || y[i] != y[i - 1]) {
      pt[n].x = pt[i].x;
      pt[n].y = pt[i].y;
      map[n]  = i;
      n++;
    } else {
      parent[i] = i - 1;
    }
  }

  // mst2( d, pt, parent );
  mst2_package_init(n);

  mst2(n, pt, par);

  mst2_package_done();

  /* special treatment for duplicated points not filtered in previous loop */
  for (i = 1; i < n; i++) {
    if (par[i] < 0) {
      for (j = n - 1; j >= 0; j--) {
        if (pt[j].x == pt[i].x && pt[j].y == pt[i].y && par[j] >= 0) {
          par[i] = j;
          break;
        }
      }
    }
  }

  for (i = 0; i < n; i++) {
    parent[map[i]] = map[par[i]];
  }

  for (i = 0; i < d; i++) {
    // assert(parent[i]>=0);
    size[parent[i]]++;
  }

  idx = 2 * d - 3;
  for (i = 0; i < d; i++) {
    if (inMST[i])
      continue;
    j = i;
    while (size[j] <= 1 && idx > 0) {
      // assert(!inMST[j]);
      inMST[j]     = 1;
      edges[idx--] = j;
      edges[idx--] = j = parent[j];
      size[j]--;
    }
  }
  // assert(idx==-1);

  inMST[edges[0]] = 1;

  free(pt);
  free(map);
  free(parent);
  free(par);
  free(size);
}

/* cached version */
Tree flutes_c(int d, DTYPE* xs, DTYPE* ys, int* s, int acc) {
  int i;
  // int orig_ht_flag;
  Tree t, tdup;

#if USE_HASHING
  dl_forall(Tree, ht[d], t) {
    for (i = 0; i < d; i++) {
      if (t.branch[i].y != ys[i] || t.branch[i].x != xs[s[i]]) {
        break;
      }
    }
    if (i >= d) { // found a match
      tdup        = t;
      tdup.branch = (Branch*)malloc(sizeof(Branch) * (2 * d - 2));
      for (i = 2 * d - 3; i >= 0; i--) {
        tdup.branch[i] = t.branch[i];
      }
      return tdup;
    }
  }
  dl_endfor;

  // orig_ht_flag = new_ht;
  // new_ht = 0;
#endif

  t = flutes_LMD(d, xs, ys, s, acc);

#if USE_HASHING
  // new_ht = orig_ht_flag;

  tdup        = t;
  tdup.branch = (Branch*)malloc(sizeof(Branch) * (2 * d - 2));
  for (i = 2 * d - 3; i >= 0; i--) {
    tdup.branch[i] = t.branch[i];
  }
  dl_prepend(Tree, ht[d], tdup);
#endif

  return t;
}

Tree flute_mr(int d, DTYPE* xs, DTYPE* ys, int* s, int acc, int rounds,
              DTYPE** dist, DTYPE* threshold_x, DTYPE* threshold_y,
              DTYPE* threshold, int* best_round, int* min_node1, int* min_node2,
              int** nb) {
  int i, j, k, m, n, itr, node1, node2;
  DTYPE min_dist, longest;
  DTYPE dist1, dist2;
  Tree t, best_t, *subtree, ttmp;
  DTYPE min_x, max_x;

#if MR_FOR_SMALL_CASES_ONLY
  int num_subtree, subroot[MAXPART], suproot[MAXPART], isSuperRoot[D2M];
  int tree_id[D2M], tid, tree_size[D2M], edges[2 * D2M];
  int idx[MAXPART], offset[MAXPART], *order[MAXT], order_base[MAXT * D2M];
  DTYPE x[D2M + MAXPART], y[D2M + MAXPART];
  int new_s[D2M + MAXPART], si[D2M], xmap[D2M + MAXPART];
#else
  int num_subtree, *subroot, *suproot, *isSuperRoot;
  int *tree_id, tid, *tree_size, *edges;
  int *idx, *offset, **order, *order_base;
  DTYPE *x, *y;
  int *new_s, *si, *xmap;

  subroot = (int*)malloc(sizeof(int) * MAXPART);
  suproot = (int*)malloc(sizeof(int) * MAXPART);
  idx     = (int*)malloc(sizeof(int) * MAXPART);
  offset  = (int*)malloc(sizeof(int) * MAXPART);
  order   = (int**)malloc(sizeof(int*) * MAXT);

  isSuperRoot = (int*)malloc(sizeof(int) * d);
  tree_id     = (int*)malloc(sizeof(int) * d);
  tree_size   = (int*)malloc(sizeof(int) * d);
  edges       = (int*)malloc(sizeof(int) * d * 2);
  order_base  = (int*)malloc(sizeof(int) * d * MAXT);
  new_s       = (int*)malloc(sizeof(int) * (d + MAXPART));
  si          = (int*)malloc(sizeof(int) * d);
  xmap        = (int*)malloc(sizeof(int) * (d + MAXPART));
  x           = (DTYPE*)malloc(sizeof(DTYPE) * (d + MAXPART));
  y           = (DTYPE*)malloc(sizeof(DTYPE) * (d + MAXPART));
#endif

#if USE_HASHING
  if (new_ht) {
    for (i = 0; i <= d; i++) {
      ht[i] = dl_alloc();
    }
  }
#endif

  best_t.branch = NULL;
  best_t.length = INFNTY;

  for (i = 0; i < MAXT; i++) {
    // order[i] = &(order_base[i*D2(acc)]);
    order[i] = &(order_base[i * d]);
  }

  while (rounds >= 0) {
    for (i = 0; i < d; i++) {
      x[i]           = xs[s[i]];
      y[i]           = ys[i];
      isSuperRoot[i] = 0;
    }

    if (rounds == FIRST_ROUND) {
      build_rmst((long)d, x, y, edges, tree_id);
      for (i = 0; i < d; i++)
        dist[i][i] = 0;
    } else {
      mst_from_heap(d, dist, *min_node1, *min_node2, nb, edges, tree_id);
    }

    if (rounds != 0) {
      longest = 0;
      for (i = 0; i < d; i++) {
        init_queue(nb[i]);
      }
    }

    for (i = 0; i < 2 * d - 2;) {
      node1 = edges[i++];
      node2 = edges[i++];
      if (rounds != 0) {
        enqueue(&nb[node1], node2);
        enqueue(&nb[node2], node1);
        dist[node1][node2] = dist[node2][node1] =
            ADIFF(x[node1], x[node2]) + ADIFF(y[node1], y[node2]);
        if (longest < dist[node1][node2]) {
          longest = dist[node1][node2];
        }
      }
    }

    for (i = 0; i < d; i++) {
      tree_size[i] = 1; // the node itself
    }

    suproot[0] = subroot[0] = edges[0];
    num_subtree             = 1;

    for (i = 2 * d - 3; i >= 0;) {
      node2 = edges[i--];
      node1 = edges[i--];
      j     = tree_size[node1] + tree_size[node2];
      // Chris
      if (j >= TAU(acc)) {
        isSuperRoot[node1]     = 1;
        suproot[num_subtree]   = node1;
        subroot[num_subtree++] = node2;
      } else {
        tree_size[node1] = j;
      }
    }

    // assert(num_subtree<=MAXT);

    for (i = 1; i < num_subtree; i++) {
      tree_id[subroot[i]] = i + 1;
      tree_size[subroot[i]] += 1; // to account for the link to parent tree
    }

    for (i = 0; i < 2 * d - 2;) {
      node1 = edges[i++];
      node2 = edges[i++];
      if (tree_id[node2] == 1) { // non-root node
        tree_id[node2] = tree_id[node1];
      }
    }

    // Find inverse si[] of s[]
    for (i = 0; i < d; i++)
      si[s[i]] = i;

    offset[1] = 0;
    for (i = 1; i < num_subtree; i++) {
      offset[i + 1] = offset[i] + tree_size[subroot[i - 1]];
    }
    // assert(offset[num_subtree]==d+num_subtree-1-tree_size[subroot[num_subtree-1]]);

    for (i = 0; i <= num_subtree; i++)
      idx[i] = 0;

    for (i = 0; i < d; i++) {
      tid                = tree_id[si[i]];
      j                  = idx[tid]++;
      x[offset[tid] + j] = xs[i];
      xmap[i]            = j;

      if (isSuperRoot[si[i]]) {
        for (k = 1; k < num_subtree; k++) {
          if (suproot[k] == si[i]) {
            tid                = k + 1;
            j                  = idx[tid]++;
            x[offset[tid] + j] = xs[i];
            xmap[d - 1 + tid]  = j;
          }
        }
      }
    }

    for (i = 0; i <= num_subtree; i++)
      idx[i] = 0;

    for (i = 0; i < d; i++) {
      tid                    = tree_id[i];
      j                      = idx[tid]++;
      y[offset[tid] + j]     = ys[i];
      new_s[offset[tid] + j] = xmap[s[i]];
      order[tid - 1][j]      = i;

      if (isSuperRoot[i]) {
        for (k = 1; k < num_subtree; k++) {
          if (suproot[k] == i) {
            tid                    = k + 1;
            j                      = idx[tid]++;
            y[offset[tid] + j]     = ys[i];
            new_s[offset[tid] + j] = xmap[d - 1 + tid];
            order[tid - 1][j]      = i;
          }
        }
      }
    }

    subtree = (Tree*)malloc(num_subtree * sizeof(Tree));
    for (i = 1; i <= num_subtree; i++) {
      if (tree_size[subroot[i - 1]] <= 1) {
        subtree[i - 1].deg = 0;
        continue;
      }

      t = flutes_c(tree_size[subroot[i - 1]], x + offset[i], y + offset[i],
                   new_s + offset[i], acc);

      subtree[i - 1] = t;
    }

    for (i = 1; i < num_subtree; i++) {
      // assert(tree_id[suproot[i]] != tree_id[subroot[i]]);

      t = wmergetree(
          subtree[tree_id[suproot[i]] - 1], subtree[tree_id[subroot[i]] - 1],
          order[tree_id[suproot[i]] - 1], order[tree_id[subroot[i]] - 1],
          xs[s[suproot[i]]], ys[suproot[i]], acc);

      subtree[tree_id[subroot[i]] - 1].deg = 0;
      subtree[tree_id[suproot[i]] - 1]     = t;
    }

    t = subtree[0];
    free(subtree);

    if (best_t.length < t.length) {
      if (*best_round - rounds >= EARLY_QUIT_CRITERIA) {
        if (t.branch) {
          free(t.branch);
        }
        break;
      }
    } else if (best_t.length == t.length) {
      *best_round = rounds;
    } else if (best_t.length > t.length) {
      if (best_t.branch) {
        free(best_t.branch);
      }
      best_t      = t;
      *best_round = rounds;
    }

    if (rounds > 0) {
      for (i = 0; i < d; i++) {
        x[i] = xs[s[i]];
        y[i] = ys[i];
      }

      *min_node1 = edges[0];
      *min_node2 = edges[1];

      update_dist2(t, dist, longest, edges, min_node1, min_node2, nb);
    }

    if (t.branch != 0 && best_t.branch != t.branch) {
      free(t.branch);
    }

    rounds--;
  }

#if !(MR_FOR_SMALL_CASES_ONLY)
  free(subroot);
  free(suproot);
  free(idx);
  free(offset);
  free(order);
  free(isSuperRoot);
  free(tree_id);
  free(tree_size);
  free(edges);
  free(order_base);
  free(new_s);
  free(si);
  free(xmap);
  free(x);
  free(y);
#endif

#if USE_HASHING
  if (new_ht) {
    for (i = 0; i <= d; i++) {
      dl_forall(Tree, ht[i], ttmp) { free(ttmp.branch); }
      dl_endfor;
      dl_free(ht[i]);
    }
  }
#endif

  return best_t;
}

Tree flute_am(int d, DTYPE* xs, DTYPE* ys, int* s, int acc, DTYPE* threshold_x,
              DTYPE* threshold_y, DTYPE* threshold) {
  int i, j, k, m, n, itr, node1, node2;
  DTYPE smallest_gap, gap;
  Tree t, t0, *subtree;
  int prev_effort;
  /*
  int num_subtree, subroot[MAXPART], suproot[MAXPART], isSuperRoot[MAXD];
  int tree_id[MAXD], tid, tree_size[MAXD], edges[2*MAXD];
  int idx[MAXPART], offset[MAXPART], *order[MAXT],
        order_base[MAXD+10]; //order_base[MAXT*MAXD];
  DTYPE x[MAXD+MAXPART], y[MAXD+MAXPART];
  int new_s[MAXD+MAXPART], si[MAXD], xmap[MAXD+MAXPART];
  */
  DTYPE *x, *y;
  int num_subtree, subroot[3], suproot[3], *isSuperRoot;
  int *tree_id, tid, *tree_size, *edges;
  int idx[3], offset[3], *order[3], *order_base;
  int *new_s, *si, *xmap;

  int maxd    = d + 1;
  x           = (DTYPE*)malloc(sizeof(DTYPE) * (maxd + 3));
  y           = (DTYPE*)malloc(sizeof(DTYPE) * (maxd + 3));
  isSuperRoot = (int*)malloc(sizeof(int) * maxd);
  tree_id     = (int*)malloc(sizeof(int) * maxd);
  tree_size   = (int*)malloc(sizeof(int) * maxd);
  edges       = (int*)malloc(sizeof(int) * maxd * 2);
  order_base  = (int*)malloc(sizeof(int) * (maxd + 10));
  new_s       = (int*)malloc(sizeof(int) * (maxd + 3));
  si          = (int*)malloc(sizeof(int) * maxd);
  xmap        = (int*)malloc(sizeof(int) * (maxd + 3));

  /*
  for (i=0; i<MAXT; i++) {
    order[i] = &(order_base[i*MAXD]);
  }
  */

  for (i = 0; i < d; i++) {
    x[i]           = xs[s[i]];
    y[i]           = ys[i];
    isSuperRoot[i] = 0;
  }

  build_rmst((long)d, x, y, edges, tree_id);

  for (i = 0; i < d; i++) {
    tree_size[i] = 1; // the node itself
  }

  suproot[0] = subroot[0] = edges[0];
  num_subtree             = 1;

  /*
  for (i=2*d-3; i>=0; ) {
    node2 = edges[i--];
    node1 = edges[i--];
    j = tree_size[node1]+tree_size[node2];
    //if (j >= d/2) {
    if (j >= d/2 && num_subtree<2) {
      isSuperRoot[node1] = 1;
      suproot[num_subtree] = node1;
      subroot[num_subtree++] = node2;
    } else {
      tree_size[node1] = j;
    }
  }
  */

  for (i = 2 * d - 3; i >= 0;) {
    node2 = edges[i--];
    node1 = edges[i--];
    tree_size[node1] += tree_size[node2];
  }

  j            = 0;
  smallest_gap = ADIFF(tree_size[j], d / 2);
  for (i = 1; i < d; i++) {
    gap = ADIFF(tree_size[i], d / 2);
    if (gap < smallest_gap) {
      j            = i;
      smallest_gap = gap;
    }
  }

  for (i = 2 * d - 3; i >= 0;) {
    node2 = edges[i--];
    node1 = edges[i--];
    if (node2 == j) {
      isSuperRoot[node1]     = 1;
      suproot[num_subtree]   = node1;
      subroot[num_subtree++] = node2;
      tree_size[subroot[0]] -= tree_size[j];
      break;
    }
  }

  // assert(num_subtree<=MAXT);

  for (i = 1; i < num_subtree; i++) {
    tree_id[subroot[i]] = i + 1;
    tree_size[subroot[i]] += 1; // to account for the link to parent tree
  }

  for (i = 0; i < 2 * d - 2;) {
    node1 = edges[i++];
    node2 = edges[i++];
    if (tree_id[node2] == 1) { // non-root node
      tree_id[node2] = tree_id[node1];
    }
  }

  // Find inverse si[] of s[]
  for (i = 0; i < d; i++)
    si[s[i]] = i;

  offset[1] = 0;
  for (i = 1; i < num_subtree; i++) {
    offset[i + 1] = offset[i] + tree_size[subroot[i - 1]];
  }
  // assert(offset[num_subtree]==d+num_subtree-1-tree_size[subroot[num_subtree-1]]);

  for (i = 0; i < num_subtree; i++) {
    order[i] = &(order_base[offset[i + 1]]);
  }

  for (i = 0; i <= num_subtree; i++)
    idx[i] = 0;

  for (i = 0; i < d; i++) {
    tid                = tree_id[si[i]];
    j                  = idx[tid]++;
    x[offset[tid] + j] = xs[i];
    xmap[i]            = j;

    if (isSuperRoot[si[i]]) {
      for (k = 1; k < num_subtree; k++) {
        if (suproot[k] == si[i]) {
          tid                = k + 1;
          j                  = idx[tid]++;
          x[offset[tid] + j] = xs[i];
          xmap[d - 1 + tid]  = j;
        }
      }
    }
  }

  for (i = 0; i <= num_subtree; i++)
    idx[i] = 0;

  for (i = 0; i < d; i++) {
    tid                    = tree_id[i];
    j                      = idx[tid]++;
    y[offset[tid] + j]     = ys[i];
    new_s[offset[tid] + j] = xmap[s[i]];
    order[tid - 1][j]      = i;

    if (isSuperRoot[i]) {
      for (k = 1; k < num_subtree; k++) {
        if (suproot[k] == i) {
          tid                    = k + 1;
          j                      = idx[tid]++;
          y[offset[tid] + j]     = ys[i];
          new_s[offset[tid] + j] = xmap[d - 1 + tid];
          order[tid - 1][j]      = i;
        }
      }
    }
  }

  subtree = (Tree*)malloc(num_subtree * sizeof(Tree));
  for (i = 1; i <= num_subtree; i++) {
    if (tree_size[subroot[i - 1]] <= 1) {
      subtree[i - 1].deg = 0;
      continue;
    }

    t = flutes_ALLD(tree_size[subroot[i - 1]], x + offset[i], y + offset[i],
                    new_s + offset[i], acc);
    subtree[i - 1] = t;
  }

  for (i = 1; i < num_subtree; i++) {
    // assert(tree_id[suproot[i]] != tree_id[subroot[i]]);

    t = xmergetree(
        subtree[tree_id[suproot[i]] - 1], subtree[tree_id[subroot[i]] - 1],
        order[tree_id[suproot[i]] - 1], order[tree_id[subroot[i]] - 1],
        xs[s[suproot[i]]], ys[suproot[i]]);

    subtree[tree_id[subroot[i]] - 1].deg = 0;
    subtree[tree_id[suproot[i]] - 1]     = t;
  }

  t0 = subtree[0];
  free(subtree);

  t = t0;

  free(x);
  free(y);
  free(isSuperRoot);
  free(tree_id);
  free(tree_size);
  free(edges);
  free(order_base);
  free(new_s);
  free(si);
  free(xmap);

  return t;
}

Tree flutes_HD(int d, DTYPE* xs, DTYPE* ys, int* s, int acc) {
  int i, A, orig_D3;
  Tree t;
  // DTYPE *dist[MAXD], *dist_base;
  DTYPE **dist, *dist_base;
  DTYPE threshold, threshold_x, threshold_y;
  int best_round, min_node1, min_node2;
  int** nb;
  DTYPE prev_len;

  // Chris
  if (d <= D2(acc)) {
    if (acc <= 6) {
      FIRST_ROUND = 0;
      A           = acc;
    } else {
      FIRST_ROUND = acc - 6;
      A           = 6 + ((acc - 5) / 4) * 2; // Even A is better
    }
    EARLY_QUIT_CRITERIA = (int)(0.75 * FIRST_ROUND + 0.5);

    dist_base = (DTYPE*)malloc(d * d * sizeof(DTYPE));
    dist      = (DTYPE**)malloc(d * sizeof(DTYPE*));
    nb        = (int**)malloc(d * sizeof(int*));
    for (i = 0; i < d; i++) {
      dist[i]  = &(dist_base[i * d]);
      nb[i]    = (int*)malloc(DEFAULT_QSIZE * sizeof(int));
      nb[i][0] = DEFAULT_QSIZE;
      nb[i][1] = 2; // queue head
    }

    t = flute_mr(d, xs, ys, s, A, FIRST_ROUND, dist, &threshold_x, &threshold_y,
                 &threshold, &best_round, &min_node1, &min_node2, nb);

    free(dist_base);
    free(dist);
    for (i = 0; i < d; i++) {
      free(nb[i]);
    }
    free(nb);
  } else {
    A       = acc;
    orig_D3 = D3;
    if (orig_D3 >= INFNTY && d > 1000) {
      D3 = (d <= 10000) ? 1000 : 10000;
    }
    t = flute_am(d, xs, ys, s, A, &threshold_x, &threshold_y, &threshold);
    if (orig_D3 >= INFNTY) {
      D3 = orig_D3;
    }
  }

  return t;
}

int pickWin(Tree t, DTYPE cx, DTYPE cy, int inWin[]) {
#if MR_FOR_SMALL_CASES_ONLY
  int i, j, n, dd, cnt, stack[D2M * 2], top = 0, prev, curr, next;
  int isPin_base[D2M], *isPin, nghbr_base[D2M], *nghbr, q[2 * D2M];
#else
  int i, j, n, dd, cnt, *stack, top = 0, prev, curr, next;
  int *isPin_base, *isPin, *nghbr_base, *nghbr, *q;

  stack      = (int*)malloc(sizeof(int) * t.deg * 2);
  isPin_base = (int*)malloc(sizeof(int) * t.deg);
  nghbr_base = (int*)malloc(sizeof(int) * t.deg);
  q          = (int*)malloc(sizeof(int) * t.deg * 2);
#endif

  if (t.deg <= 3) {
    for (i = 0; i < t.deg * 2 - 2; i++) {
      inWin[i] = 1;
    }
#if !(MR_FOR_SMALL_CASES_ONLY)
    free(stack);
    free(isPin_base);
    free(nghbr_base);
    free(q);
#endif
    return t.deg;
  }

  memset(nghbr_base, 0, sizeof(int) * t.deg);
  nghbr = &(nghbr_base[0]) - t.deg;
  isPin = &(isPin_base[0]) - t.deg;

  for (i = 0; i < t.deg; i++) {
    isPin_base[i] = -1;
  }

  dd = t.deg * 2 - 2;
  for (i = 0; i < dd; i++) {
    inWin[i] = 0;
    nghbr[t.branch[i].n]++;
  }

  for (i = t.deg + 1; i < dd; i++) {
    nghbr[i] += nghbr[i - 1];
  }
  nghbr[dd] = nghbr[dd - 1];

  for (i = 0; i < dd; i++) {
    q[--nghbr[t.branch[i].n]] = i;
  }

  cnt = 0;
  for (i = 0; i < t.deg; i++) {
    n = t.branch[i].n;
    if (t.branch[n].x == t.branch[i].x && t.branch[n].y == t.branch[i].y) {
      isPin[n] = i; // this steiner node coincides with a pin
    }
    if (t.branch[i].x == cx && t.branch[i].y == cy) {
      inWin[i] = 1;
      cnt++;
      if (isPin[n] == i) {
        inWin[n]     = 1;
        stack[top++] = t.branch[n].n;
        for (j = nghbr[n]; j < nghbr[n + 1]; j++) {
          if (q[j] == i) {
            continue;
          }
          stack[top++] = q[j];
        }
      } else {
        stack[top++] = n;
      }
    }
  }
  // assert(top > 0);

  while (top > 0) {
    i = stack[--top];
    if (inWin[i]) {
      continue;
    }
    inWin[i] = 1;
    if (i < t.deg) {
      cnt++;
      continue;
    }
    n = isPin[i];
    if (n >= 0) {
      if (!inWin[n]) {
        inWin[n] = 1;
        cnt++;
      }
    } else {
      stack[top++] = t.branch[i].n;
      for (j = nghbr[i]; j < nghbr[i + 1]; j++) {
        stack[top++] = q[j];
      }
    }
  }

  for (i = 0; i < t.deg; i++) {
    if (inWin[i]) {
      n = t.branch[i].n;
      if (isPin[n] != i) {
        continue;
      }
      prev = n;
      curr = t.branch[n].n;
      next = t.branch[curr].n;
      while (curr != next) {
        t.branch[curr].n = prev;
        prev             = curr;
        curr             = next;
        next             = t.branch[curr].n;
      }
      t.branch[curr].n = prev;
      t.branch[n].n    = n;
    }
  }

  // assert(cnt>0);
#if !(MR_FOR_SMALL_CASES_ONLY)
  free(stack);
  free(isPin_base);
  free(nghbr_base);
  free(q);
#endif
  return cnt;
}

/* merge tree t2 into tree t1, which have shared common nodes.  The intention
   is to add the non-common tree nodes of t2 into t1, as well as the
   associated steiner nodes */
Tree merge_into(Tree t1, Tree t2, int common[], int nc, int* o1, int* o2) {
  Tree t;
  DTYPE cx, cy;
#if MR_FOR_SMALL_CASES_ONLY
  int i, j, k, d, n, offset, map[2 * D2M], reachable[2 * D2M];
  int o[D2M + MAXPART];
#else
  int i, j, k, d, n, offset, *map, *reachable;
  int* o;

  map       = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);
  reachable = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);
  o         = (int*)malloc(sizeof(int) * (t1.deg + t2.deg + MAXPART2));
#endif

  if (t2.deg <= nc) {
    free(t2.branch);
#if !(MR_FOR_SMALL_CASES_ONLY)
    free(map);
    free(reachable);
    free(o);
#endif
    return t1;
  }

  t.deg    = t1.deg + t2.deg - nc;
  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));

  offset = t2.deg - nc;

  for (i = t1.deg; i < t1.deg * 2 - 2; i++) {
    t.branch[i + offset]   = t1.branch[i];
    t.branch[i + offset].n = t1.branch[i].n + offset;
  }

  memset(reachable, 0, sizeof(int) * 2 * t2.deg);
  for (i = 2 * t2.deg - 3; i >= 0; i--) {
    if (!common[i] && t2.branch[i].n != i) {
      reachable[t2.branch[i].n] = 1;
    }
  }

  for (i = 2 * t2.deg - 3; i >= 0; i--) {
    map[i] = -1;
  }

  d = t1.deg * 2 - 2;

  /* a slow method; could be sped up here */
  for (i = 0; i < t2.deg; i++) {
    if (common[i]) {
      n = t2.branch[i].n;
      if (map[n] != -1 || !reachable[n]) {
        continue;
      }
      if (t2.branch[i].x != t2.branch[n].x ||
          t2.branch[i].y != t2.branch[n].y) {
        continue;
      }
      for (j = 0; j < t1.deg; j++) {
        if (t1.branch[j].x == t2.branch[i].x &&
            t1.branch[j].y == t2.branch[i].y) {
          break;
        }
      }
      // assert(j<t1.deg);
      n = t1.branch[j].n;
      if (t1.branch[j].x == t1.branch[n].x &&
          t1.branch[j].y == t1.branch[n].y) {
        /* pin j in t1 is also a steiner node */
        map[i] = n;
      } else { // create a steiner node for the common pin in t1
        t.branch[d + offset] = t1.branch[j];
        t.branch[d + offset].n += offset;
        t1.branch[j].n = d;
        map[i]         = d;
        d++;
        // assert(d+offset<=t.deg*2-2);
      }

      map[t2.branch[i].n] = map[i];
    }
  }

  for (; i < 2 * t2.deg - 2; i++) {
    if (map[i] == -1 && !common[i] && reachable[i]) {
      map[i] = d++;
      // assert(d+offset<=t.deg*2-2);
    }
  }

  /* merge the pin nodes in the correct order */
  j = i = k = 0;
  while (k < t2.deg && common[k]) {
    k++;
  }

  do {
    if (k >= t2.deg) {
      for (; i < t1.deg; i++) {
        t.branch[j]   = t1.branch[i];
        t.branch[j].n = t1.branch[i].n + offset;
        o[j]          = o1[i];
        j++;
      }
    } else if (i >= t1.deg) {
      for (; k < t2.deg; k++) {
        if (common[k]) {
          continue;
        }
        t.branch[j] = t2.branch[k];
        n           = t2.branch[k].n;
        // assert(map[n]>=t1.deg);
        t.branch[j].n = map[n] + offset;
        o[j]          = o2[k];
        j++;
      }
    } else if (o1[i] < o2[k]) {
      t.branch[j]   = t1.branch[i];
      t.branch[j].n = t1.branch[i].n + offset;
      o[j]          = o1[i];
      j++;
      i++;
    } else {
      t.branch[j] = t2.branch[k];
      n           = t2.branch[k].n;
      // assert(map[n]>=t1.deg);
      t.branch[j].n = map[n] + offset;
      o[j]          = o2[k];
      j++;
      do {
        k++;
      } while (k < t2.deg && common[k]);
    }
  } while (i < t1.deg || k < t2.deg);
  // assert(j==t.deg);

  for (i = 0; i < j; i++) {
    o1[i] = o[i];
  }

  j += t1.deg - 2;
  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {
    if (!common[i] && reachable[i]) {
      t.branch[map[i] + offset] = t2.branch[i];
      n                         = t2.branch[i].n;
      // assert(map[n]>=t1.deg);
      t.branch[map[i] + offset].n = map[n] + offset;
      j++;
    }
  }

  j = d + offset;
  // assert(j <= t.deg*2-2);
  while (j < t.deg * 2 - 2) {
    /* redundant steiner nodes */
    t.branch[j]   = t2.branch[0];
    t.branch[j].n = j;
    j++;
  }

  /*
  for (i=0; i<t.deg; i++) {
    assert(t.branch[i].n>=t.deg);
  }
  */

  t.length = wirelength(t);

  free(t1.branch);
  free(t2.branch);

#if !(MR_FOR_SMALL_CASES_ONLY)
  free(map);
  free(reachable);
  free(o);
#endif
  return t;
}

/* simply merge two trees at their common node */
Tree smergetree(Tree t1, Tree t2, int* o1, int* o2, DTYPE cx, DTYPE cy) {
  Tree t;
  int d, i, j, k, n, ci, cn, mapped_cn, prev, curr, next, offset;
#if MR_FOR_SMALL_CASES_ONLY
  int o[D2M + MAXPART], map[2 * D2M];
#else
  int *o, *map;

  map = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);
  o   = (int*)malloc(sizeof(int) * (t1.deg + t2.deg + MAXPART2));
#endif

  t.deg    = t1.deg + t2.deg - 1;
  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));

  offset = t2.deg - 1;
  d      = t1.deg * 2 - 2;

  for (i = 0; i < t1.deg; i++) {
    if (t1.branch[i].x == cx && t1.branch[i].y == cy) {
      break;
    }
  }
  n = t1.branch[i].n;
  if (t1.branch[n].x == cx && t1.branch[n].y == cy) {
    mapped_cn = n;
  } else {
    t.branch[d + offset] = t1.branch[i];
    t.branch[d + offset].n += offset;
    t1.branch[i].n = d;
    mapped_cn      = d;
    d++;
  }

  for (i = t2.deg; i < t2.deg * 2 - 2; i++) {
    map[i] = -1;
  }
  for (i = 0; i < t2.deg; i++) {
    if (t2.branch[i].x == cx && t2.branch[i].y == cy) {
      ci = i;
      break;
    }
  }

  n = t2.branch[i].n;

  if (t2.branch[n].x == cx && t2.branch[n].y == cy) {
    cn = n;
  } else {
    cn = i;
  }

  prev = n;
  curr = t2.branch[n].n;
  next = t2.branch[curr].n;
  while (curr != next) {
    t2.branch[curr].n = prev;
    prev              = curr;
    curr              = next;
    next              = t2.branch[curr].n;
  }
  t2.branch[curr].n = prev;
  t2.branch[n].n    = cn;

  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {
    if (i != cn) {
      map[i] = d++;
    }
  }
  map[cn] = mapped_cn;

  /* merge the pin nodes in the correct order */
  j = i = k = 0;
  if (k == ci) {
    k++;
  }

  do {
    if (k >= t2.deg) {
      for (; i < t1.deg; i++) {
        t.branch[j]   = t1.branch[i];
        t.branch[j].n = t1.branch[i].n + offset;
        o[j]          = o1[i];
        j++;
      }
    } else if (i >= t1.deg) {
      for (; k < t2.deg; k++) {
        if (k == ci) {
          continue;
        }
        t.branch[j]   = t2.branch[k];
        n             = t2.branch[k].n;
        t.branch[j].n = map[n] + offset;
        o[j]          = o2[k];
        j++;
      }
    } else if (o1[i] < o2[k]) {
      t.branch[j]   = t1.branch[i];
      t.branch[j].n = t1.branch[i].n + offset;
      o[j]          = o1[i];
      j++;
      i++;
    } else {
      t.branch[j]   = t2.branch[k];
      n             = t2.branch[k].n;
      t.branch[j].n = map[n] + offset;
      o[j]          = o2[k];
      j++;
      k++;
      if (k == ci) {
        k++;
      }
    }
  } while (i < t1.deg || k < t2.deg);
  // assert(j==t.deg);

  for (i = 0; i < j; i++) {
    o1[i] = o[i];
  }

  for (i = t1.deg; i < t1.deg * 2 - 2; i++) {
    t.branch[i + offset]   = t1.branch[i];
    t.branch[i + offset].n = t1.branch[i].n + offset;
  }

  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {
    if (i != cn) {
      t.branch[map[i] + offset]   = t2.branch[i];
      n                           = t2.branch[i].n;
      t.branch[map[i] + offset].n = map[n] + offset;
    }
  }

  for (i = d + offset; i < t.deg * 2 - 2; i++) {
    t.branch[i]   = t2.branch[0];
    t.branch[i].n = i;
  }

  free(t1.branch);
  free(t2.branch);

  t.length = wirelength(t);

#if !(MR_FOR_SMALL_CASES_ONLY)
  free(map);
  free(o);
#endif
  return t;
}

/* window-based heuristics */
Tree wmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy,
                int acc) {
  Tree t, t3, t4;
#if MR_FOR_SMALL_CASES_ONLY
  int s[D2M], inWin[2 * D2M], d, d2, i, ci, n;
  int i1, i2, o[D2M], os[D2M], si[D2M];
  DTYPE x[D2M], y[D2M], tmp;
#else
  int *s, *inWin, d, d2, i, ci, n;
  int i1, i2, *o, *os, *si;
  DTYPE *x, *y, tmp;

  s     = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));
  inWin = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);
  o     = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));
  os    = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));
  si    = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));
  x     = (DTYPE*)malloc(sizeof(DTYPE) * (t1.deg + t2.deg));
  y     = (DTYPE*)malloc(sizeof(DTYPE) * (t1.deg + t2.deg));
#endif

  if (t1.deg <= 0) {
    for (i = 0; i < t2.deg; i++) {
      order1[i] = order2[i];
    }
#if !(MR_FOR_SMALL_CASES_ONLY)
    free(s);
    free(inWin);
    free(o);
    free(os);
    free(si);
    free(x);
    free(y);
#endif
    return t2;
  } else if (t2.deg <= 0) {
#if !(MR_FOR_SMALL_CASES_ONLY)
    free(s);
    free(inWin);
    free(o);
    free(os);
    free(si);
    free(x);
    free(y);
#endif
    return t1;
  }

  d  = pickWin(t1, cx, cy, inWin);
  d2 = pickWin(t2, cx, cy, inWin + 2 * t1.deg);
  d += d2;

  // if (d<t1.deg+t2.deg && t1.deg>2 && t2.deg>2) {
  if (d < t1.deg + t2.deg) {
    for (i = 0; i < t2.deg; i++) {
      if (t2.branch[i].x == cx && t2.branch[i].y == cy) {
        ci = i;
        break;
      }
    }

    d--; // to exclude the duplicated common point (cx, cy)

    n = 0;

    i1 = i2 = 0;
    while (i1 < t1.deg && !inWin[i1]) {
      i1++;
    }
    while (i2 < t2.deg && (!inWin[i2 + 2 * t1.deg] || i2 == ci)) {
      i2++;
    }
    do {
      if (i2 >= t2.deg) {
        for (; i1 < t1.deg; i1++) {
          if (inWin[i1]) {
            x[n] = t1.branch[i1].x;
            y[n] = t1.branch[i1].y;
            o[n] = order1[i1];
            n++;
          }
        }
      } else if (i1 >= t1.deg) {
        for (; i2 < t2.deg; i2++) {
          if (inWin[i2 + 2 * t1.deg] && i2 != ci) {
            x[n] = t2.branch[i2].x;
            y[n] = t2.branch[i2].y;
            o[n] = order2[i2];
            n++;
          }
        }
      } else if (order1[i1] < order2[i2]) {
        x[n] = t1.branch[i1].x;
        y[n] = t1.branch[i1].y;
        o[n] = order1[i1];
        n++;
        i1++;
        while (i1 < t1.deg && !inWin[i1]) {
          i1++;
        }
      } else {
        x[n] = t2.branch[i2].x;
        y[n] = t2.branch[i2].y;
        o[n] = order2[i2];
        n++;
        i2++;
        while (i2 < t2.deg && (!inWin[i2 + 2 * t1.deg] || i2 == ci)) {
          i2++;
        }
      }
    } while (i1 < t1.deg || i2 < t2.deg);
    // assert(n==d);

    for (i = 0; i < d; i++) {
      si[i] = i;
    }
    for (i = 0; i < d; i++) {
      n = i;
      for (i1 = i + 1; i1 < d; i1++) {
        if (x[i1] < x[n]) {
          n = i1;
        }
      }
      tmp   = x[i];
      x[i]  = x[n];
      x[n]  = tmp;
      tmp   = si[n];
      si[n] = si[i];
      si[i] = tmp;
    }
    for (i = 0; i < d; i++) {
      os[si[i]] = i;
    }
    t3 = flutes_LMD(d, x, y, os, acc);
    t  = merge_into(t3, t2, inWin + 2 * t1.deg, d2, o, order2);
    t4 = merge_into(t, t1, inWin, d + 1 - d2, o, order1);
  } else if (t2.deg > 2) {
    for (i = 0; i < t2.deg; i++) {
      o[i] = order2[i];
    }
    t4 = smergetree(t2, t1, o, order1, cx, cy);
  } else {
    for (i = 0; i < t1.deg; i++) {
      o[i] = order1[i];
    }
    t4 = smergetree(t1, t2, o, order2, cx, cy);
  }

  for (i = 0; i < t4.deg; i++) {
    order1[i] = o[i];
  }

#if !(MR_FOR_SMALL_CASES_ONLY)
  free(s);
  free(inWin);
  free(o);
  free(os);
  free(si);
  free(x);
  free(y);
#endif

  return t4;
}

/* xmerge heuristics */
typedef struct TreeNode_s {
  struct TreeNode_s* parent;
  dl_t children;
  int order, id;
  unsigned int mark;
  DTYPE x, y;
  DTYPE blen; // length of this edge (i.e. branch length)
  // longest edge from here, use child node of an edge to represent it
  struct TreeNode_s* e;
  DTYPE len; // len of current e
} TreeNode;

void redirect(Tree t, DTYPE cx, DTYPE cy) {
  int i, root, prev, curr, next;

  /* assume that one of the nodes must match (cx, cy) */
  root = 0;
  for (i = 1; i < t.deg; i++) {
    if (t.branch[i].x == cx && t.branch[i].y == cy) {
      root = i;
      break;
    }
  }

  prev = root;
  curr = t.branch[root].n;
  next = t.branch[curr].n;
  while (curr != next) {
    t.branch[curr].n = prev;
    prev             = curr;
    curr             = next;
    next             = t.branch[curr].n;
  }
  t.branch[curr].n = prev;

  t.branch[root].n = root;
}

void update_subtree(TreeNode* p, int id) {
  TreeNode *child, *grandp;
  dl_t subtree = dl_alloc();

  dl_append(TreeNode*, subtree, p);

  while (dl_length(subtree) > 0) {
    dl_pop_first(TreeNode*, subtree, p);
    p->e   = p;
    grandp = p->parent;
    if (grandp) {
      p->len = p->blen = ADIFF(p->x, grandp->x) + ADIFF(p->y, grandp->y);
      if (p->len < grandp->len) {
        p->len = grandp->len;
        p->e   = grandp->e;
      }
    } else {
      p->len = 0;
    }

    if (id) {
      p->id = id;
    }

    dl_forall(TreeNode*, p->children, child) {
      dl_prepend(TreeNode*, subtree, child);
    }
    dl_endfor;
  }

  dl_free(subtree);
}

TreeNode* createRootedTree(Tree t, int* order, int id, dl_t list_of_nodes) {
  int i, dd, n;
  TreeNode *root = 0, **nodes, *p;

  dd    = t.deg * 2 - 2;
  nodes = (TreeNode**)malloc(sizeof(TreeNode*) * dd);
  for (i = 0; i < dd; i++) {
    nodes[i]           = (TreeNode*)malloc(sizeof(TreeNode));
    nodes[i]->mark     = curr_mark;
    nodes[i]->children = dl_alloc();
  }

  curr_mark++;
  for (i = 0; i < dd; i++) {
    nodes[i]->mark = curr_mark;
    n              = t.branch[i].n;
    if (i == n) {
      if (i < t.deg) {
        // assert(root==0);
        nodes[i]->parent = 0;
        root             = nodes[i];
      } else { /* must be redundant */
        dl_free(nodes[i]->children);
        free(nodes[i]);
        nodes[i] = 0;
        continue;
      }
    } else {
      p                = nodes[n];
      nodes[i]->parent = p;
      dl_append(TreeNode*, p->children, nodes[i]);
    }
    nodes[i]->order = (i < t.deg) ? order[i] : -1;
    nodes[i]->id    = id;
    nodes[i]->x     = t.branch[i].x;
    nodes[i]->y     = t.branch[i].y;

    /* len will be computed in update_subtree
    nodes[i]->blen =
      ADIFF(t.branch[i].x, t.branch[n].x)+ADIFF(t.branch[i].y, t.branch[n].y);

    nodes[i]->e = nodes[i];
    nodes[i]->len =
      ADIFF(t.branch[i].x, t.branch[n].x)+ADIFF(t.branch[i].y, t.branch[n].y);
    */

    dl_append(TreeNode*, list_of_nodes, nodes[i]);
  }

  // assert(root);

  update_subtree(root, 0);

  for (i = 0; i < dd; i++) {
    if (nodes[i] && nodes[i]->mark != curr_mark) {
      dl_free(nodes[i]->children);
      free(nodes[i]);
    }
  }

  free(nodes);
  return root;
}

void freeTree(TreeNode* t) {
  TreeNode* child;
  dl_forall(TreeNode*, t->children, child) { freeTree(child); }
  dl_endfor;
  dl_free(t->children);
  free(t);
}

int cmpNodeByYX(const void* a, const void* b) {
  DTYPE ay = (*(TreeNode**)a)->y;
  DTYPE by = (*(TreeNode**)b)->y;
  DTYPE ax, bx;

  if (ay < by)
    return -1;
  if (ay > by)
    return 1;

  ax = (*(TreeNode**)a)->x;
  bx = (*(TreeNode**)b)->x;

  if (ax < bx)
    return -1;
  if (ax > bx)
    return 1;
  return 0;
}

int cmpNodeByXY(const void* a, const void* b) {
  DTYPE ax = (*(TreeNode**)a)->x;
  DTYPE bx = (*(TreeNode**)b)->x;
  DTYPE ay, by;

  if (ax < bx)
    return -1;
  if (ax > bx)
    return 1;

  ay = (*(TreeNode**)a)->y;
  by = (*(TreeNode**)b)->y;

  if (ay < by)
    return -1;
  if (ay > by)
    return 1;
  return 0;
}

void remove_child(dl_t children_list, TreeNode* c) {
  TreeNode* child;
  dl_forall(TreeNode*, children_list, child) {
    if (child == c) {
      dl_delete_current();
      break;
    }
  }
  dl_endfor;
}

void cleanTree(TreeNode* tn) {
  /*
  TreeNode *c, *p;

  dl_forall(TreeNode*, tn->children, c) {
    cleanTree(c);
  } dl_endfor;

  p = tn->parent;
  if (!p) return;

  if (tn->order >= 0) return;  // don't clean pin nodes

  if (dl_length(tn->children)<=0) {
    remove_child(p->children, tn);
    dl_free(tn->children);
    free(tn);
  } else if (dl_length(tn->children)<=1) {
    c = dl_first(TreeNode*, tn->children);
    c->parent = p;
    dl_append(TreeNode*, p->children, c);
    remove_child(p->children, tn);
    dl_free(tn->children);
    free(tn);
  }
  */

  // non-recursive version
  TreeNode *c, *p;
  dl_t nlist = dl_alloc();

  dl_append(TreeNode*, nlist, tn);

  while (dl_length(nlist) > 0) {
    dl_pop_first(TreeNode*, nlist, tn);
    dl_forall(TreeNode*, tn->children, c) { dl_append(TreeNode*, nlist, c); }
    dl_endfor;

    p = tn->parent;
    if (p && tn->order < 0) {
      if (dl_length(tn->children) <= 0) {
        remove_child(p->children, tn);
        dl_free(tn->children);
        free(tn);
      } else if (dl_length(tn->children) <= 1) {
        c         = dl_first(TreeNode*, tn->children);
        c->parent = p;
        dl_append(TreeNode*, p->children, c);
        remove_child(p->children, tn);
        dl_free(tn->children);
        free(tn);
      }
    }
  }

  dl_free(nlist);
}

int cmpNodeByOrder(void* a, void* b) {
  int ax = (*(TreeNode**)a)->order;
  int bx = (*(TreeNode**)b)->order;

  if (ax < bx)
    return -1;
  if (ax > bx)
    return 1;
  return 0;
}

Tree mergeRootedTrees(TreeNode* tn1, TreeNode* tn2, int* order1) {
  int i, n, redundant;
  Tree t;
  TreeNode *child, *p;
  dl_t list_of_nodes = dl_alloc();
  dl_t pin_nodes = dl_alloc(), steiner_nodes = dl_alloc();

  // assert(tn1->x==tn2->x && tn1->y==tn2->y);

  /* merge tn2 to tn1 */
  while (dl_length(tn2->children) > 0) {
    dl_pop_first(TreeNode*, tn2->children, child);
    child->parent = tn1;
    dl_append(TreeNode*, tn1->children, child);
  }
  dl_free(tn2->children);
  free(tn2);

  cleanTree(tn1);

  /* convert tn1 back to a Tree */

  dl_append(TreeNode*, list_of_nodes, tn1);
  do {
    dl_pop_first(TreeNode*, list_of_nodes, child);
    if (child->order < 0) {
      if (dl_length(child->children) == 1) { /* redundant steiner node */
        p         = dl_first(TreeNode*, child->children);
        p->parent = child->parent;
        /* note that p->parent's children list is already gone */
        dl_append(TreeNode*, list_of_nodes, p);
        dl_free(child->children);
        free(child);
        continue;
      } else if (dl_length(child->children) == 0) {
        dl_free(child->children);
        free(child);
        continue;
      }
      dl_append(TreeNode*, steiner_nodes, child);
    } else {
      dl_append(TreeNode*, pin_nodes, child);
    }
    dl_concat(list_of_nodes, child->children);
  } while (dl_length(list_of_nodes) > 0);
  dl_free(list_of_nodes);

  dl_sort(pin_nodes, sizeof(TreeNode*), cmpNodeByOrder);

  i = 0;
  dl_forall(TreeNode*, pin_nodes, child) { child->id = i++; }
  dl_endfor;

  t.deg = i;

  dl_forall(TreeNode*, steiner_nodes, child) { child->id = i++; }
  dl_endfor;

  // assert(i<=2*t.deg-2);

  t.branch = (Branch*)malloc(sizeof(Branch) * (t.deg * 2 - 2));

  redundant = i;
  for (; i < 2 * t.deg - 2; i++) {
    t.branch[i].n = i;
    t.branch[i].x = tn1->x;
    t.branch[i].y = tn1->y;
  }

  t.branch[tn1->id].n = -1;

  dl_forall(TreeNode*, pin_nodes, child) {
    i = child->id;
    if (child->order >= 0) {
      order1[i] = child->order;
    }
    t.branch[i].x = child->x;
    t.branch[i].y = child->y;
    p             = child->parent;
    if (p) {
      if (p->id >= t.deg) {
        t.branch[i].n = p->id;
      } else {
        // assert(p==tn1);
        // assert(redundant<t.deg*2-2);
        t.branch[i].n         = redundant;
        t.branch[p->id].n     = redundant;
        t.branch[redundant].x = p->x;
        t.branch[redundant].y = p->y;
        redundant++;
      }
    }
  }
  dl_endfor;
  dl_forall(TreeNode*, steiner_nodes, child) {
    i = child->id;
    if (child->order >= 0) {
      order1[i] = child->order;
    }
    t.branch[i].x = child->x;
    t.branch[i].y = child->y;
    p             = child->parent;
    if (p->id < t.deg) { // must be the root
      if (t.branch[p->id].n < 0) {
        t.branch[p->id].n = i;
        t.branch[i].n     = i;
      } else {
        n = t.branch[p->id].n;
        if (t.branch[p->id].x == t.branch[n].x &&
            t.branch[p->id].y == t.branch[n].y) {
          t.branch[i].n = n;
        } else {
          // assert(redundant<t.deg*2-2);
          t.branch[redundant].x = t.branch[p->id].x;
          t.branch[redundant].y = t.branch[p->id].y;
          t.branch[redundant].n = t.branch[p->id].n;
          t.branch[p->id].n     = redundant;
          t.branch[i].n         = redundant;
          redundant++;
        }
      }
    } else {
      t.branch[i].n = p->id;
    }
  }
  dl_endfor;

  dl_forall(TreeNode*, pin_nodes, child) { free(child); }
  dl_endfor;
  dl_free(pin_nodes);

  dl_forall(TreeNode*, steiner_nodes, child) { free(child); }
  dl_endfor;
  dl_free(steiner_nodes);

  t.length = wirelength(t);
  return t;
}

void collect_nodes(TreeNode* tn, dl_t nlist) {
  /*
  TreeNode* c;

  dl_append(TreeNode*, nlist, tn);
  dl_forall(TreeNode*, tn->children, c) {
    collect_nodes(c, nlist);
  }dl_endfor;
  */
  // non-recursive version
  TreeNode* c;
  dl_el* curr;

  dl_append(TreeNode*, nlist, tn);

  for (curr = nlist->last; curr; curr = curr->next) {
    tn = dl_data(TreeNode*, curr);
    dl_forall(TreeNode*, tn->children, c) { dl_append(TreeNode*, nlist, c); }
    dl_endfor;
  }
}

typedef struct {
  TreeNode *n1, *n2;
  DTYPE new_x, new_y, gain;
} xdata;

int cmpXdata(void* a, void* b) {
  DTYPE ga = (*(xdata*)a).gain;
  DTYPE gb = (*(xdata*)b).gain;
  if (ga > gb)
    return -1;
  if (ga < gb)
    return 1;
  return 0;
}

inline TreeNode* cedge_lca(TreeNode* n1, TreeNode* n2, DTYPE* len,
                           int* n2ton1) {
  int i;
  TreeNode *curr, *lca, *e;

  curr_mark++;

  curr = n1;
  while (curr) {
    curr->mark = curr_mark;
    curr       = curr->parent;
  }

  lca = n2;
  while (lca && lca->mark != curr_mark) {
    lca->mark = curr_mark;
    lca       = lca->parent;
  }

  if (!lca) {
    n1 = n1->parent;
    if (n1 && n1 != lca && (n1->len > n2->len)) {
      *n2ton1 = 0;
      *len    = n1->len;
      return n1->e;
    } else {
      *n2ton1 = 1;
      *len    = n2->len;
      return n2->e;
    }
  }

  if (lca == n1 || lca == n1->parent || lca == n2) {
    if (lca != n2) {
      *n2ton1 = 1;
      *len    = n2->blen;
      e       = n2;
      curr    = n2->parent;
    } else {
      *n2ton1 = 0;
      *len    = n1->blen;
      e       = n1;
      curr    = n1->parent;
    }
    while (curr != lca) {
      if (*len < curr->blen) {
        *len = curr->blen;
        e    = curr;
      }
      curr = curr->parent;
    }
    return e;
  }

  /* lca is above both n1 and n2 */
  *n2ton1 = 0;
  n1      = n1->parent;
  *len    = n1->blen;
  e       = n1;
  curr    = n1;
  for (i = 0; i < 2; i++, curr = n2) {
    while (curr != lca) {
      if (*len < curr->blen) {
        if (i > 0) {
          *n2ton1 = 1;
        }
        *len = curr->blen;
        e    = curr;
      }
      curr = curr->parent;
    }
  }

  return e;
}

TreeNode* critical_edge(TreeNode* n1, TreeNode* n2, DTYPE* len, int* n2ton1) {
  if (n1->id != n2->id) {
    n1 = n1->parent;
    if (n1 && (n1->len > n2->len)) {
      *n2ton1 = 0;
      *len    = n1->len;
      return n1->e;
    } else {
      *n2ton1 = 1;
      *len    = n2->len;
      return n2->e;
    }
  }

  return cedge_lca(n1, n2, len, n2ton1);
}

void splice2(TreeNode* n1, TreeNode* n2, TreeNode* e) {
  TreeNode *curr, *prev, *next, *s;

  // assert(n2->parent);
  // assert(e->id==n2->id);

  prev = n2;
  curr = n2->parent;
  next = curr->parent;
  while (prev != e) {
    remove_child(curr->children, prev);
    curr->parent = prev;
    dl_append(TreeNode*, prev->children, curr);
    prev = curr;
    curr = next;
    next = curr->parent;
  }
  remove_child(curr->children, prev);

  n2->parent = n1;
  dl_append(TreeNode*, n1->children, n2);

  update_subtree(n1, n1->parent->id);
}

void cut_and_splice(TreeNode* n1, TreeNode* n2, DTYPE new_x, DTYPE new_y,
                    DTYPE* x1, DTYPE* y1, DTYPE* x2, DTYPE* y2, TreeNode* e,
                    int n2ton1) {
  TreeNode *p1, *node, *s;

  /* new steiner node */
  p1 = n1->parent;
  remove_child(p1->children, n1);
  node       = (TreeNode*)malloc(sizeof(TreeNode));
  node->x    = new_x;
  node->y    = new_y;
  node->mark = curr_mark;

  node->parent = p1;
  dl_append(TreeNode*, p1->children, node);
  n1->parent     = node;
  node->children = dl_alloc();
  dl_append(TreeNode*, node->children, n1);
  node->order = -1;

  node->e  = n1->e;
  node->id = n1->id;

  if (*x1 == n1->x) {
    *x2 = new_x;
  } else {
    *x1 = new_x;
  }
  if (*y1 == n1->y) {
    *y2 = new_y;
  } else {
    *y1 = new_y;
  }

  if (n2->order >= 0) {
    /* n2 is a pin, need to replicate a steiner node */
    s = n2->parent;
    if (s->x != n2->x || s->y != n2->y) {
      s        = (TreeNode*)malloc(sizeof(TreeNode));
      s->mark  = curr_mark;
      s->order = -1;
      s->id    = n2->id;
      s->x     = n2->x;
      s->y     = n2->y;
      s->e     = n2->e;
      if (s->e == n2) {
        s->e = s;
      }
      if (e == n2) {
        e = s;
      }
      s->len   = n2->len;
      s->blen  = n2->blen;
      n2->blen = 0;

      remove_child(n2->parent->children, n2);
      dl_append(TreeNode*, n2->parent->children, s);
      s->parent   = n2->parent;
      n2->parent  = s;
      s->children = dl_alloc();
      dl_append(TreeNode*, s->children, n2);
    }
    n2 = s;
  }

  if (n2ton1) {
    splice2(node, n2, e);
  } else {
    splice2(n2, node, e);
  }
}

typedef struct {
  TreeNode *n1, *n2;
  DTYPE min_dist, new_x, new_y;
  int n2ton1;
} splice_info;

DTYPE exchange_branches_order_x(int num_nodes, TreeNode** nodes,
                                DTYPE threshold_x, DTYPE threshold_y,
                                DTYPE max_len) {
  int n2ton1;
  TreeNode *n1, *p1, *n2, *p2, *node, *e, *s;
  DTYPE x1, x2, y1, y2, min_dist, new_x, new_y, len;
  DTYPE gain = 0;
  int i, j, curr_row, next_header, num_rows, start, end, mid;
  int* header     = (int*)malloc(sizeof(int) * (num_nodes + 1));
  dl_t batch_list = dl_alloc();
  splice_info sinfo;

  int batch_mode = (num_nodes >= D3);

  header[0] = 0;

  y1 = nodes[0]->y;
  for (i = num_rows = 1; i < num_nodes; i++) {
    if (nodes[i]->y == y1) {
      continue;
    }
    header[num_rows++] = i;
    y1                 = nodes[i]->y;
  }
  header[num_rows] = i;

  curr_row    = 0;
  next_header = header[1];
  for (i = 0; i < num_nodes; i++) {
    if (i >= next_header) {
      curr_row++;
      next_header = header[curr_row + 1];
    }
    n1 = nodes[i];
    p1 = n1->parent;
    if (!p1) {
      continue;
    }
    if (p1->x == n1->x && p1->y == n1->y) {
      continue;
    }
    if (n1->x <= p1->x) {
      x1 = n1->x;
      x2 = p1->x;
    } else {
      x1 = p1->x;
      x2 = n1->x;
    }
    if (n1->y <= p1->y) {
      y1 = n1->y;
      y2 = p1->y;
    } else {
      y1 = p1->y;
      y2 = n1->y;
    }

    if (curr_row > 0) {
      for (j = curr_row - 1; j > 0; j--) {
        if (y1 - threshold_y > nodes[header[j]]->y) {
          j++;
          break;
        }
      }
    } else {
      j = 0;
    }
    for (; j < num_rows && nodes[header[j]]->y <= y2 + threshold_y; j++) {
      /* find the closest node on row j */
      start = header[j];
      end   = header[j + 1];
      while (start < end) {
        mid = (start + end) / 2;
        if (nodes[mid]->x <= x1) {
          start = mid + 1;
        } else {
          end = mid;
        }
      }
      // assert(start==end);

      if (start >= header[j + 1]) {
        continue;
      }
      n2 = nodes[start];

      if (batch_mode && n1->id == n2->id)
        continue;

      if (!n2->parent) {
        continue;
      }

      min_dist = n2->x - x2;

      if (abs(min_dist) > threshold_x) {
        continue;
      } else if (min_dist < 0) {
        min_dist = 0;
        new_x    = n2->x;
      } else {
        new_x = x2;
      }

      if (n2->y < y1) {
        min_dist += y1 - n2->y;
        new_y = y1;
      } else if (n2->y > y2) {
        min_dist += n2->y - y2;
        new_y = y2;
      } else {
        new_y = n2->y;
      }

      if (min_dist == 0 || min_dist > max_len) {
        continue;
      }

      e = critical_edge(n1, n2, &len, &n2ton1);
      if (min_dist < len && e != n1) {
        if (batch_mode) {
          sinfo.n1       = n1;
          sinfo.n2       = n2;
          sinfo.min_dist = min_dist;
          sinfo.new_x    = new_x;
          sinfo.new_y    = new_y;
          sinfo.n2ton1   = n2ton1;
          dl_append(splice_info, batch_list, sinfo);
        } else {
          gain += len - min_dist;
          cut_and_splice(n1, n2, new_x, new_y, &x1, &y1, &x2, &y2, e, n2ton1);
        }
      }
    }
  }

  dl_forall(splice_info, batch_list, sinfo) {
    n1       = sinfo.n1;
    n2       = sinfo.n2;
    n2ton1   = sinfo.n2ton1;
    min_dist = sinfo.min_dist;

    e = critical_edge(n1, n2, &len, &n2ton1);
    if (min_dist < len && e != n1) {
      gain += len - min_dist;
      cut_and_splice(n1, n2, sinfo.new_x, sinfo.new_y, &x1, &y1, &x2, &y2, e,
                     n2ton1);
    }
  }
  dl_endfor;

  dl_free(batch_list);

  free(header);

  return gain;
}

DTYPE exchange_branches_order_y(int num_nodes, TreeNode** nodes,
                                DTYPE threshold_x, DTYPE threshold_y,
                                DTYPE max_len) {
  int n2ton1;
  TreeNode *n1, *p1, *n2, *p2, *node, *e, *s;
  DTYPE x1, x2, y1, y2, min_dist, new_x, new_y, len;
  DTYPE gain = 0;
  int i, j, curr_row, next_header, num_rows, start, end, mid;
  int* header     = (int*)malloc(sizeof(int) * (num_nodes + 1));
  dl_t batch_list = dl_alloc();
  splice_info sinfo;

  int batch_mode = (num_nodes >= D3);

  header[0] = 0;

  x1 = nodes[0]->x;
  for (i = num_rows = 1; i < num_nodes; i++) {
    if (nodes[i]->x == x1) {
      continue;
    }
    header[num_rows++] = i;
    x1                 = nodes[i]->x;
  }
  header[num_rows] = i;

  curr_row    = 0;
  next_header = header[1];
  for (i = 0; i < num_nodes; i++) {
    if (i >= next_header) {
      curr_row++;
      next_header = header[curr_row + 1];
    }
    n1 = nodes[i];
    p1 = n1->parent;
    if (!p1) {
      continue;
    }
    if (p1->x == n1->x && p1->y == n1->y) {
      continue;
    }
    if (n1->x <= p1->x) {
      x1 = n1->x;
      x2 = p1->x;
    } else {
      x1 = p1->x;
      x2 = n1->x;
    }
    if (n1->y <= p1->y) {
      y1 = n1->y;
      y2 = p1->y;
    } else {
      y1 = p1->y;
      y2 = n1->y;
    }

    if (curr_row > 0) {
      for (j = curr_row - 1; j > 0; j--) {
        if (x1 - threshold_x > nodes[header[j]]->x) {
          j++;
          break;
        }
      }
    } else {
      j = 0;
    }
    for (; j < num_rows && nodes[header[j]]->x <= x2 + threshold_x; j++) {
      /* find the closest node on row j */
      start = header[j];
      end   = header[j + 1];
      while (start < end) {
        mid = (start + end) / 2;
        if (nodes[mid]->y <= y1) {
          start = mid + 1;
        } else {
          end = mid;
        }
      }
      // assert(start==end);
      if (start >= header[j + 1]) {
        continue;
      }
      n2 = nodes[start];

      if (batch_mode && n1->id == n2->id)
        continue;

      if (!n2->parent) {
        continue;
      }

      min_dist = n2->y - y2;

      if (abs(min_dist) > threshold_y) {
        continue;
      } else if (min_dist < 0) {
        min_dist = 0;
        new_y    = n2->y;
      } else {
        new_y = y2;
      }

      if (n2->x < x1) {
        min_dist += x1 - n2->x;
        new_x = x1;
      } else if (n2->x > x2) {
        min_dist += n2->x - x2;
        new_x = x2;
      } else {
        new_x = n2->x;
      }

      if (min_dist == 0 || min_dist > max_len) {
        continue;
      }

      e = critical_edge(n1, n2, &len, &n2ton1);
      if (min_dist < len && e != n1) {
        if (batch_mode) {
          sinfo.n1       = n1;
          sinfo.n2       = n2;
          sinfo.min_dist = min_dist;
          sinfo.new_x    = new_x;
          sinfo.new_y    = new_y;
          sinfo.n2ton1   = n2ton1;
          dl_append(splice_info, batch_list, sinfo);
        } else {
          gain += len - min_dist;
          cut_and_splice(n1, n2, new_x, new_y, &x1, &y1, &x2, &y2, e, n2ton1);
        }
      }
    }
  }

  dl_forall(splice_info, batch_list, sinfo) {
    n1       = sinfo.n1;
    n2       = sinfo.n2;
    n2ton1   = sinfo.n2ton1;
    min_dist = sinfo.min_dist;

    e = critical_edge(n1, n2, &len, &n2ton1);
    if (min_dist < len && e != n1) {
      gain += len - min_dist;
      cut_and_splice(n1, n2, sinfo.new_x, sinfo.new_y, &x1, &y1, &x2, &y2, e,
                     n2ton1);
    }
  }
  dl_endfor;

  dl_free(batch_list);

  free(header);

  return gain;
}

/* cross exchange branches after merging */
Tree xmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx,
                DTYPE cy) {
  int i, num, cnt, order_by_x = 1;
  Tree t;
  TreeNode *tn1, *tn2, *n1, *p1, **nodes;
  dl_t list_of_nodes = dl_alloc();
  DTYPE threshold_x, threshold_y;
  DTYPE min_x, max_x, max_len, len, gain;

  if (t1.deg <= 0) {
    for (i = 0; i < t2.deg; i++) {
      order1[i] = order2[i];
    }
    return t2;
  } else if (t2.deg <= 0) {
    return t1;
  }

  redirect(t1, cx, cy);
  redirect(t2, cx, cy);

  curr_mark = 0;
  tn1       = createRootedTree(t1, order1, 1, list_of_nodes);
  tn2       = createRootedTree(t2, order2, 2, list_of_nodes);

  num   = dl_length(list_of_nodes);
  nodes = (TreeNode**)malloc(sizeof(TreeNode*) * num);
  i     = 0;
  dl_forall(TreeNode*, list_of_nodes, n1) { nodes[i++] = n1; }
  dl_endfor;
  dl_clear(list_of_nodes);

  qsort(nodes, num, sizeof(TreeNode*), cmpNodeByYX);

  max_len = 0;
  min_x = max_x = nodes[0]->x;
  for (i = 0; i < num; i++) {
    n1 = nodes[i];
    p1 = n1->parent;
    if (p1) {
      len = ADIFF(n1->x, p1->x) + ADIFF(n1->y, p1->y);
      if (len > max_len) {
        max_len = len;
      }
    }
    if (n1->x < min_x) {
      min_x = n1->x;
    } else if (n1->x > max_x) {
      max_x = n1->x;
    }
  }

  threshold_x = (max_x - min_x) / 4;
  threshold_y = (nodes[num - 1]->y - nodes[0]->y) / 4;

  threshold_x = min(threshold_x, max_len);
  threshold_y = min(threshold_y, max_len);

  for (cnt = (t1.deg + t2.deg) / 2; cnt > 0; cnt--) {
    gain = (order_by_x) ? exchange_branches_order_x(num, nodes, threshold_x,
                                                    threshold_y, max_len)
                        : exchange_branches_order_y(num, nodes, threshold_x,
                                                    threshold_y, max_len);

    // assert(gain>=0);

    if (gain <= 0 && !order_by_x) {
      break;
    }
    if (cnt > 1) {
      collect_nodes(tn1, list_of_nodes);
      num = dl_length(list_of_nodes);
      if (num <= 1) {
        break;
      }

      collect_nodes(tn2, list_of_nodes);
      if (dl_length(list_of_nodes) - num <= 1) {
        break;
      }

      free(nodes);
      num   = dl_length(list_of_nodes);
      nodes = (TreeNode**)malloc(sizeof(TreeNode*) * num);
      i     = 0;
      dl_forall(TreeNode*, list_of_nodes, n1) { nodes[i++] = n1; }
      dl_endfor;
      dl_clear(list_of_nodes);

      if (order_by_x) {
        order_by_x = 0;
        qsort(nodes, num, sizeof(TreeNode*), cmpNodeByXY);
      } else {
        order_by_x = 1;
        qsort(nodes, num, sizeof(TreeNode*), cmpNodeByYX);
      }
    }
  }

  dl_free(list_of_nodes);
  free(nodes);

  t = mergeRootedTrees(tn1, tn2, order1);

  free(t1.branch);
  free(t2.branch);

  return t;
}
#endif


================================================
FILE: lonestar/eda/cpu/sproute/global.h
================================================
#ifndef _GLOBAL_H_
#define _GLOBAL_H_

#include <stdio.h>

#define TRUE 1
#define FALSE 0
#define MAXLONG 0x7fffffffL

struct point {
  long x, y;
};

typedef struct point Point;

typedef long nn_array[8];

#endif /* _GLOBAL_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/heap.c
================================================
/****************************************************************************/
/*
  Binary heap routines for use in Prim's algorithm, 
  with points are numbered from 0 to n-1
*/

#include <stdlib.h>
#include "heap.h"
#include "err.h"


Heap*   _heap = (Heap*)NULL;
long    _max_heap_size = 0;
long    _heap_size = 0;

/****************************************************************************/
/*
*/

void  allocate_heap( long n )
{
  if( _max_heap_size < n ) 
  {
    _heap = (Heap*)realloc( (void*)_heap, (size_t)(n+1)*sizeof(Heap) ); 
    if( ! _heap )
    {
      err_exit( "Cannot reallocate memory in allocate_heap!" );
    } 
    _max_heap_size = n;
  }
}
/****************************************************************************/
/*
*/

void  deallocate_heap()
{
  _max_heap_size = 0; 
  if( _heap )
  {
    free( (void*)_heap );
    _heap = (Heap*)NULL;
  }
}

/****************************************************************************/

void  heap_init( long  n )
{
  register long  p;

  allocate_heap( n );
  _heap_size = 0;
  for( p = 0;  p < n;  p++ )
  { 
    heap_idx( p ) = 0;
  }
 
} /* END heap_init() */

/****************************************************************************/

void  heap_insert( 
  long   p, 
  long   key 
)
{
  register long  k;       /* hole in the heap     */   
  register long  j;       /* parent of the hole   */
  register long  q;       /* heap_elt(j)          */

  heap_key( p ) = key;

  if( _heap_size == 0 )
  {
    _heap_size = 1;
    heap_elt( 1 ) = p;
    heap_idx( p ) = 1;          
    return;
  }

  k = ++ _heap_size;
  j = k >> 1;            /* k/2 */

  while( (j > 0) && (heap_key(q=heap_elt(j)) > key) ) { 

    heap_elt( k ) = q;
    heap_idx( q ) = k;
    k = j;
    j = k>>1;    /* k/2 */

  }
 
  /* store p in the position of the hole */
  heap_elt( k ) = p;
  heap_idx( p ) = k;      

} /* END heap_insert() */


/****************************************************************************/

void  heap_decrease_key
( 
  long   p, 
  long   new_key 
)
{
  register long    k;       /* hole in the heap     */   
  register long    j;       /* parent of the hole   */
  register long    q;       /* heap_elt(j)          */

  heap_key( p ) = new_key;
  k = heap_idx( p ); 
  j = k >> 1;            /* k/2 */

  if( (j > 0) && (heap_key(q=heap_elt(j)) > new_key) ) { /* change is needed */
    do {

      heap_elt( k ) = q;
      heap_idx( q ) = k;
      k = j;
      j = k>>1;    /* k/2 */

    } while( (j > 0) && (heap_key(q=heap_elt(j)) > new_key) );

    /* store p in the position of the hole */
    heap_elt( k ) = p;
    heap_idx( p ) = k;      
  }

} /* END heap_decrease_key() */


/****************************************************************************/

long  heap_delete_min()
{
  long    min, last;  
  register long  k;         /* hole in the heap     */   
  register long  j;         /* child of the hole    */
  register long  l_key;     /* key of last point    */

  if( _heap_size == 0 )            /* heap is empty */
    return( -1 );

  min  = heap_elt( 1 );
  last = heap_elt( _heap_size -- );
  l_key = heap_key( last );

  k = 1;  j = 2;
  while( j <= _heap_size ) {

    if( heap_key(heap_elt(j)) > heap_key(heap_elt(j+1)) ) 
      j++;

    if( heap_key(heap_elt(j)) >= l_key)  
      break;                     /* found a position to insert 'last' */

    /* else, sift hole down */ 
    heap_elt(k) = heap_elt(j);    /* Note that j <= _heap_size */
    heap_idx( heap_elt(k) ) = k;
    k = j;
    j = k << 1;
  }

  heap_elt( k ) = last;
  heap_idx( last ) = k;

  heap_idx( min ) = -1;   /* mark the point visited */
  return( min );

} /* END heap_delete_min() */


/****************************************************************************/


================================================
FILE: lonestar/eda/cpu/sproute/heap.h
================================================
#ifndef _HEAP_H_
#define _HEAP_H_

#include "global.h"

struct heap_info {
  long key;
  long idx;
  long elt;
};

typedef struct heap_info Heap;

extern Heap* _heap;

#define heap_key(p) (_heap[p].key)
#define heap_idx(p) (_heap[p].idx)
#define heap_elt(k) (_heap[k].elt)

#define in_heap(p) (heap_idx(p) > 0)
#define never_seen(p) (heap_idx(p) == 0)

void allocate_heap(long n);
void deallocate_heap();
void heap_init(long n);
void heap_insert(long p, long key);
void heap_decrease_key(long p, long new_key);
long heap_delete_min();

#endif /* _HEAP_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/main.cpp
================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/PriorityQueue.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/Profile.h"

#include "galois/LargeArray.h"
#include "llvm/Support/CommandLine.h"

#include "BoilerPlate.h"
#include "Lonestar/BFS_SSSP.h"

#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "RSMT.h"
#include "maze.h"
#include "RipUp.h"
#include "utility.h"
#include "route.h"
#include "maze3D.h"
#include "maze_finegrain.h"
#include "maze_finegrain_lateupdate.h"
#include "maze_lock.h"

static const char* name = "SPRoute";

static const char* desc =
    "A Scalable Parallel global router with a hybrid parallel algorithm which "
    "combines net-level parallelism and fine-grain parallelism";

static const char* url = "SPRoute";

namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

static cll::opt<std::string> outfile("o", cll::desc("output file (optional)"),
                                     cll::init(""));

static cll::opt<std::string>
    fluteDir("flute",
             cll::desc("directory of POWV9.dat and POST9.dat (REQUIRED)"),
             cll::Required);

//! Flag that forces user to be aware that they should be passing in a
//! ISPD2008 graph.
static cll::opt<bool> ISPD2008Graph(
    "ISPD2008Graph",
    cll::desc("Specify that the input graph is a ISPD2008 graph format"),
    cll::init(false));

int main(int argc, char** argv) {
  //    char benchFile[FILESTRLEN];
  clock_t t1, t2, t3;
  float gen_brk_Time, reading_Time;
  int enlarge, ripup_threshold;
  int i;
  int ESTEP1, CSTEP1, thStep1;
  int ESTEP2, CSTEP2, thStep2;
  int ESTEP3, CSTEP3, tUsage;
  int Ripvalue, LVIter, cost_step;
  int maxOverflow, past_cong, last_cong, finallength, numVia, ripupTH3D, newTH,
      healingTrigger;
  int minofl, minoflrnd = 0, mazeRound, upType, cost_type, bmfl, bwcnt;
  bool goingLV, noADJ, needOUTPUT;

  needOUTPUT = false;

  /*string outFile;
  for(int i = 1; i < argc; i++) {
      string tmp(argv[i]);
      if(tmp == "-t")
          numThreads = atoi(argv[i+1]);
      else if(tmp == "-o") {
          outFile = string(argv[i+1]);
          needOUTPUT = true;
      }
      else if(tmp == "-h" || tmp == "--help") {
          printf("Usage: ./SPRoute  <input> -o <output> -t <nthreads> \n");
          exit(1);
      }
  }*/

  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::preAlloc(numThreads * 2);

  if (!ISPD2008Graph) {
    GALOIS_DIE("This application requires a ISPD2008 graph input;"
               " please use the -ISPD2008Graph flag "
               " to indicate the input is a ISPD2008 graph format.");
  }

  if (outfile != "") {
    needOUTPUT = true;
  }
  LB = 0.9;
  UB = 1.3;

  SLOPE     = 5;
  THRESH_M  = 20;
  ENLARGE   = 15; // 5
  ESTEP1    = 10; // 10
  ESTEP2    = 5;  // 5
  ESTEP3    = 5;  // 5
  CSTEP1    = 2;  // 5
  CSTEP2    = 2;  // 3
  CSTEP3    = 5;  // 15
  COSHEIGHT = 4;
  L         = 0;
  VIA       = 2;
  Ripvalue  = -1;
  ripupTH3D = 10;
  goingLV   = TRUE;
  noADJ     = FALSE;
  thStep1   = 10;
  thStep2   = 4;
  LVIter    = 3;
  mazeRound = 500;
  bmfl      = BIG_INT;
  minofl    = BIG_INT;

  // galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>
  // thread_local_storage; galois::setActiveThreads(numThreads);
  /* galois::on_each(
           [&] (const unsigned tid, const unsigned numT)
           {
               printf("threadid: %d %d\n", tid, numT);
           }
           );
*/
  cout << " nthreads: " << numThreads << endl;

  int finegrain     = false;
  int thread_choice = 0;
  // int thread_steps[6] = {28,14,8,4,1};
  // int thread_livelock_limit[6] = {1,1,1,1,1};
  bool extrarun       = false;
  int thread_livelock = 0;

  if (1) {
    t1 = clock();
    printf("\nReading %s ...\n", inputFile.c_str());
    readFile(inputFile.c_str());
    printf("\nReading Lookup Table ...\n");
    readLUT(fluteDir.c_str());
    printf("\nDone reading table\n\n");

    t2           = clock();
    reading_Time = (float)(t2 - t1) / CLOCKS_PER_SEC;
    printf("Reading Time: %f sec\n", reading_Time);

    // call FLUTE to generate RSMT and break the nets into segments (2-pin nets)

    VIA = 2;
    // viacost = VIA;
    viacost = 0;
    gen_brk_RSMT(FALSE, FALSE, FALSE, FALSE, noADJ);
    printf("first L\n");
    routeLAll(TRUE);
    gen_brk_RSMT(TRUE, TRUE, TRUE, FALSE, noADJ);
    getOverflow2D(&maxOverflow);
    printf("second L\n");
    newrouteLAll(FALSE, TRUE);
    getOverflow2D(&maxOverflow);
    spiralRouteAll();
    newrouteZAll(10);
    printf("first Z\n");
    past_cong = getOverflow2D(&maxOverflow);

    convertToMazeroute();

    enlarge        = 10;
    newTH          = 10;
    healingTrigger = 0;
    stopDEC        = 0;
    upType         = 1;

    // iniBDE();

    costheight = COSHEIGHT;

    if (maxOverflow > 700) {
      costheight = 8;
      LOGIS_COF  = 1.33;
      VIA        = 0;
      THRESH_M   = 0;
      CSTEP1     = 30;
      slope      = BIG_INT;
    }

    for (i = 0; i < LVIter; i++) {

      LOGIS_COF = max(2.0 / (1 + log(maxOverflow)), LOGIS_COF);
      LOGIS_COF = 2.0 / (1 + log(maxOverflow));
      printf("LV routing round %d, enlarge %d \n", i, enlarge);
      routeLVAll(newTH, enlarge);

      past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);

      enlarge += 5;
      newTH -= 5;
      if (newTH < 1) {
        newTH = 1;
      }
    }

    //	past_cong = getOverflow2Dmaze( &maxOverflow);

    t3           = clock();
    reading_Time = (float)(t3 - t2) / CLOCKS_PER_SEC;
    printf("LV Time: %f sec\n", reading_Time);
    InitEstUsage();

    i               = 1;
    costheight      = COSHEIGHT;
    enlarge         = ENLARGE;
    ripup_threshold = Ripvalue;

    minofl  = totalOverflow;
    stopDEC = FALSE;

    slope     = 20;
    L         = 1;
    cost_type = 1;

    InitLastUsage(upType);

    // OrderNetEdge* netEO = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));

    PRINT_HEAT = 0;
    // checkUsageCorrectness();
    galois::StatTimer roundtimer("round");
    unsigned long oldtime = 0;
    round_avg_dist        = 0;
    round_avg_length      = 0;
    while (totalOverflow > 0) {

      if (THRESH_M > 15) {
        THRESH_M -= thStep1;
      } else if (THRESH_M >= 2) {
        THRESH_M -= thStep2;
      } else {
        THRESH_M = 0;
      }
      if (THRESH_M <= 0) {
        THRESH_M = 0;
      }
      // std::cout << "totalOverflow : " << totalOverflow << " enlarge: " <<
      // enlarge << std::endl;
      if (totalOverflow > 2000) {
        enlarge += ESTEP1; // ENLARGE+(i-1)*ESTEP;
        cost_step = CSTEP1;
        updateCongestionHistory(upType);

      } else if (totalOverflow < 500) {

        cost_step = CSTEP3;
        enlarge += ESTEP3;
        ripup_threshold = -1;
        updateCongestionHistory(upType);
      } else {
        cost_step = CSTEP2;
        enlarge += ESTEP2;
        updateCongestionHistory(upType);
      }

      if (totalOverflow > 15000 && maxOverflow > 400) {
        enlarge = max(xGrid, yGrid) /
                  30; // This is the key!!!! to enlarge routing area!!!!
        // enlarge = max(xGrid,yGrid) / 10;
        slope = BIG_INT;
        // slope = 20;
        if (i == 5) {
          VIA             = 0;
          LOGIS_COF       = 1.33;
          ripup_threshold = -1;
          //	cost_type = 3;

        } else if (i > 6) {
          if (i % 2 == 0) {
            LOGIS_COF += 0.5;
          }
          if (i > 20) {
            break;
          }
        }
        if (i > 10) {
          cost_type       = 1;
          ripup_threshold = 0;
        }
      }

      int maxGrid = max(xGrid + 1, yGrid + 1);
      enlarge     = min(enlarge, maxGrid / 2);
      // std::cout << "costheight : " << costheight << " enlarge: " << enlarge
      // << std::endl;
      costheight += cost_step;
      // std::cout << "costheight : " << costheight << " enlarge: " << enlarge
      // << std::endl;
      mazeedge_Threshold = THRESH_M;

      if (upType == 3) {
        LOGIS_COF = max(2.0 / (1 + log(maxOverflow + max_adj)), LOGIS_COF);
      } else {
        LOGIS_COF = max(2.0 / (1 + log(maxOverflow)), LOGIS_COF);
      }

      if (i == 8) {
        L      = 0;
        upType = 2;
        InitLastUsage(upType);
      }

      if (maxOverflow == 1) {
        // L = 0;
        ripup_threshold = -1;
        slope           = 5;
      }

      if (maxOverflow > 300 && past_cong > 15000) {
        L = 0;
      }
      // checkUsageCorrectness();

      // getOverflow2Dmaze(&maxOverflow , & tUsage);

      printf("iteration %d, enlarge %d, costheight %d, threshold %d via cost "
             "%d \nlog_coef %f, healingTrigger %d cost_step %d slope %d L %f "
             "cost_type %d OBIM delta %d\n",
             i, enlarge, costheight, mazeedge_Threshold, VIA, LOGIS_COF,
             healingTrigger, cost_step, slope, L, cost_type,
             max(OBIM_delta, (int)(costheight / (2 * slope))));
      // L = 2;
      roundtimer.start();
      round_num = i;
      if (finegrain) {
        printf("finegrain\n");

        mazeRouteMSMD_finegrain_spinlock(i, enlarge, costheight,
                                         ripup_threshold, mazeedge_Threshold,
                                         !(i % 3), cost_type);
      } else {
        mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,
                      mazeedge_Threshold, !(i % 3), cost_type);
      }
      roundtimer.stop();
      cout << "round : " << i << " time(ms): " << roundtimer.get() - oldtime
           << " acc time(ms): " << roundtimer.get() << endl;
      oldtime = roundtimer.get();

      last_cong        = past_cong;
      past_cong        = getOverflow2Dmaze(&maxOverflow, &tUsage);
      int nthreads_tmp = numThreads;
      if (past_cong > last_cong && !extrarun) // Michael
      {
        if (!finegrain && nthreads_tmp != 1) {
          thread_livelock++;
          if (thread_livelock == 1) {
            thread_choice++;
            thread_livelock = 0;
            if (nthreads_tmp < 6) {
              galois::setActiveThreads(4);
              numThreads = 4;
              finegrain  = true;
            } else {
              numThreads = numThreads / 2;
              galois::setActiveThreads(numThreads);
            }
          }
        }
      }
      cout << "nthreads :" << numThreads << endl;
      extrarun = false;

      if (minofl > past_cong) {
        minofl    = past_cong;
        minoflrnd = i;
      }

      if (i == 8) {
        L = 1;
      }

      i++;

      if (past_cong < 200 && i > 30 && upType == 2 && max_adj <= 20) {
        upType  = 4;
        stopDEC = TRUE;
      }

      if (maxOverflow < 150) {
        if (i == 20 && past_cong > 200) {
          printf("Extra Run for hard benchmark\n");
          L       = 0;
          upType  = 3;
          stopDEC = TRUE;
          slope   = 5;
          galois::runtime::profileVtune(
              [&](void) {
                if (finegrain) {
                  printf("finegrain\n");

                  mazeRouteMSMD_finegrain_spinlock(
                      i, enlarge, costheight, ripup_threshold,
                      mazeedge_Threshold, !(i % 3), cost_type);
                } else {
                  mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,
                                mazeedge_Threshold, !(i % 3), cost_type);
                }
              },
              "mazeroute");
          last_cong = past_cong;
          past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);
          extrarun  = true;

          str_accu(12);
          L       = 1;
          stopDEC = FALSE;
          slope   = 3;
          upType  = 2;
        }
        if (i == 35 && tUsage > 800000) {
          str_accu(25);
          extrarun = true;
        }
        if (i == 50 && tUsage > 800000) {
          str_accu(40);
          extrarun = true;
        }
      }

      if (i > 50) {
        upType = 4;
        if (i > 70) {
          stopDEC = TRUE;
        }
      }

      if (past_cong > 0.7 * last_cong) {
        costheight += CSTEP3;
      }

      if (past_cong >= last_cong) {
        VIA = 0; // is this good?
        healingTrigger++;
      }

      if (past_cong < bmfl) {
        bwcnt = 0;
        if (i > 140 || (i > 80 && past_cong < 20)) {
          copyRS();
          bmfl = past_cong;

          L     = 0;
          slope = BIG_INT;
          // SLOPE = BIG_INT;
          galois::runtime::profileVtune(
              [&](void) {
                if (finegrain) {
                  printf("finegrain\n");

                  mazeRouteMSMD_finegrain_spinlock(
                      i, enlarge, costheight, ripup_threshold,
                      mazeedge_Threshold, !(i % 3), cost_type);
                } else {
                  mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,
                                mazeedge_Threshold, !(i % 3), cost_type);
                }
              },
              "mazeroute");
          last_cong = past_cong;
          past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);
          extrarun  = true;
          if (past_cong < last_cong) {
            copyRS();
            bmfl = past_cong;
          }
          L     = 1;
          slope = 5;
          // SLOPE = 5;
          if (minofl > past_cong) {
            minofl    = past_cong;
            minoflrnd = i;
          }
          if (bmfl < 72)
            break;
        }
      } else {
        bwcnt++;
      }

      if (bmfl > 10) {
        if (bmfl > 30 && bmfl < 72 && bwcnt > 50) {
          break;
        }
        if (bmfl < 30 && bwcnt > 50) {
          break;
        }
        if (i >= mazeRound) {
          getOverflow2Dmaze(&maxOverflow, &tUsage);
          break;
        }
      }

      if (i >= mazeRound) {
        getOverflow2Dmaze(&maxOverflow, &tUsage);
        break;
      }
    }

    if (minofl > 0) {
      printf("\n\n minimal ofl %d, occuring at round %d\n\n", minofl,
             minoflrnd);
      copyBR();
    }

    freeRR();

    checkUsage();

    printf("maze routing finished\n");

    // t4 = clock();
    // maze_Time = (float)(t4-t3)/CLOCKS_PER_SEC;
    // printf("P3 runtime: %f sec\n", maze_Time);

    printf("Final 2D results: \n");
    getOverflow2Dmaze(&maxOverflow, &tUsage);

    printf("\nLayer Assignment Begins");
    newLA();
    printf("layer assignment finished\n");

    t2           = clock();
    gen_brk_Time = (float)(t2 - t1) / CLOCKS_PER_SEC;
    // printf("2D + Layer Assignment Runtime: %f sec\n", gen_brk_Time);

    costheight = 3;
    viacost    = 1;

    if (gen_brk_Time < 60) {
      ripupTH3D = 15;
    } else if (gen_brk_Time < 120) {
      ripupTH3D = 18;
    } else {
      ripupTH3D = 20;
    }

    if (goingLV && past_cong == 0) {
      printf("Post Processing Begins \n");
      mazeRouteMSMDOrder3D(enlarge, 0, ripupTH3D);

      //	mazeRouteMSMDOrder3D(enlarge, 0, 10 );
      if (gen_brk_Time > 120) {
        mazeRouteMSMDOrder3D(enlarge, 0, 12);
      }
      printf("Post Processsing finished, starting via filling\n");
    }

    fillVIA();
    finallength = getOverflow3D();
    numVia      = threeDVIA();
    checkRoute3D();
    if (needOUTPUT) {
      writeRoute3D(outfile.c_str());
    }

  } // Input ==1

  // t4 = clock();
  // maze_Time = (float)(t4-t1)/CLOCKS_PER_SEC;
  printf("Final routing length : %d\n", finallength);
  printf("Final number of via  : %d\n", numVia);
  printf("Final total length 1 : %d\n\n", finallength + numVia);

  // printf("Final total length 3 : %d\n",(finallength+3*numVia));
  // printf("3D runtime: %f sec\n", maze_Time);

  // freeAllMemory();
  return 0;
}


================================================
FILE: lonestar/eda/cpu/sproute/maze.h
================================================
#ifndef _MAZE_H_
#define _MAZE_H_

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <functional>
#include "bitmap_image.hpp"
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "route.h"
#include "RipUp.h"

#include "galois/Galois.h"
#include "galois/gstl.h"
#include "galois/PerThreadContainer.h"
#include "galois/Reduction.h"
#include "galois/PriorityQueue.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/substrate/NumaMem.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/Profile.h"

#include "galois/LargeArray.h"
#include "llvm/Support/CommandLine.h"

// using namespace std;

#define PARENT(i) (i - 1) / 2
//#define PARENT(i) ((i-1)>>1)
#define LEFT(i) 2 * i + 1
#define RIGHT(i) 2 * i + 2

#define NET_PARALLEL 0

int PRINT;
int PRINT_HEAT;
typedef struct {
  bool operator()(float* left, float* right) { return (*left) > (*right); }
} maze_greater;

class pq_grid {
public:
  float* d1_p;
  float d1_push;
  pq_grid() {
    d1_p    = NULL;
    d1_push = 0;
  };
  pq_grid(float* d1_p, float d1_push) {
    this->d1_p    = d1_p;
    this->d1_push = d1_push;
  }
};

class lateUpdateReq {
public:
  std::atomic<float>* d1_p;
  float d1_push;
  short parentX;
  short parentY;
  bool HV;
  lateUpdateReq() {
    d1_p    = NULL;
    d1_push = 0;
    parentX = 0;
    parentY = 0;
    HV      = false; // 1 == true, 3 == false
  };
  lateUpdateReq(std::atomic<float>* d1_p, float d1_push, short parentX,
                short parentY, bool HV) {
    this->d1_p    = d1_p;
    this->d1_push = d1_push;
    this->parentX = parentX;
    this->parentY = parentY;
    this->HV      = HV;
  }
};

typedef struct {
  bool operator()(const pq_grid& left, const pq_grid& right) const {
    return left.d1_push < right.d1_push;
  }
} pq_less;

/*typedef galois::PerThreadDeque< float* > PerThread_PQ;
typedef galois::gstl::Deque< float* > local_pq;*/ //FIFO TRIAL

typedef galois::PerThreadMinHeap<pq_grid, pq_less> PerThread_PQ;
typedef galois::gstl::PQ<pq_grid, pq_less> local_pq;

typedef galois::PerThreadVector<int> PerThread_Vec;
typedef galois::gstl::Vector<int> local_vec;

typedef struct {
  int x; // x position
  int y; // y position
} Pos;

#define FIFO_CHUNK_SIZE 4
#define OBIM_delta 20

auto RequestIndexer = [](const pq_grid& top) {
  return (unsigned int)(top.d1_push) /
         max(OBIM_delta, (int)(costheight / (2 * slope)));
};

auto RequestIndexerLate = [](const lateUpdateReq& top) {
  return (unsigned int)(top.d1_push) / OBIM_delta;
};

/*auto RequestIndexerConcurrent = [&](const concurrent_pq_grid& top) {
    return (unsigned int)(top.d1_push) / OBIM_delta;
};*/

namespace gwl = galois::worklists;
using PSChunk = gwl::PerThreadChunkFIFO<FIFO_CHUNK_SIZE>;
using OBIM    = gwl::OrderedByIntegerMetric<decltype(RequestIndexer), PSChunk>;
using OBIM_late =
    gwl::OrderedByIntegerMetric<decltype(RequestIndexerLate), PSChunk>;
// using OBIM_concurrent =
// gwl::OrderedByIntegerMetric<decltype(RequestIndexerConcurrent), PSChunk>;

struct THREAD_LOCAL_STORAGE {
  using LAptr = galois::substrate::LAptr;
  LAptr pop_heap2_LA;
  bool* pop_heap2;

  LAptr d1_p_LA, d1_alloc_LA;
  float** d1_p;
  float* d1_alloc;

  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,
      hyperH_alloc_LA;
  bool **HV_p, **hyperV_p, **hyperH_p;
  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;

  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,
      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;
  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;
  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;

  LAptr corrEdge_p_LA, corrEdge_alloc_LA;
  int** corrEdge_p;
  int* corrEdge_alloc;

  LAptr inRegion_p_LA, inRegion_alloc_LA;
  bool** inRegion_p;
  bool* inRegion_alloc;

  LAptr netEO_p_LA;
  OrderNetEdge* netEO_p;

  // maze_pq pq1;
  // std::vector<float*> v2;
  THREAD_LOCAL_STORAGE() {
    using namespace galois::substrate;

    if (NET_PARALLEL) {
      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());

      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(float));
      d1_alloc    = reinterpret_cast<float*>(d1_alloc_LA.get());
      d1_p_LA     = largeMallocLocal(yGrid * sizeof(float*));
      d1_p        = reinterpret_cast<float**>(d1_p_LA.get());

      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());
      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());
      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());

      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());
      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());
      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());

      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());
      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());
      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());
      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());

      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());
      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());
      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());
      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());

      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));
      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());
      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));
      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());

      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());
      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());

      netEO_p_LA = largeMallocLocal(MAXNETDEG * 2 * sizeof(OrderNetEdge));
      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());
    } else {
      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));

      d1_alloc = (float*)calloc(yGrid * xGrid, sizeof(float));
      d1_p     = (float**)calloc(yGrid, sizeof(float*));

      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));
      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));
      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));

      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));

      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));
      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));

      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));

      netEO_p = (OrderNetEdge*)calloc(MAXNETDEG * 2, sizeof(OrderNetEdge));
    }
    // printf("allocation success\n");
    for (int i = 0; i < yGrid; i++) {
      d1_p[i] = &(d1_alloc[i * xGrid]);

      HV_p[i]     = &(HV_alloc[i * xGrid]);
      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);
      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);

      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);

      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);
    }

    for (int i = 0; i < yGrid; i++) {
      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);
      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);
      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);
      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);
    }
  }
  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }

  ~THREAD_LOCAL_STORAGE() {
    free(pop_heap2);

    free(d1_p);
    free(d1_alloc);

    free(HV_p);
    free(hyperV_p);
    free(hyperH_p);
    free(HV_alloc);
    free(hyperV_alloc);
    free(hyperH_alloc);

    free(parentX1_p);
    free(parentY1_p);
    free(parentX3_p);
    free(parentY3_p);

    free(parentX1_alloc);
    free(parentY1_alloc);
    free(parentX3_alloc);
    free(parentY3_alloc);

    free(corrEdge_alloc);
    free(corrEdge_p);

    free(inRegion_alloc);
    free(inRegion_p);

    free(netEO_p);
  }
};

void convertToMazerouteNet(int netID) {
  short *gridsX, *gridsY;
  int i, edgeID, edgelength;
  int n1, n2, x1, y1, x2, y2;
  int cnt, Zpoint;
  TreeEdge* treeedge;
  TreeNode* treenodes;

  treenodes = sttrees[netID].nodes;
  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {
    treeedge               = &(sttrees[netID].edges[edgeID]);
    edgelength             = treeedge->len;
    n1                     = treeedge->n1;
    n2                     = treeedge->n2;
    x1                     = treenodes[n1].x;
    y1                     = treenodes[n1].y;
    x2                     = treenodes[n2].x;
    y2                     = treenodes[n2].y;
    treeedge->route.gridsX = (short*)calloc((edgelength + 1), sizeof(short));
    treeedge->route.gridsY = (short*)calloc((edgelength + 1), sizeof(short));
    gridsX                 = treeedge->route.gridsX;
    gridsY                 = treeedge->route.gridsY;
    treeedge->len          = ADIFF(x1, x2) + ADIFF(y1, y2);

    cnt = 0;
    if (treeedge->route.type == NOROUTE) {
      gridsX[0]                = x1;
      gridsY[0]                = y1;
      treeedge->route.type     = MAZEROUTE;
      treeedge->route.routelen = 0;
      treeedge->len            = 0;
      cnt++;
    } else if (treeedge->route.type == LROUTE) {
      if (treeedge->route.xFirst) // horizontal first
      {
        for (i = x1; i <= x2; i++) {
          gridsX[cnt] = i;
          gridsY[cnt] = y1;
          cnt++;
        }
        if (y1 <= y2) {
          for (i = y1 + 1; i <= y2; i++) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
          }
        } else {
          for (i = y1 - 1; i >= y2; i--) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
          }
        }
      } else // vertical first
      {
        if (y1 <= y2) {
          for (i = y1; i <= y2; i++) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
          }
        } else {
          for (i = y1; i >= y2; i--) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
          }
        }
        for (i = x1 + 1; i <= x2; i++) {
          gridsX[cnt] = i;
          gridsY[cnt] = y2;
          cnt++;
        }
      }
    } else if (treeedge->route.type == ZROUTE) {
      Zpoint = treeedge->route.Zpoint;
      if (treeedge->route.HVH) // HVH
      {
        for (i = x1; i < Zpoint; i++) {
          gridsX[cnt] = i;
          gridsY[cnt] = y1;
          cnt++;
        }
        if (y1 <= y2) {
          for (i = y1; i <= y2; i++) {
            gridsX[cnt] = Zpoint;
            gridsY[cnt] = i;
            cnt++;
          }
        } else {
          for (i = y1; i >= y2; i--) {
            gridsX[cnt] = Zpoint;
            gridsY[cnt] = i;
            cnt++;
          }
        }
        for (i = Zpoint + 1; i <= x2; i++) {
          gridsX[cnt] = i;
          gridsY[cnt] = y2;
          cnt++;
        }
      } else // VHV
      {
        if (y1 <= y2) {
          for (i = y1; i < Zpoint; i++) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
          }
          for (i = x1; i <= x2; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = Zpoint;
            cnt++;
          }
          for (i = Zpoint + 1; i <= y2; i++) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
          }
        } else {
          for (i = y1; i > Zpoint; i--) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
          }
          for (i = x1; i <= x2; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = Zpoint;
            cnt++;
          }
          for (i = Zpoint - 1; i >= y2; i--) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
          }
        }
      }
    }

    treeedge->route.type     = MAZEROUTE;
    treeedge->route.routelen = edgelength;

  } // loop for all the edges
}

void convertToMazeroute() {
  int i, j, netID;

  for (netID = 0; netID < numValidNets; netID++) {
    convertToMazerouteNet(netID);
  }

  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      int grid            = i * (xGrid - 1) + j;
      h_edges[grid].usage = h_edges[grid].est_usage;
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      int grid            = i * xGrid + j;
      v_edges[grid].usage = v_edges[grid].est_usage;
    }
  }
}

// non recursive version of heapify
void heapify(float** array, int heapSize, int i) {
  int l, r, smallest;
  float* tmp;
  Bool STOP = FALSE;

  tmp = array[i];
  do {

    l = LEFT(i);
    r = RIGHT(i);

    if (l < heapSize && *(array[l]) < *tmp) {
      smallest = l;
      if (r < heapSize && *(array[r]) < *(array[l]))
        smallest = r;
    } else {
      smallest = i;
      if (r < heapSize && *(array[r]) < *tmp)
        smallest = r;
    }
    if (smallest != i) {
      array[i] = array[smallest];
      i        = smallest;
    } else {
      array[i] = tmp;
      STOP     = TRUE;
    }
  } while (!STOP);
}

// build heap for an list of grid
/*void buildHeap(float **array, int arrayLen)
{
    int i;

    for (i=arrayLen/2-1; i>=0; i--)
        heapify(array, arrayLen, i);
}*/

void updateHeap(float** array, int i) {
  int parent;
  float* tmpi;

  tmpi = array[i];
  while (i > 0 && *(array[PARENT(i)]) > *tmpi) {
    parent   = PARENT(i);
    array[i] = array[parent];
    i        = parent;
  }
  array[i] = tmpi;
}

// extract the entry with minimum distance from Priority queue
void extractMin(float** array, int arrayLen) {

  //    if(arrayLen<1)
  //        printf("Error: heap underflow\n");
  array[0] = array[arrayLen - 1];
  heapify(array, arrayLen - 1, 0);
}

/*
 * num_iteration : the total number of iterations for maze route to run
 * round : the number of maze route stages runned
 */

void updateCongestionHistory(int upType) {
  int i, j, grid, maxlimit;
  float overflow;

  maxlimit = 0;

  printf("updateType %d\n", upType);

  if (upType == 1) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid     = i * (xGrid - 1) + j;
        overflow = h_edges[grid].usage - h_edges[grid].cap;

        if (overflow > 0) {
          h_edges[grid].last_usage += overflow;
          h_edges[grid].congCNT++;
        } else {
          if (!stopDEC) {
            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, h_edges[grid].last_usage);
      }
    }

    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid     = i * xGrid + j;
        overflow = v_edges[grid].usage - v_edges[grid].cap;

        if (overflow > 0) {
          v_edges[grid].last_usage += overflow;
          v_edges[grid].congCNT++;
        } else {
          if (!stopDEC) {
            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, v_edges[grid].last_usage);
      }
    }
  } else if (upType == 2) {
    if (max_adj < ahTH) {
      stopDEC = TRUE;
    } else {
      stopDEC = FALSE;
    }
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid     = i * (xGrid - 1) + j;
        overflow = h_edges[grid].usage - h_edges[grid].cap;

        if (overflow > 0) {
          h_edges[grid].congCNT++;
          h_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            h_edges[grid].congCNT--;
            h_edges[grid].congCNT    = max(0, h_edges[grid].congCNT);
            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, h_edges[grid].last_usage);
      }
    }

    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid     = i * xGrid + j;
        overflow = v_edges[grid].usage - v_edges[grid].cap;

        if (overflow > 0) {
          v_edges[grid].congCNT++;
          v_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            v_edges[grid].congCNT--;
            v_edges[grid].congCNT    = max(0, v_edges[grid].congCNT);
            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, v_edges[grid].last_usage);
      }
    }

  } else if (upType == 3) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid     = i * (xGrid - 1) + j;
        overflow = h_edges[grid].usage - h_edges[grid].cap;

        if (overflow > 0) {
          h_edges[grid].congCNT++;
          h_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            h_edges[grid].congCNT--;
            h_edges[grid].congCNT = max(0, h_edges[grid].congCNT);
            h_edges[grid].last_usage += overflow;
            h_edges[grid].last_usage = max(h_edges[grid].last_usage, 0);
          }
        }
        maxlimit = max(maxlimit, h_edges[grid].last_usage);
      }
    }

    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid     = i * xGrid + j;
        overflow = v_edges[grid].usage - v_edges[grid].cap;

        if (overflow > 0) {
          v_edges[grid].congCNT++;
          v_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            v_edges[grid].congCNT--;
            v_edges[grid].last_usage += overflow;
            v_edges[grid].last_usage = max(v_edges[grid].last_usage, 0);
          }
        }
        maxlimit = max(maxlimit, v_edges[grid].last_usage);
      }
    }

  } else if (upType == 4) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid     = i * (xGrid - 1) + j;
        overflow = h_edges[grid].usage - h_edges[grid].cap;

        if (overflow > 0) {
          h_edges[grid].congCNT++;
          h_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            h_edges[grid].congCNT--;
            h_edges[grid].congCNT    = max(0, h_edges[grid].congCNT);
            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, h_edges[grid].last_usage);
      }
    }

    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid     = i * xGrid + j;
        overflow = v_edges[grid].usage - v_edges[grid].cap;

        if (overflow > 0) {
          v_edges[grid].congCNT++;
          v_edges[grid].last_usage += overflow;
        } else {
          if (!stopDEC) {
            v_edges[grid].congCNT--;
            v_edges[grid].congCNT    = max(0, v_edges[grid].congCNT);
            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;
          }
        }
        maxlimit = max(maxlimit, v_edges[grid].last_usage);
      }
    }
    //	if (maxlimit < 20) {
    //		stopDEC = TRUE;
    //	}
  }

  max_adj = maxlimit;

  printf("max value %d stop %d\n", maxlimit, stopDEC);
}

// ripup a tree edge according to its ripup type and Z-route it
// put all the nodes in the subtree t1 and t2 into heap1 and heap2
// netID   - the ID for the net
// edgeID  - the ID for the tree edge to route
// d1      - the distance of any grid from the source subtree t1
// d2      - the distance of any grid from the destination subtree t2
// heap1   - the heap storing the addresses for d1[][]
// heap2   - the heap storing the addresses for d2[][]
void setupHeap(int netID, int edgeID, local_pq& pq1, local_vec& v2,
               int regionX1, int regionX2, int regionY1, int regionY2,
               float** d1, int** corrEdge, bool** inRegion) {
  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;
  int nbr, nbrX, nbrY, cur, edge;
  int x_grid, y_grid;
  int queuehead, queuetail, *queue;
  Bool* visited;
  TreeEdge* treeedges;
  TreeNode* treenodes;
  Route* route;

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = TRUE;
  }

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  d         = sttrees[netID].deg;

  n1 = treeedges[edgeID].n1;
  n2 = treeedges[edgeID].n2;
  x1 = treenodes[n1].x;
  y1 = treenodes[n1].y;
  x2 = treenodes[n2].x;
  y2 = treenodes[n2].y;

  // if(netID == 14628)
  //    printf("net: %d edge: %d src: %d %d dst: %d %d d: %d\n", netID, edgeID,
  //    y1, x1, y2, x2, d);
  pq1.clear();
  v2.clear(); // Michael
  if (d == 2) // 2-pin net
  {
    d1[y1][x1] = 0;
    pq1.push({&(d1[y1][x1]), 0});
    v2.push_back(y2 * xGrid + x2);
  } else // net with more than 2 pins
  {
    numNodes = 2 * d - 2;

    visited = (Bool*)calloc(numNodes, sizeof(Bool));
    for (i = 0; i < numNodes; i++)
      visited[i] = FALSE;

    queue = (int*)calloc(numNodes, sizeof(int));

    // find all the grids on tree edges in subtree t1 (connecting to n1) and put
    // them into heap1
    if (n1 < d) // n1 is a Pin node
    {
      // just need to put n1 itself into heap1
      d1[y1][x1] = 0;
      pq1.push({&(d1[y1][x1]), 0});
      visited[n1] = TRUE;
    } else // n1 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n1 into heap1
      d1[y1][x1] = 0;
      // if(netID == 252163 && edgeID == 51)
      //    printf("y: %d x: %d\n", y1, x1);
      pq1.push({&(d1[y1][x1]), 0});
      visited[n1] = TRUE;

      // add n1 into the queue
      queue[queuetail] = n1;
      queuetail++;

      // loop to find all the edges in subtree t1
      while (queuetail > queuehead) {
        // get cur node from the queuehead
        cur = queue[queuehead];
        queuehead++;
        visited[cur] = TRUE;
        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n2) // not n2
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap1
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap1 if in enlarged region
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX           = treenodes[nbr].x;
                    nbrY           = treenodes[nbr].y;
                    d1[nbrY][nbrX] = 0;
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("y: %d x: %d\n", nbrY, nbrX);
                    pq1.push({&(d1[nbrY][nbrX]), 0});
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap1
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];

                      if (inRegion[y_grid][x_grid]) {
                        d1[y_grid][x_grid] = 0;
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("y: %d x: %d\n", y_grid, x_grid);
                        pq1.push({&(d1[y_grid][x_grid]), 0});
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if not a degraded edge (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n2
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n1 is not a Pin node

    // find all the grids on subtree t2 (connect to n2) and put them into heap2
    // find all the grids on tree edges in subtree t2 (connecting to n2) and put
    // them into heap2
    if (n2 < d) // n2 is a Pin node
    {
      // just need to put n2 itself into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 14628)
      //    printf("y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;
    } else // n2 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n2 into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 252163 && edgeID == 51)
      //    printf("dst y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;

      // add n2 into the queue
      queue[queuetail] = n2;
      queuetail++;

      // loop to find all the edges in subtree t2
      while (queuetail > queuehead) {
        // get cur node form queuehead
        cur          = queue[queuehead];
        visited[cur] = TRUE;
        queuehead++;

        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n1) // not n1
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap2
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap2
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX = treenodes[nbr].x;
                    nbrY = treenodes[nbr].y;
                    v2.push_back(nbrY * xGrid + nbrX);
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("dst y: %d x: %d\n", nbrY, nbrX);
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap2
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];
                      if (inRegion[y_grid][x_grid]) {
                        v2.push_back(y_grid * xGrid + x_grid);
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("dst y: %d x: %d\n", y_grid, x_grid);
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if the edge is not degraded (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n1
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n2 is not a Pin node

    free(queue);
    free(visited);
  } // net with more than two pins

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = FALSE;
  }
}

int copyGrids(TreeNode* treenodes, int n1, TreeEdge* treeedges, int edge_n1n2,
              int* gridsX_n1n2, int* gridsY_n1n2) {
  int i, cnt;
  int n1x, n1y;

  n1x = treenodes[n1].x;
  n1y = treenodes[n1].y;

  cnt = 0;
  if (treeedges[edge_n1n2].n1 == n1) // n1 is the first node of (n1, n2)
  {
    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
      for (i = 0; i <= treeedges[edge_n1n2].route.routelen; i++) {
        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];
        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];
        cnt++;
      }
    }    // MAZEROUTE
    else // NOROUTE
    {
      gridsX_n1n2[cnt] = n1x;
      gridsY_n1n2[cnt] = n1y;
      cnt++;
    }
  }    // if n1 is the first node of (n1, n2)
  else // n2 is the first node of (n1, n2)
  {
    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
      for (i = treeedges[edge_n1n2].route.routelen; i >= 0; i--) {
        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];
        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];
        cnt++;
      }
    }    // MAZEROUTE
    else // NOROUTE
    {
      gridsX_n1n2[cnt] = n1x;
      gridsY_n1n2[cnt] = n1y;
      cnt++;
    } // MAZEROUTE
  }

  return (cnt);
}

void updateRouteType1(TreeNode* treenodes, int n1, int A1, int A2, int E1x,
                      int E1y, TreeEdge* treeedges, int edge_n1A1,
                      int edge_n1A2) {
  using namespace galois::substrate;
  int i, cnt, A1x, A1y, A2x, A2y;
  int cnt_n1A1, cnt_n1A2, E1_pos = 0;
  // int gridsXY[4 * (xGrid + yGrid)];

  int gridsX_n1A1[2 * (xGrid + yGrid)], gridsY_n1A1[2 * (xGrid + yGrid)],
      gridsX_n1A2[2 * (xGrid + yGrid)], gridsY_n1A2[2 * (xGrid + yGrid)];

  /*int* gridsX_n1A1 = gridsXY;
  int* gridsY_n1A1 = gridsXY + xGrid + yGrid;
  int* gridsX_n1A2 = gridsXY + 2 * (xGrid + yGrid);
  int* gridsY_n1A2 = gridsXY + 3 * (xGrid + yGrid);*/

  /*LAptr gridsX_n1A1_LA, gridsY_n1A1_LA, gridsX_n1A2_LA, gridsY_n1A2_LA;
  gridsX_n1A1_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));
  gridsY_n1A1_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));
  gridsX_n1A2_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));
  gridsY_n1A2_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));

  int* gridsX_n1A1 = reinterpret_cast<int*> (gridsX_n1A1_LA.get());
  int* gridsY_n1A1 = reinterpret_cast<int*> (gridsY_n1A1_LA.get());
  int* gridsX_n1A2 = reinterpret_cast<int*> (gridsX_n1A2_LA.get());
  int* gridsY_n1A2 = reinterpret_cast<int*> (gridsY_n1A2_LA.get());*/

  A1x = treenodes[A1].x;
  A1y = treenodes[A1].y;
  A2x = treenodes[A2].x;
  A2y = treenodes[A2].y;

  // copy all the grids on (n1, A1) and (n2, A2) to tmp arrays, and keep the
  // grids order A1->n1->A2 copy (n1, A1)
  cnt_n1A1 =
      copyGrids(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1, gridsY_n1A1);

  // copy (n1, A2)
  cnt_n1A2 =
      copyGrids(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2, gridsY_n1A2);

  // update route for (n1, A1) and (n1, A2)
  // find the index of E1 in (n1, A1)
  for (i = 0; i < cnt_n1A1; i++) {
    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1
    {
      E1_pos = i;
      break;
    }
  }

  // reallocate memory for route.gridsX and route.gridsY
  if (treeedges[edge_n1A1].route.type ==
      MAZEROUTE) // if originally allocated, free them first
  {
    free(treeedges[edge_n1A1].route.gridsX);
    free(treeedges[edge_n1A1].route.gridsY);
  }
  treeedges[edge_n1A1].route.gridsX =
      (short*)calloc((E1_pos + 1), sizeof(short));
  treeedges[edge_n1A1].route.gridsY =
      (short*)calloc((E1_pos + 1), sizeof(short));

  if (A1x <= E1x) {
    cnt = 0;
    for (i = 0; i <= E1_pos; i++) {
      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A1].n1 = A1;
    treeedges[edge_n1A1].n2 = n1;
  } else {
    cnt = 0;
    for (i = E1_pos; i >= 0; i--) {
      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A1].n1 = n1;
    treeedges[edge_n1A1].n2 = A1;
  }

  treeedges[edge_n1A1].route.type     = MAZEROUTE;
  treeedges[edge_n1A1].route.routelen = E1_pos;
  treeedges[edge_n1A1].len            = ADIFF(A1x, E1x) + ADIFF(A1y, E1y);

  // reallocate memory for route.gridsX and route.gridsY
  if (treeedges[edge_n1A2].route.type ==
      MAZEROUTE) // if originally allocated, free them first
  {
    free(treeedges[edge_n1A2].route.gridsX);
    free(treeedges[edge_n1A2].route.gridsY);
  }
  treeedges[edge_n1A2].route.gridsX =
      (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos - 1), sizeof(short));
  treeedges[edge_n1A2].route.gridsY =
      (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos - 1), sizeof(short));

  if (E1x <= A2x) {
    cnt = 0;
    for (i = E1_pos; i < cnt_n1A1; i++) {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];
      cnt++;
    }
    for (i = 1; i < cnt_n1A2; i++) // 0 is n1 again, so no repeat
    {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];
      cnt++;
    }
    treeedges[edge_n1A2].n1 = n1;
    treeedges[edge_n1A2].n2 = A2;
  } else {
    cnt = 0;
    for (i = cnt_n1A2 - 1; i >= 1; i--) // 0 is n1 again, so no repeat
    {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];
      cnt++;
    }
    for (i = cnt_n1A1 - 1; i >= E1_pos; i--) {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A2].n1 = A2;
    treeedges[edge_n1A2].n2 = n1;
  }
  treeedges[edge_n1A2].route.type     = MAZEROUTE;
  treeedges[edge_n1A2].route.routelen = cnt - 1;
  treeedges[edge_n1A2].len            = ADIFF(A2x, E1x) + ADIFF(A2y, E1y);
}

void updateRouteType2(TreeNode* treenodes, int n1, int A1, int A2, int C1,
                      int C2, int E1x, int E1y, TreeEdge* treeedges,
                      int edge_n1A1, int edge_n1A2, int edge_C1C2) {
  int i, cnt, A1x, A1y, A2x, A2y, C1x, C1y, C2x, C2y;
  int edge_n1C1, edge_n1C2, edge_A1A2;
  int cnt_n1A1, cnt_n1A2, cnt_C1C2, E1_pos = 0;
  int len_A1A2, len_n1C1, len_n1C2;
  int gridsX_n1A1[2 * (xGrid + yGrid)], gridsY_n1A1[2 * (xGrid + yGrid)];
  int gridsX_n1A2[2 * (xGrid + yGrid)], gridsY_n1A2[2 * (xGrid + yGrid)];
  int gridsX_C1C2[2 * (xGrid + yGrid)], gridsY_C1C2[2 * (xGrid + yGrid)];

  A1x = treenodes[A1].x;
  A1y = treenodes[A1].y;
  A2x = treenodes[A2].x;
  A2y = treenodes[A2].y;
  C1x = treenodes[C1].x;
  C1y = treenodes[C1].y;
  C2x = treenodes[C2].x;
  C2y = treenodes[C2].y;

  edge_n1C1 = edge_n1A1;
  edge_n1C2 = edge_n1A2;
  edge_A1A2 = edge_C1C2;

  // combine (n1, A1) and (n1, A2) into (A1, A2), A1 is the first node and A2 is
  // the second grids order A1->n1->A2 copy (A1, n1)
  cnt_n1A1 =
      copyGrids(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1, gridsY_n1A1);

  // copy (n1, A2)
  cnt_n1A2 =
      copyGrids(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2, gridsY_n1A2);

  // copy all the grids on (C1, C2) to gridsX_C1C2[] and gridsY_C1C2[]
  cnt_C1C2 =
      copyGrids(treenodes, C1, treeedges, edge_C1C2, gridsX_C1C2, gridsY_C1C2);

  // combine grids on original (A1, n1) and (n1, A2) to new (A1, A2)
  // allocate memory for gridsX[] and gridsY[] of edge_A1A2
  if (treeedges[edge_A1A2].route.type == MAZEROUTE) {
    free(treeedges[edge_A1A2].route.gridsX);
    free(treeedges[edge_A1A2].route.gridsY);
  }
  len_A1A2 = cnt_n1A1 + cnt_n1A2 - 1;

  treeedges[edge_A1A2].route.gridsX   = (short*)calloc(len_A1A2, sizeof(short));
  treeedges[edge_A1A2].route.gridsY   = (short*)calloc(len_A1A2, sizeof(short));
  treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;
  treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);

  cnt = 0;
  for (i = 0; i < cnt_n1A1; i++) {
    treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A1[i];
    treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A1[i];
    cnt++;
  }
  for (i = 1; i < cnt_n1A2; i++) // do not repeat point n1
  {
    treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[i];
    treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[i];
    cnt++;
  }

  // find the index of E1 in (C1, C2)
  for (i = 0; i < cnt_C1C2; i++) {
    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {
      E1_pos = i;
      break;
    }
  }

  // allocate memory for gridsX[] and gridsY[] of edge_n1C1 and edge_n1C2
  if (treeedges[edge_n1C1].route.type == MAZEROUTE) {
    free(treeedges[edge_n1C1].route.gridsX);
    free(treeedges[edge_n1C1].route.gridsY);
  }
  len_n1C1                            = E1_pos + 1;
  treeedges[edge_n1C1].route.gridsX   = (short*)calloc(len_n1C1, sizeof(short));
  treeedges[edge_n1C1].route.gridsY   = (short*)calloc(len_n1C1, sizeof(short));
  treeedges[edge_n1C1].route.routelen = len_n1C1 - 1;
  treeedges[edge_n1C1].len            = ADIFF(C1x, E1x) + ADIFF(C1y, E1y);

  if (treeedges[edge_n1C2].route.type == MAZEROUTE) {
    free(treeedges[edge_n1C2].route.gridsX);
    free(treeedges[edge_n1C2].route.gridsY);
  }
  len_n1C2                            = cnt_C1C2 - E1_pos;
  treeedges[edge_n1C2].route.gridsX   = (short*)calloc(len_n1C2, sizeof(short));
  treeedges[edge_n1C2].route.gridsY   = (short*)calloc(len_n1C2, sizeof(short));
  treeedges[edge_n1C2].route.routelen = len_n1C2 - 1;
  treeedges[edge_n1C2].len            = ADIFF(C2x, E1x) + ADIFF(C2y, E1y);

  // split original (C1, C2) to (C1, n1) and (n1, C2)
  cnt = 0;
  for (i = 0; i <= E1_pos; i++) {
    treeedges[edge_n1C1].route.gridsX[i] = gridsX_C1C2[i];
    treeedges[edge_n1C1].route.gridsY[i] = gridsY_C1C2[i];
    cnt++;
  }

  cnt = 0;
  for (i = E1_pos; i < cnt_C1C2; i++) {
    treeedges[edge_n1C2].route.gridsX[cnt] = gridsX_C1C2[i];
    treeedges[edge_n1C2].route.gridsY[cnt] = gridsY_C1C2[i];
    cnt++;
  }
}

void reInitTree(int netID) {
  int deg, numEdges, edgeID, d, j;
  TreeEdge* treeedge;
  Tree rsmt;
  int x[MAXNETDEG], y[MAXNETDEG];

  // printf("re init tree for net %d\n",netID);

  newRipupNet(netID);

  deg      = sttrees[netID].deg;
  numEdges = 2 * deg - 3;
  for (edgeID = 0; edgeID < numEdges; edgeID++) {
    treeedge = &(sttrees[netID].edges[edgeID]);
    if (treeedge->len > 0) {
      free(treeedge->route.gridsX);
      free(treeedge->route.gridsY);
      free(treeedge->route.gridsL);
    }
  }
  free(sttrees[netID].nodes);
  free(sttrees[netID].edges);

  // printf("old tree component freed\n");

  d = nets[netID]->deg;
  // printf("net deg %d\n",d);
  // fflush(stdout);
  for (j = 0; j < d; j++) {
    x[j] = nets[netID]->pinX[j];
    y[j] = nets[netID]->pinY[j];
  }
  // printf("before flute\n");
  // fflush(stdout);
  fluteCongest(netID, d, x, y, 2, 1.2, &rsmt);
  // printf("fluted worked\n");
  // fflush(stdout);
  if (d > 3) {
    edgeShiftNew(&rsmt);
    // printf("edge shifted\n");
  }
  // fflush(stdout);
  copyStTree(netID, rsmt);
  // printf("tree copied\n");
  // fflush(stdout);
  newrouteLInMaze(netID);
  // printf("L routing worked\n");
  // fflush(stdout);
  // newrouteZ(netID, 10);
  // printf("Z routign worked\n");
  // fflush(stdout);
  convertToMazerouteNet(netID);
  // printf("L to mzed converted\n");
  // fflush(stdout);
  // checkRoute2DTree(netID);
  // printf("tree double checked\n");
  // fflush(stdout);
}

void mazeRouteMSMD(int iter, int expand, float costHeight, int ripup_threshold,
                   int mazeedge_Threshold, Bool Ordering, int cost_type) {
  // LOCK = 0;
  float forange;

  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  }
  // cout << " i = vCap:" << v_costTable[vCapacity-1] << " " <<
  // v_costTable[vCapacity] << " " << v_costTable[vCapacity+1] << endl;

  /*forange = yGrid*xGrid;
  for(int i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>
      thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  galois::do_all(
      galois::iterate(0, numValidNets),
      [&](const auto nidRPC) {
        int grid, netID;

        // maze routing for multi-source, multi-destination
        bool hypered, enter;
        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,
            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,
            num_edges;
        int regionX1, regionX2, regionY1, regionY2;
        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
            tmp_gridsY[YRANGE];
        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
            edge_C1C2;
        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
        int E1x, E1y, E2x, E2y;
        int tmp_grid;
        int preX, preY, origENG, edgeREC;

        float tmp, tmp_cost;
        TreeEdge *treeedges, *treeedge;
        TreeNode* treenodes;

        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;

        float** d1    = thread_local_storage.getLocal()->d1_p;
        bool** HV     = thread_local_storage.getLocal()->HV_p;
        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;
        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;

        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;
        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;
        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;
        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;

        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;

        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;

        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;

        local_pq pq1 = perthread_pq.get();
        local_vec v2 = perthread_vec.get();

        /*for(i=0; i<yGrid*xGrid; i++)
        {
            pop_heap2[i] = FALSE;
        } */

        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
        /*for(int i=0; i<yGrid; i++)
        {
            for(int j=0; j<xGrid; j++)
                inRegion[i][j] = FALSE;
        }*/
        // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
        // parentY1[153][134], parentX3[153][134]); printf("what is
        // happening?\n");

        if (Ordering) {
          netID = treeOrderCong[nidRPC].treeIndex;
        } else {
          netID = nidRPC;
        }

        deg = sttrees[netID].deg;

        origENG = expand;

        netedgeOrderDec(netID, netEO);

        treeedges = sttrees[netID].edges;
        treenodes = sttrees[netID].nodes;
        // loop for all the tree edges (2*deg-3)
        num_edges = 2 * deg - 3;

        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {
          edgeID   = netEO[edgeREC].edgeID;
          treeedge = &(treeedges[edgeID]);

          n1            = treeedge->n1;
          n2            = treeedge->n2;
          n1x           = treenodes[n1].x;
          n1y           = treenodes[n1].y;
          n2x           = treenodes[n2].x;
          n2y           = treenodes[n2].y;
          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

          if (treeedge->len >
              mazeedge_Threshold) // only route the non-degraded edges (len>0)
          {
            // enter = newRipupCheck(treeedge, n1x, n1y, n2x, n2y,
            // ripup_threshold, netID, edgeID);
            enter =
                newRipupCheck_atomic(treeedge, ripup_threshold, netID, edgeID);

            // ripup the routing for the edge
            if (enter) {
              /*pre_length = treeedge->route.routelen;
              for(int i = 0; i < pre_length; i++)
              {
                  pre_gridsY[i] = treeedge->route.gridsY[i];
                  pre_gridsX[i] = treeedge->route.gridsX[i];
                  //printf("i %d x %d y %d\n", i, pre_gridsX[i], pre_gridsY[i]);
              }*/
              // if(netID == 252163 && edgeID == 51)
              //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
              //    edgeID, n1x, n1y, n2x, n2y);
              if (n1y <= n2y) {
                ymin = n1y;
                ymax = n2y;
              } else {
                ymin = n2y;
                ymax = n1y;
              }

              if (n1x <= n2x) {
                xmin = n1x;
                xmax = n2x;
              } else {
                xmin = n2x;
                xmax = n1x;
              }

              int enlarge =
                  min(origENG,
                      (iter / 6 + 3) *
                          treeedge->route
                              .routelen); // michael, this was global variable
              regionX1 = max(0, xmin - enlarge);
              regionX2 = min(xGrid - 1, xmax + enlarge);
              regionY1 = max(0, ymin - enlarge);
              regionY2 = min(yGrid - 1, ymax + enlarge);

              // initialize d1[][] and d2[][] as BIG_INT
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  d1[i][j] = BIG_INT;
                  /*d2[i][j] = BIG_INT;
                  hyperH[i][j] = FALSE;
                  hyperV[i][j] = FALSE;*/
                }
              }
              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperH[i][j] = FALSE;
                }
              }
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperV[i][j] = FALSE;
                }
              }
              // TODO: use seperate loops

              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
              // grids on the two subtrees
              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                        regionY2, d1, corrEdge, inRegion);
              // TODO: use std priority queue
              // while loop to find shortest path
              ind1 = (pq1.top().d1_p - &d1[0][0]);
              pq1.pop();
              curX = ind1 % xGrid;
              curY = ind1 / xGrid;

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
                pop_heap2[*ii] = TRUE;
              }
              float curr_d1;
              while (pop_heap2[ind1] ==
                     FALSE) // stop until the grid position been popped out from
                            // both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                // if(PRINT) printf("curX curY %d %d, (%d, %d), (%d, %d),
                // pq1.size: %d\n", curX, curY, regionX1, regionX2, regionY1,
                // regionY2, pq1.size()); if(curX == 102 && curY == 221)
                // exit(1);
                curr_d1 = d1[curY][curX];
                if (curr_d1 != 0) {
                  if (HV[curY][curX]) {
                    preX = parentX1[curY][curX];
                    preY = parentY1[curY][curX];
                  } else {
                    preX = parentX3[curY][curX];
                    preY = parentY3[curY][curX];
                  }
                } else {
                  preX = curX;
                  preY = curY;
                }

                // left
                if (curX > regionX1) {
                  grid = curY * (xGrid - 1) + curX - 1;
                  tmpX = curX - 1; // the left neighbor
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX < regionX2 - 1) {
                      tmp_grid = curY * (xGrid - 1) + curX;
                      tmp_cost =
                          d1[curY][curX + 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA &&
                          d1[curY][tmpX] >
                              tmp_cost +
                                  h_costTable
                                      [h_edges[grid].usage + h_edges[grid].red +
                                       (int)(L * h_edges[grid].last_usage)]) {
                        hyperH[curY][curX] = TRUE; // Michael
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }
                  // if(LOCK)  h_edges[grid].releaseLock();

                  if (d1[curY][tmpX] >
                      tmp) // left neighbor been put into heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // right
                if (curX < regionX2) {
                  grid = curY * (xGrid - 1) + curX;

                  tmpX = curX + 1; // the right neighbor
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX > regionX1 + 1) {
                      tmp_grid = curY * (xGrid - 1) + curX - 1;
                      tmp_cost =
                          d1[curY][curX - 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA &&
                          d1[curY][tmpX] >
                              tmp_cost +
                                  h_costTable
                                      [h_edges[grid].usage + h_edges[grid].red +
                                       (int)(L * h_edges[grid].last_usage)]) {
                        hyperH[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }

                  if (d1[curY][tmpX] > tmp) // right neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // bottom
                if (curY > regionY1) {
                  grid = (curY - 1) * xGrid + curX;

                  tmpY = curY - 1; // the bottom neighbor
                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY < regionY2 - 1) {
                      tmp_grid = curY * xGrid + curX;
                      tmp_cost =
                          d1[curY + 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA &&
                          d1[tmpY][curX] >
                              tmp_cost +
                                  v_costTable
                                      [v_edges[grid].usage + v_edges[grid].red +
                                       (int)(L * v_edges[grid].last_usage)]) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }

                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }
                // top
                if (curY < regionY2) {
                  grid = curY * xGrid + curX;
                  tmpY = curY + 1; // the top neighbor

                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY > regionY1 + 1) {
                      tmp_grid = (curY - 1) * xGrid + curX;
                      tmp_cost =
                          d1[curY - 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA &&
                          d1[tmpY][curX] >
                              tmp_cost +
                                  v_costTable
                                      [v_edges[grid].usage + v_edges[grid].red +
                                       (int)(L * v_edges[grid].last_usage)]) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }
                  if (d1[tmpY][curX] >
                      tmp) // top neighbor been put into heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }

                // update ind1 and ind2 for next loop, Michael: need to check if
                // it is up-to-date value.
                float d1_push;
                do {
                  ind1    = pq1.top().d1_p - &d1[0][0];
                  d1_push = pq1.top().d1_push;
                  pq1.pop();
                  curX = ind1 % xGrid;
                  curY = ind1 / xGrid;
                } while (d1_push != d1[curY][curX]);
              } // while loop

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
                pop_heap2[*ii] = FALSE;

              crossX = ind1 % xGrid;
              crossY = ind1 / xGrid;

              cnt  = 0;
              curX = crossX;
              curY = crossY;
              while (d1[curY][curX] != 0) // loop until reach subtree1
              {

                /*if(cnt > 2000) {
                    cerr << "Y: " << curY <<" X:" << curX << " hyperH: " <<
                hyperH[curY][curX]; cerr << " hyperV:" << hyperV[curY][curX] <<
                " HV: " << HV[curY][curX]; cerr << " d1: " << d1[curY][curX] <<
                endl; cerr << " deadloop return!" << endl; reInitTree(netID);
                    return;
                }*/

                hypered = FALSE;
                if (cnt != 0) {
                  if (curX != tmpX && hyperH[curY][curX]) {
                    curX    = 2 * curX - tmpX;
                    hypered = TRUE;
                  }
                  // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
                  if (curY != tmpY && hyperV[curY][curX]) {
                    curY    = 2 * curY - tmpY;
                    hypered = TRUE;
                  }
                }
                tmpX = curX;
                tmpY = curY;
                if (!hypered) {
                  if (HV[tmpY][tmpX]) {
                    curY = parentY1[tmpY][tmpX];
                  } else {
                    curX = parentX3[tmpY][tmpX];
                  }
                }

                tmp_gridsX[cnt] = curX;
                tmp_gridsY[cnt] = curY;
                cnt++;
              }
              // reverse the grids on the path

              for (i = 0; i < cnt; i++) {
                tmpind    = cnt - 1 - i;
                gridsX[i] = tmp_gridsX[tmpind];
                gridsY[i] = tmp_gridsY[tmpind];
              }
              // add the connection point (crossX, crossY)
              gridsX[cnt] = crossX;
              gridsY[cnt] = crossY;
              cnt++;

              curX     = crossX;
              curY     = crossY;
              cnt_n1n2 = cnt;

              // change the tree structure according to the new routing for the
              // tree edge find E1 and E2, and the endpoints of the edges they
              // are on
              E1x = gridsX[0];
              E1y = gridsY[0];
              E2x = gridsX[cnt_n1n2 - 1];
              E2y = gridsY[cnt_n1n2 - 1];

              edge_n1n2 = edgeID;
              // if(netID == 252163 && edgeID == 51)
              //    printf("E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\n",
              //    E1x, E1y, E2x, E2y, cnt_n1n2);

              // (1) consider subtree1
              if (n1 >= deg && (E1x != n1x || E1y != n1y))
              // n1 is not a pin and E1!=n1, then make change to subtree1,
              // otherwise, no change to subtree1
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

                // find A1, A2 and edge_n1A1, edge_n1A2
                if (treenodes[n1].nbr[0] == n2) {
                  A1        = treenodes[n1].nbr[1];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[1];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else if (treenodes[n1].nbr[1] == n2) {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[1];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[1];
                }

                if (endpt1 == n1 ||
                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
                {
                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always
                  // on (n1, A1)
                  if (endpt1 == A2 || endpt2 == A2) {
                    tmpi      = A1;
                    A1        = A2;
                    A2        = tmpi;
                    tmpi      = edge_n1A1;
                    edge_n1A1 = edge_n1A2;
                    edge_n1A2 = tmpi;
                  }

                  // update route for edge (n1, A1), (n1, A2)
                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                                   edge_n1A1, edge_n1A2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                }    // if E1 is on (n1, A1) or (n1, A2)
                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
                {
                  C1        = endpt1;
                  C2        = endpt2;
                  edge_C1C2 = corrEdge[E1y][E1x];

                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)
                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
                  // C2)->(A1, A2)
                  edge_n1C1               = edge_n1A1;
                  treeedges[edge_n1C1].n1 = C1;
                  treeedges[edge_n1C1].n2 = n1;
                  edge_n1C2               = edge_n1A2;
                  treeedges[edge_n1C2].n1 = n1;
                  treeedges[edge_n1C2].n2 = C2;
                  edge_A1A2               = edge_C1C2;
                  treeedges[edge_A1A2].n1 = A1;
                  treeedges[edge_A1A2].n2 = A2;
                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)
                  treenodes[n1].nbr[0]  = n2;
                  treenodes[n1].edge[0] = edge_n1n2;
                  treenodes[n1].nbr[1]  = C1;
                  treenodes[n1].edge[1] = edge_n1C1;
                  treenodes[n1].nbr[2]  = C2;
                  treenodes[n1].edge[2] = edge_n1C2;
                  // A1's nbr n1->A2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A1].nbr[i] == n1) {
                      treenodes[A1].nbr[i]  = A2;
                      treenodes[A1].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // A2's nbr n1->A1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A2].nbr[i] == n1) {
                      treenodes[A2].nbr[i]  = A1;
                      treenodes[A2].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // C1's nbr C2->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C1].nbr[i] == C2) {
                      treenodes[C1].nbr[i]  = n1;
                      treenodes[C1].edge[i] = edge_n1C1;
                      break;
                    }
                  }
                  // C2's nbr C1->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C2].nbr[i] == C1) {
                      treenodes[C2].nbr[i]  = n1;
                      treenodes[C2].edge[i] = edge_n1C2;
                      break;
                    }
                  }

                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
              }   // n1 is not a pin and E1!=n1

              // (2) consider subtree2

              if (n2 >= deg && (E2x != n2x || E2y != n2y))
              // n2 is not a pin and E2!=n2, then make change to subtree2,
              // otherwise, no change to subtree2
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

                // find B1, B2
                if (treenodes[n2].nbr[0] == n1) {
                  B1        = treenodes[n2].nbr[1];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[1];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else if (treenodes[n2].nbr[1] == n1) {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[1];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[1];
                }

                if (endpt1 == n2 ||
                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
                {
                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always
                  // on (n2, B1)
                  if (endpt1 == B2 || endpt2 == B2) {
                    tmpi      = B1;
                    B1        = B2;
                    B2        = tmpi;
                    tmpi      = edge_n2B1;
                    edge_n2B1 = edge_n2B2;
                    edge_n2B2 = tmpi;
                  }

                  // update route for edge (n2, B1), (n2, B2)
                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                                   edge_n2B1, edge_n2B2);

                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                }    // if E2 is on (n2, B1) or (n2, B2)
                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
                {
                  D1        = endpt1;
                  D2        = endpt2;
                  edge_D1D2 = corrEdge[E2y][E2x];

                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)
                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
                  // D2)->(B1, B2)
                  edge_n2D1               = edge_n2B1;
                  treeedges[edge_n2D1].n1 = D1;
                  treeedges[edge_n2D1].n2 = n2;
                  edge_n2D2               = edge_n2B2;
                  treeedges[edge_n2D2].n1 = n2;
                  treeedges[edge_n2D2].n2 = D2;
                  edge_B1B2               = edge_D1D2;
                  treeedges[edge_B1B2].n1 = B1;
                  treeedges[edge_B1B2].n2 = B2;
                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)
                  treenodes[n2].nbr[0]  = n1;
                  treenodes[n2].edge[0] = edge_n1n2;
                  treenodes[n2].nbr[1]  = D1;
                  treenodes[n2].edge[1] = edge_n2D1;
                  treenodes[n2].nbr[2]  = D2;
                  treenodes[n2].edge[2] = edge_n2D2;
                  // B1's nbr n2->B2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B1].nbr[i] == n2) {
                      treenodes[B1].nbr[i]  = B2;
                      treenodes[B1].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // B2's nbr n2->B1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B2].nbr[i] == n2) {
                      treenodes[B2].nbr[i]  = B1;
                      treenodes[B2].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // D1's nbr D2->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D1].nbr[i] == D2) {
                      treenodes[D1].nbr[i]  = n2;
                      treenodes[D1].edge[i] = edge_n2D1;
                      break;
                    }
                  }
                  // D2's nbr D1->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D2].nbr[i] == D1) {
                      treenodes[D2].nbr[i]  = n2;
                      treenodes[D2].edge[i] = edge_n2D2;
                      break;
                    }
                  }
                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
              }   // n2 is not a pin and E2!=n2

              // update route for edge (n1, n2) and edge usage

              // printf("update route? %d %d\n", netID, num_edges);
              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
                free(treeedges[edge_n1n2].route.gridsX);
                free(treeedges[edge_n1n2].route.gridsY);
              }
              treeedges[edge_n1n2].route.gridsX =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.gridsY =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.type     = MAZEROUTE;
              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
              treeedges[edge_n1n2].n_ripups += 1;
              total_ripups += 1;
              max_ripups.update(treeedges[edge_n1n2].n_ripups);

              for (i = 0; i < cnt_n1n2; i++) {
                // printf("cnt_n1n2: %d\n", cnt_n1n2);
                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
              }

              // update edge usage

              /*for(i=0; i<pre_length; i++)
              {
                  if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
                  {
                      if(i != pre_length - 1)
                          min_y = min(pre_gridsY[i], pre_gridsY[i+1]);
                      else
                          min_y = pre_gridsY[i];
                      //v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                      //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
              (short unsigned)1);
                      //printf("x y %d %d i %d \n", pre_gridsX[i], min_y, i);
                      v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short
              int)1);
                      //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0)
              printf("V negative! %d \n", i);
                  }
                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                  {
                      if(i != pre_length - 1)
                          min_x = min(pre_gridsX[i], pre_gridsX[i+1]);
                      else
                          min_x = pre_gridsX[i];
                      //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                      //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              (short unsigned)1);
                      //printf("x y %d %d i %d\n", min_x, pre_gridsY[i], i);
                      h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short
              int)1);
                      //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)
              printf("H negative! %d \n", i);
                  }
              }*/

              for (i = 0; i < cnt_n1n2 - 1; i++) {
                if (gridsX[i] == gridsX[i + 1]) // a vertical edge
                {
                  min_y = min(gridsY[i], gridsY[i + 1]);
                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
                  // (short unsigned)1);
                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                      (short int)1);

                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
                {
                  min_x = min(gridsX[i], gridsX[i + 1]);
                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
                  // (short unsigned)1);
                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                      (short int)1);
                }
              }
              /*if(LOCK){
                  for(i=0; i<cnt_n1n2-1; i++)
                  {
                      if(gridsX[i]==gridsX[i+1]) // a vertical edge
                      {
                          min_y = min(gridsY[i], gridsY[i+1]);
                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                      }
                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                      {
                          min_x = min(gridsX[i], gridsX[i+1]);
                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                      }
                  }
              }*/
              if (checkRoute2DTree(netID)) {
                reInitTree(netID);
                return;
              }
            } // congested route
          }   // maze routing
        }     // loop edgeID
      },
      // galois::wl<galois::worklists::ParaMeter<>>(),
      galois::steal(),
      // galois::chunk_size<64>(),
      galois::loopname("net-level parallelism")); // galois::do_all

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}

void mazeRouteMSMD_block(int iter, int expand, float costHeight,
                         int ripup_threshold, int mazeedge_Threshold,
                         Bool Ordering, int cost_type,
                         galois::InsertBag<int>* net_shuffle) {
  // LOCK = 0;
  float forange;

  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>
      thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::do_all(galois::iterate(0, numValidNets), [&] (const auto nidRPC)

  galois::on_each(
      [&](const unsigned tid, const unsigned numT)
      // for(unsigned int nidRPC = 0; nidRPC < numValidNets; nidRPC++)
      {
        if (tid >= numT)
          return;
        for (const auto nidRPC : net_shuffle[tid]) {
          int grid, netID;

          // maze routing for multi-source, multi-destination
          Bool hypered, enter;
          int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,
              xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,
              num_edges;
          int regionX1, regionX2, regionY1, regionY2;
          int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
              tmp_gridsY[YRANGE];
          int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
          int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
              edge_C1C2;
          int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
          int E1x, E1y, E2x, E2y;
          int tmp_grid, tmp_cost;
          int preX, preY, origENG, edgeREC;

          float tmp;
          TreeEdge *treeedges, *treeedge;
          TreeNode* treenodes;

          bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;

          float** d1    = thread_local_storage.getLocal()->d1_p;
          bool** HV     = thread_local_storage.getLocal()->HV_p;
          bool** hyperV = thread_local_storage.getLocal()->hyperV_p;
          bool** hyperH = thread_local_storage.getLocal()->hyperH_p;

          short** parentX1 = thread_local_storage.getLocal()->parentX1_p;
          short** parentX3 = thread_local_storage.getLocal()->parentX3_p;
          short** parentY1 = thread_local_storage.getLocal()->parentY1_p;
          short** parentY3 = thread_local_storage.getLocal()->parentY3_p;

          int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;

          OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;

          bool** inRegion = thread_local_storage.getLocal()->inRegion_p;

          local_pq pq1 = perthread_pq.get();
          local_vec v2 = perthread_vec.get();

          /*for(i=0; i<yGrid*xGrid; i++)
          {
              pop_heap2[i] = FALSE;
          } */

          /*for(int i=0; i<yGrid; i++)
          {
              for(int j=0; j<xGrid; j++)
                  inRegion[i][j] = FALSE;
          }*/
          // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
          // parentY1[153][134], parentX3[153][134]); printf("what is
          // happening?\n");

          if (Ordering) {
            netID = treeOrderCong[nidRPC].treeIndex;
          } else {
            netID = nidRPC;
          }

          deg = sttrees[netID].deg;

          origENG = expand;

          netedgeOrderDec(netID, netEO);

          treeedges = sttrees[netID].edges;
          treenodes = sttrees[netID].nodes;
          // loop for all the tree edges (2*deg-3)
          num_edges = 2 * deg - 3;

          for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {
            edgeID   = netEO[edgeREC].edgeID;
            treeedge = &(treeedges[edgeID]);

            n1            = treeedge->n1;
            n2            = treeedge->n2;
            n1x           = treenodes[n1].x;
            n1y           = treenodes[n1].y;
            n2x           = treenodes[n2].x;
            n2y           = treenodes[n2].y;
            treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

            if (treeedge->len >
                mazeedge_Threshold) // only route the non-degraded edges (len>0)
            {

              enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);

              // ripup the routing for the edge
              if (enter) {

                if (n1y <= n2y) {
                  ymin = n1y;
                  ymax = n2y;
                } else {
                  ymin = n2y;
                  ymax = n1y;
                }

                if (n1x <= n2x) {
                  xmin = n1x;
                  xmax = n2x;
                } else {
                  xmin = n2x;
                  xmax = n1x;
                }

                int enlarge =
                    min(origENG,
                        (iter / 6 + 3) *
                            treeedge->route
                                .routelen); // michael, this was global variable
                regionX1 = max(0, xmin - enlarge);
                regionX2 = min(xGrid - 1, xmax + enlarge);
                regionY1 = max(0, ymin - enlarge);
                regionY2 = min(yGrid - 1, ymax + enlarge);

                // initialize d1[][] and d2[][] as BIG_INT
                for (i = regionY1; i <= regionY2; i++) {
                  for (j = regionX1; j <= regionX2; j++) {
                    d1[i][j] = BIG_INT;
                    /*d2[i][j] = BIG_INT;
                    hyperH[i][j] = FALSE;
                    hyperV[i][j] = FALSE;*/
                  }
                }
                // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
                // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
                for (i = regionY1; i <= regionY2; i++) {
                  for (j = regionX1; j <= regionX2; j++) {
                    hyperH[i][j] = FALSE;
                  }
                }
                for (i = regionY1; i <= regionY2; i++) {
                  for (j = regionX1; j <= regionX2; j++) {
                    hyperV[i][j] = FALSE;
                  }
                }
                // TODO: use seperate loops

                // setup heap1, heap2 and initialize d1[][] and d2[][] for all
                // the grids on the two subtrees
                setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                          regionY2, d1, corrEdge, inRegion);
                // TODO: use std priority queue
                // while loop to find shortest path
                ind1 = (pq1.top().d1_p - &d1[0][0]);
                pq1.pop();
                curX = ind1 % xGrid;
                curY = ind1 / xGrid;

                for (local_vec::iterator ii = v2.begin(); ii != v2.end();
                     ii++) {
                  pop_heap2[*ii] = TRUE;
                }
                while (pop_heap2[ind1] ==
                       FALSE) // stop until the grid position been popped out
                              // from both heap1 and heap2
                {
                  // relax all the adjacent grids within the enlarged region for
                  // source subtree

                  // if(PRINT) printf("curX curY %d %d, (%d, %d), (%d, %d),
                  // pq1.size: %d\n", curX, curY, regionX1, regionX2, regionY1,
                  // regionY2, pq1.size()); if(curX == 102 && curY == 221)
                  // exit(1);
                  float curr_d1 = d1[curY][curX];
                  if (curr_d1 != 0) {
                    if (HV[curY][curX]) {
                      preX = parentX1[curY][curX];
                      preY = parentY1[curY][curX];
                    } else {
                      preX = parentX3[curY][curX];
                      preY = parentY3[curY][curX];
                    }
                  } else {
                    preX = curX;
                    preY = curY;
                  }

                  // left
                  if (curX > regionX1) {
                    grid = curY * (xGrid - 1) + curX - 1;

                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX < regionX2 - 1) {
                        tmp_grid = curY * (xGrid - 1) + curX;
                        tmp_cost =
                            d1[curY][curX + 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          hyperH[curY][curX] = TRUE; // Michael
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    // if(LOCK)  h_edges[grid].releaseLock();
                    tmpX = curX - 1; // the left neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));
                    }
                    else */
                    if (d1[curY][tmpX] > tmp) // left neighbor been put into
                                              // heap1 but needs update
                    {
                      d1[curY][tmpX]       = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX]       = FALSE;
                      pq1.push({&(d1[curY][tmpX]), tmp});
                    }
                  }
                  // right
                  if (curX < regionX2) {
                    grid = curY * (xGrid - 1) + curX;

                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX > regionX1 + 1) {
                        tmp_grid = curY * (xGrid - 1) + curX - 1;
                        tmp_cost =
                            d1[curY][curX - 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          hyperH[curY][curX] = TRUE;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    // if(LOCK) h_edges[grid].releaseLock();
                    tmpX = curX + 1; // the right neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));

                    }
                    else */
                    if (d1[curY][tmpX] > tmp) // right neighbor been put into
                                              // heap1 but needs update
                    {
                      d1[curY][tmpX]       = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX]       = FALSE;
                      pq1.push({&(d1[curY][tmpX]), tmp});
                    }
                  }
                  // bottom
                  if (curY > regionY1) {
                    grid = (curY - 1) * xGrid + curX;

                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY < regionY2 - 1) {
                        tmp_grid = curY * xGrid + curX;
                        tmp_cost =
                            d1[curY + 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          hyperV[curY][curX] = TRUE;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    // if(LOCK) v_edges[grid].releaseLock();
                    tmpY = curY - 1; // the bottom neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been
                    put into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));

                    }
                    else */
                    if (d1[tmpY][curX] > tmp) // bottom neighbor been put into
                                              // heap1 but needs update
                    {
                      d1[tmpY][curX]       = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX]       = TRUE;
                      pq1.push({&(d1[tmpY][curX]), tmp});
                    }
                  }
                  // top
                  if (curY < regionY2) {
                    grid = curY * xGrid + curX;

                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY > regionY1 + 1) {
                        tmp_grid = (curY - 1) * xGrid + curX;
                        tmp_cost =
                            d1[curY - 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          hyperV[curY][curX] = TRUE;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    // if(LOCK) v_edges[grid].releaseLock();
                    tmpY = curY + 1; // the top neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put
                    into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));
                    }
                    else*/
                    if (d1[tmpY][curX] > tmp) // top neighbor been put into
                                              // heap1 but needs update
                    {
                      d1[tmpY][curX]       = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX]       = TRUE;
                      pq1.push({&(d1[tmpY][curX]), tmp});
                    }
                  }

                  // update ind1 and ind2 for next loop, Michael: need to check
                  // if it is up-to-date value.
                  float d1_push;
                  do {
                    ind1    = pq1.top().d1_p - &d1[0][0];
                    d1_push = pq1.top().d1_push;
                    pq1.pop();
                    curX = ind1 % xGrid;
                    curY = ind1 / xGrid;
                  } while (d1_push != d1[curY][curX]);
                } // while loop

                for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
                  pop_heap2[*ii] = FALSE;

                crossX = ind1 % xGrid;
                crossY = ind1 / xGrid;

                cnt  = 0;
                curX = crossX;
                curY = crossY;
                while (d1[curY][curX] != 0) // loop until reach subtree1
                {
                  hypered = FALSE;
                  if (cnt != 0) {
                    if (curX != tmpX && hyperH[curY][curX]) {
                      curX    = 2 * curX - tmpX;
                      hypered = TRUE;
                    }
                    // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
                    if (curY != tmpY && hyperV[curY][curX]) {
                      curY    = 2 * curY - tmpY;
                      hypered = TRUE;
                    }
                  }
                  tmpX = curX;
                  tmpY = curY;
                  if (!hypered) {
                    if (HV[tmpY][tmpX]) {
                      curY = parentY1[tmpY][tmpX];
                    } else {
                      curX = parentX3[tmpY][tmpX];
                    }
                  }

                  tmp_gridsX[cnt] = curX;
                  tmp_gridsY[cnt] = curY;
                  cnt++;
                }
                // reverse the grids on the path

                for (i = 0; i < cnt; i++) {
                  tmpind    = cnt - 1 - i;
                  gridsX[i] = tmp_gridsX[tmpind];
                  gridsY[i] = tmp_gridsY[tmpind];
                }
                // add the connection point (crossX, crossY)
                gridsX[cnt] = crossX;
                gridsY[cnt] = crossY;
                cnt++;

                curX     = crossX;
                curY     = crossY;
                cnt_n1n2 = cnt;

                // change the tree structure according to the new routing for
                // the tree edge find E1 and E2, and the endpoints of the edges
                // they are on
                E1x = gridsX[0];
                E1y = gridsY[0];
                E2x = gridsX[cnt_n1n2 - 1];
                E2y = gridsY[cnt_n1n2 - 1];

                edge_n1n2 = edgeID;

                // (1) consider subtree1
                if (n1 >= deg && (E1x != n1x || E1y != n1y))
                // n1 is not a pin and E1!=n1, then make change to subtree1,
                // otherwise, no change to subtree1
                {
                  // find the endpoints of the edge E1 is on
                  endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
                  endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

                  // find A1, A2 and edge_n1A1, edge_n1A2
                  if (treenodes[n1].nbr[0] == n2) {
                    A1        = treenodes[n1].nbr[1];
                    A2        = treenodes[n1].nbr[2];
                    edge_n1A1 = treenodes[n1].edge[1];
                    edge_n1A2 = treenodes[n1].edge[2];
                  } else if (treenodes[n1].nbr[1] == n2) {
                    A1        = treenodes[n1].nbr[0];
                    A2        = treenodes[n1].nbr[2];
                    edge_n1A1 = treenodes[n1].edge[0];
                    edge_n1A2 = treenodes[n1].edge[2];
                  } else {
                    A1        = treenodes[n1].nbr[0];
                    A2        = treenodes[n1].nbr[1];
                    edge_n1A1 = treenodes[n1].edge[0];
                    edge_n1A2 = treenodes[n1].edge[1];
                  }

                  if (endpt1 == n1 ||
                      endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
                  {
                    // if E1 is on (n1, A2), switch A1 and A2 so that E1 is
                    // always on (n1, A1)
                    if (endpt1 == A2 || endpt2 == A2) {
                      tmpi      = A1;
                      A1        = A2;
                      A2        = tmpi;
                      tmpi      = edge_n1A1;
                      edge_n1A1 = edge_n1A2;
                      edge_n1A2 = tmpi;
                    }

                    // update route for edge (n1, A1), (n1, A2)
                    updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                                     edge_n1A1, edge_n1A2);
                    // update position for n1
                    treenodes[n1].x = E1x;
                    treenodes[n1].y = E1y;
                  }    // if E1 is on (n1, A1) or (n1, A2)
                  else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
                  {
                    C1        = endpt1;
                    C2        = endpt2;
                    edge_C1C2 = corrEdge[E1y][E1x];

                    // update route for edge (n1, C1), (n1, C2) and (A1, A2)
                    updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                                     treeedges, edge_n1A1, edge_n1A2,
                                     edge_C1C2);
                    // update position for n1
                    treenodes[n1].x = E1x;
                    treenodes[n1].y = E1y;
                    // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2),
                    // (C1, C2)->(A1, A2)
                    edge_n1C1               = edge_n1A1;
                    treeedges[edge_n1C1].n1 = C1;
                    treeedges[edge_n1C1].n2 = n1;
                    edge_n1C2               = edge_n1A2;
                    treeedges[edge_n1C2].n1 = n1;
                    treeedges[edge_n1C2].n2 = C2;
                    edge_A1A2               = edge_C1C2;
                    treeedges[edge_A1A2].n1 = A1;
                    treeedges[edge_A1A2].n2 = A2;
                    // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
                    // n1's nbr (n2, A1, A2)->(n2, C1, C2)
                    treenodes[n1].nbr[0]  = n2;
                    treenodes[n1].edge[0] = edge_n1n2;
                    treenodes[n1].nbr[1]  = C1;
                    treenodes[n1].edge[1] = edge_n1C1;
                    treenodes[n1].nbr[2]  = C2;
                    treenodes[n1].edge[2] = edge_n1C2;
                    // A1's nbr n1->A2
                    for (i = 0; i < 3; i++) {
                      if (treenodes[A1].nbr[i] == n1) {
                        treenodes[A1].nbr[i]  = A2;
                        treenodes[A1].edge[i] = edge_A1A2;
                        break;
                      }
                    }
                    // A2's nbr n1->A1
                    for (i = 0; i < 3; i++) {
                      if (treenodes[A2].nbr[i] == n1) {
                        treenodes[A2].nbr[i]  = A1;
                        treenodes[A2].edge[i] = edge_A1A2;
                        break;
                      }
                    }
                    // C1's nbr C2->n1
                    for (i = 0; i < 3; i++) {
                      if (treenodes[C1].nbr[i] == C2) {
                        treenodes[C1].nbr[i]  = n1;
                        treenodes[C1].edge[i] = edge_n1C1;
                        break;
                      }
                    }
                    // C2's nbr C1->n1
                    for (i = 0; i < 3; i++) {
                      if (treenodes[C2].nbr[i] == C1) {
                        treenodes[C2].nbr[i]  = n1;
                        treenodes[C2].edge[i] = edge_n1C2;
                        break;
                      }
                    }

                  } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
                }   // n1 is not a pin and E1!=n1

                // (2) consider subtree2

                if (n2 >= deg && (E2x != n2x || E2y != n2y))
                // n2 is not a pin and E2!=n2, then make change to subtree2,
                // otherwise, no change to subtree2
                {
                  // find the endpoints of the edge E1 is on
                  endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
                  endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

                  // find B1, B2
                  if (treenodes[n2].nbr[0] == n1) {
                    B1        = treenodes[n2].nbr[1];
                    B2        = treenodes[n2].nbr[2];
                    edge_n2B1 = treenodes[n2].edge[1];
                    edge_n2B2 = treenodes[n2].edge[2];
                  } else if (treenodes[n2].nbr[1] == n1) {
                    B1        = treenodes[n2].nbr[0];
                    B2        = treenodes[n2].nbr[2];
                    edge_n2B1 = treenodes[n2].edge[0];
                    edge_n2B2 = treenodes[n2].edge[2];
                  } else {
                    B1        = treenodes[n2].nbr[0];
                    B2        = treenodes[n2].nbr[1];
                    edge_n2B1 = treenodes[n2].edge[0];
                    edge_n2B2 = treenodes[n2].edge[1];
                  }

                  if (endpt1 == n2 ||
                      endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
                  {
                    // if E2 is on (n2, B2), switch B1 and B2 so that E2 is
                    // always on (n2, B1)
                    if (endpt1 == B2 || endpt2 == B2) {
                      tmpi      = B1;
                      B1        = B2;
                      B2        = tmpi;
                      tmpi      = edge_n2B1;
                      edge_n2B1 = edge_n2B2;
                      edge_n2B2 = tmpi;
                    }

                    // update route for edge (n2, B1), (n2, B2)
                    updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                                     edge_n2B1, edge_n2B2);

                    // update position for n2
                    treenodes[n2].x = E2x;
                    treenodes[n2].y = E2y;
                  }    // if E2 is on (n2, B1) or (n2, B2)
                  else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
                  {
                    D1        = endpt1;
                    D2        = endpt2;
                    edge_D1D2 = corrEdge[E2y][E2x];

                    // update route for edge (n2, D1), (n2, D2) and (B1, B2)
                    updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                                     treeedges, edge_n2B1, edge_n2B2,
                                     edge_D1D2);
                    // update position for n2
                    treenodes[n2].x = E2x;
                    treenodes[n2].y = E2y;
                    // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2),
                    // (D1, D2)->(B1, B2)
                    edge_n2D1               = edge_n2B1;
                    treeedges[edge_n2D1].n1 = D1;
                    treeedges[edge_n2D1].n2 = n2;
                    edge_n2D2               = edge_n2B2;
                    treeedges[edge_n2D2].n1 = n2;
                    treeedges[edge_n2D2].n2 = D2;
                    edge_B1B2               = edge_D1D2;
                    treeedges[edge_B1B2].n1 = B1;
                    treeedges[edge_B1B2].n2 = B2;
                    // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
                    // n1's nbr (n1, B1, B2)->(n1, D1, D2)
                    treenodes[n2].nbr[0]  = n1;
                    treenodes[n2].edge[0] = edge_n1n2;
                    treenodes[n2].nbr[1]  = D1;
                    treenodes[n2].edge[1] = edge_n2D1;
                    treenodes[n2].nbr[2]  = D2;
                    treenodes[n2].edge[2] = edge_n2D2;
                    // B1's nbr n2->B2
                    for (i = 0; i < 3; i++) {
                      if (treenodes[B1].nbr[i] == n2) {
                        treenodes[B1].nbr[i]  = B2;
                        treenodes[B1].edge[i] = edge_B1B2;
                        break;
                      }
                    }
                    // B2's nbr n2->B1
                    for (i = 0; i < 3; i++) {
                      if (treenodes[B2].nbr[i] == n2) {
                        treenodes[B2].nbr[i]  = B1;
                        treenodes[B2].edge[i] = edge_B1B2;
                        break;
                      }
                    }
                    // D1's nbr D2->n2
                    for (i = 0; i < 3; i++) {
                      if (treenodes[D1].nbr[i] == D2) {
                        treenodes[D1].nbr[i]  = n2;
                        treenodes[D1].edge[i] = edge_n2D1;
                        break;
                      }
                    }
                    // D2's nbr D1->n2
                    for (i = 0; i < 3; i++) {
                      if (treenodes[D2].nbr[i] == D1) {
                        treenodes[D2].nbr[i]  = n2;
                        treenodes[D2].edge[i] = edge_n2D2;
                        break;
                      }
                    }
                  } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
                }   // n2 is not a pin and E2!=n2

                // update route for edge (n1, n2) and edge usage

                // printf("update route? %d %d\n", netID, num_edges);
                if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
                  free(treeedges[edge_n1n2].route.gridsX);
                  free(treeedges[edge_n1n2].route.gridsY);
                }
                treeedges[edge_n1n2].route.gridsX =
                    (short*)calloc(cnt_n1n2, sizeof(short));
                treeedges[edge_n1n2].route.gridsY =
                    (short*)calloc(cnt_n1n2, sizeof(short));
                treeedges[edge_n1n2].route.type     = MAZEROUTE;
                treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
                treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);

                for (i = 0; i < cnt_n1n2; i++) {
                  // printf("cnt_n1n2: %d\n", cnt_n1n2);
                  treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
                  treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
                }

                // update edge usage

                for (i = 0; i < cnt_n1n2 - 1; i++) {
                  if (gridsX[i] == gridsX[i + 1]) // a vertical edge
                  {
                    min_y = min(gridsY[i], gridsY[i + 1]);
                    // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                    // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
                    // (short unsigned)1);
                    v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                        (short unsigned)1, std::memory_order_relaxed);
                  } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
                  {
                    min_x = min(gridsX[i], gridsX[i + 1]);
                    // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                    // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
                    // (short unsigned)1);
                    h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                        (short unsigned)1, std::memory_order_relaxed);
                  }
                }
                /*if(LOCK){
                    for(i=0; i<cnt_n1n2-1; i++)
                    {
                        if(gridsX[i]==gridsX[i+1]) // a vertical edge
                        {
                            min_y = min(gridsY[i], gridsY[i+1]);
                            v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                        }
                        else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                        {
                            min_x = min(gridsX[i], gridsX[i+1]);
                            h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                        }
                    }
                }*/
                if (checkRoute2DTree(netID)) {
                  reInitTree(netID);
                  return;
                }
              } // congested route

            } // maze routing
          }   // loop edgeID
        }
      },
      galois::steal(),
      // galois::chunk_size<32>(),
      galois::loopname("maze routing block")); // galois::do_all
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}

int getOverflow2Dmaze(int* maxOverflow, int* tUsage) {
  int i, j, grid, overflow, max_overflow, H_overflow, max_H_overflow,
      V_overflow, max_V_overflow, numedges = 0;
  int total_usage, total_cap;

  // get overflow
  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =
      max_V_overflow                                    = 0;

  total_usage = 0;
  total_cap   = 0;

  //    fprintf(fph, "Horizontal Congestion\n");

  // int ripup_same = 0;
  // int ripup_diff = 0;

  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid = i * (xGrid - 1) + j;
      total_usage += h_edges[grid].usage;
      overflow = h_edges[grid].usage - h_edges[grid].cap;
      total_cap += h_edges[grid].cap;
      if (overflow > 0) {
        H_overflow += overflow;
        max_H_overflow = max(max_H_overflow, overflow);
        numedges++;
        /*if(!h_edges[grid].ripups_cur_round)
        {
            h_edges[grid].max_have_rippedups = h_edges[grid].max_ripups.load();
        }*/
      }
      // h_edges[grid].ripups_cur_round = false;
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid = i * xGrid + j;
      total_usage += v_edges[grid].usage;
      overflow = v_edges[grid].usage - v_edges[grid].cap;
      total_cap += v_edges[grid].cap;
      if (overflow > 0) {
        V_overflow += overflow;
        max_V_overflow = max(max_V_overflow, overflow);
        numedges++;
        /*if(!v_edges[grid].ripups_cur_round)
        {
            v_edges[grid].max_have_rippedups = v_edges[grid].max_ripups.load();
        }*/
      }

      // v_edges[grid].ripups_cur_round = false;
    }
  }
  /*bitmap_image image(dimx,dimy);
  for(i=0; i<yGrid-1; i++)
  {
      for(j=0; j<xGrid - 1; j++)
      {
          int gridx = i*(xGrid-1)+j;
          int gridy = i*xGrid+j;

          int h_overflow = h_edges[gridx].usage - h_edges[gridx].cap;
          h_overflow = (h_overflow > 0)? h_overflow : 0;
          int v_overflow = v_edges[gridy].usage - v_edges[gridy].cap;
          v_overflow = (v_overflow > 0)? v_overflow : 0;

          overflow = h_overflow + v_overflow;
          if(overflow > 0)
          {
              float red = (overflow >= 1)? 0 : (255 - ((float)overflow/1) *
  255); int red_int = (int)red; image.set_pixel(j,i,255,(unsigned
  char)red_int,(unsigned char)red_int);
          }
          else
          {
              image.set_pixel(j,i,255,255,255);
          }

      }
  }
  std::string file_name = "route" + to_string(PRINT_HEAT) + ".bmp";
  image.save_image(file_name);
  PRINT_HEAT++;*/

  max_overflow  = max(max_H_overflow, max_V_overflow);
  totalOverflow = H_overflow + V_overflow;
  *maxOverflow  = max_overflow;

  printf("total Usage   : %d\n", (int)total_usage);
  printf("Max H Overflow: %d\n", max_H_overflow);
  printf("Max V Overflow: %d\n", max_V_overflow);
  printf("Max Overflow  : %d\n", max_overflow);
  printf("Num Overflow e: %d\n", numedges);
  printf("H   Overflow  : %d\n", H_overflow);
  printf("V   Overflow  : %d\n", V_overflow);
  printf("Final Overflow: %d\n\n", totalOverflow);

  *tUsage = total_usage;

  if (total_usage > 800000) {
    ahTH = 30;
  } else {
    ahTH = 20;
  }

  return (totalOverflow);
}

void checkUsageCorrectness() {
  int* vedge_usage = new int[xGrid * (yGrid - 1)];
  int* hedge_usage = new int[(xGrid - 1) * yGrid];
  memset(vedge_usage, 0, xGrid * (yGrid - 1) * sizeof(int));
  memset(hedge_usage, 0, (xGrid - 1) * yGrid * sizeof(int));

  for (int netID = 0; netID < numValidNets; netID++) {
    // maze routing for multi-source, multi-destination

    int deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, num_edges;

    TreeEdge *treeedges, *treeedge;
    TreeNode* treenodes;

    deg = sttrees[netID].deg;

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeID = 0; edgeID < num_edges; edgeID++) {
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len > 0) // only route the non-degraded edges (len>0)
      {
        if (treeedge->route.type == MAZEROUTE) {
          short* gridsX = treeedge->route.gridsX;
          short* gridsY = treeedge->route.gridsY;

          // std::cout << "net: " << netID << " route lenth: " <<
          // treeedge->route.routelen << std::endl;

          for (int i = 0; i < treeedge->route.routelen; i++) {
            // std::cout << "path: " << gridsX[i] << "," << gridsY[i] << "<->"
            // << gridsX[i + 1] << "," << gridsY[i + 1] << std::endl;
            if (gridsX[i] == gridsX[i + 1]) {
              int ymin = min(gridsY[i], gridsY[i + 1]);
              vedge_usage[ymin * xGrid + gridsX[i]]++;
            } else if (gridsY[i] == gridsY[i + 1]) {
              int xmin = min(gridsX[i], gridsX[i + 1]);
              hedge_usage[gridsY[i] * (xGrid - 1) + xmin]++;
            } else {
              std::cout << "usage correctness: net not connected!" << std::endl;
            }
          }
        } else {
          std::cout << "usage correctness: not maze route!" << std::endl;
          exit(1);
        }
      }
    }
  }
  int same = 0;
  int diff = 0;
  for (int i = 0; i < yGrid; i++) {
    for (int j = 0; j < xGrid - 1; j++) {
      int grid = i * (xGrid - 1) + j;
      if (hedge_usage[grid] == h_edges[grid].usage) {
        same++;
      } else {
        diff++;
        // std::cout << "h edge diff: " << j << ", " << i << " check: " <<
        // hedge_usage[grid] << " actual: " << h_edges[grid].usage << std::endl;
      }
    }
  }

  for (int i = 0; i < yGrid - 1; i++) {
    for (int j = 0; j < xGrid; j++) {
      int grid = i * xGrid + j;
      if (vedge_usage[grid] == v_edges[grid].usage) {
        same++;
      } else {
        diff++;
        // std::cout << "v edge diff: " << j << ", " << i << " check: " <<
        // vedge_usage[grid] << " actual: " << v_edges[grid].usage << std::endl;
      }
    }
  }

  std::cout << "same: " << same << " diff: " << diff << std::endl;

  delete[] vedge_usage;
  delete[] hedge_usage;
}

int getOverflow2D(int* maxOverflow) {
  int i, j, grid, overflow, max_overflow, H_overflow, max_H_overflow,
      V_overflow, max_V_overflow, numedges;
  int total_usage, total_cap, hCap, vCap;

  // get overflow
  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =
      max_V_overflow                                    = 0;
  hCap = vCap = numedges = 0;

  total_usage = 0;
  total_cap   = 0;
  //    fprintf(fph, "Horizontal Congestion\n");
  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid = i * (xGrid - 1) + j;
      total_usage += h_edges[grid].est_usage;
      overflow = h_edges[grid].est_usage - h_edges[grid].cap;
      total_cap += h_edges[grid].cap;
      hCap += h_edges[grid].cap;
      if (overflow > 0) {
        H_overflow += overflow;
        max_H_overflow = max(max_H_overflow, overflow);
        numedges++;
      }
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid = i * xGrid + j;
      total_usage += v_edges[grid].est_usage;
      overflow = v_edges[grid].est_usage - v_edges[grid].cap;
      total_cap += v_edges[grid].cap;
      vCap += v_edges[grid].cap;
      if (overflow > 0) {
        V_overflow += overflow;
        max_V_overflow = max(max_V_overflow, overflow);
        numedges++;
      }
    }
  }

  max_overflow  = max(max_H_overflow, max_V_overflow);
  totalOverflow = H_overflow + V_overflow;
  *maxOverflow  = max_overflow;

  if (total_usage > 800000) {
    ahTH = 30;
  } else {
    ahTH = 20;
  }

  printf("total hCap    : %d\n", hCap);
  printf("total vCap    : %d\n", vCap);
  printf("total Usage   : %d\n", (int)total_usage);
  printf("Max H Overflow: %d\n", max_H_overflow);
  printf("Max V Overflow: %d\n", max_V_overflow);
  printf("Max Overflow  : %d\n", max_overflow);
  printf("Num Overflow e: %d\n", numedges);
  printf("H   Overflow  : %d\n", H_overflow);
  printf("V   Overflow  : %d\n", V_overflow);
  printf("Final Overflow: %d\n\n", totalOverflow);

  return (totalOverflow);
}

int getOverflow3D(void) {
  int i, j, k, grid, overflow, max_overflow, H_overflow, max_H_overflow,
      V_overflow, max_V_overflow;
  int cap;
  int total_usage;

  // get overflow
  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =
      max_V_overflow                                    = 0;

  total_usage = 0;
  cap         = 0;
  //    fprintf(fph, "Horizontal Congestion\n");

  for (k = 0; k < numLayers; k++) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid = i * (xGrid - 1) + j + k * (xGrid - 1) * yGrid;
        total_usage += h_edges3D[grid].usage;
        overflow = h_edges3D[grid].usage - h_edges3D[grid].cap;
        cap += h_edges3D[grid].cap;

        if (overflow > 0) {
          H_overflow += overflow;
          max_H_overflow = max(max_H_overflow, overflow);
        }
      }
    }
    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid = i * xGrid + j + k * xGrid * (yGrid - 1);
        total_usage += v_edges3D[grid].usage;
        overflow = v_edges3D[grid].usage - v_edges3D[grid].cap;
        cap += v_edges3D[grid].cap;

        if (overflow > 0) {
          V_overflow += overflow;
          max_V_overflow = max(max_V_overflow, overflow);
        }
      }
    }
  }

  max_overflow  = max(max_H_overflow, max_V_overflow);
  totalOverflow = H_overflow + V_overflow;

  printf("total Usage   : %d\n", total_usage);
  printf("Total Capacity: %d\n", cap);
  printf("Max H Overflow: %d\n", max_H_overflow);
  printf("Max V Overflow: %d\n", max_V_overflow);
  printf("Max Overflow  : %d\n", max_overflow);
  printf("H   Overflow  : %d\n", H_overflow);
  printf("V   Overflow  : %d\n", V_overflow);
  printf("Final Overflow: %d\n\n", totalOverflow);

  return (total_usage);
}

int unsolved;

/*void initialCongestionHistory(int round)
{
    int i, j, grid;

    for(i=0; i<yGrid; i++)
    {
        for(j=0; j<xGrid-1; j++)
        {
            grid = i*(xGrid-1)+j;
            h_edges[grid].est_usage -=
((float)h_edges[grid].usage/h_edges[grid].cap);

        }
    }

    for(i=0; i<yGrid-1; i++)
    {
        for(j=0; j<xGrid; j++)
        {
            grid = i*xGrid+j;
            v_edges[grid].est_usage -=
((float)v_edges[grid].usage/v_edges[grid].cap);

        }
    }

}

void reduceCongestionHistory(int round)
{
    int i, j, grid;

    for(i=0; i<yGrid; i++)
    {
        for(j=0; j<xGrid-1; j++)
        {
            grid = i*(xGrid-1)+j;
            h_edges[grid].est_usage -=
0.2*((float)h_edges[grid].usage/h_edges[grid].cap);
        }
    }

    for(i=0; i<yGrid-1; i++)
    {
        for(j=0; j<xGrid; j++)
        {
            grid = i*xGrid+j;
            v_edges[grid].est_usage -=
0.2*((float)v_edges[grid].usage/v_edges[grid].cap);
        }
    }

}*/

void InitEstUsage() {
  int i, j, grid;
  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid                    = i * (xGrid - 1) + j;
      h_edges[grid].est_usage = 0;
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid                    = i * xGrid + j;
      v_edges[grid].est_usage = 0;
    }
  }
}

void str_accu(int rnd) {
  int i, j, grid, overflow;
  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid     = i * (xGrid - 1) + j;
      overflow = h_edges[grid].usage - h_edges[grid].cap;
      if (overflow > 0 || h_edges[grid].congCNT > rnd) {
        h_edges[grid].last_usage += h_edges[grid].congCNT * overflow / 2;
      }
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid     = i * xGrid + j;
      overflow = v_edges[grid].usage - v_edges[grid].cap;
      if (overflow > 0 || v_edges[grid].congCNT > rnd) {
        v_edges[grid].last_usage += v_edges[grid].congCNT * overflow / 2;
      }
    }
  }
}

void InitLastUsage(int upType) {
  int i, j, grid;
  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid - 1; j++) {
      grid                     = i * (xGrid - 1) + j;
      h_edges[grid].last_usage = 0;
    }
  }
  //    fprintf(fpv, "\nVertical Congestion\n");
  for (i = 0; i < yGrid - 1; i++) {
    for (j = 0; j < xGrid; j++) {
      grid                     = i * xGrid + j;
      v_edges[grid].last_usage = 0;
    }
  }

  if (upType == 1) {
    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {
        grid                  = i * (xGrid - 1) + j;
        h_edges[grid].congCNT = 0;
      }
    }
    //    fprintf(fpv, "\nVertical Congestion\n");
    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {
        grid                  = i * xGrid + j;
        v_edges[grid].congCNT = 0;
      }
    }
  } else if (upType == 2) {

    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {

        grid = i * (xGrid - 1) + j;
        // if (overflow > 0)
        h_edges[grid].last_usage = h_edges[grid].last_usage * 0.2;
      }
    }
    //    fprintf(fpv, "\nVertical Congestion\n");
    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {

        grid = i * xGrid + j;
        //	if (overflow > 0)
        v_edges[grid].last_usage = v_edges[grid].last_usage * 0.2;
      }
    }
  }
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/maze3D.h
================================================
#ifndef _MAZE3D_H_
#define _MAZE3D_H_

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "route.h"
#include "RipUp.h"
#include <time.h>

#define PARENT(i) (i - 1) / 2
//#define PARENT(i) ((i-1)>>1)
#define LEFT(i) 2 * i + 1
#define RIGHT(i) 2 * i + 2

typedef struct {
  int x; // x position
  int y; // y position
  int l;
} Pos3D;
// non recursive version of heapify-
static void heapify3D(int** array, int heapSize, int i) {
  int l, r, smallest;
  int* tmp;
  Bool STOP = FALSE;

  tmp = array[i];
  do {

    l = LEFT(i);
    r = RIGHT(i);

    if (l < heapSize && *(array[l]) < *tmp) {
      smallest = l;
      if (r < heapSize && *(array[r]) < *(array[l]))
        smallest = r;
    } else {
      smallest = i;
      if (r < heapSize && *(array[r]) < *tmp)
        smallest = r;
    }
    if (smallest != i) {
      array[i] = array[smallest];
      i        = smallest;
    } else {
      array[i] = tmp;
      STOP     = TRUE;
    }
  } while (!STOP);
}

// build heap for an list of grid
/*static void buildHeap3D(int **array, int arrayLen)
{
    int i;

    for (i=arrayLen/2-1; i>=0; i--)
        heapify3D(array, arrayLen, i);
}*/

static void updateHeap3D(int** array, int i) {

  int parent;
  int* tmpi;

  //	printf("heap being updated");
  //	fflush(stdout);

  tmpi = array[i];
  while (i > 0 && *(array[PARENT(i)]) > *tmpi) {
    parent   = PARENT(i);
    array[i] = array[parent];
    i        = parent;
  }
  array[i] = tmpi;

  //	printf("heap updated succedded");
  //	fflush(stdout);
}

// extract the entry with minimum distance from Priority queue
static void extractMin3D(int** array, int arrayLen) {

  if (arrayLen < 1)
    printf("Error: heap underflow\n");
  array[0] = array[arrayLen - 1];
  heapify3D(array, arrayLen - 1, 0);
}

void setupHeap3D(int netID, int edgeID, int* heapLen1, int* heapLen2,
                 int regionX1, int regionX2, int regionY1, int regionY2) {
  int i, j, l, d, numNodes, x1, y1, x2, y2, n1, n2, nt;
  int nbr, nbrX, nbrY, cur, edge;
  int x_grid, y_grid, l_grid, heapcnt;
  int queuehead, queuetail;
  int* heapQueue;
  //   Bool *heapVisited;
  TreeEdge* treeedges;
  TreeNode* treenodes;
  Route* route;
  // Bool **inRegion;  // the flag to check weather the node is in the enlarged
  // region for maze routing

  // inRegion = (Bool**) calloc(yGrid, sizeof(Bool*));
  // for(i=0; i<yGrid; i++)
  // inRegion[i] = (Bool*) calloc(xGrid, sizeof(Bool));

  /*	for ( l = 0; l < numLayers; l++) {
          for(i=regionY1; i<=regionY2; i++) {
              for(j=regionX1; j<=regionX2; j++) {
                  corrEdge3D[l][i][j] = BIG_INT;
              }
          }
      }*/

  // return;

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  d         = sttrees[netID].deg;

  n1 = treeedges[edgeID].n1;
  n2 = treeedges[edgeID].n2;
  x1 = treenodes[n1].x;
  y1 = treenodes[n1].y;
  x2 = treenodes[n2].x;
  y2 = treenodes[n2].y;

  if (d == 2) // 2-pin net
  {
    //		printf("2pinnet l1 %d, l2 %d\n", l1, l2);
    d13D[0][y1][x1]         = 0;
    directions3D[0][y1][x1] = ORIGIN;
    heap13D[0]              = &(d13D[0][y1][x1]);
    *heapLen1               = 1;
    d23D[0][y2][x2]         = 0;
    directions3D[0][y2][x2] = ORIGIN;
    heap23D[0]              = &(d23D[0][y2][x2]);
    *heapLen2               = 1;
  } else // net with more than 2 pins
  {
    heapQueue = (int*)calloc(MAXNETDEG, sizeof(int));

    for (i = regionY1; i <= regionY2; i++) {
      for (j = regionX1; j <= regionX2; j++) {
        inRegion[i][j] = TRUE;
      }
    }

    numNodes = 2 * d - 2;

    for (i = 0; i < numNodes; i++)
      heapVisited[i] = FALSE;

    // find all the grids on tree edges in subtree t1 (connecting to n1) and put
    // them into heap13D
    if (n1 < d) // n1 is a Pin node
    {
      //			getLayerRange(treenodes, treeedges ,n1, edgeID, &topL,
      //&botL);

      // just need to put n1 itself into heap13D
      heapcnt = 0;

      nt = treenodes[n1].stackAlias;

      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {
        d13D[l][y1][x1] = 0;
        // cout << heap13D << " " << heapcnt << " " << d13D[l][y1][x1] << " " <<
        // &(d13D[l][y1][x1]) << endl;
        heap13D[heapcnt]        = &(d13D[l][y1][x1]);
        directions3D[l][y1][x1] = ORIGIN;
        heapVisited[n1]         = TRUE;
        heapcnt++;
      }
      *heapLen1 = heapcnt;

    } else // n1 is a Steiner node
    {
      heapcnt   = 0;
      queuehead = queuetail = 0;

      //			getLayerRange(treenodes, treeedges ,n1, edgeID, &topL,
      //&botL);

      nt = treenodes[n1].stackAlias;

      // add n1 into heap13D
      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {
        d13D[l][y1][x1]         = 0;
        directions3D[l][y1][x1] = ORIGIN;
        heap13D[heapcnt]        = &(d13D[l][y1][x1]);
        heapVisited[n1]         = TRUE;
        heapcnt++;
      }

      // add n1 into the heapQueue
      heapQueue[queuetail] = n1;
      queuetail++;

      // loop to find all the edges in subtree t1
      while (queuetail > queuehead) {
        // get cur node from the queuehead
        cur = heapQueue[queuehead];
        queuehead++;
        heapVisited[cur] = TRUE;
        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n2) // not n2
            {
              if (heapVisited[nbr] == FALSE) {

                // put all the grids on the two adjacent tree edges into heap13D
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {

                  // put nbr into heap13D if in enlarged region
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX = treenodes[nbr].x;
                    nbrY = treenodes[nbr].y;
                    nt   = treenodes[nbr].stackAlias;
                    for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {

                      d13D[l][nbrY][nbrX]         = 0;
                      directions3D[l][nbrY][nbrX] = ORIGIN;
                      heap13D[heapcnt]            = &(d13D[l][nbrY][nbrX]);
                      heapcnt++;
                      corrEdge3D[l][nbrY][nbrX] = edge;
                    }
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap13D
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];
                      l_grid = route->gridsL[j];

                      if (inRegion[y_grid][x_grid]) {
                        d13D[l_grid][y_grid][x_grid] = 0;
                        heap13D[heapcnt] = &(d13D[l_grid][y_grid][x_grid]);
                        directions3D[l_grid][y_grid][x_grid] = ORIGIN;
                        heapcnt++;
                        corrEdge3D[l_grid][y_grid][x_grid] = edge;
                      }
                    }

                  } // if MAZEROUTE
                }   // if not a degraded edge (len>0)

                // add the neighbor of cur node into heapQueue
                heapQueue[queuetail] = nbr;
                queuetail++;
              }            // if the node is not heapVisited
            }              // if nbr!=n2
          }                // loop i (3 neigbors for cur node)
        }                  // if cur node is a Steiner nodes
      }                    // while heapQueue is not empty
      *heapLen1 = heapcnt; // record the length of heap13D
    }                      // else n1 is not a Pin node

    // find all the grids on subtree t2 (connect to n2) and put them into
    // heap23D find all the grids on tree edges in subtree t2 (connecting to n2)
    // and put them into heap23D
    if (n2 < d) // n2 is a Pin node
    {

      nt = treenodes[n2].stackAlias;
      //*heapLen2 = 0;
      heapcnt = 0;

      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {
        // just need to put n1 itself into heap13D
        d23D[l][y2][x2]         = 0;
        directions3D[l][y2][x2] = ORIGIN;
        heap23D[heapcnt]        = &(d23D[l][y2][x2]);
        heapVisited[n2]         = TRUE;
        //*heapLen2 += 1;
        heapcnt++;
      }
      *heapLen2 = heapcnt;
    } else // n2 is a Steiner node
    {
      heapcnt   = 0;
      queuehead = queuetail = 0;

      nt = treenodes[n2].stackAlias;
      // add n2 into heap23D
      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {
        d23D[l][y2][x2]         = 0;
        directions3D[l][y2][x2] = ORIGIN;
        heap23D[heapcnt]        = &(d23D[l][y2][x2]);
        heapcnt++;
      }
      heapVisited[n2] = TRUE;

      // add n2 into the heapQueue
      heapQueue[queuetail] = n2;
      queuetail++;

      // loop to find all the edges in subtree t2
      while (queuetail > queuehead) {
        // get cur node form queuehead
        cur              = heapQueue[queuehead];
        heapVisited[cur] = TRUE;
        queuehead++;

        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n1) // not n1
            {
              if (heapVisited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap23D
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {

                  // put nbr into heap23D
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX = treenodes[nbr].x;
                    nbrY = treenodes[nbr].y;
                    nt   = treenodes[nbr].stackAlias;
                    for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {
                      // nbrL = treenodes[nbr].l;

                      d23D[l][nbrY][nbrX]         = 0;
                      directions3D[l][nbrY][nbrX] = ORIGIN;
                      heap23D[heapcnt]            = &(d23D[l][nbrY][nbrX]);
                      heapcnt++;
                      corrEdge3D[l][nbrY][nbrX] = edge;
                    }
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {

                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap23D
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];
                      l_grid = route->gridsL[j];
                      if (inRegion[y_grid][x_grid]) {

                        d23D[l_grid][y_grid][x_grid]         = 0;
                        directions3D[l_grid][y_grid][x_grid] = ORIGIN;
                        heap23D[heapcnt] = &(d23D[l_grid][y_grid][x_grid]);
                        heapcnt++;

                        corrEdge3D[l_grid][y_grid][x_grid] = edge;
                      }
                    }

                  } // if MAZEROUTE
                }   // if the edge is not degraded (len>0)

                // add the neighbor of cur node into heapQueue
                heapQueue[queuetail] = nbr;
                queuetail++;
              }            // if the node is not heapVisited
            }              // if nbr!=n1
          }                // loop i (3 neigbors for cur node)
        }                  // if cur node is a Steiner nodes
      }                    // while heapQueue is not empty
      *heapLen2 = heapcnt; // record the length of heap23D
    }                      // else n2 is not a Pin node

    //	printf("queuetail %d, numnodes %d\n", queuetail, numNodes);
    //	fflush(stdout);
    free(heapQueue);
    //   free(heapVisited);

    //	printf("there after\n", queuetail, numNodes);
    //	fflush(stdout);

    for (i = regionY1; i <= regionY2; i++) {
      for (j = regionX1; j <= regionX2; j++) {
        inRegion[i][j] = FALSE;
      }
    }
  } // net with more than two pins

  // for(i=0; i<yGrid; i++) {
  // free(inRegion[i]);
  //}
  // free(inRegion);
}

void newUpdateNodeLayers(TreeNode* treenodes, int edgeID, int n1, int lastL) {
  int con;

  con = treenodes[n1].conCNT;

  treenodes[n1].heights[con] = lastL;
  treenodes[n1].eID[con]     = edgeID;
  treenodes[n1].conCNT++;
  if (treenodes[n1].topL < lastL) {
    treenodes[n1].topL = lastL;
    treenodes[n1].hID  = edgeID;
  }
  if (treenodes[n1].botL > lastL) {
    treenodes[n1].botL = lastL;
    treenodes[n1].lID  = edgeID;
  }
}

int copyGrids3D(TreeNode* treenodes, int n1, TreeEdge* treeedges, int edge_n1n2,
                int gridsX_n1n2[], int gridsY_n1n2[], int gridsL_n1n2[]) {
  int i, cnt;
  int n1x, n1y, n1l = 0;

  n1x = treenodes[n1].x;
  n1y = treenodes[n1].y;
  // n1l = treenodes[n1].l;
  // n2l = treenodes[n2].l;

  cnt = 0;
  if (treeedges[edge_n1n2].n1 == n1) // n1 is the first node of (n1, n2)
  {
    if (treeedges[edge_n1n2].route.routelen > 0) {
      for (i = 0; i <= treeedges[edge_n1n2].route.routelen; i++) {
        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];
        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];
        gridsL_n1n2[cnt] = treeedges[edge_n1n2].route.gridsL[i];
        cnt++;
      }
    }    // MAZEROUTE
    else // NOROUTE
    {
      fflush(stdout);
      gridsX_n1n2[cnt] = n1x;
      gridsY_n1n2[cnt] = n1y;
      gridsL_n1n2[cnt] = n1l;
      cnt++;
    }
  }    // if n1 is the first node of (n1, n2)
  else // n2 is the first node of (n1, n2)
  {
    if (treeedges[edge_n1n2].route.routelen > 0) {
      for (i = treeedges[edge_n1n2].route.routelen; i >= 0; i--) {
        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];
        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];
        gridsL_n1n2[cnt] = treeedges[edge_n1n2].route.gridsL[i];
        cnt++;
      }
    }    // MAZEROUTE
    else // NOROUTE
    {
      gridsX_n1n2[cnt] = n1x;
      gridsY_n1n2[cnt] = n1y;
      gridsL_n1n2[cnt] = n1l;
      cnt++;
    } // MAZEROUTE
  }

  return (cnt);
}

void updateRouteType13D(TreeNode* treenodes, int n1, int A1, int A2, int E1x,
                        int E1y, TreeEdge* treeedges, int edge_n1A1,
                        int edge_n1A2) {
  int i, l, cnt, A1x, A1y, A2x, A2y;
  int cnt_n1A1, cnt_n1A2, E1_pos1 = 0, E1_pos2 = 0;
  int gridsX_n1A1[MAXLEN], gridsY_n1A1[MAXLEN], gridsL_n1A1[MAXLEN],
      gridsX_n1A2[MAXLEN], gridsY_n1A2[MAXLEN], gridsL_n1A2[MAXLEN];

  A1x = treenodes[A1].x;
  A1y = treenodes[A1].y;
  A2x = treenodes[A2].x;
  A2y = treenodes[A2].y;

  // copy all the grids on (n1, A1) and (n2, A2) to tmp arrays, and keep the
  // grids order A1->n1->A2 copy (n1, A1)
  cnt_n1A1 = copyGrids3D(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1,
                         gridsY_n1A1, gridsL_n1A1);

  // copy (n1, A2)
  cnt_n1A2 = copyGrids3D(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2,
                         gridsY_n1A2, gridsL_n1A2);

  if (cnt_n1A1 == 1) {
    printf("in 3D maze routing, type 1 node shift, cnt_n1A1 is 1\n");
    exit(0);
  }

  for (i = 0; i < cnt_n1A1; i++) {
    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1
    {
      E1_pos1 = i;
      break;
    }
  }

  for (i = cnt_n1A1 - 1; i >= 0; i--) {
    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1
    {
      E1_pos2 = i;
      break;
    }
  }

  // reallocate memory for route.gridsX and route.gridsY
  if (treeedges[edge_n1A1].route.type == MAZEROUTE &&
      treeedges[edge_n1A1].route.routelen >
          0) // if originally allocated, free them first
  {
    free(treeedges[edge_n1A1].route.gridsX);
    free(treeedges[edge_n1A1].route.gridsY);
    free(treeedges[edge_n1A1].route.gridsL);
  }
  treeedges[edge_n1A1].route.gridsX =
      (short*)calloc((E1_pos1 + 1), sizeof(short));
  treeedges[edge_n1A1].route.gridsY =
      (short*)calloc((E1_pos1 + 1), sizeof(short));
  treeedges[edge_n1A1].route.gridsL =
      (short*)calloc((E1_pos1 + 1), sizeof(short));

  if (A1x <= E1x) {
    cnt = 0;
    for (i = 0; i <= E1_pos1; i++) {
      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];
      treeedges[edge_n1A1].route.gridsL[cnt] = gridsL_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A1].n1 = A1;
    treeedges[edge_n1A1].n2 = n1;
  } else {
    cnt = 0;
    for (i = E1_pos1; i >= 0; i--) {
      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];
      treeedges[edge_n1A1].route.gridsL[cnt] = gridsL_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A1].n1 = n1;
    treeedges[edge_n1A1].n2 = A1;
  }
  treeedges[edge_n1A1].len = ADIFF(A1x, E1x) + ADIFF(A1y, E1y);

  treeedges[edge_n1A1].route.type     = MAZEROUTE;
  treeedges[edge_n1A1].route.routelen = E1_pos1;

  // reallocate memory for route.gridsX and route.gridsY
  if (treeedges[edge_n1A2].route.type == MAZEROUTE &&
      treeedges[edge_n1A2].route.routelen >
          0) // if originally allocated, free them first
  {
    free(treeedges[edge_n1A2].route.gridsX);
    free(treeedges[edge_n1A2].route.gridsY);
    free(treeedges[edge_n1A2].route.gridsL);
  }

  if (cnt_n1A2 > 1) {
    treeedges[edge_n1A2].route.gridsX =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +
                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),
                       sizeof(short));
    treeedges[edge_n1A2].route.gridsY =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +
                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),
                       sizeof(short));
    treeedges[edge_n1A2].route.gridsL =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +
                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),
                       sizeof(short));
  } else {
    treeedges[edge_n1A2].route.gridsX =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));
    treeedges[edge_n1A2].route.gridsY =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));
    treeedges[edge_n1A2].route.gridsL =
        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));
  }

  if (E1x <= A2x) {
    cnt = 0;
    for (i = E1_pos2; i < cnt_n1A1; i++) {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];
      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A1[i];
      cnt++;
    }
    if (cnt_n1A2 > 1) {
      if (gridsL_n1A1[cnt_n1A1 - 1] > gridsL_n1A2[0]) {
        for (l = gridsL_n1A1[cnt_n1A1 - 1] - 1; l >= gridsL_n1A2[0]; l--) {
          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_n1A2].route.gridsL[cnt] = l;
          cnt++;
        }
      } else if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {
        for (l = gridsL_n1A1[cnt_n1A1 - 1] + 1; l <= gridsL_n1A2[0]; l++) {
          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_n1A2].route.gridsL[cnt] = l;
          cnt++;
        }
      }
    }

    for (i = 1; i < cnt_n1A2; i++) // 0 is n1 again, so no repeat
    {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];
      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A2[i];
      cnt++;
    }
    treeedges[edge_n1A2].n1 = n1;
    treeedges[edge_n1A2].n2 = A2;
  } else {
    cnt = 0;
    for (i = cnt_n1A2 - 1; i >= 1; i--) // 0 is n1 again, so no repeat
    {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];
      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A2[i];
      cnt++;
    }

    if (cnt_n1A2 > 1) {
      if (gridsL_n1A1[cnt_n1A1 - 1] > gridsL_n1A2[0]) {
        for (l = gridsL_n1A2[0]; l < gridsL_n1A1[cnt_n1A1 - 1]; l++) {
          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_n1A2].route.gridsL[cnt] = l;
          cnt++;
        }
      } else if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {
        for (l = gridsL_n1A2[0]; l > gridsL_n1A1[cnt_n1A1 - 1]; l--) {
          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_n1A2].route.gridsL[cnt] = l;
          cnt++;
        }
      }
    }
    for (i = cnt_n1A1 - 1; i >= E1_pos2; i--) {
      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];
      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];
      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A1[i];
      cnt++;
    }
    treeedges[edge_n1A2].n1 = A2;
    treeedges[edge_n1A2].n2 = n1;
  }
  treeedges[edge_n1A2].route.type     = MAZEROUTE;
  treeedges[edge_n1A2].route.routelen = cnt - 1;
  treeedges[edge_n1A2].len            = ADIFF(A2x, E1x) + ADIFF(A2y, E1y);

  treenodes[n1].x = E1x;
  treenodes[n1].y = E1y;
}

void updateRouteType23D(TreeNode* treenodes, int n1, int A1, int A2, int C1,
                        int C2, int E1x, int E1y, TreeEdge* treeedges,
                        int edge_n1A1, int edge_n1A2, int edge_C1C2) {
  int i, cnt, A1x, A1y, A2x, A2y, C1x, C1y, C2x, C2y, extraLen, startIND;
  int edge_n1C1, edge_n1C2, edge_A1A2;
  int cnt_n1A1, cnt_n1A2, cnt_C1C2, E1_pos1 = 0, E1_pos2 = 0;
  int len_A1A2, len_n1C1, len_n1C2;
  int gridsX_n1A1[MAXLEN], gridsY_n1A1[MAXLEN], gridsL_n1A1[MAXLEN];
  int gridsX_n1A2[MAXLEN], gridsY_n1A2[MAXLEN], gridsL_n1A2[MAXLEN];
  int gridsX_C1C2[MAXLEN], gridsY_C1C2[MAXLEN], gridsL_C1C2[MAXLEN];

  A1x = treenodes[A1].x;
  A1y = treenodes[A1].y;
  // A1l = treenodes[A1].l;
  A2x = treenodes[A2].x;
  A2y = treenodes[A2].y;
  C1x = treenodes[C1].x;
  C1y = treenodes[C1].y;
  C2x = treenodes[C2].x;
  C2y = treenodes[C2].y;

  // printf("orig edge_n1A1 %d edge_n1A2 %d edge_C1C2
  // %d\n",edge_n1A1,edge_n1A2,edge_C1C2 );
  edge_n1C1 = edge_n1A1;
  edge_n1C2 = edge_n1A2;
  edge_A1A2 = edge_C1C2;

  // combine (n1, A1) and (n1, A2) into (A1, A2), A1 is the first node and A2 is
  // the second grids order A1->n1->A2 copy (A1, n1)
  cnt_n1A1 = copyGrids3D(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1,
                         gridsY_n1A1, gridsL_n1A1);

  // copy (n1, A2)
  cnt_n1A2 = copyGrids3D(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2,
                         gridsY_n1A2, gridsL_n1A2);

  // copy all the grids on (C1, C2) to gridsX_C1C2[] and gridsY_C1C2[]
  cnt_C1C2 = copyGrids3D(treenodes, C1, treeedges, edge_C1C2, gridsX_C1C2,
                         gridsY_C1C2, gridsL_C1C2);

  // combine grids on original (A1, n1) and (n1, A2) to new (A1, A2)
  // allocate memory for gridsX[] and gridsY[] of edge_A1A2
  if (treeedges[edge_A1A2].route.type == MAZEROUTE) {
    free(treeedges[edge_A1A2].route.gridsX);
    free(treeedges[edge_A1A2].route.gridsY);
    free(treeedges[edge_A1A2].route.gridsL);
  }
  len_A1A2 = cnt_n1A1 + cnt_n1A2 - 1;

  if (len_A1A2 == 1) {
    treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;
    treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);
  } else {

    extraLen = 0;
    if (cnt_n1A1 > 1 && cnt_n1A2 > 1) {
      extraLen = ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0]);
      len_A1A2 += extraLen;
    }
    treeedges[edge_A1A2].route.gridsX = (short*)calloc(len_A1A2, sizeof(short));
    treeedges[edge_A1A2].route.gridsY = (short*)calloc(len_A1A2, sizeof(short));
    treeedges[edge_A1A2].route.gridsL = (short*)calloc(len_A1A2, sizeof(short));
    treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;
    treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);

    cnt      = 0;
    startIND = 0;

    if (cnt_n1A1 > 1) {
      startIND = 1;
      for (i = 0; i < cnt_n1A1; i++) {
        treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A1[i];
        treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A1[i];
        treeedges[edge_A1A2].route.gridsL[cnt] = gridsL_n1A1[i];
        cnt++;
      }
    }

    if (extraLen > 0) {
      if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {
        for (i = gridsL_n1A1[cnt_n1A1 - 1] + 1; i <= gridsL_n1A2[0]; i++) {
          treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_A1A2].route.gridsL[cnt] = i;
          cnt++;
        }
      } else {
        for (i = gridsL_n1A1[cnt_n1A1 - 1] - 1; i >= gridsL_n1A2[1]; i--) {
          treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[0];
          treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[0];
          treeedges[edge_A1A2].route.gridsL[cnt] = i;
          cnt++;
        }
      }
    }

    for (i = startIND; i < cnt_n1A2; i++) // do not repeat point n1
    {
      treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[i];
      treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[i];
      treeedges[edge_A1A2].route.gridsL[cnt] = gridsL_n1A2[i];
      cnt++;
    }
  }

  if (cnt_C1C2 == 1) {
    printf("shift to 0 length edge, type2\n");
  }

  // find the index of E1 in (C1, C2)
  for (i = 0; i < cnt_C1C2; i++) {
    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {
      E1_pos1 = i;
      break;
    }
  }

  for (i = cnt_C1C2 - 1; i >= 0; i--) {
    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {
      E1_pos2 = i;
      break;
    }
  }

  // allocate memory for gridsX[] and gridsY[] of edge_n1C1 and edge_n1C2
  if (treeedges[edge_n1C1].route.type == MAZEROUTE &&
      treeedges[edge_n1C1].route.routelen > 0) {
    free(treeedges[edge_n1C1].route.gridsX);
    free(treeedges[edge_n1C1].route.gridsY);
    free(treeedges[edge_n1C1].route.gridsL);
  }
  len_n1C1 = E1_pos1 + 1;

  treeedges[edge_n1C1].route.gridsX   = (short*)calloc(len_n1C1, sizeof(short));
  treeedges[edge_n1C1].route.gridsY   = (short*)calloc(len_n1C1, sizeof(short));
  treeedges[edge_n1C1].route.gridsL   = (short*)calloc(len_n1C1, sizeof(short));
  treeedges[edge_n1C1].route.routelen = len_n1C1 - 1;
  treeedges[edge_n1C1].len            = ADIFF(C1x, E1x) + ADIFF(C1y, E1y);

  if (treeedges[edge_n1C2].route.type == MAZEROUTE &&
      treeedges[edge_n1C2].route.routelen > 0) {
    free(treeedges[edge_n1C2].route.gridsX);
    free(treeedges[edge_n1C2].route.gridsY);
    free(treeedges[edge_n1C2].route.gridsL);
  }
  len_n1C2 = cnt_C1C2 - E1_pos2;

  treeedges[edge_n1C2].route.gridsX   = (short*)calloc(len_n1C2, sizeof(short));
  treeedges[edge_n1C2].route.gridsY   = (short*)calloc(len_n1C2, sizeof(short));
  treeedges[edge_n1C2].route.gridsL   = (short*)calloc(len_n1C2, sizeof(short));
  treeedges[edge_n1C2].route.routelen = len_n1C2 - 1;
  treeedges[edge_n1C2].len            = ADIFF(C2x, E1x) + ADIFF(C2y, E1y);

  // split original (C1, C2) to (C1, n1) and (n1, C2)
  cnt = 0;
  for (i = 0; i <= E1_pos1; i++) {
    treeedges[edge_n1C1].route.gridsX[i] = gridsX_C1C2[i];
    treeedges[edge_n1C1].route.gridsY[i] = gridsY_C1C2[i];
    treeedges[edge_n1C1].route.gridsL[i] = gridsL_C1C2[i];
    cnt++;
  }
  /// if(cnt!=len_n1C1) {printf("len_n1C1 wrong!\n");exit(1);}

  cnt = 0;
  for (i = E1_pos2; i < cnt_C1C2; i++) {
    treeedges[edge_n1C2].route.gridsX[cnt] = gridsX_C1C2[i];
    treeedges[edge_n1C2].route.gridsY[cnt] = gridsY_C1C2[i];
    treeedges[edge_n1C2].route.gridsL[cnt] = gridsL_C1C2[i];
    cnt++;
  }
}

void mazeRouteMSMDOrder3D(int expand, int ripupTHlb, int ripupTHub) {

  short *gridsLtmp, gridsX[MAXLEN], gridsY[MAXLEN], gridsL[MAXLEN],
      tmp_gridsX[MAXLEN], tmp_gridsY[MAXLEN], tmp_gridsL[MAXLEN];
  int netID, enlarge, endIND;
  Bool* pop_heap23D;

  int i, j, k, deg, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax, curX,
      curY, curL, crossX, crossY, crossL, tmpX, tmpY, tmpL, tmpi, min_x, min_y,
      *dtmp;
  int regionX1, regionX2, regionY1, regionY2, routeLen;
  int heapLen1, heapLen2, ind, ind1, tmpind, grid;
  float tmp;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  int endpt1, endpt2, A1, A2, B1, B2, C1, C2, cnt, cnt_n1n2, remd;
  int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
      edge_C1C2;
  int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2, D1, D2;
  int E1x, E1y, E2x, E2y, range, corE1, corE2, edgeID;

  Bool Horizontal, n1Shift, n2Shift, redundant;
  int lastL, origL, headRoom, tailRoom, newcnt_n1n2, numpoints, d, n1a, n2a,
      connectionCNT;
  int origEng, orderIndex;

  directions3D = (dirctionT***)calloc(numLayers, sizeof(dirctionT**));
  corrEdge3D   = (int***)calloc(numLayers, sizeof(int**));
  pr3D         = (parent3D***)calloc(numLayers, sizeof(parent3D**));

  for (i = 0; i < numLayers; i++) {
    directions3D[i] = (dirctionT**)calloc(yGrid, sizeof(dirctionT*));
    corrEdge3D[i]   = (int**)calloc(yGrid, sizeof(int*));
    pr3D[i]         = (parent3D**)calloc(yGrid, sizeof(parent3D*));

    for (j = 0; j < yGrid; j++) {
      directions3D[i][j] = (dirctionT*)calloc(xGrid, sizeof(dirctionT));
      corrEdge3D[i][j]   = (int*)calloc(xGrid, sizeof(int));
      pr3D[i][j]         = (parent3D*)calloc(xGrid, sizeof(parent3D));
    }
  }

  pop_heap23D = (Bool*)calloc(numLayers * YRANGE * XRANGE, sizeof(Bool));

  // allocate memory for priority queue
  heap13D = (int**)calloc((yGrid * xGrid * numLayers), sizeof(int*));
  heap23D = (short**)calloc((yGrid * xGrid * numLayers), sizeof(short*));

  // cout << heap13D << endl;

  for (i = 0; i < yGrid; i++) {
    for (j = 0; j < xGrid; j++) {
      inRegion[i][j] = FALSE;
    }
  }

  range = YRANGE * XRANGE * numLayers;
  for (i = 0; i < range; i++) {
    pop_heap23D[i] = FALSE;
  }

  endIND = numValidNets * 0.9;

  for (orderIndex = 0; orderIndex < endIND; orderIndex++) {

    netID = treeOrderPV[orderIndex].treeIndex;

    // printf("netID %d\n",netID);
    // fflush(stdout);
    // if (netID == 53757)
    //{
    //	continue;
    //}

    enlarge   = expand;
    deg       = sttrees[netID].deg;
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    origEng   = enlarge;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      treeedge = &(treeedges[edgeID]);

      if (treeedge->len < ripupTHub && treeedge->len > ripupTHlb) {

        n1       = treeedge->n1;
        n2       = treeedge->n2;
        n1x      = treenodes[n1].x;
        n1y      = treenodes[n1].y;
        n2x      = treenodes[n2].x;
        n2y      = treenodes[n2].y;
        routeLen = treeedges[edgeID].route.routelen;

        if (n1y <= n2y) {
          ymin = n1y;
          ymax = n2y;
        } else {
          ymin = n2y;
          ymax = n1y;
        }

        if (n1x <= n2x) {
          xmin = n1x;
          xmax = n2x;
        } else {
          xmin = n2x;
          xmax = n1x;
        }

        // ripup the routing for the edge
        if (newRipup3DType3(netID, edgeID)) {
          enlarge = min(origEng, treeedge->route.routelen);

          regionX1 = max(0, xmin - enlarge);
          regionX2 = min(xGrid - 1, xmax + enlarge);
          regionY1 = max(0, ymin - enlarge);
          regionY2 = min(yGrid - 1, ymax + enlarge);

          n1Shift = FALSE;
          n2Shift = FALSE;
          n1a     = treeedge->n1a;
          n2a     = treeedge->n2a;

          // initialize pop_heap13D[] and pop_heap23D[] as FALSE (for detecting
          // the shortest path is found or not)

          for (k = 0; k < numLayers; k++) {
            for (i = regionY1; i <= regionY2; i++) {
              for (j = regionX1; j <= regionX2; j++) {
                d13D[k][i][j] = BIG_INT;
                d23D[k][i][j] = 256;
              }
            }
          }

          // setup heap13D, heap23D and initialize d13D[][] and d23D[][] for all
          // the grids on the two subtrees
          setupHeap3D(netID, edgeID, &heapLen1, &heapLen2, regionX1, regionX2,
                      regionY1, regionY2);

          // while loop to find shortest path
          ind1 = (heap13D[0] - (int*)d13D);

          for (i = 0; i < heapLen2; i++)
            pop_heap23D[(heap23D[i] - (short*)d23D)] = TRUE;

          while (pop_heap23D[ind1] ==
                 FALSE) // stop until the grid position been popped out from
                        // both heap13D and heap23D
          {
            // relax all the adjacent grids within the enlarged region for
            // source subtree
            curL = ind1 / (gridHV);
            remd = ind1 % (gridHV);
            curX = remd % XRANGE;
            curY = remd / XRANGE;

            extractMin3D(heap13D, heapLen1);
            // pop_heap13D[ind1] = TRUE;
            heapLen1--;

            if (hCapacity3D[curL]) {
              Horizontal = TRUE;
            } else {
              Horizontal = FALSE;
            }

            if (Horizontal) {

              // left
              if (curX > regionX1 && directions3D[curL][curY][curX] != EAST) {
                grid = gridHs[curL] + curY * (xGrid - 1) + curX - 1;
                tmp  = d13D[curL][curY][curX] + 1;
                if (h_edges3D[grid].usage < h_edges3D[grid].cap) {

                  tmpX = curX - 1; // the left neighbor

                  if (d13D[curL][curY][tmpX] >=
                      BIG_INT) // left neighbor not been put into heap13D
                  {
                    d13D[curL][curY][tmpX]         = tmp;
                    pr3D[curL][curY][tmpX].l       = curL;
                    pr3D[curL][curY][tmpX].x       = curX;
                    pr3D[curL][curY][tmpX].y       = curY;
                    directions3D[curL][curY][tmpX] = WEST;
                    heap13D[heapLen1]              = &(d13D[curL][curY][tmpX]);
                    heapLen1++;
                    updateHeap3D(heap13D, heapLen1 - 1);
                  } else if (d13D[curL][curY][tmpX] >
                             tmp) // left neighbor been put into heap13D but
                                  // needs update
                  {
                    d13D[curL][curY][tmpX]         = tmp;
                    pr3D[curL][curY][tmpX].l       = curL;
                    pr3D[curL][curY][tmpX].x       = curX;
                    pr3D[curL][curY][tmpX].y       = curY;
                    directions3D[curL][curY][tmpX] = WEST;
                    dtmp                           = &(d13D[curL][curY][tmpX]);
                    ind                            = 0;
                    while (heap13D[ind] != dtmp)
                      ind++;
                    updateHeap3D(heap13D, ind);
                  }
                }
              }
              // right
              if (Horizontal && curX < regionX2 &&
                  directions3D[curL][curY][curX] != WEST) {
                grid = gridHs[curL] + curY * (xGrid - 1) + curX;

                tmp  = d13D[curL][curY][curX] + 1;
                tmpX = curX + 1; // the right neighbor

                if (h_edges3D[grid].usage < h_edges3D[grid].cap) {
                  if (d13D[curL][curY][tmpX] >=
                      BIG_INT) // right neighbor not been put into heap13D
                  {
                    d13D[curL][curY][tmpX]         = tmp;
                    pr3D[curL][curY][tmpX].l       = curL;
                    pr3D[curL][curY][tmpX].x       = curX;
                    pr3D[curL][curY][tmpX].y       = curY;
                    directions3D[curL][curY][tmpX] = EAST;
                    heap13D[heapLen1]              = &(d13D[curL][curY][tmpX]);
                    heapLen1++;
                    updateHeap3D(heap13D, heapLen1 - 1);
                  } else if (d13D[curL][curY][tmpX] >
                             tmp) // right neighbor been put into heap13D but
                                  // needs update
                  {
                    d13D[curL][curY][tmpX]         = tmp;
                    pr3D[curL][curY][tmpX].l       = curL;
                    pr3D[curL][curY][tmpX].x       = curX;
                    pr3D[curL][curY][tmpX].y       = curY;
                    directions3D[curL][curY][tmpX] = EAST;
                    dtmp                           = &(d13D[curL][curY][tmpX]);
                    ind                            = 0;
                    while (heap13D[ind] != dtmp)
                      ind++;
                    updateHeap3D(heap13D, ind);
                  }
                }
              }
            } else {
              // bottom
              if (!Horizontal && curY > regionY1 &&
                  directions3D[curL][curY][curX] != SOUTH) {
                grid = gridVs[curL] + (curY - 1) * xGrid + curX;
                tmp  = d13D[curL][curY][curX] + 1;
                tmpY = curY - 1; // the bottom neighbor
                if (v_edges3D[grid].usage < v_edges3D[grid].cap) {

                  if (d13D[curL][tmpY][curX] >=
                      BIG_INT) // bottom neighbor not been put into heap13D
                  {
                    d13D[curL][tmpY][curX]         = tmp;
                    pr3D[curL][tmpY][curX].l       = curL;
                    pr3D[curL][tmpY][curX].x       = curX;
                    pr3D[curL][tmpY][curX].y       = curY;
                    directions3D[curL][tmpY][curX] = NORTH;
                    heap13D[heapLen1]              = &(d13D[curL][tmpY][curX]);
                    heapLen1++;
                    updateHeap3D(heap13D, heapLen1 - 1);
                  } else if (d13D[curL][tmpY][curX] >
                             tmp) // bottom neighbor been put into heap13D but
                                  // needs update
                  {
                    d13D[curL][tmpY][curX]         = tmp;
                    pr3D[curL][tmpY][curX].l       = curL;
                    pr3D[curL][tmpY][curX].x       = curX;
                    pr3D[curL][tmpY][curX].y       = curY;
                    directions3D[curL][tmpY][curX] = NORTH;
                    dtmp                           = &(d13D[curL][tmpY][curX]);
                    ind                            = 0;
                    while (heap13D[ind] != dtmp)
                      ind++;
                    updateHeap3D(heap13D, ind);
                  }
                }
              }
              // top
              if (!Horizontal && curY < regionY2 &&
                  directions3D[curL][curY][curX] != NORTH) {
                grid = gridVs[curL] + curY * xGrid + curX;
                tmp  = d13D[curL][curY][curX] + 1;
                tmpY = curY + 1; // the top neighbor
                if (v_edges3D[grid].usage < v_edges3D[grid].cap) {

                  if (d13D[curL][tmpY][curX] >=
                      BIG_INT) // top neighbor not been put into heap13D
                  {
                    d13D[curL][tmpY][curX]         = tmp;
                    pr3D[curL][tmpY][curX].l       = curL;
                    pr3D[curL][tmpY][curX].x       = curX;
                    pr3D[curL][tmpY][curX].y       = curY;
                    directions3D[curL][tmpY][curX] = SOUTH;
                    heap13D[heapLen1]              = &(d13D[curL][tmpY][curX]);
                    heapLen1++;
                    updateHeap3D(heap13D, heapLen1 - 1);
                  } else if (d13D[curL][tmpY][curX] >
                             tmp) // top neighbor been put into heap13D but
                                  // needs update
                  {
                    d13D[curL][tmpY][curX]         = tmp;
                    pr3D[curL][tmpY][curX].l       = curL;
                    pr3D[curL][tmpY][curX].x       = curX;
                    pr3D[curL][tmpY][curX].y       = curY;
                    directions3D[curL][tmpY][curX] = SOUTH;
                    dtmp                           = &(d13D[curL][tmpY][curX]);
                    ind                            = 0;
                    while (heap13D[ind] != dtmp)
                      ind++;
                    updateHeap3D(heap13D, ind);
                  }
                }
              }
            }

            // down
            if (curL > 0 && directions3D[curL][curY][curX] != UP) {

              tmp  = d13D[curL][curY][curX] + viacost;
              tmpL = curL - 1; // the bottom neighbor

              // printf("down, new value %f, old value
              // %f\n",tmp,d13D[tmpL][curY][curX]);
              if (d13D[tmpL][curY][curX] >=
                  BIG_INT) // bottom neighbor not been put into heap13D
              {
                d13D[tmpL][curY][curX]         = tmp;
                pr3D[tmpL][curY][curX].l       = curL;
                pr3D[tmpL][curY][curX].x       = curX;
                pr3D[tmpL][curY][curX].y       = curY;
                directions3D[tmpL][curY][curX] = DOWN;
                heap13D[heapLen1]              = &(d13D[tmpL][curY][curX]);
                heapLen1++;
                updateHeap3D(heap13D, heapLen1 - 1);
              } else if (d13D[tmpL][curY][curX] >
                         tmp) // bottom neighbor been put into heap13D but needs
                              // update
              {
                d13D[tmpL][curY][curX]         = tmp;
                pr3D[tmpL][curY][curX].l       = curL;
                pr3D[tmpL][curY][curX].x       = curX;
                pr3D[tmpL][curY][curX].y       = curY;
                directions3D[tmpL][curY][curX] = DOWN;
                dtmp                           = &(d13D[tmpL][curY][curX]);
                ind                            = 0;
                while (heap13D[ind] != dtmp)
                  ind++;
                updateHeap3D(heap13D, ind);
              }
            }

            // up
            if (curL < numLayers - 1 &&
                directions3D[curL][curY][curX] != DOWN) {

              tmp  = d13D[curL][curY][curX] + viacost;
              tmpL = curL + 1; // the bottom neighbor
              if (d13D[tmpL][curY][curX] >=
                  BIG_INT) // bottom neighbor not been put into heap13D
              {
                d13D[tmpL][curY][curX]         = tmp;
                pr3D[tmpL][curY][curX].l       = curL;
                pr3D[tmpL][curY][curX].x       = curX;
                pr3D[tmpL][curY][curX].y       = curY;
                directions3D[tmpL][curY][curX] = UP;
                heap13D[heapLen1]              = &(d13D[tmpL][curY][curX]);
                heapLen1++;
                updateHeap3D(heap13D, heapLen1 - 1);
              } else if (d13D[tmpL][curY][curX] >
                         tmp) // bottom neighbor been put into heap13D but needs
                              // update
              {
                d13D[tmpL][curY][curX]         = tmp;
                pr3D[tmpL][curY][curX].l       = curL;
                pr3D[tmpL][curY][curX].x       = curX;
                pr3D[tmpL][curY][curX].y       = curY;
                directions3D[tmpL][curY][curX] = UP;
                dtmp                           = &(d13D[tmpL][curY][curX]);
                ind                            = 0;
                while (heap13D[ind] != dtmp)
                  ind++;
                updateHeap3D(heap13D, ind);
              }
            }

            // update ind1 and ind2 for next loop
            ind1 = (heap13D[0] - (int*)d13D);
          } // while loop

          for (i = 0; i < heapLen2; i++)
            pop_heap23D[(heap23D[i] - (short*)d23D)] = FALSE;

          // get the new route for the edge and store it in gridsX[] and
          // gridsY[] temporarily

          crossL = ind1 / (gridHV);
          crossX = (ind1 % (gridHV)) % XRANGE;
          crossY = (ind1 % (gridHV)) / XRANGE;

          cnt  = 0;
          curX = crossX;
          curY = crossY;
          curL = crossL;

          if (d13D[curL][curY][curX] == 0) {
            recoverEdge(netID, edgeID);
            break;
          }
          // printf("the initial value %f LYX [%d %d
          // %d]\n",d13D[curL][curY][curX],curL, curY, curX);

          while (d13D[curL][curY][curX] != 0) // loop until reach subtree1
          {

            tmpL = pr3D[curL][curY][curX].l;
            tmpX = pr3D[curL][curY][curX].x;
            tmpY = pr3D[curL][curY][curX].y;
            curX = tmpX;
            curY = tmpY;
            curL = tmpL;
            fflush(stdout);
            tmp_gridsX[cnt] = curX;
            tmp_gridsY[cnt] = curY;
            tmp_gridsL[cnt] = curL;
            cnt++;
          }

          // printf("the end value %f\n",d13D[curL][curY][curX]);
          // reverse the grids on the path
          for (i = 0; i < cnt; i++) {
            tmpind    = cnt - 1 - i;
            gridsX[i] = tmp_gridsX[tmpind];
            gridsY[i] = tmp_gridsY[tmpind];
            gridsL[i] = tmp_gridsL[tmpind];
          }

          // add the connection point (crossX, crossY)
          gridsX[cnt] = crossX;
          gridsY[cnt] = crossY;
          gridsL[cnt] = crossL;
          cnt++;

          curX = crossX;
          curY = crossY;
          curL = crossL;

          cnt_n1n2 = cnt;

          E1x = gridsX[0];
          E1y = gridsY[0];
          E2x = gridsX[cnt_n1n2 - 1];
          E2y = gridsY[cnt_n1n2 - 1];

          headRoom = 0;
          origL    = gridsL[0];

          while (gridsX[headRoom] == E1x && gridsY[headRoom] == E1y) {
            lastL = gridsL[headRoom];
            headRoom++;
          }
          if (headRoom > 0) {
            headRoom--;
          }

          lastL = gridsL[headRoom];

          // change the tree structure according to the new routing for the tree
          // edge find E1 and E2, and the endpoints of the edges they are on

          edge_n1n2 = edgeID;
          // (1) consider subtree1
          if (n1 >= deg && (E1x != n1x || E1y != n1y))
          // n1 is not a pin and E1!=n1, then make change to subtree1,
          // otherwise, no change to subtree1
          {
            n1Shift = TRUE;
            corE1   = corrEdge3D[origL][E1y][E1x];

            endpt1 = treeedges[corE1].n1;
            endpt2 = treeedges[corE1].n2;

            // find A1, A2 and edge_n1A1, edge_n1A2
            if (treenodes[n1].nbr[0] == n2) {
              A1        = treenodes[n1].nbr[1];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[1];
              edge_n1A2 = treenodes[n1].edge[2];
            } else if (treenodes[n1].nbr[1] == n2) {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[2];
            } else {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[1];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[1];
            }

            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
            {
              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on
              // (n1, A1)
              if (endpt1 == A2 || endpt2 == A2) {
                tmpi      = A1;
                A1        = A2;
                A2        = tmpi;
                tmpi      = edge_n1A1;
                edge_n1A1 = edge_n1A2;
                edge_n1A2 = tmpi;
              }

              // update route for edge (n1, A1), (n1, A2)
              updateRouteType13D(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                                 edge_n1A1, edge_n1A2);
              // newUpdateNodeLayers(treenodes, edge_n1n2,n1, lastL);

              // update position for n1

              // treenodes[n1].l = E1l;
              treenodes[n1].assigned = TRUE;
            }    // if E1 is on (n1, A1) or (n1, A2)
            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
            {
              C1        = endpt1;
              C2        = endpt2;
              edge_C1C2 = corrEdge3D[origL][E1y][E1x];

              // update route for edge (n1, C1), (n1, C2) and (A1, A2)
              updateRouteType23D(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                                 treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
              // update position for n1
              treenodes[n1].x        = E1x;
              treenodes[n1].y        = E1y;
              treenodes[n1].assigned = TRUE;
              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
              // C2)->(A1, A2)
              edge_n1C1               = edge_n1A1;
              treeedges[edge_n1C1].n1 = C1;
              treeedges[edge_n1C1].n2 = n1;
              edge_n1C2               = edge_n1A2;
              treeedges[edge_n1C2].n1 = n1;
              treeedges[edge_n1C2].n2 = C2;
              edge_A1A2               = edge_C1C2;
              treeedges[edge_A1A2].n1 = A1;
              treeedges[edge_A1A2].n2 = A2;
              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
              // n1's nbr (n2, A1, A2)->(n2, C1, C2)
              treenodes[n1].nbr[0]  = n2;
              treenodes[n1].edge[0] = edge_n1n2;
              treenodes[n1].nbr[1]  = C1;
              treenodes[n1].edge[1] = edge_n1C1;
              treenodes[n1].nbr[2]  = C2;
              treenodes[n1].edge[2] = edge_n1C2;
              // A1's nbr n1->A2
              for (i = 0; i < 3; i++) {
                if (treenodes[A1].nbr[i] == n1) {
                  treenodes[A1].nbr[i]  = A2;
                  treenodes[A1].edge[i] = edge_A1A2;
                  break;
                }
              }
              // A2's nbr n1->A1
              for (i = 0; i < 3; i++) {
                if (treenodes[A2].nbr[i] == n1) {
                  treenodes[A2].nbr[i]  = A1;
                  treenodes[A2].edge[i] = edge_A1A2;
                  break;
                }
              }
              // C1's nbr C2->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C1].nbr[i] == C2) {
                  treenodes[C1].nbr[i]  = n1;
                  treenodes[C1].edge[i] = edge_n1C1;
                  break;
                }
              }
              // C2's nbr C1->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C2].nbr[i] == C1) {
                  treenodes[C2].nbr[i]  = n1;
                  treenodes[C2].edge[i] = edge_n1C2;
                  break;
                }
              }
            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
          }   // n1 is not a pin and E1!=n1
          else {
            newUpdateNodeLayers(treenodes, edge_n1n2, n1a, lastL);
          }

          origL    = gridsL[cnt_n1n2 - 1];
          tailRoom = cnt_n1n2 - 1;

          while (gridsX[tailRoom] == E2x && gridsY[tailRoom] == E2y) {
            tailRoom--;
            if (tailRoom == -1)
              break;
          }
          if (tailRoom < cnt_n1n2 - 1) {
            tailRoom++;
          }

          lastL = gridsL[tailRoom];

          // updateNodeLayers(treenodes, edgeID, n2a, ntpL,nbtL,lastL);

          // (2) consider subtree2
          if (n2 >= deg && (E2x != n2x || E2y != n2y))
          // n2 is not a pin and E2!=n2, then make change to subtree2,
          // otherwise, no change to subtree2
          {
            // find the endpoints of the edge E1 is on

            n2Shift = TRUE;
            corE2   = corrEdge3D[origL][E2y][E2x];
            endpt1  = treeedges[corE2].n1;
            endpt2  = treeedges[corE2].n2;

            // find B1, B2
            if (treenodes[n2].nbr[0] == n1) {
              B1        = treenodes[n2].nbr[1];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[1];
              edge_n2B2 = treenodes[n2].edge[2];
            } else if (treenodes[n2].nbr[1] == n1) {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[2];
            } else {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[1];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[1];
            }

            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
            {
              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on
              // (n2, B1)
              if (endpt1 == B2 || endpt2 == B2) {
                tmpi      = B1;
                B1        = B2;
                B2        = tmpi;
                tmpi      = edge_n2B1;
                edge_n2B1 = edge_n2B2;
                edge_n2B2 = tmpi;
              }
              // printf(" type1\n");

              // update route for edge (n2, B1), (n2, B2)
              updateRouteType13D(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                                 edge_n2B1, edge_n2B2);
              // newUpdateNodeLayers(treenodes, edge_n1n2,n2, lastL);

              // update position for n2
              treenodes[n2].assigned = TRUE;
            }    // if E2 is on (n2, B1) or (n2, B2)
            else // E2 is not on (n2, B1) or (n2, B2), but on (d13D, d23D)
            {
              D1        = endpt1;
              D2        = endpt2;
              edge_D1D2 = corrEdge3D[origL][E2y][E2x];
              // printf(" type2\n");

              // update route for edge (n2, d13D), (n2, d23D) and (B1, B2)
              updateRouteType23D(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                                 treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
              // update position for n2
              treenodes[n2].x        = E2x;
              treenodes[n2].y        = E2y;
              treenodes[n2].assigned = TRUE;
              // update 3 edges (n2, B1)->(d13D, n2), (n2, B2)->(n2, d23D),
              // (d13D, d23D)->(B1, B2)
              edge_n2D1               = edge_n2B1;
              treeedges[edge_n2D1].n1 = D1;
              treeedges[edge_n2D1].n2 = n2;
              edge_n2D2               = edge_n2B2;
              treeedges[edge_n2D2].n1 = n2;
              treeedges[edge_n2D2].n2 = D2;
              edge_B1B2               = edge_D1D2;
              treeedges[edge_B1B2].n1 = B1;
              treeedges[edge_B1B2].n2 = B2;
              // update nbr and edge for 5 nodes n2, B1, B2, d13D, d23D
              // n1's nbr (n1, B1, B2)->(n1, d13D, d23D)
              treenodes[n2].nbr[0]  = n1;
              treenodes[n2].edge[0] = edge_n1n2;
              treenodes[n2].nbr[1]  = D1;
              treenodes[n2].edge[1] = edge_n2D1;
              treenodes[n2].nbr[2]  = D2;
              treenodes[n2].edge[2] = edge_n2D2;
              // B1's nbr n2->B2
              for (i = 0; i < 3; i++) {
                if (treenodes[B1].nbr[i] == n2) {
                  treenodes[B1].nbr[i]  = B2;
                  treenodes[B1].edge[i] = edge_B1B2;
                  break;
                }
              }
              // B2's nbr n2->B1
              for (i = 0; i < 3; i++) {
                if (treenodes[B2].nbr[i] == n2) {
                  treenodes[B2].nbr[i]  = B1;
                  treenodes[B2].edge[i] = edge_B1B2;
                  break;
                }
              }
              // D1's nbr D2->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D1].nbr[i] == D2) {
                  treenodes[D1].nbr[i]  = n2;
                  treenodes[D1].edge[i] = edge_n2D1;
                  break;
                }
              }
              // D2's nbr D1->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D2].nbr[i] == D1) {
                  treenodes[D2].nbr[i]  = n2;
                  treenodes[D2].edge[i] = edge_n2D2;
                  break;
                }
              }
            }    // else E2 is not on (n2, B1) or (n2, B2), but on (d13D, d23D)
          } else // n2 is not a pin and E2!=n2
          {
            newUpdateNodeLayers(treenodes, edge_n1n2, n2a, lastL);
          }

          newcnt_n1n2 = tailRoom - headRoom + 1;

          // update route for edge (n1, n2) and edge usage
          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
            free(treeedges[edge_n1n2].route.gridsX);
            free(treeedges[edge_n1n2].route.gridsY);
            free(treeedges[edge_n1n2].route.gridsL);
          }

          treeedges[edge_n1n2].route.gridsX =
              (short*)calloc(newcnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsY =
              (short*)calloc(newcnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsL =
              (short*)calloc(newcnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.type     = MAZEROUTE;
          treeedges[edge_n1n2].route.routelen = newcnt_n1n2 - 1;
          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);

          j = headRoom;
          for (i = 0; i < newcnt_n1n2; i++) {
            treeedges[edge_n1n2].route.gridsX[i] = gridsX[j];
            treeedges[edge_n1n2].route.gridsY[i] = gridsY[j];
            treeedges[edge_n1n2].route.gridsL[i] = gridsL[j];
            j++;
          }

          // update edge usage
          for (i = headRoom; i < tailRoom; i++) {
            if (gridsL[i] == gridsL[i + 1]) {
              if (gridsX[i] == gridsX[i + 1]) // a vertical edge
              {
                min_y = min(gridsY[i], gridsY[i + 1]);
                v_edges3D[gridsL[i] * gridV + min_y * xGrid + gridsX[i]]
                    .usage += 1;
              } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
              {
                min_x = min(gridsX[i], gridsX[i + 1]);
                h_edges3D[gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + min_x]
                    .usage += 1;
              }
            }
          }

          if (n1Shift || n2Shift) {
            // re statis the node overlap
            numpoints = 0;

            for (d = 0; d < 2 * deg - 2; d++) {
              treenodes[d].topL       = -1;
              treenodes[d].botL       = numLayers;
              treenodes[d].assigned   = FALSE;
              treenodes[d].stackAlias = d;
              treenodes[d].conCNT     = 0;
              treenodes[d].hID        = BIG_INT;
              treenodes[d].lID        = BIG_INT;
              treenodes[d].status     = 0;

              if (d < deg) {
                treenodes[d].botL = treenodes[d].topL = 0;
                // treenodes[d].l = 0;
                treenodes[d].assigned = TRUE;
                treenodes[d].status   = 1;

                xcor[numpoints] = treenodes[d].x;
                ycor[numpoints] = treenodes[d].y;
                dcor[numpoints] = d;
                numpoints++;
              } else {
                redundant = FALSE;
                for (k = 0; k < numpoints; k++) {
                  if ((treenodes[d].x == xcor[k]) &&
                      (treenodes[d].y == ycor[k])) {
                    treenodes[d].stackAlias = dcor[k];

                    redundant = TRUE;
                    break;
                  }
                }
                if (!redundant) {
                  xcor[numpoints] = treenodes[d].x;
                  ycor[numpoints] = treenodes[d].y;
                  dcor[numpoints] = d;
                  numpoints++;
                }
              }
            } // numerating for nodes
            for (k = 0; k < 2 * deg - 3; k++) {

              treeedge = &(treeedges[k]);

              if (treeedge->len > 0) {

                routeLen = treeedge->route.routelen;

                n1        = treeedge->n1;
                n2        = treeedge->n2;
                gridsLtmp = treeedge->route.gridsL;

                n1a = treenodes[n1].stackAlias;

                n2a = treenodes[n2].stackAlias;

                treeedge->n1a = n1a;
                treeedge->n2a = n2a;

                connectionCNT                         = treenodes[n1a].conCNT;
                treenodes[n1a].heights[connectionCNT] = gridsLtmp[0];
                treenodes[n1a].eID[connectionCNT]     = k;
                treenodes[n1a].conCNT++;

                if (gridsLtmp[0] > treenodes[n1a].topL) {
                  treenodes[n1a].hID  = k;
                  treenodes[n1a].topL = gridsLtmp[0];
                }
                if (gridsLtmp[0] < treenodes[n1a].botL) {
                  treenodes[n1a].lID  = k;
                  treenodes[n1a].botL = gridsLtmp[0];
                }

                treenodes[n1a].assigned = TRUE;

                connectionCNT                         = treenodes[n2a].conCNT;
                treenodes[n2a].heights[connectionCNT] = gridsLtmp[routeLen];
                treenodes[n2a].eID[connectionCNT]     = k;
                treenodes[n2a].conCNT++;
                if (gridsLtmp[routeLen] > treenodes[n2a].topL) {
                  treenodes[n2a].hID  = k;
                  treenodes[n2a].topL = gridsLtmp[routeLen];
                }
                if (gridsLtmp[routeLen] < treenodes[n2a].botL) {
                  treenodes[n2a].lID  = k;
                  treenodes[n2a].botL = gridsLtmp[routeLen];
                }

                treenodes[n2a].assigned = TRUE;

              } // edge len > 0

            } // eunmerating edges

            //	printf("edge %d shifted post processing finished\n",edgeID);
          } // if shift1 and shift2
        }
      }
    }
  }

  for (i = 0; i < numLayers; i++) {
    for (j = 0; j < yGrid; j++) {
      free(directions3D[i][j]);
      free(corrEdge3D[i][j]);
      free(pr3D[i][j]);
    }
  }

  for (i = 0; i < numLayers; i++) {

    free(directions3D[i]);
    free(corrEdge3D[i]);
    free(pr3D[i]);
  }

  free(directions3D);
  free(corrEdge3D);
  free(pr3D);

  free(pop_heap23D);
  free(heap13D);
  free(heap23D);
}

void getLayerRange(TreeNode* treenodes, int edgeID, int n1, int deg) {
  int i;
  int ntpL, nbtL, nhID = 0, nlID = 0;

  ntpL = -1;
  nbtL = BIG_INT;

  if (treenodes[n1].conCNT > 1) {
    for (i = 0; i < treenodes[n1].conCNT; i++) {
      if (treenodes[n1].eID[i] != edgeID) {
        if (ntpL < treenodes[n1].heights[i]) {
          ntpL = treenodes[n1].heights[i];
          nhID = treenodes[n1].eID[i];
        }
        if (nbtL > treenodes[n1].heights[i]) {
          nbtL = treenodes[n1].heights[i];
          nlID = treenodes[n1].eID[i];
        }
      }
    }
    if (n1 < deg) {
      nbtL = 0;
    }
    treenodes[n1].topL = ntpL;
    treenodes[n1].botL = nbtL;
    treenodes[n1].hID  = nhID;
    treenodes[n1].lID  = nlID;
  } else {

    if (treenodes[n1].botL > 0) {
      printf("bottom layer acutally %d\n", treenodes[n1].botL);
    }
    treenodes[n1].topL = 0;
    treenodes[n1].botL = 0;
    treenodes[n1].hID  = BIG_INT;
    treenodes[n1].lID  = BIG_INT;
    if (n1 >= deg) {
      printf("steiner nodes only have one connection\n");
      exit(0);
    }
  }
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/maze_finegrain.h
================================================
#include "galois/DynamicBitset.h"

struct grid_lock : public galois::runtime::Lockable {
public:
  int lock;
  int& getData() { return lock; }
};

int round_num = 0;

void mazeRouteMSMD_finegrain(int iter, int expand, float costHeight,
                             int ripup_threshold, int mazeedge_Threshold,
                             Bool Ordering, int cost_type) {
  // LOCK = 0;
  galois::StatTimer timer_finegrain("fine grain function", "fine grain maze");

  float forange;
  // std::cout << " enter here " << std::endl;
  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            (round_num > 50)
                ? 10
                : costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) +
                      1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            (round_num > 50)
                ? 10
                : costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) +
                      1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  }*/

  // Michael

  galois::LargeArray<grid_lock> data;
  data.allocateInterleaved(yGrid * xGrid);
  for (int n = 0; n < yGrid * xGrid; n++) {
    data.constructAt(n);
  }

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  THREAD_LOCAL_STORAGE thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  galois::StatTimer timer_newripupcheck("ripup", "fine grain maze");
  galois::StatTimer timer_setupheap("setup heap", "fine grain maze");
  galois::StatTimer timer_traceback("trace back", "fine grain maze");
  galois::StatTimer timer_adjusttree("adjust tree", "fine grain maze");
  galois::StatTimer timer_updateusage("update usage", "fine grain maze");
  galois::StatTimer timer_checkroute2dtree("checkroute2dtree",
                                           "fine grain maze");
  galois::StatTimer timer_init("init", "fine grain maze");
  galois::StatTimer timer_foreach("foreach", "fine grain maze");
  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {

    int netID;

    // maze routing for multi-source, multi-destination
    Bool hypered, enter;
    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,
        crossX, crossY, tmpi, min_x, min_y, num_edges;
    int regionX1, regionX2, regionY1, regionY2;
    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
        tmp_gridsY[YRANGE];
    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
        edge_C1C2;
    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
    int E1x, E1y, E2x, E2y;
    int origENG, edgeREC;

    TreeEdge *treeedges, *treeedge;
    TreeNode* treenodes;

    bool* pop_heap2 = thread_local_storage.pop_heap2;

    float** d1    = thread_local_storage.d1_p;
    bool** HV     = thread_local_storage.HV_p;
    bool** hyperV = thread_local_storage.hyperV_p;
    bool** hyperH = thread_local_storage.hyperH_p;

    short** parentX1 = thread_local_storage.parentX1_p;
    short** parentX3 = thread_local_storage.parentX3_p;
    short** parentY1 = thread_local_storage.parentY1_p;
    short** parentY3 = thread_local_storage.parentY3_p;

    int** corrEdge = thread_local_storage.corrEdge_p;

    OrderNetEdge* netEO = thread_local_storage.netEO_p;

    bool** inRegion = thread_local_storage.inRegion_p;

    local_pq pq1 = perthread_pq.get();
    local_vec v2 = perthread_vec.get();

    /*for(i=0; i<yGrid*xGrid; i++)
    {
        pop_heap2[i] = FALSE;
    } */

    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
    /*for(int i=0; i<yGrid; i++)
    {
        for(int j=0; j<xGrid; j++)
            inRegion[i][j] = FALSE;
    }*/
    // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
    // parentY1[153][134], parentX3[153][134]); printf("what is happening?\n");

    if (Ordering) {
      netID = treeOrderCong[nidRPC].treeIndex;
    } else {
      netID = nidRPC;
    }

    deg = sttrees[netID].deg;

    origENG = expand;

    netedgeOrderDec(netID, netEO);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {

      edgeID   = netEO[edgeREC].edgeID;
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len >
          mazeedge_Threshold) // only route the non-degraded edges (len>0)
      {
        timer_newripupcheck.start();
        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);
        timer_newripupcheck.stop();

        // ripup the routing for the edge
        timer_finegrain.start();
        if (enter) {
          // if(netID == 2 && edgeID == 26)
          //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
          //    edgeID, n1x, n1y, n2x, n2y);
          // pre_length = treeedge->route.routelen;
          /*for(int i = 0; i < pre_length; i++)
          {
              pre_gridsY[i] = treeedge->route.gridsY[i];
              pre_gridsX[i] = treeedge->route.gridsX[i];
              //printf("i %d x %d y %d\n", i, pre_gridsX[i], pre_gridsY[i]);
          }*/
          timer_init.start();
          if (n1y <= n2y) {
            ymin = n1y;
            ymax = n2y;
          } else {
            ymin = n2y;
            ymax = n1y;
          }

          if (n1x <= n2x) {
            xmin = n1x;
            xmax = n2x;
          } else {
            xmin = n2x;
            xmax = n1x;
          }

          int enlarge = min(
              origENG, (iter / 6 + 3) *
                           treeedge->route
                               .routelen); // michael, this was global variable
          regionX1 = max(0, xmin - enlarge);
          regionX2 = min(xGrid - 1, xmax + enlarge);
          regionY1 = max(0, ymin - enlarge);
          regionY2 = min(yGrid - 1, ymax + enlarge);
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              d1[i][j] = BIG_INT;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              HV[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperH[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperV[i][j] = FALSE;
            }
          }
          // TODO: use seperate loops

          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
          // grids on the two subtrees
          timer_setupheap.start();
          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                    regionY2, d1, corrEdge, inRegion);
          timer_setupheap.stop();
          // TODO: use std priority queue
          // while loop to find shortest path
          /*ind1 = (pq1.top().d1_p - &d1[0][0]);
          curX = ind1%xGrid;
          curY = ind1/xGrid;
          printf("src size: %d dst size: %d\n", pq1.size(), v2.size());*/
          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
            pop_heap2[*ii] = TRUE;
          }
          std::atomic<int> return_ind1;
          std::atomic<float> return_dist;
          return_dist = (float)BIG_INT;

          timer_init.stop();
          timer_foreach.start();
          galois::for_each(
              galois::iterate(pq1),
              [&](const auto& top, auto& ctx)
              // while( pop_heap2[ind1]==FALSE) // stop until the grid position
              // been popped out from both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                int ind1 = top.d1_p - &d1[0][0];

                float d1_push = top.d1_push;
                int curX      = ind1 % xGrid;
                int curY      = ind1 / xGrid;
                int grid      = curY * xGrid + curX;

                float curr_d1 = d1[curY][curX];
                // if(netID == 2 && edgeID == 26)
                //    printf("netID: %d edgeID:%d curX curY %d %d, d1_push: %f,
                //    curr_d1: %f\n", netID, edgeID, curX, curY, d1_push,
                //    curr_d1);

                if (d1_push > return_dist + OBIM_delta) {
                  // ctx.breakLoop();
                }
                galois::runtime::acquire(&data[grid],
                                         galois::MethodFlag::WRITE);
                if (d1_push == curr_d1 && d1_push < return_dist.load()) {
                  if (pop_heap2[ind1] != false) {
                    return_ind1.store(ind1);
                    return_dist.store(d1_push);
                  }

                  grid = curY * xGrid + curX - 1;
                  if (curX > regionX1)
                    galois::runtime::acquire(&data[grid],
                                             galois::MethodFlag::WRITE);

                  grid = curY * xGrid + curX + 1;
                  if (curX < regionX2)
                    galois::runtime::acquire(&data[grid],
                                             galois::MethodFlag::WRITE);

                  grid = (curY - 1) * xGrid + curX;
                  if (curY > regionY1)
                    galois::runtime::acquire(&data[grid],
                                             galois::MethodFlag::WRITE);

                  grid = (curY + 1) * xGrid + curX;
                  if (curY < regionY2)
                    galois::runtime::acquire(&data[grid],
                                             galois::MethodFlag::WRITE);

                  int preX, preY;
                  if (curr_d1 != 0) {
                    if (HV[curY][curX]) {
                      preX = parentX1[curY][curX];
                      preY = parentY1[curY][curX];
                    } else {
                      preX = parentX3[curY][curX];
                      preY = parentY3[curY][curX];
                    }
                  } else {
                    preX = curX;
                    preY = curY;
                  }
                  // printf("pop curY: %d curX: %d d1: %f preX: %d preY: %d
                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\n",
                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],
                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());
                  float tmp, tmp_cost;
                  int tmp_grid;
                  int tmpX, tmpY;
                  // left
                  bool tmpH = false;
                  bool tmpV = false;
                  if (curX > regionX1) {
                    grid = curY * (xGrid - 1) + curX - 1;
                    // printf("grid: %d usage: %d red:%d last:%d sum%f
                    // %d\n",grid, h_edges[grid].usage.load(),
                    // h_edges[grid].red, h_edges[grid].last_usage, L ,
                    // h_edges[grid].usage.load() + h_edges[grid].red +
                    // (int)(L*h_edges[grid].last_usage));
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX < regionX2 - 1) {
                        tmp_grid = curY * (xGrid - 1) + curX;
                        tmp_cost =
                            d1[curY][curX + 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperH[curY][curX] = TRUE; //Michael
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    // if(LOCK)  h_edges[grid].releaseLock();
                    tmpX = curX - 1; // the left neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));
                    }
                    else */
                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],
                    // galois::MethodFlag::WRITE);
                    if (d1[curY][tmpX] > tmp &&
                        tmp < return_dist) // left neighbor been put into heap1
                                           // but needs update
                    {
                      d1[curY][tmpX]       = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX]       = FALSE;
                      // pq1.push({&(d1[curY][tmpX]), tmp});
                      // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};
                      ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));
                      // printf("left push Y: %d X: %d tmp: %f HV: false hyperH:
                      // %d\n", curY, tmpX, tmp, true);
                    }
                  }
                  // right
                  if (curX < regionX2) {
                    grid = curY * (xGrid - 1) + curX;
                    // printf("grid: %d usage: %d red:%d last:%d sum%f
                    // %d\n",grid, h_edges[grid].usage.load(),
                    // h_edges[grid].red, h_edges[grid].last_usage, L ,
                    // h_edges[grid].usage.load() + h_edges[grid].red +
                    // (int)(L*h_edges[grid].last_usage));
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX > regionX1 + 1) {
                        tmp_grid = curY * (xGrid - 1) + curX - 1;
                        tmp_cost =
                            d1[curY][curX - 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperH[curY][curX] = TRUE;
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    // if(LOCK) h_edges[grid].releaseLock();
                    tmpX = curX + 1; // the right neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));

                    }
                    else */
                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],
                    // galois::MethodFlag::WRITE);
                    if (d1[curY][tmpX] > tmp &&
                        tmp < return_dist) // right neighbor been put into heap1
                                           // but needs update
                    {
                      d1[curY][tmpX]       = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX]       = FALSE;
                      // pq1.push({&(d1[curY][tmpX]), tmp});
                      // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};
                      ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));
                      // printf("right push Y: %d X: %d tmp: %f HV: false
                      // hyperH: %d\n", curY, tmpX, tmp, true);
                    }
                  }
                  hyperH[curY][curX] = tmpH;

                  // bottom
                  if (curY > regionY1) {
                    grid = (curY - 1) * xGrid + curX;
                    // printf("grid: %d usage: %d red:%d last:%d sum%f
                    // %d\n",grid, v_edges[grid].usage.load(),
                    // v_edges[grid].red, v_edges[grid].last_usage, L ,
                    // v_edges[grid].usage.load() + v_edges[grid].red +
                    // (int)(L*v_edges[grid].last_usage));
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY < regionY2 - 1) {
                        tmp_grid = curY * xGrid + curX;
                        tmp_cost =
                            d1[curY + 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    // if(LOCK) v_edges[grid].releaseLock();
                    tmpY = curY - 1; // the bottom neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been
                    put into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));

                    }
                    else */
                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],
                    // galois::MethodFlag::WRITE);
                    if (d1[tmpY][curX] > tmp &&
                        tmp < return_dist) // bottom neighbor been put into
                                           // heap1 but needs update
                    {
                      d1[tmpY][curX]       = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX]       = TRUE;
                      // pq1.push({&(d1[tmpY][curX]), tmp});
                      // pq_grid grid_push = {&(d1[tmpY][curX]), tmp};
                      ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));
                      // printf("bottom push Y: %d X: %d tmp: %f HV: false
                      // hyperH: %d\n", tmpY, curX, tmp, true);
                    }
                  }
                  // top
                  if (curY < regionY2) {
                    grid = curY * xGrid + curX;
                    // printf("grid: %d usage: %d red:%d last:%d sum%f
                    // %d\n",grid, v_edges[grid].usage.load(),
                    // v_edges[grid].red, v_edges[grid].last_usage, L ,
                    // v_edges[grid].usage.load() + v_edges[grid].red +
                    // (int)(L*v_edges[grid].last_usage));
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY > regionY1 + 1) {
                        tmp_grid = (curY - 1) * xGrid + curX;
                        tmp_cost =
                            d1[curY - 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    // if(LOCK) v_edges[grid].releaseLock();
                    tmpY = curY + 1; // the top neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put
                    into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));
                    }
                    else*/
                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],
                    // galois::MethodFlag::WRITE);
                    if (d1[tmpY][curX] > tmp &&
                        tmp < return_dist) // top neighbor been put into heap1
                                           // but needs update
                    {
                      d1[tmpY][curX]       = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX]       = TRUE;
                      // pq_grid grid_push = {&(d1[tmpY][curX]), tmp};
                      ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));
                      // printf("top push Y: %d X: %d tmp: %f HV: false hyperH:
                      // %d\n", tmpY, curX, tmp, true);
                      // pq1.push({&(d1[tmpY][curX]), tmp});
                    }
                  }
                  hyperV[curY][curX] = tmpV;
                }
              },
              galois::wl<galois::worklists::ParaMeter<>>(),
              // galois::wl<PSChunk>(),
              // galois::wl<OBIM>(RequestIndexer),
              // galois::chunk_size<MAZE_CHUNK_SIZE>()
              // galois::parallel_break(),
              // galois::steal(),
              galois::loopname("fine_grain"));

          timer_foreach.stop();

          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
            pop_heap2[*ii] = FALSE;

          crossX = return_ind1 % xGrid;
          crossY = return_ind1 / xGrid;

          cnt      = 0;
          int curX = crossX;
          int curY = crossY;
          int tmpX, tmpY;
          // if(netID == 2 && edgeID == 26)
          //    printf("crossX %d crossY %d return_d: %f\n", crossX, crossY,
          //    return_dist.load());
          timer_traceback.start();
          while (d1[curY][curX] != 0) // loop until reach subtree1
          {
            // if(cnt > 1000 && cnt < 1100)
            //    printf("Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\n",
            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],
            //    HV[curY][curX], d1[curY][curX]);

            hypered = FALSE;
            if (cnt != 0) {
              if (curX != tmpX && hyperH[curY][curX]) {
                curX    = 2 * curX - tmpX;
                hypered = TRUE;
              }
              // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
              if (curY != tmpY && hyperV[curY][curX]) {
                curY    = 2 * curY - tmpY;
                hypered = TRUE;
              }
            }
            tmpX = curX;
            tmpY = curY;
            if (!hypered) {
              if (HV[tmpY][tmpX]) {
                curY = parentY1[tmpY][tmpX];
              } else {
                curX = parentX3[tmpY][tmpX];
              }
            }

            tmp_gridsX[cnt] = curX;
            tmp_gridsY[cnt] = curY;
            cnt++;
          }
          // reverse the grids on the path

          for (i = 0; i < cnt; i++) {
            tmpind    = cnt - 1 - i;
            gridsX[i] = tmp_gridsX[tmpind];
            gridsY[i] = tmp_gridsY[tmpind];
          }
          // add the connection point (crossX, crossY)
          gridsX[cnt] = crossX;
          gridsY[cnt] = crossY;
          cnt++;

          curX     = crossX;
          curY     = crossY;
          cnt_n1n2 = cnt;

          // change the tree structure according to the new routing for the tree
          // edge find E1 and E2, and the endpoints of the edges they are on
          E1x = gridsX[0];
          E1y = gridsY[0];
          E2x = gridsX[cnt_n1n2 - 1];
          E2y = gridsY[cnt_n1n2 - 1];

          edge_n1n2 = edgeID;

          timer_traceback.stop();

          // if(netID == 14628)
          //    printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n",
          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);
          // (1) consider subtree1
          timer_adjusttree.start();
          if (n1 >= deg && (E1x != n1x || E1y != n1y))
          // n1 is not a pin and E1!=n1, then make change to subtree1,
          // otherwise, no change to subtree1
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

            // find A1, A2 and edge_n1A1, edge_n1A2
            if (treenodes[n1].nbr[0] == n2) {
              A1        = treenodes[n1].nbr[1];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[1];
              edge_n1A2 = treenodes[n1].edge[2];
            } else if (treenodes[n1].nbr[1] == n2) {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[2];
            } else {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[1];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[1];
            }

            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
            {
              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on
              // (n1, A1)
              if (endpt1 == A2 || endpt2 == A2) {
                tmpi      = A1;
                A1        = A2;
                A2        = tmpi;
                tmpi      = edge_n1A1;
                edge_n1A1 = edge_n1A2;
                edge_n1A2 = tmpi;
              }

              // update route for edge (n1, A1), (n1, A2)
              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                               edge_n1A1, edge_n1A2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
            }    // if E1 is on (n1, A1) or (n1, A2)
            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
            {
              C1        = endpt1;
              C2        = endpt2;
              edge_C1C2 = corrEdge[E1y][E1x];

              // update route for edge (n1, C1), (n1, C2) and (A1, A2)
              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
              // C2)->(A1, A2)
              edge_n1C1               = edge_n1A1;
              treeedges[edge_n1C1].n1 = C1;
              treeedges[edge_n1C1].n2 = n1;
              edge_n1C2               = edge_n1A2;
              treeedges[edge_n1C2].n1 = n1;
              treeedges[edge_n1C2].n2 = C2;
              edge_A1A2               = edge_C1C2;
              treeedges[edge_A1A2].n1 = A1;
              treeedges[edge_A1A2].n2 = A2;
              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
              // n1's nbr (n2, A1, A2)->(n2, C1, C2)
              treenodes[n1].nbr[0]  = n2;
              treenodes[n1].edge[0] = edge_n1n2;
              treenodes[n1].nbr[1]  = C1;
              treenodes[n1].edge[1] = edge_n1C1;
              treenodes[n1].nbr[2]  = C2;
              treenodes[n1].edge[2] = edge_n1C2;
              // A1's nbr n1->A2
              for (i = 0; i < 3; i++) {
                if (treenodes[A1].nbr[i] == n1) {
                  treenodes[A1].nbr[i]  = A2;
                  treenodes[A1].edge[i] = edge_A1A2;
                  break;
                }
              }
              // A2's nbr n1->A1
              for (i = 0; i < 3; i++) {
                if (treenodes[A2].nbr[i] == n1) {
                  treenodes[A2].nbr[i]  = A1;
                  treenodes[A2].edge[i] = edge_A1A2;
                  break;
                }
              }
              // C1's nbr C2->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C1].nbr[i] == C2) {
                  treenodes[C1].nbr[i]  = n1;
                  treenodes[C1].edge[i] = edge_n1C1;
                  break;
                }
              }
              // C2's nbr C1->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C2].nbr[i] == C1) {
                  treenodes[C2].nbr[i]  = n1;
                  treenodes[C2].edge[i] = edge_n1C2;
                  break;
                }
              }

            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
          }   // n1 is not a pin and E1!=n1

          // (2) consider subtree2

          if (n2 >= deg && (E2x != n2x || E2y != n2y))
          // n2 is not a pin and E2!=n2, then make change to subtree2,
          // otherwise, no change to subtree2
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

            // find B1, B2
            if (treenodes[n2].nbr[0] == n1) {
              B1        = treenodes[n2].nbr[1];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[1];
              edge_n2B2 = treenodes[n2].edge[2];
            } else if (treenodes[n2].nbr[1] == n1) {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[2];
            } else {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[1];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[1];
            }

            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
            {
              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on
              // (n2, B1)
              if (endpt1 == B2 || endpt2 == B2) {
                tmpi      = B1;
                B1        = B2;
                B2        = tmpi;
                tmpi      = edge_n2B1;
                edge_n2B1 = edge_n2B2;
                edge_n2B2 = tmpi;
              }

              // update route for edge (n2, B1), (n2, B2)
              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                               edge_n2B1, edge_n2B2);

              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
            }    // if E2 is on (n2, B1) or (n2, B2)
            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
            {
              D1        = endpt1;
              D2        = endpt2;
              edge_D1D2 = corrEdge[E2y][E2x];

              // update route for edge (n2, D1), (n2, D2) and (B1, B2)
              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
              // D2)->(B1, B2)
              edge_n2D1               = edge_n2B1;
              treeedges[edge_n2D1].n1 = D1;
              treeedges[edge_n2D1].n2 = n2;
              edge_n2D2               = edge_n2B2;
              treeedges[edge_n2D2].n1 = n2;
              treeedges[edge_n2D2].n2 = D2;
              edge_B1B2               = edge_D1D2;
              treeedges[edge_B1B2].n1 = B1;
              treeedges[edge_B1B2].n2 = B2;
              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
              // n1's nbr (n1, B1, B2)->(n1, D1, D2)
              treenodes[n2].nbr[0]  = n1;
              treenodes[n2].edge[0] = edge_n1n2;
              treenodes[n2].nbr[1]  = D1;
              treenodes[n2].edge[1] = edge_n2D1;
              treenodes[n2].nbr[2]  = D2;
              treenodes[n2].edge[2] = edge_n2D2;
              // B1's nbr n2->B2
              for (i = 0; i < 3; i++) {
                if (treenodes[B1].nbr[i] == n2) {
                  treenodes[B1].nbr[i]  = B2;
                  treenodes[B1].edge[i] = edge_B1B2;
                  break;
                }
              }
              // B2's nbr n2->B1
              for (i = 0; i < 3; i++) {
                if (treenodes[B2].nbr[i] == n2) {
                  treenodes[B2].nbr[i]  = B1;
                  treenodes[B2].edge[i] = edge_B1B2;
                  break;
                }
              }
              // D1's nbr D2->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D1].nbr[i] == D2) {
                  treenodes[D1].nbr[i]  = n2;
                  treenodes[D1].edge[i] = edge_n2D1;
                  break;
                }
              }
              // D2's nbr D1->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D2].nbr[i] == D1) {
                  treenodes[D2].nbr[i]  = n2;
                  treenodes[D2].edge[i] = edge_n2D2;
                  break;
                }
              }
            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
          }   // n2 is not a pin and E2!=n2

          // update route for edge (n1, n2) and edge usage

          // printf("update route? %d %d\n", netID, num_edges);
          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
            free(treeedges[edge_n1n2].route.gridsX);
            free(treeedges[edge_n1n2].route.gridsY);
          }
          treeedges[edge_n1n2].route.gridsX =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsY =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.type     = MAZEROUTE;
          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
          treeedges[edge_n1n2].n_ripups += 1;
          total_ripups += 1;
          max_ripups.update(treeedges[edge_n1n2].n_ripups);

          for (i = 0; i < cnt_n1n2; i++) {
            // printf("cnt_n1n2: %d\n", cnt_n1n2);
            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
          }
          // std::cout << " adjsut tree" << std::endl;
          timer_adjusttree.stop();

          // update edge usage

          timer_updateusage.start();
          for (i = 0; i < cnt_n1n2 - 1; i++) {
            if (gridsX[i] == gridsX[i + 1]) // a vertical edge
            {
              min_y = min(gridsY[i], gridsY[i + 1]);
              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
              // unsigned)1);
              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
              min_x = min(gridsX[i], gridsX[i + 1]);
              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              // (short unsigned)1);
              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            }
          }
          timer_updateusage.stop();
          timer_checkroute2dtree.start();
          if (checkRoute2DTree(netID)) {
            reInitTree(netID);
            return;
          }
          timer_checkroute2dtree.stop();
        } // congested route, if(enter)
        timer_finegrain.stop();
      } // only route the non-degraded edges (len>0)
    }   // iterate on edges of a net
  }

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}

void mazeRouteMSMD_finegrain_spinlock(int iter, int expand, float costHeight,
                                      int ripup_threshold,
                                      int mazeedge_Threshold, Bool Ordering,
                                      int cost_type) {
  // LOCK = 0;
  galois::StatTimer timer_finegrain("fine grain maze", "fine grain maze");

  float forange;
  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  }

  // cout << " i = vCap:" << v_costTable[vCapacity - 1] << " "
  //     << v_costTable[vCapacity] << " " << v_costTable[vCapacity + 1] << endl;

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  galois::LargeArray<galois::substrate::SimpleLock> data;
  data.allocateInterleaved(xGrid * yGrid);

  galois::substrate::SimpleLock return_lock;
  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  THREAD_LOCAL_STORAGE thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  /*galois::StatTimer timer_newripupcheck("ripup", "fine grain maze");
  galois::StatTimer timer_setupheap("setup heap", "fine grain maze");
  galois::StatTimer timer_traceback("trace back", "fine grain maze");
  galois::StatTimer timer_adjusttree("adjust tree", "fine grain maze");
  galois::StatTimer timer_updateusage("update usage", "fine grain maze");
  galois::StatTimer timer_checkroute2dtree("checkroute2dtree", "fine grain
  maze"); galois::StatTimer timer_init("init", "fine grain maze");
  galois::StatTimer timer_foreach("foreach", "fine grain maze");
  galois::StatTimer timer_init_int("big int initialize", "fine grain maze");*/
  float acc_dist = 0;
  int acc_length = 0;
  int acc_cnt    = 0;
  float max_dist = 0;
  int max_length = 0;

  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {

    int netID;

    // maze routing for multi-source, multi-destination
    Bool hypered, enter;
    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,
        crossX, crossY, tmpi, min_x, min_y, num_edges;
    int regionX1, regionX2, regionY1, regionY2;
    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
        tmp_gridsY[YRANGE];
    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
        edge_C1C2;
    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
    int E1x, E1y, E2x, E2y;
    int origENG, edgeREC;

    TreeEdge *treeedges, *treeedge;
    TreeNode* treenodes;

    bool* pop_heap2 = thread_local_storage.pop_heap2;

    float** d1    = thread_local_storage.d1_p;
    bool** HV     = thread_local_storage.HV_p;
    bool** hyperV = thread_local_storage.hyperV_p;
    bool** hyperH = thread_local_storage.hyperH_p;

    short** parentX1 = thread_local_storage.parentX1_p;
    short** parentX3 = thread_local_storage.parentX3_p;
    short** parentY1 = thread_local_storage.parentY1_p;
    short** parentY3 = thread_local_storage.parentY3_p;

    int** corrEdge = thread_local_storage.corrEdge_p;

    OrderNetEdge* netEO = thread_local_storage.netEO_p;

    bool** inRegion = thread_local_storage.inRegion_p;

    local_pq pq1 = perthread_pq.get();
    local_vec v2 = perthread_vec.get();

    /*for(i=0; i<yGrid*xGrid; i++)
    {
        pop_heap2[i] = FALSE;
    } */

    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
    /*for(int i=0; i<yGrid; i++)
    {
        for(int j=0; j<xGrid; j++)
            inRegion[i][j] = FALSE;
    }*/
    // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
    // parentY1[153][134], parentX3[153][134]); printf("what is happening?\n");

    if (Ordering) {
      netID = treeOrderCong[nidRPC].treeIndex;
    } else {
      netID = nidRPC;
    }

    deg = sttrees[netID].deg;

    origENG = expand;

    netedgeOrderDec(netID, netEO);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {

      edgeID   = netEO[edgeREC].edgeID;
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len >
          mazeedge_Threshold) // only route the non-degraded edges (len>0)
      {
        // timer_newripupcheck.start();
        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);
        // timer_newripupcheck.stop();

        // ripup the routing for the edge
        timer_finegrain.start();
        if (enter) {
          // if(netID == 2 && edgeID == 26)
          //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
          //    edgeID, n1x, n1y, n2x, n2y);
          // pre_length = treeedge->route.routelen;
          /*for(int i = 0; i < pre_length; i++)
          {
              pre_gridsY[i] = treeedge->route.gridsY[i];
              pre_gridsX[i] = treeedge->route.gridsX[i];
              //printf("i %d x %d y %d\n", i, pre_gridsX[i], pre_gridsY[i]);
          }*/
          // timer_init.start();
          if (n1y <= n2y) {
            ymin = n1y;
            ymax = n2y;
          } else {
            ymin = n2y;
            ymax = n1y;
          }

          if (n1x <= n2x) {
            xmin = n1x;
            xmax = n2x;
          } else {
            xmin = n2x;
            xmax = n1x;
          }

          int enlarge = min(
              origENG, (iter / 6 + 3) *
                           treeedge->route
                               .routelen); // michael, this was global variable
          regionX1 = max(0, xmin - enlarge);
          regionX2 = min(xGrid - 1, xmax + enlarge);
          regionY1 = max(0, ymin - enlarge);
          regionY2 = min(yGrid - 1, ymax + enlarge);
          // std::cout << "region size" << regionWidth << ", " << regionHeight
          // << std::endl;
          // initialize d1[][] and d2[][] as BIG_INT
          // timer_init_int.start();
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              d1[i][j] = BIG_INT;

              /*d2[i][j] = BIG_INT;
              hyperH[i][j] = FALSE;
              hyperV[i][j] = FALSE;*/
            }
          }
          // timer_init_int.stop();
          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              HV[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperH[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperV[i][j] = FALSE;
            }
          }
          // TODO: use seperate loops

          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
          // grids on the two subtrees
          // timer_setupheap.start();
          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                    regionY2, d1, corrEdge, inRegion);
          // timer_setupheap.stop();
          // TODO: use std priority queue
          // while loop to find shortest path
          /*ind1 = (pq1.top().d1_p - &d1[0][0]);
          curX = ind1%xGrid;
          curY = ind1/xGrid;
          printf("src size: %d dst size: %d\n", pq1.size(), v2.size());*/
          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
            pop_heap2[*ii] = TRUE;
          }
          std::atomic<int> return_ind1;
          std::atomic<float> return_dist;
          return_dist = (float)BIG_INT;

          galois::for_each(
              galois::iterate(pq1),
              [&](const auto& top, auto& ctx)
              // while( pop_heap2[ind1]==FALSE) // stop until the grid position
              // been popped out from both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                int ind1 = top.d1_p - &d1[0][0];

                int curX = ind1 % xGrid;
                int curY = ind1 / xGrid;
                int grid = curY * xGrid + curX;

                float curr_d1 = d1[curY][curX];
                float d1_push = top.d1_push;

                if (d1_push > return_dist + OBIM_delta) {
                  // ctx.breakLoop();
                }
                if (d1_push == curr_d1 && d1_push < return_dist.load()) {
                  if (pop_heap2[ind1] != false) {
                    // if(netID == 2 && edgeID == 26)
                    //    printf("reach! curX curY %d %d, d1_push: %f, curr_d1:
                    //    %f return_d: %f\n", curX, curY, d1_push, curr_d1,
                    //    return_dist.load());
                    return_lock.lock();
                    if (d1_push < return_dist.load()) {
                      return_ind1.store(ind1);
                      return_dist.store(d1_push);
                    } else {
                      return_lock.unlock();
                      return;
                    }
                    return_lock.unlock();
                  }
                  // curr_d1 = d1_push;

                  int preX = curX, preY = curY;
                  if (curr_d1 != 0) {
                    if (HV[curY][curX]) {
                      preX = parentX1[curY][curX];
                      preY = parentY1[curY][curX];
                    } else {
                      preX = parentX3[curY][curX];
                      preY = parentY3[curY][curX];
                    }
                  }
                  // printf("pop curY: %d curX: %d d1: %f preX: %d preY: %d
                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\n",
                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],
                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());
                  float tmp = 0.f, tmp_cost = 0.f;
                  int tmp_grid = 0;
                  int tmpX = 0, tmpY = 0;
                  // left
                  bool tmpH = false;
                  bool tmpV = false;

                  if (curX > regionX1) {
                    grid = curY * (xGrid - 1) + curX - 1;

                    tmpX = curX - 1; // the left neighbor
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX < regionX2 - 1) {
                        tmp_grid = curY * (xGrid - 1) + curX;
                        tmp_cost =
                            d1[curY][curX + 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA &&
                            d1[curY][tmpX] >
                                tmp_cost +
                                    h_costTable[h_edges[grid].usage +
                                                h_edges[grid].red +
                                                (int)(L * h_edges[grid]
                                                              .last_usage)]) {
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }

                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {

                      data[curY * xGrid + curX - 1].lock();
                      if (d1[curY][tmpX] > tmp &&
                          tmp < return_dist) // left neighbor been put into
                                             // heap1 but needs update
                      {
                        d1[curY][tmpX]       = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX]       = FALSE;
                        ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));
                      }
                      data[curY * xGrid + curX - 1].unlock();
                    }
                  }
                  // right

                  if (curX < regionX2) {
                    grid = curY * (xGrid - 1) + curX;
                    tmpX = curX + 1; // the right neighbor
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX > regionX1 + 1) {
                        tmp_grid = curY * (xGrid - 1) + curX - 1;
                        tmp_cost =
                            d1[curY][curX - 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA &&
                            d1[curY][tmpX] >
                                tmp_cost +
                                    h_costTable[h_edges[grid].usage +
                                                h_edges[grid].red +
                                                (int)(L * h_edges[grid]
                                                              .last_usage)]) {
                          // hyperH[curY][curX] = TRUE;
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }

                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));

                    }
                    else */
                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],
                    // galois::MethodFlag::WRITE);
                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {

                      data[curY * xGrid + curX + 1].lock();
                      if (d1[curY][tmpX] > tmp &&
                          tmp < return_dist) // right neighbor been put into
                                             // heap1 but needs update
                      {
                        d1[curY][tmpX]       = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX]       = FALSE;
                        ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));

                        // printf("right push Y: %d X: %d tmp: %f HV: false
                        // hyperH: %d\n", curY, tmpX, tmp, true);
                      }
                      data[curY * xGrid + curX + 1].unlock();
                    }
                  }
                  hyperH[curY][curX] = tmpH;

                  if (curY > regionY1) {
                    grid = (curY - 1) * xGrid + curX;

                    tmpY = curY - 1; // the bottom neighbor
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY < regionY2 - 1) {
                        tmp_grid = curY * xGrid + curX;
                        tmp_cost =
                            d1[curY + 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA &&
                            d1[tmpY][curX] >
                                tmp_cost +
                                    v_costTable[v_edges[grid].usage +
                                                v_edges[grid].red +
                                                (int)(L * v_edges[grid]
                                                              .last_usage)]) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }

                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {

                      data[(curY - 1) * xGrid + curX].lock();
                      if (d1[tmpY][curX] > tmp &&
                          tmp < return_dist) // bottom neighbor been put into
                                             // heap1 but needs update
                      {
                        d1[tmpY][curX]       = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX]       = TRUE;
                        ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));
                      }
                      data[(curY - 1) * xGrid + curX].unlock();
                    }
                  }
                  // top
                  if (curY < regionY2) {

                    grid = curY * xGrid + curX;

                    tmpY = curY + 1; // the top neighbor
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY > regionY1 + 1) {
                        tmp_grid = (curY - 1) * xGrid + curX;
                        tmp_cost =
                            d1[curY - 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA &&
                            d1[tmpY][curX] >
                                tmp_cost +
                                    v_costTable[v_edges[grid].usage +
                                                v_edges[grid].red +
                                                (int)(L * v_edges[grid]
                                                              .last_usage)]) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }

                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {

                      data[(curY + 1) * xGrid + curX].lock();
                      if (d1[tmpY][curX] > tmp &&
                          tmp < return_dist) // top neighbor been put into heap1
                                             // but needs update
                      {
                        d1[tmpY][curX]       = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX]       = TRUE;
                        ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));
                      }
                      data[(curY + 1) * xGrid + curX].unlock();
                    }
                  }
                  hyperV[curY][curX] = tmpV;
                }
              },
              // galois::wl<galois::worklists::ParaMeter<>>(),
              // galois::wl<PSChunk>(),
              galois::wl<OBIM>(RequestIndexer)
              // galois::chunk_size<MAZE_CHUNK_SIZE>()
              // galois::parallel_break(),
              // galois::steal(),
              // galois::loopname("fine_grain")
          );

          // timer_foreach.stop();

          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
            pop_heap2[*ii] = FALSE;

          crossX = return_ind1 % xGrid;
          crossY = return_ind1 / xGrid;

          cnt      = 0;
          int curX = crossX;
          int curY = crossY;
          int tmpX, tmpY;
          acc_cnt++;
          acc_dist += return_dist;
          max_dist =
              (max_dist >= return_dist.load()) ? max_dist : return_dist.load();

          while (d1[curY][curX] != 0) // loop until reach subtree1
          {
            hypered = FALSE;
            if (cnt != 0) {
              if (curX != tmpX && hyperH[curY][curX]) {
                curX    = 2 * curX - tmpX;
                hypered = TRUE;
              }
              if (curY != tmpY && hyperV[curY][curX]) {
                curY    = 2 * curY - tmpY;
                hypered = TRUE;
              }
            }
            tmpX = curX;
            tmpY = curY;
            if (!hypered) {
              if (HV[tmpY][tmpX]) {
                curY = parentY1[tmpY][tmpX];
              } else {
                curX = parentX3[tmpY][tmpX];
              }
            }

            tmp_gridsX[cnt] = curX;
            tmp_gridsY[cnt] = curY;
            cnt++;
          }
          // reverse the grids on the path

          for (i = 0; i < cnt; i++) {
            tmpind    = cnt - 1 - i;
            gridsX[i] = tmp_gridsX[tmpind];
            gridsY[i] = tmp_gridsY[tmpind];
          }
          // add the connection point (crossX, crossY)
          gridsX[cnt] = crossX;
          gridsY[cnt] = crossY;
          cnt++;

          curX     = crossX;
          curY     = crossY;
          cnt_n1n2 = cnt;

          // change the tree structure according to the new routing for the tree
          // edge find E1 and E2, and the endpoints of the edges they are on
          E1x = gridsX[0];
          E1y = gridsY[0];
          E2x = gridsX[cnt_n1n2 - 1];
          E2y = gridsY[cnt_n1n2 - 1];

          edge_n1n2 = edgeID;

          if (n1 >= deg && (E1x != n1x || E1y != n1y))
          // n1 is not a pin and E1!=n1, then make change to subtree1,
          // otherwise, no change to subtree1
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

            // find A1, A2 and edge_n1A1, edge_n1A2
            if (treenodes[n1].nbr[0] == n2) {
              A1        = treenodes[n1].nbr[1];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[1];
              edge_n1A2 = treenodes[n1].edge[2];
            } else if (treenodes[n1].nbr[1] == n2) {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[2];
            } else {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[1];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[1];
            }

            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
            {
              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on
              // (n1, A1)
              if (endpt1 == A2 || endpt2 == A2) {
                tmpi      = A1;
                A1        = A2;
                A2        = tmpi;
                tmpi      = edge_n1A1;
                edge_n1A1 = edge_n1A2;
                edge_n1A2 = tmpi;
              }

              // update route for edge (n1, A1), (n1, A2)
              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                               edge_n1A1, edge_n1A2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
            }    // if E1 is on (n1, A1) or (n1, A2)
            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
            {
              C1        = endpt1;
              C2        = endpt2;
              edge_C1C2 = corrEdge[E1y][E1x];

              // update route for edge (n1, C1), (n1, C2) and (A1, A2)
              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
              // C2)->(A1, A2)
              edge_n1C1               = edge_n1A1;
              treeedges[edge_n1C1].n1 = C1;
              treeedges[edge_n1C1].n2 = n1;
              edge_n1C2               = edge_n1A2;
              treeedges[edge_n1C2].n1 = n1;
              treeedges[edge_n1C2].n2 = C2;
              edge_A1A2               = edge_C1C2;
              treeedges[edge_A1A2].n1 = A1;
              treeedges[edge_A1A2].n2 = A2;
              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
              // n1's nbr (n2, A1, A2)->(n2, C1, C2)
              treenodes[n1].nbr[0]  = n2;
              treenodes[n1].edge[0] = edge_n1n2;
              treenodes[n1].nbr[1]  = C1;
              treenodes[n1].edge[1] = edge_n1C1;
              treenodes[n1].nbr[2]  = C2;
              treenodes[n1].edge[2] = edge_n1C2;
              // A1's nbr n1->A2
              for (i = 0; i < 3; i++) {
                if (treenodes[A1].nbr[i] == n1) {
                  treenodes[A1].nbr[i]  = A2;
                  treenodes[A1].edge[i] = edge_A1A2;
                  break;
                }
              }
              // A2's nbr n1->A1
              for (i = 0; i < 3; i++) {
                if (treenodes[A2].nbr[i] == n1) {
                  treenodes[A2].nbr[i]  = A1;
                  treenodes[A2].edge[i] = edge_A1A2;
                  break;
                }
              }
              // C1's nbr C2->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C1].nbr[i] == C2) {
                  treenodes[C1].nbr[i]  = n1;
                  treenodes[C1].edge[i] = edge_n1C1;
                  break;
                }
              }
              // C2's nbr C1->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C2].nbr[i] == C1) {
                  treenodes[C2].nbr[i]  = n1;
                  treenodes[C2].edge[i] = edge_n1C2;
                  break;
                }
              }

            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
          }   // n1 is not a pin and E1!=n1

          // (2) consider subtree2

          if (n2 >= deg && (E2x != n2x || E2y != n2y))
          // n2 is not a pin and E2!=n2, then make change to subtree2,
          // otherwise, no change to subtree2
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

            // find B1, B2
            if (treenodes[n2].nbr[0] == n1) {
              B1        = treenodes[n2].nbr[1];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[1];
              edge_n2B2 = treenodes[n2].edge[2];
            } else if (treenodes[n2].nbr[1] == n1) {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[2];
            } else {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[1];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[1];
            }

            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
            {
              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on
              // (n2, B1)
              if (endpt1 == B2 || endpt2 == B2) {
                tmpi      = B1;
                B1        = B2;
                B2        = tmpi;
                tmpi      = edge_n2B1;
                edge_n2B1 = edge_n2B2;
                edge_n2B2 = tmpi;
              }

              // update route for edge (n2, B1), (n2, B2)
              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                               edge_n2B1, edge_n2B2);

              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
            }    // if E2 is on (n2, B1) or (n2, B2)
            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
            {
              D1        = endpt1;
              D2        = endpt2;
              edge_D1D2 = corrEdge[E2y][E2x];

              // update route for edge (n2, D1), (n2, D2) and (B1, B2)
              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
              // D2)->(B1, B2)
              edge_n2D1               = edge_n2B1;
              treeedges[edge_n2D1].n1 = D1;
              treeedges[edge_n2D1].n2 = n2;
              edge_n2D2               = edge_n2B2;
              treeedges[edge_n2D2].n1 = n2;
              treeedges[edge_n2D2].n2 = D2;
              edge_B1B2               = edge_D1D2;
              treeedges[edge_B1B2].n1 = B1;
              treeedges[edge_B1B2].n2 = B2;
              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
              // n1's nbr (n1, B1, B2)->(n1, D1, D2)
              treenodes[n2].nbr[0]  = n1;
              treenodes[n2].edge[0] = edge_n1n2;
              treenodes[n2].nbr[1]  = D1;
              treenodes[n2].edge[1] = edge_n2D1;
              treenodes[n2].nbr[2]  = D2;
              treenodes[n2].edge[2] = edge_n2D2;
              // B1's nbr n2->B2
              for (i = 0; i < 3; i++) {
                if (treenodes[B1].nbr[i] == n2) {
                  treenodes[B1].nbr[i]  = B2;
                  treenodes[B1].edge[i] = edge_B1B2;
                  break;
                }
              }
              // B2's nbr n2->B1
              for (i = 0; i < 3; i++) {
                if (treenodes[B2].nbr[i] == n2) {
                  treenodes[B2].nbr[i]  = B1;
                  treenodes[B2].edge[i] = edge_B1B2;
                  break;
                }
              }
              // D1's nbr D2->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D1].nbr[i] == D2) {
                  treenodes[D1].nbr[i]  = n2;
                  treenodes[D1].edge[i] = edge_n2D1;
                  break;
                }
              }
              // D2's nbr D1->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D2].nbr[i] == D1) {
                  treenodes[D2].nbr[i]  = n2;
                  treenodes[D2].edge[i] = edge_n2D2;
                  break;
                }
              }
            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
          }   // n2 is not a pin and E2!=n2

          // update route for edge (n1, n2) and edge usage

          // printf("update route? %d %d\n", netID, num_edges);
          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
            free(treeedges[edge_n1n2].route.gridsX);
            free(treeedges[edge_n1n2].route.gridsY);
          }
          treeedges[edge_n1n2].route.gridsX =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsY =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.type     = MAZEROUTE;
          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
          treeedges[edge_n1n2].n_ripups += 1;
          total_ripups += 1;
          max_ripups.update(treeedges[edge_n1n2].n_ripups);

          for (i = 0; i < cnt_n1n2; i++) {
            // printf("cnt_n1n2: %d\n", cnt_n1n2);
            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
          }
          // std::cout << " adjsut tree" << std::endl;
          // timer_adjusttree.stop();

          // update edge usage

          // timer_updateusage.start();
          acc_length += cnt_n1n2;
          max_length = (max_length >= cnt_n1n2) ? max_length : cnt_n1n2;
          for (i = 0; i < cnt_n1n2 - 1; i++) {
            if (gridsX[i] == gridsX[i + 1]) // a vertical edge
            {
              min_y = min(gridsY[i], gridsY[i + 1]);
              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
              // unsigned)1);
              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
              min_x = min(gridsX[i], gridsX[i + 1]);
              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              // (short unsigned)1);
              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            }
          }
          // timer_updateusage.stop();
          // timer_checkroute2dtree.start();
          if (checkRoute2DTree(netID)) {
            reInitTree(netID);
            return;
          }
          // timer_checkroute2dtree.stop();
        } // congested route, if(enter)
        timer_finegrain.stop();
      } // only route the non-degraded edges (len>0)
    }   // iterate on edges of a net
  }

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  if (acc_cnt != 0) {
    cout << " max_dist " << max_dist << " max_length " << max_length << endl;
    cout << " avg dist " << acc_dist / acc_cnt << " avg length "
         << (float)acc_length / acc_cnt << " acc_cnt: " << acc_cnt << endl;
    round_avg_dist   = acc_dist / acc_cnt;
    round_avg_length = acc_length / acc_cnt;
  }
  free(h_costTable);
  free(v_costTable);
}

void mazeRouteMSMD_finegrain_doall(int iter, int expand, float costHeight,
                                   int ripup_threshold, int mazeedge_Threshold,
                                   Bool Ordering, int cost_type) {
  // LOCK = 0;
  galois::StatTimer timer_finegrain("fine grain function", "fine grain maze");

  float forange;
  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  galois::LargeArray<galois::substrate::SimpleLock> data;
  data.allocateInterleaved(xGrid * yGrid);

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  THREAD_LOCAL_STORAGE thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  galois::StatTimer timer_newripupcheck("ripup", "fine grain maze");
  galois::StatTimer timer_setupheap("setup heap", "fine grain maze");
  galois::StatTimer timer_traceback("trace back", "fine grain maze");
  galois::StatTimer timer_adjusttree("adjust tree", "fine grain maze");
  galois::StatTimer timer_updateusage("update usage", "fine grain maze");
  galois::StatTimer timer_checkroute2dtree("checkroute2dtree",
                                           "fine grain maze");
  galois::StatTimer timer_init("init", "fine grain maze");
  galois::StatTimer timer_foreach("foreach", "fine grain maze");
  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {

    int netID;

    // maze routing for multi-source, multi-destination
    Bool hypered, enter;
    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,
        crossX, crossY, tmpi, min_x, min_y, num_edges;
    int regionX1, regionX2, regionY1, regionY2;
    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
        tmp_gridsY[YRANGE];
    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
        edge_C1C2;
    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
    int E1x, E1y, E2x, E2y;
    int origENG, edgeREC;

    TreeEdge *treeedges, *treeedge;
    TreeNode* treenodes;

    bool* pop_heap2 = thread_local_storage.pop_heap2;

    float** d1    = thread_local_storage.d1_p;
    bool** HV     = thread_local_storage.HV_p;
    bool** hyperV = thread_local_storage.hyperV_p;
    bool** hyperH = thread_local_storage.hyperH_p;

    short** parentX1 = thread_local_storage.parentX1_p;
    short** parentX3 = thread_local_storage.parentX3_p;
    short** parentY1 = thread_local_storage.parentY1_p;
    short** parentY3 = thread_local_storage.parentY3_p;

    int** corrEdge = thread_local_storage.corrEdge_p;

    OrderNetEdge* netEO = thread_local_storage.netEO_p;

    bool** inRegion = thread_local_storage.inRegion_p;

    local_pq pq1 = perthread_pq.get();
    local_vec v2 = perthread_vec.get();

    /*for(i=0; i<yGrid*xGrid; i++)
    {
        pop_heap2[i] = FALSE;
    } */

    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
    /*for(int i=0; i<yGrid; i++)
    {
        for(int j=0; j<xGrid; j++)
            inRegion[i][j] = FALSE;
    }*/
    // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
    // parentY1[153][134], parentX3[153][134]); printf("what is happening?\n");

    if (Ordering) {
      netID = treeOrderCong[nidRPC].treeIndex;
    } else {
      netID = nidRPC;
    }

    deg = sttrees[netID].deg;

    origENG = expand;

    netedgeOrderDec(netID, netEO);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {

      edgeID   = netEO[edgeREC].edgeID;
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len >
          mazeedge_Threshold) // only route the non-degraded edges (len>0)
      {
        timer_newripupcheck.start();
        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);
        timer_newripupcheck.stop();

        // ripup the routing for the edge
        timer_finegrain.start();
        if (enter) {
          // if(netID == 2 && edgeID == 26)
          //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
          //    edgeID, n1x, n1y, n2x, n2y);
          // pre_length = treeedge->route.routelen;
          /*for(int i = 0; i < pre_length; i++)
          {
              pre_gridsY[i] = treeedge->route.gridsY[i];
              pre_gridsX[i] = treeedge->route.gridsX[i];
              //printf("i %d x %d y %d\n", i, pre_gridsX[i], pre_gridsY[i]);
          }*/
          timer_init.start();
          if (n1y <= n2y) {
            ymin = n1y;
            ymax = n2y;
          } else {
            ymin = n2y;
            ymax = n1y;
          }

          if (n1x <= n2x) {
            xmin = n1x;
            xmax = n2x;
          } else {
            xmin = n2x;
            xmax = n1x;
          }

          int enlarge = min(
              origENG, (iter / 6 + 3) *
                           treeedge->route
                               .routelen); // michael, this was global variable
          regionX1 = max(0, xmin - enlarge);
          regionX2 = min(xGrid - 1, xmax + enlarge);
          regionY1 = max(0, ymin - enlarge);
          regionY2 = min(yGrid - 1, ymax + enlarge);
          // std::cout << "region size" << regionWidth << ", " << regionHeight
          // << std::endl;
          // initialize d1[][] and d2[][] as BIG_INT
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              d1[i][j] = BIG_INT;

              /*d2[i][j] = BIG_INT;
              hyperH[i][j] = FALSE;
              hyperV[i][j] = FALSE;*/
            }
          }
          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              HV[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperH[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperV[i][j] = FALSE;
            }
          }
          // TODO: use seperate loops

          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
          // grids on the two subtrees
          timer_setupheap.start();
          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                    regionY2, d1, corrEdge, inRegion);
          timer_setupheap.stop();
          // TODO: use std priority queue
          // while loop to find shortest path
          /*ind1 = (pq1.top().d1_p - &d1[0][0]);
          curX = ind1%xGrid;
          curY = ind1/xGrid;
          printf("src size: %d dst size: %d\n", pq1.size(), v2.size());*/
          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
            pop_heap2[*ii] = TRUE;
          }
          std::atomic<int> return_ind1;
          std::atomic<float> return_dist;
          return_dist = (float)BIG_INT;

          timer_init.stop();
          timer_foreach.start();

          galois::InsertBag<pq_grid> wls[2];
          galois::InsertBag<pq_grid>* next;
          galois::InsertBag<pq_grid>* cur;

          cur  = &wls[0];
          next = &wls[1];
          std::atomic<int> bagsize;
          bagsize = 0;

          while (!pq1.empty()) {
            auto tmp = pq1.top();
            pq1.pop();
            cur->push(tmp);
            // bagsize++;
          }

          while (!cur->empty()) {
            // std::cout << "bag size: " << bagsize.load() << " unique items: "
            // << dynBS.count() << std::endl; bagsize = 0; dynBS.reset();
            // galois::for_each(galois::iterate(*cur),
            //[&] (const auto& top, auto& ctx)
            galois::do_all(
                galois::iterate(*cur),
                [&](const auto& top)

                /*galois::for_each(galois::iterate(pq1),
                [&] (const auto& top, auto& ctx)*/
                // while( pop_heap2[ind1]==FALSE) // stop until the grid
                // position been popped out from both heap1 and heap2
                {
                  // relax all the adjacent grids within the enlarged region for
                  // source subtree

                  int ind1 = top.d1_p - &d1[0][0];

                  //
                  int curX = ind1 % xGrid;
                  int curY = ind1 / xGrid;
                  int grid = curY * xGrid + curX;
                  // std::cout << " d1: " << d1[curY][curX] << std::endl;
                  float d1_push = top.d1_push; // d1[curY][curX];
                  float curr_d1 = d1[curY][curX];
                  // std::cout << "d1_push: " << d1_push << " d1: " <<
                  // d1[curY][curX] << std::endl; if(netID == 2 && edgeID == 26)
                  // printf("netID: %d edgeID:%d curX curY %d %d, d1_push: %f,
                  // curr_d1: %f\n", netID, edgeID, curX, curY, d1_push,
                  // curr_d1);

                  if (d1_push > return_dist + OBIM_delta) {
                    // printf("netID: %d early break\n", netID);
                    // if(netID == 2 && edgeID == 26)
                    //    printf("break! curX curY %d %d, d1_push: %f, curr_d1:
                    //    %f return_d: %f\n", curX, curY, d1_push, curr_d1,
                    //    return_dist.load());
                    // ctx.breakLoop();
                  }
                  // galois::runtime::acquire(&data[grid],
                  // galois::MethodFlag::WRITE);
                  if (d1_push == curr_d1 && d1_push < return_dist.load()) {
                    if (pop_heap2[ind1] != false) {
                      // if(netID == 2 && edgeID == 26)
                      //    printf("reach! curX curY %d %d, d1_push: %f,
                      //    curr_d1: %f return_d: %f\n", curX, curY, d1_push,
                      //    curr_d1, return_dist.load());
                      return_ind1.store(ind1);
                      return_dist.store(d1_push);
                    }

                    /*grid = curY*xGrid + curX - 1;
                    if(curX>regionX1)
                        galois::runtime::acquire(&data[grid],
                    galois::MethodFlag::WRITE);

                    grid = curY*xGrid + curX + 1;
                    if(curX<regionX2)
                        galois::runtime::acquire(&data[grid],
                    galois::MethodFlag::WRITE);

                    grid = (curY - 1)*xGrid + curX;
                    if(curY>regionY1)
                        galois::runtime::acquire(&data[grid],
                    galois::MethodFlag::WRITE);

                    grid = (curY + 1)*xGrid + curX;
                    if(curY<regionY2)
                        galois::runtime::acquire(&data[grid],
                    galois::MethodFlag::WRITE);*/

                    int preX = curX, preY = curY;
                    if (curr_d1 != 0) {
                      if (HV[curY][curX]) {
                        preX = parentX1[curY][curX];
                        preY = parentY1[curY][curX];
                      } else {
                        preX = parentX3[curY][curX];
                        preY = parentY3[curY][curX];
                      }
                    }
                    // printf("pop curY: %d curX: %d d1: %f preX: %d preY: %d
                    // hyperH: %d hyperV: %d HV: %d return_dist: %f\n",
                    //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],
                    //    hyperV[curY][curX], HV[curY][curX],
                    //    return_dist.load());
                    float tmp = 0.f, tmp_cost = 0.f;
                    int tmp_grid = 0;
                    int tmpX = 0, tmpY = 0;
                    // left
                    bool tmpH = false;
                    bool tmpV = false;

                    // if(curX>regionX1)
                    //    data[curY*xGrid+curX-1].lock();

                    // data[curY*(xGrid-1)+curX].lock();

                    if (curX > regionX1) {
                      grid = curY * (xGrid - 1) + curX - 1;

                      // printf("grid: %d %d usage: %d red:%d last:%d sum%f
                      // %d\n",
                      //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),
                      //    h_edges[grid].red, h_edges[grid].last_usage, L ,
                      //    h_edges[grid].usage.load() + h_edges[grid].red +
                      //    (int)(L*h_edges[grid].last_usage));
                      if ((preY == curY) || (curr_d1 == 0)) {
                        tmp = curr_d1 +
                              h_costTable[h_edges[grid].usage +
                                          h_edges[grid].red +
                                          (int)(L * h_edges[grid].last_usage)];
                      } else {
                        if (curX < regionX2 - 1) {
                          tmp_grid = curY * (xGrid - 1) + curX;
                          tmp_cost =
                              d1[curY][curX + 1] +
                              h_costTable[h_edges[tmp_grid].usage +
                                          h_edges[tmp_grid].red +
                                          (int)(L *
                                                h_edges[tmp_grid].last_usage)];

                          if (tmp_cost < curr_d1 + VIA) {
                            // hyperH[curY][curX] = TRUE; //Michael
                            tmpH = true;
                          }
                        }
                        tmp = curr_d1 + VIA +
                              h_costTable[h_edges[grid].usage +
                                          h_edges[grid].red +
                                          (int)(L * h_edges[grid].last_usage)];
                      }
                      tmpX = curX - 1; // the left neighbor
                      if (d1[curY][tmpX] > tmp && tmp < return_dist) {
                        data[curY * xGrid + curX - 1].lock();
                        if (d1[curY][tmpX] > tmp &&
                            tmp < return_dist) // left neighbor been put into
                                               // heap1 but needs update
                        {
                          d1[curY][tmpX]       = tmp;
                          parentX3[curY][tmpX] = curX;
                          parentY3[curY][tmpX] = curY;
                          HV[curY][tmpX]       = FALSE;
                          // pq1.push({&(d1[curY][tmpX]), tmp});
                          // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};

                          // next->push(&(d1[curY][tmpX]));
                          next->push(pq_grid(&(d1[curY][tmpX]), tmp));
                          // bagsize.fetch_add(1, std::memory_order_relaxed);

                          // printf("left push Y: %d X: %d tmp: %f HV: false
                          // hyperH: %d\n", curY, tmpX, tmp, true);
                        }
                        data[curY * xGrid + curX - 1].unlock();
                      }
                    }
                    // right

                    if (curX < regionX2) {
                      // data[curY*xGrid+curX+1].lock();
                      grid = curY * (xGrid - 1) + curX;
                      // printf("grid: %d %d usage: %d red:%d last:%d L:%f
                      // sum:%d\n",grid%xGrid, grid/xGrid,
                      // h_edges[grid].usage.load(), h_edges[grid].red,
                      // h_edges[grid].last_usage, L ,
                      // h_edges[grid].usage.load()
                      // + h_edges[grid].red +
                      // (int)(L*h_edges[grid].last_usage));
                      if ((preY == curY) || (curr_d1 == 0)) {
                        tmp = curr_d1 +
                              h_costTable[h_edges[grid].usage +
                                          h_edges[grid].red +
                                          (int)(L * h_edges[grid].last_usage)];
                      } else {
                        if (curX > regionX1 + 1) {
                          tmp_grid = curY * (xGrid - 1) + curX - 1;
                          tmp_cost =
                              d1[curY][curX - 1] +
                              h_costTable[h_edges[tmp_grid].usage +
                                          h_edges[tmp_grid].red +
                                          (int)(L *
                                                h_edges[tmp_grid].last_usage)];

                          if (tmp_cost < curr_d1 + VIA) {
                            // hyperH[curY][curX] = TRUE;
                            tmpH = true;
                          }
                        }
                        tmp = curr_d1 + VIA +
                              h_costTable[h_edges[grid].usage +
                                          h_edges[grid].red +
                                          (int)(L * h_edges[grid].last_usage)];
                      }
                      tmpX = curX + 1; // the right neighbor

                      if (d1[curY][tmpX] > tmp && tmp < return_dist) {
                        data[curY * xGrid + curX + 1].lock();
                        if (d1[curY][tmpX] > tmp &&
                            tmp < return_dist) // right neighbor been put into
                                               // heap1 but needs update
                        {
                          d1[curY][tmpX]       = tmp;
                          parentX3[curY][tmpX] = curX;
                          parentY3[curY][tmpX] = curY;
                          HV[curY][tmpX]       = FALSE;

                          // next->push(&(d1[curY][tmpX]));
                          next->push(pq_grid(&(d1[curY][tmpX]), tmp));
                          // bagsize.fetch_add(1, std::memory_order_relaxed);

                          // printf("right push Y: %d X: %d tmp: %f HV: false
                          // hyperH: %d\n", curY, tmpX, tmp, true);
                        }
                        data[curY * xGrid + curX + 1].unlock();
                      }
                    }
                    // data[curY*(xGrid-1)+curX].lock();
                    hyperH[curY][curX] = tmpH;

                    // bottom

                    if (curY > regionY1) {
                      grid = (curY - 1) * xGrid + curX;
                      // printf("grid: %d %d usage: %d red:%d last:%d sum%f
                      // %d\n",
                      //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
                      //    v_edges[grid].red, v_edges[grid].last_usage, L ,
                      //    v_edges[grid].usage.load() + v_edges[grid].red +
                      //    (int)(L*v_edges[grid].last_usage));
                      if ((preX == curX) || (curr_d1 == 0)) {
                        tmp = curr_d1 +
                              v_costTable[v_edges[grid].usage +
                                          v_edges[grid].red +
                                          (int)(L * v_edges[grid].last_usage)];
                      } else {
                        if (curY < regionY2 - 1) {
                          tmp_grid = curY * xGrid + curX;
                          tmp_cost =
                              d1[curY + 1][curX] +
                              v_costTable[v_edges[tmp_grid].usage +
                                          v_edges[tmp_grid].red +
                                          (int)(L *
                                                v_edges[tmp_grid].last_usage)];

                          if (tmp_cost < curr_d1 + VIA) {
                            // hyperV[curY][curX] = TRUE;
                            tmpV = true;
                          }
                        }
                        tmp = curr_d1 + VIA +
                              v_costTable[v_edges[grid].usage +
                                          v_edges[grid].red +
                                          (int)(L * v_edges[grid].last_usage)];
                      }
                      tmpY = curY - 1; // the bottom neighbor

                      if (d1[tmpY][curX] > tmp && tmp < return_dist) {
                        data[(curY - 1) * xGrid + curX].lock();
                        if (d1[tmpY][curX] > tmp &&
                            tmp < return_dist) // bottom neighbor been put into
                                               // heap1 but needs update
                        {
                          d1[tmpY][curX]       = tmp;
                          parentX1[tmpY][curX] = curX;
                          parentY1[tmpY][curX] = curY;
                          HV[tmpY][curX]       = TRUE;

                          // next->push(&(d1[tmpY][curX]));
                          next->push(pq_grid(&(d1[tmpY][curX]), tmp));
                          // bagsize.fetch_add(1, std::memory_order_relaxed);
                        }
                        data[(curY - 1) * xGrid + curX].unlock();
                      }
                    }
                    // top
                    if (curY < regionY2) {

                      grid = curY * xGrid + curX;
                      // printf("grid: %d %d usage: %d red:%d last:%d sum%f
                      // %d\n",
                      //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
                      //    v_edges[grid].red, v_edges[grid].last_usage, L ,
                      //    v_edges[grid].usage.load() + v_edges[grid].red +
                      //    (int)(L*v_edges[grid].last_usage));
                      if ((preX == curX) || (curr_d1 == 0)) {
                        tmp = curr_d1 +
                              v_costTable[v_edges[grid].usage +
                                          v_edges[grid].red +
                                          (int)(L * v_edges[grid].last_usage)];
                      } else {
                        if (curY > regionY1 + 1) {
                          tmp_grid = (curY - 1) * xGrid + curX;
                          tmp_cost =
                              d1[curY - 1][curX] +
                              v_costTable[v_edges[tmp_grid].usage +
                                          v_edges[tmp_grid].red +
                                          (int)(L *
                                                v_edges[tmp_grid].last_usage)];

                          if (tmp_cost < curr_d1 + VIA) {
                            // hyperV[curY][curX] = TRUE;
                            tmpV = true;
                          }
                        }
                        tmp = curr_d1 + VIA +
                              v_costTable[v_edges[grid].usage +
                                          v_edges[grid].red +
                                          (int)(L * v_edges[grid].last_usage)];
                      }
                      tmpY = curY + 1; // the top neighbor

                      if (d1[tmpY][curX] > tmp && tmp < return_dist) {
                        data[(curY + 1) * xGrid + curX].lock();
                        if (d1[tmpY][curX] > tmp &&
                            tmp < return_dist) // top neighbor been put into
                                               // heap1 but needs update
                        {

                          d1[tmpY][curX]       = tmp;
                          parentX1[tmpY][curX] = curX;
                          parentY1[tmpY][curX] = curY;
                          HV[tmpY][curX]       = TRUE;

                          // next->push(&(d1[tmpY][curX]));
                          next->push(pq_grid(&(d1[tmpY][curX]), tmp));
                          // bagsize.fetch_add(1, std::memory_order_relaxed);
                        }
                        data[(curY + 1) * xGrid + curX].unlock();
                      }
                    }
                    hyperV[curY][curX] = tmpV;
                  }
                },
                // galois::wl<galois::worklists::ParaMeter<>>(),
                // galois::wl<PSChunk>(),
                // galois::wl<OBIM>(RequestIndexer),
                galois::chunk_size<4>(),
                // galois::parallel_break(),
                // galois::steal(),
                galois::loopname("fine_grain"));
            std::swap(cur, next);
            next->clear();

          } // do all while curr is not empty

          timer_foreach.stop();

          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
            pop_heap2[*ii] = FALSE;

          crossX = return_ind1 % xGrid;
          crossY = return_ind1 / xGrid;

          cnt      = 0;
          int curX = crossX;
          int curY = crossY;
          int tmpX, tmpY;
          // if(netID == 2 && edgeID == 26)
          //    printf("crossX %d crossY %d return_d: %f\n", crossX, crossY,
          //    return_dist.load());
          timer_traceback.start();
          while (d1[curY][curX] != 0) // loop until reach subtree1
          {
            // if(cnt > 1000)
            //    printf("Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\n",
            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],
            //    HV[curY][curX], d1[curY][curX]);

            hypered = FALSE;
            if (cnt != 0) {
              if (curX != tmpX && hyperH[curY][curX]) {
                curX    = 2 * curX - tmpX;
                hypered = TRUE;
              }
              // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
              if (curY != tmpY && hyperV[curY][curX]) {
                curY    = 2 * curY - tmpY;
                hypered = TRUE;
              }
            }
            tmpX = curX;
            tmpY = curY;
            if (!hypered) {
              if (HV[tmpY][tmpX]) {
                curY = parentY1[tmpY][tmpX];
              } else {
                curX = parentX3[tmpY][tmpX];
              }
            }

            tmp_gridsX[cnt] = curX;
            tmp_gridsY[cnt] = curY;
            cnt++;
          }
          // reverse the grids on the path

          for (i = 0; i < cnt; i++) {
            tmpind    = cnt - 1 - i;
            gridsX[i] = tmp_gridsX[tmpind];
            gridsY[i] = tmp_gridsY[tmpind];
          }
          // add the connection point (crossX, crossY)
          gridsX[cnt] = crossX;
          gridsY[cnt] = crossY;
          cnt++;

          curX     = crossX;
          curY     = crossY;
          cnt_n1n2 = cnt;

          // change the tree structure according to the new routing for the tree
          // edge find E1 and E2, and the endpoints of the edges they are on
          E1x = gridsX[0];
          E1y = gridsY[0];
          E2x = gridsX[cnt_n1n2 - 1];
          E2y = gridsY[cnt_n1n2 - 1];

          edge_n1n2 = edgeID;

          timer_traceback.stop();

          // if(netID == 14628)
          //    printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n",
          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);
          // (1) consider subtree1
          timer_adjusttree.start();
          if (n1 >= deg && (E1x != n1x || E1y != n1y))
          // n1 is not a pin and E1!=n1, then make change to subtree1,
          // otherwise, no change to subtree1
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

            // find A1, A2 and edge_n1A1, edge_n1A2
            if (treenodes[n1].nbr[0] == n2) {
              A1        = treenodes[n1].nbr[1];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[1];
              edge_n1A2 = treenodes[n1].edge[2];
            } else if (treenodes[n1].nbr[1] == n2) {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[2];
            } else {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[1];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[1];
            }

            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
            {
              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on
              // (n1, A1)
              if (endpt1 == A2 || endpt2 == A2) {
                tmpi      = A1;
                A1        = A2;
                A2        = tmpi;
                tmpi      = edge_n1A1;
                edge_n1A1 = edge_n1A2;
                edge_n1A2 = tmpi;
              }

              // update route for edge (n1, A1), (n1, A2)
              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                               edge_n1A1, edge_n1A2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
            }    // if E1 is on (n1, A1) or (n1, A2)
            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
            {
              C1        = endpt1;
              C2        = endpt2;
              edge_C1C2 = corrEdge[E1y][E1x];

              // update route for edge (n1, C1), (n1, C2) and (A1, A2)
              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
              // C2)->(A1, A2)
              edge_n1C1               = edge_n1A1;
              treeedges[edge_n1C1].n1 = C1;
              treeedges[edge_n1C1].n2 = n1;
              edge_n1C2               = edge_n1A2;
              treeedges[edge_n1C2].n1 = n1;
              treeedges[edge_n1C2].n2 = C2;
              edge_A1A2               = edge_C1C2;
              treeedges[edge_A1A2].n1 = A1;
              treeedges[edge_A1A2].n2 = A2;
              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
              // n1's nbr (n2, A1, A2)->(n2, C1, C2)
              treenodes[n1].nbr[0]  = n2;
              treenodes[n1].edge[0] = edge_n1n2;
              treenodes[n1].nbr[1]  = C1;
              treenodes[n1].edge[1] = edge_n1C1;
              treenodes[n1].nbr[2]  = C2;
              treenodes[n1].edge[2] = edge_n1C2;
              // A1's nbr n1->A2
              for (i = 0; i < 3; i++) {
                if (treenodes[A1].nbr[i] == n1) {
                  treenodes[A1].nbr[i]  = A2;
                  treenodes[A1].edge[i] = edge_A1A2;
                  break;
                }
              }
              // A2's nbr n1->A1
              for (i = 0; i < 3; i++) {
                if (treenodes[A2].nbr[i] == n1) {
                  treenodes[A2].nbr[i]  = A1;
                  treenodes[A2].edge[i] = edge_A1A2;
                  break;
                }
              }
              // C1's nbr C2->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C1].nbr[i] == C2) {
                  treenodes[C1].nbr[i]  = n1;
                  treenodes[C1].edge[i] = edge_n1C1;
                  break;
                }
              }
              // C2's nbr C1->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C2].nbr[i] == C1) {
                  treenodes[C2].nbr[i]  = n1;
                  treenodes[C2].edge[i] = edge_n1C2;
                  break;
                }
              }

            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
          }   // n1 is not a pin and E1!=n1

          // (2) consider subtree2

          if (n2 >= deg && (E2x != n2x || E2y != n2y))
          // n2 is not a pin and E2!=n2, then make change to subtree2,
          // otherwise, no change to subtree2
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

            // find B1, B2
            if (treenodes[n2].nbr[0] == n1) {
              B1        = treenodes[n2].nbr[1];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[1];
              edge_n2B2 = treenodes[n2].edge[2];
            } else if (treenodes[n2].nbr[1] == n1) {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[2];
            } else {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[1];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[1];
            }

            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
            {
              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on
              // (n2, B1)
              if (endpt1 == B2 || endpt2 == B2) {
                tmpi      = B1;
                B1        = B2;
                B2        = tmpi;
                tmpi      = edge_n2B1;
                edge_n2B1 = edge_n2B2;
                edge_n2B2 = tmpi;
              }

              // update route for edge (n2, B1), (n2, B2)
              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                               edge_n2B1, edge_n2B2);

              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
            }    // if E2 is on (n2, B1) or (n2, B2)
            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
            {
              D1        = endpt1;
              D2        = endpt2;
              edge_D1D2 = corrEdge[E2y][E2x];

              // update route for edge (n2, D1), (n2, D2) and (B1, B2)
              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
              // D2)->(B1, B2)
              edge_n2D1               = edge_n2B1;
              treeedges[edge_n2D1].n1 = D1;
              treeedges[edge_n2D1].n2 = n2;
              edge_n2D2               = edge_n2B2;
              treeedges[edge_n2D2].n1 = n2;
              treeedges[edge_n2D2].n2 = D2;
              edge_B1B2               = edge_D1D2;
              treeedges[edge_B1B2].n1 = B1;
              treeedges[edge_B1B2].n2 = B2;
              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
              // n1's nbr (n1, B1, B2)->(n1, D1, D2)
              treenodes[n2].nbr[0]  = n1;
              treenodes[n2].edge[0] = edge_n1n2;
              treenodes[n2].nbr[1]  = D1;
              treenodes[n2].edge[1] = edge_n2D1;
              treenodes[n2].nbr[2]  = D2;
              treenodes[n2].edge[2] = edge_n2D2;
              // B1's nbr n2->B2
              for (i = 0; i < 3; i++) {
                if (treenodes[B1].nbr[i] == n2) {
                  treenodes[B1].nbr[i]  = B2;
                  treenodes[B1].edge[i] = edge_B1B2;
                  break;
                }
              }
              // B2's nbr n2->B1
              for (i = 0; i < 3; i++) {
                if (treenodes[B2].nbr[i] == n2) {
                  treenodes[B2].nbr[i]  = B1;
                  treenodes[B2].edge[i] = edge_B1B2;
                  break;
                }
              }
              // D1's nbr D2->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D1].nbr[i] == D2) {
                  treenodes[D1].nbr[i]  = n2;
                  treenodes[D1].edge[i] = edge_n2D1;
                  break;
                }
              }
              // D2's nbr D1->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D2].nbr[i] == D1) {
                  treenodes[D2].nbr[i]  = n2;
                  treenodes[D2].edge[i] = edge_n2D2;
                  break;
                }
              }
            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
          }   // n2 is not a pin and E2!=n2

          // update route for edge (n1, n2) and edge usage

          // printf("update route? %d %d\n", netID, num_edges);
          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
            free(treeedges[edge_n1n2].route.gridsX);
            free(treeedges[edge_n1n2].route.gridsY);
          }
          treeedges[edge_n1n2].route.gridsX =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsY =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.type     = MAZEROUTE;
          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
          treeedges[edge_n1n2].n_ripups += 1;
          total_ripups += 1;
          max_ripups.update(treeedges[edge_n1n2].n_ripups);

          for (i = 0; i < cnt_n1n2; i++) {
            // printf("cnt_n1n2: %d\n", cnt_n1n2);
            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
          }
          // std::cout << " adjsut tree" << std::endl;
          timer_adjusttree.stop();

          // update edge usage

          /*for(i=0; i<pre_length; i++)
          {
              if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
              {
                  if(i != pre_length - 1)
                      min_y = min(pre_gridsY[i], pre_gridsY[i+1]);
                  else
                      min_y = pre_gridsY[i];
                  //v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                  //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
          (short unsigned)1);
                  //printf("x y %d %d i %d \n", pre_gridsX[i], min_y, i);
                  v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short
          int)1, std::memory_order_relaxed);
                  //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf("V
          negative! %d \n", i);
              }
              else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
              {
                  if(i != pre_length - 1)
                      min_x = min(pre_gridsX[i], pre_gridsX[i+1]);
                  else
                      min_x = pre_gridsX[i];
                  //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                  //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
          (short unsigned)1);
                  //printf("x y %d %d i %d\n", min_x, pre_gridsY[i], i);
                  h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short
          int)1, std::memory_order_relaxed);
                  //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)
          printf("H negative! %d \n", i);
              }
          }*/
          timer_updateusage.start();
          for (i = 0; i < cnt_n1n2 - 1; i++) {
            if (gridsX[i] == gridsX[i + 1]) // a vertical edge
            {
              min_y = min(gridsY[i], gridsY[i + 1]);
              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
              // unsigned)1);
              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
              min_x = min(gridsX[i], gridsX[i + 1]);
              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              // (short unsigned)1);
              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            }
          }
          timer_updateusage.stop();
          /*if(LOCK){
              for(i=0; i<cnt_n1n2-1; i++)
              {
                  if(gridsX[i]==gridsX[i+1]) // a vertical edge
                  {
                      min_y = min(gridsY[i], gridsY[i+1]);
                      v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                  }
                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                  {
                      min_x = min(gridsX[i], gridsX[i+1]);
                      h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                  }
              }
          }*/
          // printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n",
          // netID, edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);
          timer_checkroute2dtree.start();
          if (checkRoute2DTree(netID)) {
            reInitTree(netID);
            return;
          }
          timer_checkroute2dtree.stop();
        } // congested route, if(enter)
        timer_finegrain.stop();
      } // only route the non-degraded edges (len>0)
    }   // iterate on edges of a net
  }

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}


================================================
FILE: lonestar/eda/cpu/sproute/maze_finegrain_concurrent.h
================================================

/*struct netID_edgeID {
    int netID;
    int edgeID;
    netID_edgeID() {
        netID = 0;
        edgeID = 0;
    }


};*/
class Concurrent_pq_grid {
public:
  float* d1_p;
  float d1_push;
  int concurrentID;
  Concurrent_pq_grid() {
    d1_p         = NULL;
    d1_push      = 0;
    concurrentID = -1;
  };
  Concurrent_pq_grid(float* d1_p, float d1_push, int concurrentID) {
    this->d1_p         = d1_p;
    this->d1_push      = d1_push;
    this->concurrentID = concurrentID;
  }
};

struct CONCURRENT_NET_STORAGE {
  using LAptr = galois::substrate::LAptr;
  LAptr pop_heap2_LA;
  bool* pop_heap2;

  LAptr d1_p_LA, d1_alloc_LA;
  float** d1_p;
  float* d1_alloc;

  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,
      hyperH_alloc_LA;
  bool **HV_p, **hyperV_p, **hyperH_p;
  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;

  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,
      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;
  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;
  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;

  LAptr corrEdge_p_LA, corrEdge_alloc_LA;
  int** corrEdge_p;
  int* corrEdge_alloc;

  LAptr inRegion_p_LA, inRegion_alloc_LA;
  bool** inRegion_p;
  bool* inRegion_alloc;

  LAptr netEO_p_LA;
  OrderNetEdge* netEO_p;

  galois::LargeArray<galois::substrate::SimpleLock> nodelock;

  std::vector<int> v2;

  std::atomic<int> return_ind1;
  std::atomic<float> return_dist;

  CONCURRENT_NET_STORAGE() {
    using namespace galois::substrate;

    if (NET_PARALLEL) {
      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());

      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(float));
      d1_alloc    = reinterpret_cast<float*>(d1_alloc_LA.get());
      d1_p_LA     = largeMallocLocal(yGrid * sizeof(float*));
      d1_p        = reinterpret_cast<float**>(d1_p_LA.get());

      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());
      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());
      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());

      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());
      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());
      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());

      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());
      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());
      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());
      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());

      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());
      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());
      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());
      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());

      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));
      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());
      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));
      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());

      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());
      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());

      netEO_p_LA = largeMallocLocal(2000 * sizeof(OrderNetEdge));
      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());

      nodelock.allocateInterleaved(xGrid * yGrid);
    } else {
      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));

      d1_alloc = (float*)calloc(yGrid * xGrid, sizeof(float));
      d1_p     = (float**)calloc(yGrid, sizeof(float*));

      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));
      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));
      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));

      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));

      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));
      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));

      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));

      netEO_p = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));

      nodelock.allocateInterleaved(xGrid * yGrid);
    }
    // printf("allocation success\n");
    for (int i = 0; i < yGrid; i++) {
      d1_p[i] = &(d1_alloc[i * xGrid]);

      HV_p[i]     = &(HV_alloc[i * xGrid]);
      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);
      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);

      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);

      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);
    }

    for (int i = 0; i < yGrid; i++) {
      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);
      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);
      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);
      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);
    }
  }
  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }
  void clear() {
    if (!NET_PARALLEL) {
      free(pop_heap2);

      free(d1_p);
      free(d1_alloc);

      free(HV_p);
      free(hyperV_p);
      free(hyperH_p);
      free(HV_alloc);
      free(hyperV_alloc);
      free(hyperH_alloc);

      free(parentX1_p);
      free(parentY1_p);
      free(parentX3_p);
      free(parentY3_p);

      free(parentX1_alloc);
      free(parentY1_alloc);
      free(parentX3_alloc);
      free(parentY3_alloc);

      free(corrEdge_alloc);
      free(corrEdge_p);

      free(netEO_p);
    } else {
      /*delete [] pop_heap2;
      delete [] heap1;
      delete [] heap2;

      delete [] d1_p;
      delete [] d1_alloc;

      delete [] HV_p;
      delete [] hyperV_p;
      delete [] hyperH_p;
      delete [] HV_alloc;
      delete [] hyperV_alloc;
      delete [] hyperH_alloc;

      delete [] parentX1_p;
      delete [] parentY1_p;
      delete [] parentX3_p;
      delete [] parentY3_p;

      delete [] parentX1_alloc;
      delete [] parentY1_alloc;
      delete [] parentX3_alloc;
      delete [] parentY3_alloc;

      delete [] corrEdge_alloc;
      delete [] corrEdge_p;

      delete [] netEO_p;*/
    }
  }

  ~CONCURRENT_NET_STORAGE() {
    if (!NET_PARALLEL) {
      free(pop_heap2);

      free(d1_p);
      free(d1_alloc);

      free(HV_p);
      free(hyperV_p);
      free(hyperH_p);
      free(HV_alloc);
      free(hyperV_alloc);
      free(hyperH_alloc);

      free(parentX1_p);
      free(parentY1_p);
      free(parentX3_p);
      free(parentY3_p);

      free(parentX1_alloc);
      free(parentY1_alloc);
      free(parentX3_alloc);
      free(parentY3_alloc);

      free(corrEdge_alloc);
      free(corrEdge_p);

      free(netEO_p);
    }
  }
};

void setupHeap_nopq1clear(int netID, int edgeID,
                          galois::InsertBag<Concurrent_pq_grid>& pq1,
                          std::vector<int>& v2, int regionX1, int regionX2,
                          int regionY1, int regionY2, float** d1,
                          int** corrEdge, bool** inRegion, int concurrentID) {
  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;
  int nbr, nbrX, nbrY, cur, edge;
  int grid, x_grid, y_grid;
  int queuehead, queuetail, *queue;
  Bool* visited;
  TreeEdge* treeedges;
  TreeNode* treenodes;
  Route* route;

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = TRUE;
  }

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  d         = sttrees[netID].deg;

  n1 = treeedges[edgeID].n1;
  n2 = treeedges[edgeID].n2;
  x1 = treenodes[n1].x;
  y1 = treenodes[n1].y;
  x2 = treenodes[n2].x;
  y2 = treenodes[n2].y;

  // if(netID == 14628)
  //    printf("net: %d edge: %d src: %d %d dst: %d %d d: %d\n", netID, edgeID,
  //    y1, x1, y2, x2, d);
  v2.clear(); // Michael
  if (d == 2) // 2-pin net
  {
    d1[y1][x1] = 0;
    pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));
    v2.push_back(y2 * xGrid + x2);
  } else // net with more than 2 pins
  {
    numNodes = 2 * d - 2;

    visited = (Bool*)calloc(numNodes, sizeof(Bool));
    for (i = 0; i < numNodes; i++)
      visited[i] = FALSE;

    queue = (int*)calloc(numNodes, sizeof(int));

    // find all the grids on tree edges in subtree t1 (connecting to n1) and put
    // them into heap1
    if (n1 < d) // n1 is a Pin node
    {
      // just need to put n1 itself into heap1
      d1[y1][x1] = 0;
      pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));
      visited[n1] = TRUE;
    } else // n1 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n1 into heap1
      d1[y1][x1] = 0;
      // if(netID == 252163 && edgeID == 51)
      //    printf("y: %d x: %d\n", y1, x1);
      pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));
      visited[n1] = TRUE;

      // add n1 into the queue
      queue[queuetail] = n1;
      queuetail++;

      // loop to find all the edges in subtree t1
      while (queuetail > queuehead) {
        // get cur node from the queuehead
        cur = queue[queuehead];
        queuehead++;
        visited[cur] = TRUE;
        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n2) // not n2
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap1
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap1 if in enlarged region
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX           = treenodes[nbr].x;
                    nbrY           = treenodes[nbr].y;
                    d1[nbrY][nbrX] = 0;
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("y: %d x: %d\n", nbrY, nbrX);
                    pq1.push(
                        Concurrent_pq_grid(&(d1[nbrY][nbrX]), 0, concurrentID));
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap1
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];

                      if (inRegion[y_grid][x_grid]) {
                        d1[y_grid][x_grid] = 0;
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("y: %d x: %d\n", y_grid, x_grid);
                        pq1.push(Concurrent_pq_grid(&(d1[y_grid][x_grid]), 0,
                                                    concurrentID));
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if not a degraded edge (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n2
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n1 is not a Pin node

    // find all the grids on subtree t2 (connect to n2) and put them into heap2
    // find all the grids on tree edges in subtree t2 (connecting to n2) and put
    // them into heap2
    if (n2 < d) // n2 is a Pin node
    {
      // just need to put n2 itself into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 14628)
      //    printf("y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;
    } else // n2 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n2 into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 252163 && edgeID == 51)
      //    printf("dst y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;

      // add n2 into the queue
      queue[queuetail] = n2;
      queuetail++;

      // loop to find all the edges in subtree t2
      while (queuetail > queuehead) {
        // get cur node form queuehead
        cur          = queue[queuehead];
        visited[cur] = TRUE;
        queuehead++;

        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n1) // not n1
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap2
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap2
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX = treenodes[nbr].x;
                    nbrY = treenodes[nbr].y;
                    v2.push_back(nbrY * xGrid + nbrX);
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("dst y: %d x: %d\n", nbrY, nbrX);
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap2
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];
                      if (inRegion[y_grid][x_grid]) {
                        v2.push_back(y_grid * xGrid + x_grid);
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("dst y: %d x: %d\n", y_grid, x_grid);
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if the edge is not degraded (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n1
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n2 is not a Pin node

    free(queue);
    free(visited);
  } // net with more than two pins

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = FALSE;
  }
}

void trace_back(CONCURRENT_NET_STORAGE* concurrent_net_storage,
                int* concurrentID2netID, int* concurrentID2edgeID,
                int concurrentNet_cnt) {

  for (int concurrentID = 0; concurrentID < concurrentNet_cnt; concurrentID++) {

    int concurrentID = top.concurrentID;

    float** d1 = concurrent_net_storage[concurrentID].d1_p;
    std::atomic<int>& return_ind1 =
        concurrent_net_storage[concurrentID].return_ind1;
    std::atomic<float>& return_dist =
        concurrent_net_storage[concurrentID].return_dist;

    bool* pop_heap2 = concurrent_net_storage[concurrentID].pop_heap2;

    bool** HV     = concurrent_net_storage[concurrentID].HV_p;
    bool** hyperV = concurrent_net_storage[concurrentID].hyperV_p;
    bool** hyperH = concurrent_net_storage[concurrentID].hyperH_p;

    short** parentX1 = concurrent_net_storage[concurrentID].parentX1_p;
    short** parentX3 = concurrent_net_storage[concurrentID].parentX3_p;
    short** parentY1 = concurrent_net_storage[concurrentID].parentY1_p;
    short** parentY3 = concurrent_net_storage[concurrentID].parentY3_p;

    int** corrEdge = concurrent_net_storage[concurrentID].corrEdge_p;

    OrderNetEdge* netEO = concurrent_net_storage[concurrentID].netEO_p;

    bool** inRegion      = concurrent_net_storage[concurrentID].inRegion_p;
    bool* inRegion_alloc = concurrent_net_storage[concurrentID].inRegion_alloc;

    galois::LargeArray<galois::substrate::SimpleLock>& nodelock =
        concurrent_net_storage[concurrentID].nodelock;

    std::vector<int>& v2 = concurrent_net_storage[concurrentID].v2;

    int regionX1     = concurrent_net_storage[concurrentID].regionX1;
    int regionX2     = concurrent_net_storage[concurrentID].regionX2;
    int regionY1     = concurrent_net_storage[concurrentID].regionY1;
    int regionY2     = concurrent_net_storage[concurrentID].regionY2;
    int regionWidth  = concurrent_net_storage[concurrentID].regionWidth;
    int regionHeight = concurrent_net_storage[concurrentID].regionHeight;

    for (auto ii = v2.begin(); ii != v2.end(); ii++)
      pop_heap2[*ii] = FALSE;

    crossX = return_ind1 % xGrid;
    crossY = return_ind1 / xGrid;

    cnt      = 0;
    int curX = crossX;
    int curY = crossY;
    int tmpX, tmpY;
    // if(netID == 2 && edgeID == 26)
    //    printf("crossX %d crossY %d return_d: %f\n", crossX, crossY,
    //    return_dist.load());
    timer_traceback.start();
    while (d1[curY][curX] != 0) // loop until reach subtree1
    {
      // if(cnt < 200)
      //    printf("Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\n", curY,
      //    curX, hyperH[curY][curX], hyperV[curY][curX], HV[curY][curX],
      //    d1[curY][curX]);

      hypered = FALSE;
      if (cnt != 0) {
        if (curX != tmpX && hyperH[curY][curX]) {
          curX    = 2 * curX - tmpX;
          hypered = TRUE;
        }
        // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
        if (curY != tmpY && hyperV[curY][curX]) {
          curY    = 2 * curY - tmpY;
          hypered = TRUE;
        }
      }
      tmpX = curX;
      tmpY = curY;
      if (!hypered) {
        if (HV[tmpY][tmpX]) {
          curY = parentY1[tmpY][tmpX];
        } else {
          curX = parentX3[tmpY][tmpX];
        }
      }

      tmp_gridsX[cnt] = curX;
      tmp_gridsY[cnt] = curY;
      cnt++;
    }
    // reverse the grids on the path

    for (i = 0; i < cnt; i++) {
      tmpind    = cnt - 1 - i;
      gridsX[i] = tmp_gridsX[tmpind];
      gridsY[i] = tmp_gridsY[tmpind];
    }
    // add the connection point (crossX, crossY)
    gridsX[cnt] = crossX;
    gridsY[cnt] = crossY;
    cnt++;

    curX     = crossX;
    curY     = crossY;
    cnt_n1n2 = cnt;

    // change the tree structure according to the new routing for the tree edge
    // find E1 and E2, and the endpoints of the edges they are on
    E1x = gridsX[0];
    E1y = gridsY[0];
    E2x = gridsX[cnt_n1n2 - 1];
    E2y = gridsY[cnt_n1n2 - 1];

    edge_n1n2 = edgeID;

    timer_traceback.stop();

    // if(netID == 14628)
    //    printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n", netID,
    //    edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);
    // (1) consider subtree1
    timer_adjusttree.start();
    if (n1 >= deg && (E1x != n1x || E1y != n1y))
    // n1 is not a pin and E1!=n1, then make change to subtree1, otherwise, no
    // change to subtree1
    {
      shifted = TRUE;
      // find the endpoints of the edge E1 is on
      endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
      endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

      // find A1, A2 and edge_n1A1, edge_n1A2
      if (treenodes[n1].nbr[0] == n2) {
        A1        = treenodes[n1].nbr[1];
        A2        = treenodes[n1].nbr[2];
        edge_n1A1 = treenodes[n1].edge[1];
        edge_n1A2 = treenodes[n1].edge[2];
      } else if (treenodes[n1].nbr[1] == n2) {
        A1        = treenodes[n1].nbr[0];
        A2        = treenodes[n1].nbr[2];
        edge_n1A1 = treenodes[n1].edge[0];
        edge_n1A2 = treenodes[n1].edge[2];
      } else {
        A1        = treenodes[n1].nbr[0];
        A2        = treenodes[n1].nbr[1];
        edge_n1A1 = treenodes[n1].edge[0];
        edge_n1A2 = treenodes[n1].edge[1];
      }

      if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
      {
        // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on (n1,
        // A1)
        if (endpt1 == A2 || endpt2 == A2) {
          tmpi      = A1;
          A1        = A2;
          A2        = tmpi;
          tmpi      = edge_n1A1;
          edge_n1A1 = edge_n1A2;
          edge_n1A2 = tmpi;
        }

        // update route for edge (n1, A1), (n1, A2)
        updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges, edge_n1A1,
                         edge_n1A2);
        // update position for n1
        treenodes[n1].x = E1x;
        treenodes[n1].y = E1y;
      }    // if E1 is on (n1, A1) or (n1, A2)
      else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
      {
        C1        = endpt1;
        C2        = endpt2;
        edge_C1C2 = corrEdge[E1y][E1x];

        // update route for edge (n1, C1), (n1, C2) and (A1, A2)
        updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y, treeedges,
                         edge_n1A1, edge_n1A2, edge_C1C2);
        // update position for n1
        treenodes[n1].x = E1x;
        treenodes[n1].y = E1y;
        // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1, C2)->(A1,
        // A2)
        edge_n1C1               = edge_n1A1;
        treeedges[edge_n1C1].n1 = C1;
        treeedges[edge_n1C1].n2 = n1;
        edge_n1C2               = edge_n1A2;
        treeedges[edge_n1C2].n1 = n1;
        treeedges[edge_n1C2].n2 = C2;
        edge_A1A2               = edge_C1C2;
        treeedges[edge_A1A2].n1 = A1;
        treeedges[edge_A1A2].n2 = A2;
        // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
        // n1's nbr (n2, A1, A2)->(n2, C1, C2)
        treenodes[n1].nbr[0]  = n2;
        treenodes[n1].edge[0] = edge_n1n2;
        treenodes[n1].nbr[1]  = C1;
        treenodes[n1].edge[1] = edge_n1C1;
        treenodes[n1].nbr[2]  = C2;
        treenodes[n1].edge[2] = edge_n1C2;
        // A1's nbr n1->A2
        for (i = 0; i < 3; i++) {
          if (treenodes[A1].nbr[i] == n1) {
            treenodes[A1].nbr[i]  = A2;
            treenodes[A1].edge[i] = edge_A1A2;
            break;
          }
        }
        // A2's nbr n1->A1
        for (i = 0; i < 3; i++) {
          if (treenodes[A2].nbr[i] == n1) {
            treenodes[A2].nbr[i]  = A1;
            treenodes[A2].edge[i] = edge_A1A2;
            break;
          }
        }
        // C1's nbr C2->n1
        for (i = 0; i < 3; i++) {
          if (treenodes[C1].nbr[i] == C2) {
            treenodes[C1].nbr[i]  = n1;
            treenodes[C1].edge[i] = edge_n1C1;
            break;
          }
        }
        // C2's nbr C1->n1
        for (i = 0; i < 3; i++) {
          if (treenodes[C2].nbr[i] == C1) {
            treenodes[C2].nbr[i]  = n1;
            treenodes[C2].edge[i] = edge_n1C2;
            break;
          }
        }

      } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
    }   // n1 is not a pin and E1!=n1

    // (2) consider subtree2

    if (n2 >= deg && (E2x != n2x || E2y != n2y))
    // n2 is not a pin and E2!=n2, then make change to subtree2, otherwise, no
    // change to subtree2
    {
      shifted = TRUE;
      // find the endpoints of the edge E1 is on
      endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
      endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

      // find B1, B2
      if (treenodes[n2].nbr[0] == n1) {
        B1        = treenodes[n2].nbr[1];
        B2        = treenodes[n2].nbr[2];
        edge_n2B1 = treenodes[n2].edge[1];
        edge_n2B2 = treenodes[n2].edge[2];
      } else if (treenodes[n2].nbr[1] == n1) {
        B1        = treenodes[n2].nbr[0];
        B2        = treenodes[n2].nbr[2];
        edge_n2B1 = treenodes[n2].edge[0];
        edge_n2B2 = treenodes[n2].edge[2];
      } else {
        B1        = treenodes[n2].nbr[0];
        B2        = treenodes[n2].nbr[1];
        edge_n2B1 = treenodes[n2].edge[0];
        edge_n2B2 = treenodes[n2].edge[1];
      }

      if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
      {
        // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on (n2,
        // B1)
        if (endpt1 == B2 || endpt2 == B2) {
          tmpi      = B1;
          B1        = B2;
          B2        = tmpi;
          tmpi      = edge_n2B1;
          edge_n2B1 = edge_n2B2;
          edge_n2B2 = tmpi;
        }

        // update route for edge (n2, B1), (n2, B2)
        updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges, edge_n2B1,
                         edge_n2B2);

        // update position for n2
        treenodes[n2].x = E2x;
        treenodes[n2].y = E2y;
      }    // if E2 is on (n2, B1) or (n2, B2)
      else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
      {
        D1        = endpt1;
        D2        = endpt2;
        edge_D1D2 = corrEdge[E2y][E2x];

        // update route for edge (n2, D1), (n2, D2) and (B1, B2)
        updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y, treeedges,
                         edge_n2B1, edge_n2B2, edge_D1D2);
        // update position for n2
        treenodes[n2].x = E2x;
        treenodes[n2].y = E2y;
        // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1, D2)->(B1,
        // B2)
        edge_n2D1               = edge_n2B1;
        treeedges[edge_n2D1].n1 = D1;
        treeedges[edge_n2D1].n2 = n2;
        edge_n2D2               = edge_n2B2;
        treeedges[edge_n2D2].n1 = n2;
        treeedges[edge_n2D2].n2 = D2;
        edge_B1B2               = edge_D1D2;
        treeedges[edge_B1B2].n1 = B1;
        treeedges[edge_B1B2].n2 = B2;
        // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
        // n1's nbr (n1, B1, B2)->(n1, D1, D2)
        treenodes[n2].nbr[0]  = n1;
        treenodes[n2].edge[0] = edge_n1n2;
        treenodes[n2].nbr[1]  = D1;
        treenodes[n2].edge[1] = edge_n2D1;
        treenodes[n2].nbr[2]  = D2;
        treenodes[n2].edge[2] = edge_n2D2;
        // B1's nbr n2->B2
        for (i = 0; i < 3; i++) {
          if (treenodes[B1].nbr[i] == n2) {
            treenodes[B1].nbr[i]  = B2;
            treenodes[B1].edge[i] = edge_B1B2;
            break;
          }
        }
        // B2's nbr n2->B1
        for (i = 0; i < 3; i++) {
          if (treenodes[B2].nbr[i] == n2) {
            treenodes[B2].nbr[i]  = B1;
            treenodes[B2].edge[i] = edge_B1B2;
            break;
          }
        }
        // D1's nbr D2->n2
        for (i = 0; i < 3; i++) {
          if (treenodes[D1].nbr[i] == D2) {
            treenodes[D1].nbr[i]  = n2;
            treenodes[D1].edge[i] = edge_n2D1;
            break;
          }
        }
        // D2's nbr D1->n2
        for (i = 0; i < 3; i++) {
          if (treenodes[D2].nbr[i] == D1) {
            treenodes[D2].nbr[i]  = n2;
            treenodes[D2].edge[i] = edge_n2D2;
            break;
          }
        }
      } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
    }   // n2 is not a pin and E2!=n2

    // update route for edge (n1, n2) and edge usage

    // printf("update route? %d %d\n", netID, num_edges);
    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
      free(treeedges[edge_n1n2].route.gridsX);
      free(treeedges[edge_n1n2].route.gridsY);
    }
    treeedges[edge_n1n2].route.gridsX = (short*)calloc(cnt_n1n2, sizeof(short));
    treeedges[edge_n1n2].route.gridsY = (short*)calloc(cnt_n1n2, sizeof(short));
    treeedges[edge_n1n2].route.type   = MAZEROUTE;
    treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
    treeedges[edge_n1n2].len            = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
    treeedges[edge_n1n2].n_ripups += 1;
    total_ripups += 1;
    max_ripups.update(treeedges[edge_n1n2].n_ripups);

    for (i = 0; i < cnt_n1n2; i++) {
      // printf("cnt_n1n2: %d\n", cnt_n1n2);
      treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
      treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
    }
    // std::cout << " adjsut tree" << std::endl;
    timer_adjusttree.stop();

    // update edge usage

    /*for(i=0; i<pre_length; i++)
    {
        if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
        {
            if(i != pre_length - 1)
                min_y = min(pre_gridsY[i], pre_gridsY[i+1]);
            else
                min_y = pre_gridsY[i];
            //v_edges[min_y*xGrid+gridsX[i]].usage += 1;
            //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
    unsigned)1);
            //printf("x y %d %d i %d \n", pre_gridsX[i], min_y, i);
            v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short int)1,
    std::memory_order_relaxed);
            //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf("V
    negative! %d \n", i);
        }
        else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
            if(i != pre_length - 1)
                min_x = min(pre_gridsX[i], pre_gridsX[i+1]);
            else
                min_x = pre_gridsX[i];
            //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
            //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage, (short
    unsigned)1);
            //printf("x y %d %d i %d\n", min_x, pre_gridsY[i], i);
            h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short int)1,
    std::memory_order_relaxed);
            //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0) printf("H
    negative! %d \n", i);
        }
    }*/
    timer_updateusage.start();
    for (i = 0; i < cnt_n1n2 - 1; i++) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        min_y = min(gridsY[i], gridsY[i + 1]);
        // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
        // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
        // unsigned)1);
        v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
            (short int)1, std::memory_order_relaxed);
      } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
      {
        min_x = min(gridsX[i], gridsX[i + 1]);
        // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
        // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage, (short
        // unsigned)1);
        h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
            (short int)1, std::memory_order_relaxed);
      }
    }
    timer_updateusage.stop();
    /*if(LOCK){
        for(i=0; i<cnt_n1n2-1; i++)
        {
            if(gridsX[i]==gridsX[i+1]) // a vertical edge
            {
                min_y = min(gridsY[i], gridsY[i+1]);
                v_edges[min_y*xGrid+gridsX[i]].releaseLock();
            }
            else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
                min_x = min(gridsX[i], gridsX[i+1]);
                h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
            }
        }
    }*/
    // printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n", netID,
    // edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);
    if (checkRoute2DTree(netID)) {
      reInitTree(netID);
      return;
    }
  }
}

void concurrent_maze(galois::InsertBag<pq_grid> pq1,
                     CONCURRENT_NET_STORAGE* concurrent_net_storage) {

  galois::for_each(
      galois::iterate(pq1),
      [&](const auto& top, auto& ctx) {
        // relax all the adjacent grids within the enlarged region for source
        // subtree
        int concurrentID = top.concurrentID;

        float** d1 = concurrent_net_storage[concurrentID].d1_p;
        std::atomic<int>& return_ind1 =
            concurrent_net_storage[concurrentID].return_ind1;
        std::atomic<float>& return_dist =
            concurrent_net_storage[concurrentID].return_dist;

        bool* pop_heap2 = concurrent_net_storage[concurrentID].pop_heap2;

        bool** HV     = concurrent_net_storage[concurrentID].HV_p;
        bool** hyperV = concurrent_net_storage[concurrentID].hyperV_p;
        bool** hyperH = concurrent_net_storage[concurrentID].hyperH_p;

        short** parentX1 = concurrent_net_storage[concurrentID].parentX1_p;
        short** parentX3 = concurrent_net_storage[concurrentID].parentX3_p;
        short** parentY1 = concurrent_net_storage[concurrentID].parentY1_p;
        short** parentY3 = concurrent_net_storage[concurrentID].parentY3_p;

        int** corrEdge = concurrent_net_storage[concurrentID].corrEdge_p;

        OrderNetEdge* netEO = concurrent_net_storage[concurrentID].netEO_p;

        bool** inRegion = concurrent_net_storage[concurrentID].inRegion_p;
        bool* inRegion_alloc =
            concurrent_net_storage[concurrentID].inRegion_alloc;

        galois::LargeArray<galois::substrate::SimpleLock>& nodelock =
            concurrent_net_storage[concurrentID].nodelock;

        std::vector<int>& v2 = concurrent_net_storage[concurrentID].v2;

        int regionX1     = concurrent_net_storage[concurrentID].regionX1;
        int regionX2     = concurrent_net_storage[concurrentID].regionX2;
        int regionY1     = concurrent_net_storage[concurrentID].regionY1;
        int regionY2     = concurrent_net_storage[concurrentID].regionY2;
        int regionWidth  = concurrent_net_storage[concurrentID].regionWidth;
        int regionHeight = concurrent_net_storage[concurrentID].regionHeight;

        int ind1 = top.d1_p - &d1[0][0];

        int curX = ind1 % xGrid;
        int curY = ind1 / xGrid;
        int grid = curY * xGrid + curX;

        float curr_d1 = d1[curY][curX];
        float d1_push = top.d1_push;

        if (d1_push == curr_d1 && d1_push < return_dist.load()) {
          if (pop_heap2[ind1] != false) {
            // if(netID == 2 && edgeID == 26)
            //    printf("reach! curX curY %d %d, d1_push: %f, curr_d1: %f
            //    return_d: %f\n", curX, curY, d1_push, curr_d1,
            //    return_dist.load());
            return_ind1.store(ind1);
            return_dist.store(d1_push);
          }

          /*grid = curY*xGrid + curX - 1;
          if(curX>regionX1)
              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);

          grid = curY*xGrid + curX + 1;
          if(curX<regionX2)
              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);

          grid = (curY - 1)*xGrid + curX;
          if(curY>regionY1)
              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);

          grid = (curY + 1)*xGrid + curX;
          if(curY<regionY2)
              galois::runtime::acquire(&data[grid],
          galois::MethodFlag::WRITE);*/

          int preX, preY;
          if (curr_d1 != 0) {
            if (HV[curY][curX]) {
              preX = parentX1[curY][curX];
              preY = parentY1[curY][curX];
            } else {
              preX = parentX3[curY][curX];
              preY = parentY3[curY][curX];
            }
          } else {
            preX = curX;
            preY = curY;
          }
          // printf("pop curY: %d curX: %d d1: %f preX: %d preY: %d hyperH: %d
          // hyperV: %d HV: %d return_dist: %f\n",
          //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],
          //    hyperV[curY][curX], HV[curY][curX], return_dist.load());
          float tmp, tmp_cost;
          int tmp_grid;
          int tmpX, tmpY;
          // left
          bool tmpH = false;
          bool tmpV = false;

          // if(curX>regionX1)
          //    data[curY*xGrid+curX-1].lock();

          // data[curY*(xGrid-1)+curX].lock();

          if (curX > regionX1) {
            grid = curY * (xGrid - 1) + curX - 1;

            // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
            //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),
            //    h_edges[grid].red, h_edges[grid].last_usage, L ,
            //    h_edges[grid].usage.load() + h_edges[grid].red +
            //    (int)(L*h_edges[grid].last_usage));
            if ((preY == curY) || (curr_d1 == 0)) {
              tmp = curr_d1 +
                    h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                (int)(L * h_edges[grid].last_usage)];
            } else {
              if (curX < regionX2 - 1) {
                tmp_grid = curY * (xGrid - 1) + curX;
                tmp_cost = d1[curY][curX + 1] +
                           h_costTable[h_edges[tmp_grid].usage +
                                       h_edges[tmp_grid].red +
                                       (int)(L * h_edges[tmp_grid].last_usage)];

                if (tmp_cost < curr_d1 + VIA) {
                  // hyperH[curY][curX] = TRUE; //Michael
                  tmpH = true;
                }
              }
              tmp = curr_d1 + VIA +
                    h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                (int)(L * h_edges[grid].last_usage)];
            }
            tmpX = curX - 1; // the left neighbor

            if (d1[curY][tmpX] > tmp && tmp < return_dist) {
              ctx.push(
                  Concurrent_pq_grid(&(d1[curY][tmpX]), tmp, concurrentID));
              nodelock[curY * xGrid + curX - 1].lock();
              if (d1[curY][tmpX] > tmp &&
                  tmp < return_dist) // left neighbor been put into heap1 but
                                     // needs update
              {
                d1[curY][tmpX]       = tmp;
                parentX3[curY][tmpX] = curX;
                parentY3[curY][tmpX] = curY;
                HV[curY][tmpX]       = FALSE;
              }
              nodelock[curY * xGrid + curX - 1].unlock();
            }
          }
          // right

          if (curX < regionX2) {
            // data[curY*xGrid+curX+1].lock();
            grid = curY * (xGrid - 1) + curX;
            // printf("grid: %d %d usage: %d red:%d last:%d L:%f
            // sum:%d\n",grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),
            // h_edges[grid].red, h_edges[grid].last_usage, L ,
            // h_edges[grid].usage.load() + h_edges[grid].red +
            // (int)(L*h_edges[grid].last_usage));
            if ((preY == curY) || (curr_d1 == 0)) {
              tmp = curr_d1 +
                    h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                (int)(L * h_edges[grid].last_usage)];
            } else {
              if (curX > regionX1 + 1) {
                tmp_grid = curY * (xGrid - 1) + curX - 1;
                tmp_cost = d1[curY][curX - 1] +
                           h_costTable[h_edges[tmp_grid].usage +
                                       h_edges[tmp_grid].red +
                                       (int)(L * h_edges[tmp_grid].last_usage)];

                if (tmp_cost < curr_d1 + VIA) {
                  // hyperH[curY][curX] = TRUE;
                  tmpH = true;
                }
              }
              tmp = curr_d1 + VIA +
                    h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                (int)(L * h_edges[grid].last_usage)];
            }
            tmpX = curX + 1; // the right neighbor

            if (d1[curY][tmpX] > tmp && tmp < return_dist) {
              ctx.push(
                  Concurrent_pq_grid(&(d1[curY][tmpX]), tmp, concurrentID));
              nodelock[curY * xGrid + curX + 1].lock();
              if (d1[curY][tmpX] > tmp &&
                  tmp < return_dist) // right neighbor been put into heap1 but
                                     // needs update
              {
                d1[curY][tmpX]       = tmp;
                parentX3[curY][tmpX] = curX;
                parentY3[curY][tmpX] = curY;
                HV[curY][tmpX]       = FALSE;
              }
              nodelock[curY * xGrid + curX + 1].unlock();
            }
          }
          // data[curY*(xGrid-1)+curX].lock();
          hyperH[curY][curX] = tmpH;

          // data[curY*(xGrid-1)+curX].unlock();

          // bottom

          // if(curY>regionY1)
          //   data[(curY-1)*xGrid+curX].lock();

          // data[curY*(xGrid-1)+curX].lock();

          if (curY > regionY1) {
            grid = (curY - 1) * xGrid + curX;
            // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
            //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
            //    v_edges[grid].red, v_edges[grid].last_usage, L ,
            //    v_edges[grid].usage.load() + v_edges[grid].red +
            //    (int)(L*v_edges[grid].last_usage));
            if ((preX == curX) || (curr_d1 == 0)) {
              tmp = curr_d1 +
                    v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                (int)(L * v_edges[grid].last_usage)];
            } else {
              if (curY < regionY2 - 1) {
                tmp_grid = curY * xGrid + curX;
                tmp_cost = d1[curY + 1][curX] +
                           v_costTable[v_edges[tmp_grid].usage +
                                       v_edges[tmp_grid].red +
                                       (int)(L * v_edges[tmp_grid].last_usage)];

                if (tmp_cost < curr_d1 + VIA) {
                  // hyperV[curY][curX] = TRUE;
                  tmpV = true;
                }
              }
              tmp = curr_d1 + VIA +
                    v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                (int)(L * v_edges[grid].last_usage)];
            }
            tmpY = curY - 1; // the bottom neighbor

            /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put into
            heap1
            {
                d1[tmpY][curX] = tmp;
                parentX1[tmpY][curX] = curX;
                parentY1[tmpY][curX] = curY;
                HV[tmpY][curX] = TRUE;
                pq1.push(&(d1[tmpY][curX]));

            }
            else */
            // galois::runtime::acquire(&data[tmpY * yGrid + curX],
            // galois::MethodFlag::WRITE);
            if (d1[tmpY][curX] > tmp && tmp < return_dist) {
              ctx.push(
                  concurrent_pq_grid(&(d1[tmpY][curX]), tmp, concurrentID));

              nodelock[(curY - 1) * xGrid + curX].lock();
              if (d1[tmpY][curX] > tmp &&
                  tmp < return_dist) // bottom neighbor been put into heap1 but
                                     // needs update
              {
                d1[tmpY][curX]       = tmp;
                parentX1[tmpY][curX] = curX;
                parentY1[tmpY][curX] = curY;
                HV[tmpY][curX]       = TRUE;
              }
              nodelock[(curY - 1) * xGrid + curX].unlock();
            }
          }
          // top
          if (curY < regionY2) {

            grid = curY * xGrid + curX;
            // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
            //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
            //    v_edges[grid].red, v_edges[grid].last_usage, L ,
            //    v_edges[grid].usage.load() + v_edges[grid].red +
            //    (int)(L*v_edges[grid].last_usage));
            if ((preX == curX) || (curr_d1 == 0)) {
              tmp = curr_d1 +
                    v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                (int)(L * v_edges[grid].last_usage)];
            } else {
              if (curY > regionY1 + 1) {
                tmp_grid = (curY - 1) * xGrid + curX;
                tmp_cost = d1[curY - 1][curX] +
                           v_costTable[v_edges[tmp_grid].usage +
                                       v_edges[tmp_grid].red +
                                       (int)(L * v_edges[tmp_grid].last_usage)];

                if (tmp_cost < curr_d1 + VIA) {
                  // hyperV[curY][curX] = TRUE;
                  tmpV = true;
                }
              }
              tmp = curr_d1 + VIA +
                    v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                (int)(L * v_edges[grid].last_usage)];
            }
            tmpY = curY + 1; // the top neighbor

            /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put into
            heap1
            {
                d1[tmpY][curX] = tmp;
                parentX1[tmpY][curX] = curX;
                parentY1[tmpY][curX] = curY;
                HV[tmpY][curX] = TRUE;
                pq1.push(&(d1[tmpY][curX]));
            }
            else*/
            // galois::runtime::acquire(&data[tmpY * yGrid + curX],
            // galois::MethodFlag::WRITE);
            if (d1[tmpY][curX] > tmp && tmp < return_dist) {
              ctx.push(
                  Concurrent_pq_grid(&(d1[tmpY][curX]), tmp, concurrentID));
              nodelock[(curY + 1) * xGrid + curX].lock();
              if (d1[tmpY][curX] > tmp &&
                  tmp < return_dist) // top neighbor been put into heap1 but
                                     // needs update
              {

                d1[tmpY][curX]       = tmp;
                parentX1[tmpY][curX] = curX;
                parentY1[tmpY][curX] = curY;
                HV[tmpY][curX]       = TRUE;

                // printf("top push Y: %d X: %d tmp: %f HV: false hyperH: %d\n",
                // tmpY, curX, tmp, true); pq1.push({&(d1[tmpY][curX]), tmp});
              }
              nodelock[(curY + 1) * xGrid + curX].unlock();
            }
          }
          hyperV[curY][curX] = tmpV;
          // data[curY*xGrid+curX].unlock();
        }
      },
      // galois::wl<galois::worklists::ParaMeter<>>(),
      // galois::wl<PSChunk>(),
      galois::wl<OBIM_concurrent>(RequestIndexerConcurrent),
      // galois::chunk_size<MAZE_CHUNK_SIZE>()
      // galois::parallel_break(),
      // galois::steal(),
      galois::loopname("fine_grain"));
}

void mazeRouteMSMD_finegrain_concurrent(int iter, int expand, float costHeight,
                                        int ripup_threshold,
                                        int mazeedge_Threshold, Bool Ordering,
                                        int cost_type, int nConcurrentNets) {
  // LOCK = 0;
  galois::StatTimer timer_finegrain("fine grain function",
                                    "fine grain concurrent maze");

  float forange;
  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  galois::LargeArray<galois::substrate::SimpleLock> data;
  data.allocateInterleaved(xGrid * yGrid);

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  CONCURRENT_NET_STORAGE* concurrent_net_storage =
      new CONCURRENT_NET_STORAGE[nConcurrentNets];
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  galois::StatTimer timer_newripupcheck("ripup", "fine grain concurrent maze");
  galois::StatTimer timer_setupheap("setup heap", "fine grain concurrent maze");
  galois::StatTimer timer_traceback("trace back", "fine grain concurrent maze");
  galois::StatTimer timer_adjusttree("adjust tree",
                                     "fine grain concurrent maze");
  galois::StatTimer timer_updateusage("update usage",
                                      "fine grain concurrent maze");
  galois::StatTimer timer_checkroute2dtree("checkroute2dtree",
                                           "fine grain concurrent maze");
  galois::StatTimer timer_init("init", "fine grain concurrent maze");
  galois::StatTimer timer_foreach("foreach", "fine grain concurrent maze");
  galois::StatTimer timer_init_int("big int initialize",
                                   "fine grain concurrent maze");

  int concurrentNet_cnt = 0;
  int concurrentID2netID[nConcurrentNets];
  int concurrentID2edgeID[nConcurrentNets];

  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {

    int l, netID;
    float total_usage;
    float overflow;

    // maze routing for multi-source, multi-destination
    Bool preD, hypered, enter, shifted;
    int i, j, k, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,
        xmax, curX, curY, crossX, crossY, tmpi, min_x, min_y, num_edges;
    int segWidth, segHeight;
    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
        tmp_gridsY[YRANGE];
    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
        edge_C1C2;
    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
    int E1x, E1y, E2x, E2y;
    int tmp_of;
    int origENG, edgeREC;

    float costL1, costL2, *dtmp;
    TreeEdge *treeedges, *treeedge, *cureedge;
    TreeNode* treenodes;

    bool* pop_heap2 = concurrent_net_storage[concurrentNet_cnt].pop_heap2;

    float** d1    = concurrent_net_storage[concurrentNet_cnt].d1_p;
    bool** HV     = concurrent_net_storage[concurrentNet_cnt].HV_p;
    bool** hyperV = concurrent_net_storage[concurrentNet_cnt].hyperV_p;
    bool** hyperH = concurrent_net_storage[concurrentNet_cnt].hyperH_p;

    short** parentX1 = concurrent_net_storage[concurrentNet_cnt].parentX1_p;
    short** parentX3 = concurrent_net_storage[concurrentNet_cnt].parentX3_p;
    short** parentY1 = concurrent_net_storage[concurrentNet_cnt].parentY1_p;
    short** parentY3 = concurrent_net_storage[concurrentNet_cnt].parentY3_p;

    int** corrEdge = concurrent_net_storage[concurrentNet_cnt].corrEdge_p;

    OrderNetEdge* netEO = concurrent_net_storage[concurrentNet_cnt].netEO_p;

    bool** inRegion = concurrent_net_storage[concurrentNet_cnt].inRegion_p;
    bool* inRegion_alloc =
        concurrent_net_storage[concurrentNet_cnt].inRegion_alloc;

    galois::LargeArray<galois::substrate::SimpleLock>& nodelock =
        concurrent_net_storage[concurrentNet_cnt].nodelock;

    galois::InsertBag<pq_grid> pq1;
    std::vector<int>& v2 = concurrent_net_storage[concurrentNet_cnt].v2;

    int& regionX1     = concurrent_net_storage[concurrentNet_cnt].regionX1;
    int& regionX2     = concurrent_net_storage[concurrentNet_cnt].regionX2;
    int& regionY1     = concurrent_net_storage[concurrentNet_cnt].regionY1;
    int& regionY2     = concurrent_net_storage[concurrentNet_cnt].regionY2;
    int& regionWidth  = concurrent_net_storage[concurrentNet_cnt].regionWidth;
    int& regionHeight = concurrent_net_storage[concurrentNet_cnt].regionHeight;

    if (Ordering) {
      netID = treeOrderCong[nidRPC].treeIndex;
    } else {
      netID = nidRPC;
    }

    deg = sttrees[netID].deg;

    origENG = expand;

    netedgeOrderDec(netID, netEO);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {

      edgeID   = netEO[edgeREC].edgeID;
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len >
          mazeedge_Threshold) // only route the non-degraded edges (len>0)
      {
        timer_newripupcheck.start();
        enter = newRipupCheck(treeedge, n1x, n1y, n2x, n2y, ripup_threshold,
                              netID, edgeID);
        timer_newripupcheck.stop();

        // ripup the routing for the edge
        timer_finegrain.start();
        if (enter) {
          concurrentID2netID[concurrentNet_cnt]  = netID;
          concurrentID2edgeID[concurrentNet_cnt] = edgeID;

          timer_init.start();
          if (n1y <= n2y) {
            ymin = n1y;
            ymax = n2y;
          } else {
            ymin = n2y;
            ymax = n1y;
          }

          if (n1x <= n2x) {
            xmin = n1x;
            xmax = n2x;
          } else {
            xmin = n2x;
            xmax = n1x;
          }

          shifted     = FALSE;
          int enlarge = min(
              origENG, (iter / 6 + 3) *
                           treeedge->route
                               .routelen); // michael, this was global variable
          segWidth     = xmax - xmin;
          segHeight    = ymax - ymin;
          regionX1     = max(0, xmin - enlarge);
          regionX2     = min(xGrid - 1, xmax + enlarge);
          regionY1     = max(0, ymin - enlarge);
          regionY2     = min(yGrid - 1, ymax + enlarge);
          regionWidth  = regionX2 - regionX1 + 1;
          regionHeight = regionY2 - regionY1 + 1;

          timer_init_int.start();
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              d1[i][j] = BIG_INT;

              /*d2[i][j] = BIG_INT;
              hyperH[i][j] = FALSE;
              hyperV[i][j] = FALSE;*/
            }
          }
          timer_init_int.stop();
          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              HV[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperH[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperV[i][j] = FALSE;
            }
          }
          // TODO: use seperate loops

          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
          // grids on the two subtrees
          timer_setupheap.start();
          setupHeap_nopq1clear(netID, edgeID, pq1, v2, regionX1, regionX2,
                               regionY1, regionY2, d1, corrEdge, inRegion,
                               concurrentNet_cnt);
          timer_setupheap.stop();
          // TODO: use std priority queue
          // while loop to find shortest path
          /*ind1 = (pq1.top().d1_p - &d1[0][0]);
          curX = ind1%xGrid;
          curY = ind1/xGrid;
          printf("src size: %d dst size: %d\n", pq1.size(), v2.size());*/
          for (auto ii = v2.begin(); ii != v2.end(); ii++) {
            pop_heap2[*ii] = TRUE;
          }

          std::atomic<int>& return_ind1 =
              concurrent_net_storage[concurrentNet_cnt]->return_ind1;
          return_ind1 = 0;
          std::atomic<float>& return_dist =
              concurrent_net_storage[concurrentNet_cnt]->return_dist;
          return_dist = (float)BIG_INT;

          timer_init.stop();
          timer_foreach.start();

          concurrent_maze(pq1, concurrent_net_storage);

          timer_foreach.stop();

          trace_back(concurrent_net_storage, concurrentID2netID,
                     concurrentID2edgeID, concurrentNet_cnt);

        } // congested route, if(enter)
        timer_finegrain.stop();
      } // only route the non-degraded edges (len>0)
    }   // iterate on edges of a net
  }

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);

  thread_local_storage->clear();
  delete thread_local_storage;
}


================================================
FILE: lonestar/eda/cpu/sproute/maze_finegrain_lateupdate.h
================================================

struct LateUpdateStorage {
  using LAptr = galois::substrate::LAptr;
  LAptr pop_heap2_LA;
  bool* pop_heap2;

  LAptr d1_p_LA, d1_alloc_LA;
  std::atomic<float>** d1_p;
  std::atomic<float>* d1_alloc;

  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,
      hyperH_alloc_LA;
  bool **HV_p, **hyperV_p, **hyperH_p;
  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;

  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,
      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;
  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;
  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;

  LAptr corrEdge_p_LA, corrEdge_alloc_LA;
  int** corrEdge_p;
  int* corrEdge_alloc;

  LAptr inRegion_p_LA, inRegion_alloc_LA;
  bool** inRegion_p;
  bool* inRegion_alloc;

  LAptr netEO_p_LA;
  OrderNetEdge* netEO_p;

  // maze_pq pq1;
  // std::vector<float*> v2;
  LateUpdateStorage() {
    using namespace galois::substrate;

    if (NET_PARALLEL) {
      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());

      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(atomic<float>));
      d1_alloc    = reinterpret_cast<atomic<float>*>(d1_alloc_LA.get());
      d1_p_LA     = largeMallocLocal(yGrid * sizeof(atomic<float>*));
      d1_p        = reinterpret_cast<atomic<float>**>(d1_p_LA.get());

      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());
      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());
      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());

      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());
      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());
      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));
      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());

      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());
      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());
      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());
      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));
      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());

      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());
      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());
      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());
      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));
      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());

      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));
      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());
      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));
      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());

      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));
      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());
      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));
      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());

      netEO_p_LA = largeMallocLocal(2000 * sizeof(OrderNetEdge));
      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());
    } else {
      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));

      d1_alloc = (std::atomic<float>*)calloc(yGrid * xGrid,
                                             sizeof(std::atomic<float>));
      d1_p = (std::atomic<float>**)calloc(yGrid, sizeof(std::atomic<float>*));

      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));
      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));
      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));

      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));
      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));
      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));

      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));
      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));

      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));
      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));

      netEO_p = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));
    }
    // printf("allocation success\n");
    for (int i = 0; i < yGrid; i++) {
      d1_p[i] = &(d1_alloc[i * xGrid]);

      HV_p[i]     = &(HV_alloc[i * xGrid]);
      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);
      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);

      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);

      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);
    }

    for (int i = 0; i < yGrid; i++) {
      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);
      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);
      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);
      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);
    }
  }
  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }
  void clear() {
    if (!NET_PARALLEL) {
      free(pop_heap2);

      free(d1_p);
      free(d1_alloc);

      free(HV_p);
      free(hyperV_p);
      free(hyperH_p);
      free(HV_alloc);
      free(hyperV_alloc);
      free(hyperH_alloc);

      free(parentX1_p);
      free(parentY1_p);
      free(parentX3_p);
      free(parentY3_p);

      free(parentX1_alloc);
      free(parentY1_alloc);
      free(parentX3_alloc);
      free(parentY3_alloc);

      free(corrEdge_alloc);
      free(corrEdge_p);

      free(netEO_p);
    } else {
      /*delete [] pop_heap2;
      delete [] heap1;
      delete [] heap2;

      delete [] d1_p;
      delete [] d1_alloc;

      delete [] HV_p;
      delete [] hyperV_p;
      delete [] hyperH_p;
      delete [] HV_alloc;
      delete [] hyperV_alloc;
      delete [] hyperH_alloc;

      delete [] parentX1_p;
      delete [] parentY1_p;
      delete [] parentX3_p;
      delete [] parentY3_p;

      delete [] parentX1_alloc;
      delete [] parentY1_alloc;
      delete [] parentX3_alloc;
      delete [] parentY3_alloc;

      delete [] corrEdge_alloc;
      delete [] corrEdge_p;

      delete [] netEO_p;*/
    }
  }
};

// ripup a tree edge according to its ripup type and Z-route it
// put all the nodes in the subtree t1 and t2 into heap1 and heap2
// netID   - the ID for the net
// edgeID  - the ID for the tree edge to route
// d1      - the distance of any grid from the source subtree t1
// d2      - the distance of any grid from the destination subtree t2
// heap1   - the heap storing the addresses for d1[][]
// heap2   - the heap storing the addresses for d2[][]
void setupHeapLate(int netID, int edgeID, galois::InsertBag<lateUpdateReq>& pq1,
                   std::vector<int>& v2, int regionX1, int regionX2,
                   int regionY1, int regionY2, std::atomic<float>** d1,
                   int** corrEdge, bool** inRegion) {
  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;
  int nbr, nbrX, nbrY, cur, edge;
  int x_grid, y_grid;
  int queuehead, queuetail, *queue;
  Bool* visited;
  TreeEdge* treeedges;
  TreeNode* treenodes;
  Route* route;

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = TRUE;
  }

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  d         = sttrees[netID].deg;

  n1 = treeedges[edgeID].n1;
  n2 = treeedges[edgeID].n2;
  x1 = treenodes[n1].x;
  y1 = treenodes[n1].y;
  x2 = treenodes[n2].x;
  y2 = treenodes[n2].y;

  // if(netID == 14628)
  //    printf("net: %d edge: %d src: %d %d dst: %d %d d: %d\n", netID, edgeID,
  //    y1, x1, y2, x2, d);
  pq1.clear();
  v2.clear(); // Michael
  if (d == 2) // 2-pin net
  {
    d1[y1][x1] = 0;
    pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));
    v2.push_back(y2 * xGrid + x2);
  } else // net with more than 2 pins
  {
    numNodes = 2 * d - 2;

    visited = (Bool*)calloc(numNodes, sizeof(Bool));
    for (i = 0; i < numNodes; i++)
      visited[i] = FALSE;

    queue = (int*)calloc(numNodes, sizeof(int));

    // find all the grids on tree edges in subtree t1 (connecting to n1) and put
    // them into heap1
    if (n1 < d) // n1 is a Pin node
    {
      // just need to put n1 itself into heap1
      d1[y1][x1] = 0;
      pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));
      visited[n1] = TRUE;
    } else // n1 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n1 into heap1
      d1[y1][x1] = 0;
      // if(netID == 252163 && edgeID == 51)
      //    printf("y: %d x: %d\n", y1, x1);
      pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));
      visited[n1] = TRUE;

      // add n1 into the queue
      queue[queuetail] = n1;
      queuetail++;

      // loop to find all the edges in subtree t1
      while (queuetail > queuehead) {
        // get cur node from the queuehead
        cur = queue[queuehead];
        queuehead++;
        visited[cur] = TRUE;
        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n2) // not n2
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap1
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap1 if in enlarged region
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX           = treenodes[nbr].x;
                    nbrY           = treenodes[nbr].y;
                    d1[nbrY][nbrX] = 0;
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("y: %d x: %d\n", nbrY, nbrX);
                    pq1.push(lateUpdateReq(&(d1[nbrY][nbrX]), 0, 0, 0, false));
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap1
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];

                      if (inRegion[y_grid][x_grid]) {
                        d1[y_grid][x_grid] = 0;
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("y: %d x: %d\n", y_grid, x_grid);
                        pq1.push(lateUpdateReq(&(d1[y_grid][x_grid]), 0, 0, 0,
                                               false));
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if not a degraded edge (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n2
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n1 is not a Pin node

    // find all the grids on subtree t2 (connect to n2) and put them into heap2
    // find all the grids on tree edges in subtree t2 (connecting to n2) and put
    // them into heap2
    if (n2 < d) // n2 is a Pin node
    {
      // just need to put n2 itself into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 14628)
      //    printf("y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;
    } else // n2 is a Steiner node
    {
      queuehead = queuetail = 0;

      // add n2 into heap2
      v2.push_back(y2 * xGrid + x2);
      // if(netID == 252163 && edgeID == 51)
      //    printf("dst y: %d x: %d \n", y2, x2);
      visited[n2] = TRUE;

      // add n2 into the queue
      queue[queuetail] = n2;
      queuetail++;

      // loop to find all the edges in subtree t2
      while (queuetail > queuehead) {
        // get cur node form queuehead
        cur          = queue[queuehead];
        visited[cur] = TRUE;
        queuehead++;

        if (cur >= d) // cur node is a Steiner node
        {
          for (i = 0; i < 3; i++) {
            nbr  = treenodes[cur].nbr[i];
            edge = treenodes[cur].edge[i];
            if (nbr != n1) // not n1
            {
              if (visited[nbr] == FALSE) {
                // put all the grids on the two adjacent tree edges into heap2
                if (treeedges[edge].route.routelen > 0) // not a degraded edge
                {
                  // put nbr into heap2
                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {
                    nbrX = treenodes[nbr].x;
                    nbrY = treenodes[nbr].y;
                    v2.push_back(nbrY * xGrid + nbrX);
                    // if(netID == 252163 && edgeID == 51)
                    //    printf("dst y: %d x: %d\n", nbrY, nbrX);
                    corrEdge[nbrY][nbrX] = edge;
                  }

                  // the coordinates of two end nodes of the edge

                  route = &(treeedges[edge].route);
                  if (route->type == MAZEROUTE) {
                    for (j = 1; j < route->routelen;
                         j++) // don't put edge_n1 and edge_n2 into heap2
                    {
                      x_grid = route->gridsX[j];
                      y_grid = route->gridsY[j];
                      if (inRegion[y_grid][x_grid]) {
                        v2.push_back(y_grid * xGrid + x_grid);
                        // if(netID == 252163 && edgeID == 51)
                        //    printf("dst y: %d x: %d\n", y_grid, x_grid);
                        corrEdge[y_grid][x_grid] = edge;
                      }
                    }
                  } // if MAZEROUTE
                  else {
                    printf("Setup Heap: not maze routing\n");
                  }
                } // if the edge is not degraded (len>0)

                // add the neighbor of cur node into queue
                queue[queuetail] = nbr;
                queuetail++;
              } // if the node is not visited
            }   // if nbr!=n1
          }     // loop i (3 neigbors for cur node)
        }       // if cur node is a Steiner nodes
      }         // while queue is not empty
    }           // else n2 is not a Pin node

    free(queue);
    free(visited);
  } // net with more than two pins

  for (i = regionY1; i <= regionY2; i++) {
    for (j = regionX1; j <= regionX2; j++)
      inRegion[i][j] = FALSE;
  }
}

void mazeRouteMSMD_finegrain_lateupdate(int iter, int expand, float costHeight,
                                        int ripup_threshold,
                                        int mazeedge_Threshold, Bool Ordering,
                                        int cost_type) {
  // LOCK = 0;
  galois::StatTimer timer_finegrain("fine grain function", "fine grain maze");

  float forange;
  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            (float)costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  int* d1_edgeID = new int[yGrid * xGrid];
  int* d1_netID  = new int[yGrid * xGrid];
  for (int i = 0; i < yGrid * xGrid; i++) {
    d1_edgeID[i] = 0;
    d1_netID[i]  = 0;
  }

  galois::LargeArray<galois::substrate::SimpleLock> data;
  data.allocateInterleaved(xGrid * yGrid);

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  LateUpdateStorage* thread_local_storage = new LateUpdateStorage;
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  galois::StatTimer timer_newripupcheck("ripup", "fine grain maze");
  galois::StatTimer timer_setupheap("setup heap", "fine grain maze");
  galois::StatTimer timer_traceback("trace back", "fine grain maze");
  galois::StatTimer timer_adjusttree("adjust tree", "fine grain maze");
  galois::StatTimer timer_updateusage("update usage", "fine grain maze");
  galois::StatTimer timer_checkroute2dtree("checkroute2dtree",
                                           "fine grain maze");
  galois::StatTimer timer_init("init", "fine grain maze");
  galois::StatTimer timer_foreach("foreach", "fine grain maze");
  galois::StatTimer timer_init_int("big int initialize", "fine grain maze");
  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {

    int netID;

    // maze routing for multi-source, multi-destination
    Bool hypered, enter;
    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,
        crossX, crossY, tmpi, min_x, min_y, num_edges;
    int regionX1, regionX2, regionY1, regionY2;
    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
        tmp_gridsY[YRANGE];
    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
        edge_C1C2;
    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
    int E1x, E1y, E2x, E2y;
    int origENG, edgeREC;

    TreeEdge *treeedges, *treeedge;
    TreeNode* treenodes;

    bool* pop_heap2 = thread_local_storage->pop_heap2;

    std::atomic<float>** d1 = thread_local_storage->d1_p;
    bool** HV               = thread_local_storage->HV_p;
    bool** hyperV           = thread_local_storage->hyperV_p;
    bool** hyperH           = thread_local_storage->hyperH_p;

    short** parentX1 = thread_local_storage->parentX1_p;
    short** parentX3 = thread_local_storage->parentX3_p;
    short** parentY1 = thread_local_storage->parentY1_p;
    short** parentY3 = thread_local_storage->parentY3_p;

    int** corrEdge = thread_local_storage->corrEdge_p;

    OrderNetEdge* netEO = thread_local_storage->netEO_p;

    bool** inRegion = thread_local_storage->inRegion_p;

    galois::InsertBag<lateUpdateReq> pq1;
    std::vector<int> v2;

    /*for(i=0; i<yGrid*xGrid; i++)
    {
        pop_heap2[i] = FALSE;
    } */

    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
    /*for(int i=0; i<yGrid; i++)
    {
        for(int j=0; j<xGrid; j++)
            inRegion[i][j] = FALSE;
    }*/
    // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
    // parentY1[153][134], parentX3[153][134]); printf("what is happening?\n");

    if (Ordering) {
      netID = treeOrderCong[nidRPC].treeIndex;
    } else {
      netID = nidRPC;
    }

    deg = sttrees[netID].deg;

    origENG = expand;

    netedgeOrderDec(netID, netEO);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    // loop for all the tree edges (2*deg-3)
    num_edges = 2 * deg - 3;

    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {

      edgeID   = netEO[edgeREC].edgeID;
      treeedge = &(treeedges[edgeID]);

      n1            = treeedge->n1;
      n2            = treeedge->n2;
      n1x           = treenodes[n1].x;
      n1y           = treenodes[n1].y;
      n2x           = treenodes[n2].x;
      n2y           = treenodes[n2].y;
      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

      if (treeedge->len >
          mazeedge_Threshold) // only route the non-degraded edges (len>0)
      {
        timer_newripupcheck.start();
        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);
        timer_newripupcheck.stop();

        // ripup the routing for the edge
        timer_finegrain.start();
        if (enter) {
          // if(netID == 2 && edgeID == 26)
          //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
          //    edgeID, n1x, n1y, n2x, n2y);
          // pre_length = treeedge->route.routelen;
          /*for(int i = 0; i < pre_length; i++)
          {
              pre_gridsY[i] = treeedge->route.gridsY[i];
              pre_gridsX[i] = treeedge->route.gridsX[i];
              //printf("i %d x %d y %d\n", i, pre_gridsX[i], pre_gridsY[i]);
          }*/
          timer_init.start();
          if (n1y <= n2y) {
            ymin = n1y;
            ymax = n2y;
          } else {
            ymin = n2y;
            ymax = n1y;
          }

          if (n1x <= n2x) {
            xmin = n1x;
            xmax = n2x;
          } else {
            xmin = n2x;
            xmax = n1x;
          }

          int enlarge = min(
              origENG, (iter / 6 + 3) *
                           treeedge->route
                               .routelen); // michael, this was global variable
          regionX1 = max(0, xmin - enlarge);
          regionX2 = min(xGrid - 1, xmax + enlarge);
          regionY1 = max(0, ymin - enlarge);
          regionY2 = min(yGrid - 1, ymax + enlarge);
          // std::cout << "region size" << regionWidth << ", " << regionHeight
          // << std::endl;
          // initialize d1[][] and d2[][] as BIG_INT
          timer_init_int.start();
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              d1[i][j] = BIG_INT;
            }
          }
          timer_init_int.stop();
          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              HV[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperH[i][j] = FALSE;
            }
          }
          for (i = regionY1; i <= regionY2; i++) {
            for (j = regionX1; j <= regionX2; j++) {
              hyperV[i][j] = FALSE;
            }
          }
          // TODO: use seperate loops

          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
          // grids on the two subtrees
          timer_setupheap.start();
          setupHeapLate(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                        regionY2, d1, corrEdge, inRegion);
          timer_setupheap.stop();
          // TODO: use std priority queue
          // while loop to find shortest path
          /*ind1 = (pq1.top().d1_p - &d1[0][0]);
          curX = ind1%xGrid;
          curY = ind1/xGrid;
          printf("src size: %d dst size: %d\n", pq1.size(), v2.size());*/
          for (auto ii = v2.begin(); ii != v2.end(); ii++) {
            pop_heap2[*ii] = TRUE;
            // cout << "dst : " << *ii % xGrid << " " << *ii / xGrid << endl;
          }
          std::atomic<int> return_ind1;
          std::atomic<float> return_dist;
          return_dist = (float)BIG_INT;

          timer_init.stop();
          timer_foreach.start();

          galois::for_each(
              galois::iterate(pq1),
              [&](const auto& top, auto& ctx)
              // while( pop_heap2[ind1]==FALSE) // stop until the grid position
              // been popped out from both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                int ind1 = top.d1_p - &d1[0][0];
                // data[ind1].lock();

                int curX = ind1 % xGrid;
                int curY = ind1 / xGrid;
                int grid = curY * xGrid + curX;

                float curr_d1 = d1[curY][curX];
                float d1_push = top.d1_push;

                // printf("netID: %d edgeID:%d curX curY %d %d, d1_push: %f,
                // curr_d1: %f\n", netID, edgeID, curX, curY, d1_push, curr_d1);
                if ((d1_push == curr_d1 && d1_push < return_dist.load()) ||
                    d1_push == 0) {

                  // d1[curY][curX] = d1_push;
                  // curr_d1 = d1_push;
                  if (d1_push == 0) {
                    d1_edgeID[ind1] = edgeID;
                    d1_netID[ind1]  = netID;
                  }

                  HV[curY][curX] = top.HV;
                  if (top.HV) {
                    parentX1[curY][curX] = top.parentX;
                    parentY1[curY][curX] = top.parentY;
                  } else {
                    parentX3[curY][curX] = top.parentX;
                    parentY3[curY][curX] = top.parentY;
                  }

                  if (pop_heap2[ind1]) {
                    // printf("reach! curX curY %d %d, d1_push: %f, curr_d1: %f
                    // return_d: %f\n", curX, curY, d1_push, curr_d1,
                    // return_dist.load());
                    return_ind1.store(ind1);
                    return_dist.store(d1_push);
                  }

                  /*grid = curY*xGrid + curX - 1;
                  if(curX>regionX1)
                      galois::runtime::acquire(&data[grid],
                  galois::MethodFlag::WRITE);

                  grid = curY*xGrid + curX + 1;
                  if(curX<regionX2)
                      galois::runtime::acquire(&data[grid],
                  galois::MethodFlag::WRITE);

                  grid = (curY - 1)*xGrid + curX;
                  if(curY>regionY1)
                      galois::runtime::acquire(&data[grid],
                  galois::MethodFlag::WRITE);

                  grid = (curY + 1)*xGrid + curX;
                  if(curY<regionY2)
                      galois::runtime::acquire(&data[grid],
                  galois::MethodFlag::WRITE);*/

                  int preX = curX, preY = curY;
                  if (curr_d1 != 0) {
                    if (HV[curY][curX]) {
                      preX = parentX1[curY][curX];
                      preY = parentY1[curY][curX];
                    } else {
                      preX = parentX3[curY][curX];
                      preY = parentY3[curY][curX];
                    }
                  }
                  // printf("pop curY: %d curX: %d d1: %f preX: %d preY: %d
                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\n",
                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],
                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());
                  float tmp = 0.f, tmp_cost = 0.f;
                  int tmp_grid = 0;
                  int tmpX = 0, tmpY = 0;
                  bool tmpH = false;
                  bool tmpV = false;

                  // if(curX>regionX1)
                  //    data[curY*xGrid+curX-1].lock();

                  // data[curY*(xGrid-1)+curX].lock();

                  // left
                  if (curX > regionX1) {
                    grid = curY * (xGrid - 1) + curX - 1;

                    // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
                    //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),
                    //    h_edges[grid].red, h_edges[grid].last_usage, L ,
                    //    h_edges[grid].usage.load() + h_edges[grid].red +
                    //    (int)(L*h_edges[grid].last_usage));
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX < regionX2 - 1) {
                        tmp_grid = curY * (xGrid - 1) + curX;
                        // float d1_right = (d1_edgeID[curY*xGrid+curX+1] ==
                        // edgeID && d1_netID[curY*xGrid+curX+1] == netID)?
                        // d1[curY][curX+1] : BIG_INT;
                        tmp_cost =
                            d1[curY][curX + 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperH[curY][curX] = TRUE; //Michael
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    tmpX = curX - 1; // the left neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));
                    }
                    else */
                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],
                    // galois::MethodFlag::WRITE);
                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {
                      galois::atomicMin(d1[curY][tmpX], tmp);
                      if (d1[curY][tmpX] == tmp)
                        ctx.push(lateUpdateReq(&(d1[curY][tmpX]), tmp, curX,
                                               curY, false));
                    }
                  }

                  // right

                  if (curX < regionX2) {
                    // data[curY*xGrid+curX+1].lock();
                    grid = curY * (xGrid - 1) + curX;
                    // printf("grid: %d %d usage: %d red:%d last:%d L:%f
                    // sum:%d\n",grid%xGrid, grid/xGrid,
                    // h_edges[grid].usage.load(), h_edges[grid].red,
                    // h_edges[grid].last_usage, L , h_edges[grid].usage.load()
                    // + h_edges[grid].red + (int)(L*h_edges[grid].last_usage));
                    if ((preY == curY) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    } else {
                      if (curX > regionX1 + 1) {
                        tmp_grid = curY * (xGrid - 1) + curX - 1;
                        // float d1_left = (d1_edgeID[curY*xGrid+curX-1] ==
                        // edgeID && d1_netID[curY*xGrid+curX-1] == netID)?
                        // d1[curY][curX-1] : BIG_INT;
                        tmp_cost =
                            d1[curY][curX - 1] +
                            h_costTable[h_edges[tmp_grid].usage +
                                        h_edges[tmp_grid].red +
                                        (int)(L *
                                              h_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperH[curY][curX] = TRUE;
                          tmpH = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                    }
                    tmpX = curX + 1; // the right neighbor

                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                    into heap1
                    {
                        d1[curY][tmpX] = tmp;
                        parentX3[curY][tmpX] = curX;
                        parentY3[curY][tmpX] = curY;
                        HV[curY][tmpX] = FALSE;
                        pq1.push(&(d1[curY][tmpX]));

                    }
                    else */
                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],
                    // galois::MethodFlag::WRITE);
                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {
                      galois::atomicMin(d1[curY][tmpX], tmp);
                      if (d1[curY][tmpX] == tmp)
                        ctx.push(lateUpdateReq(&(d1[curY][tmpX]), tmp, curX,
                                               curY, false));
                    }
                  }
                  // data[curY*(xGrid-1)+curX].lock();
                  hyperH[curY][curX] = tmpH;

                  // data[curY*(xGrid-1)+curX].unlock();

                  // bottom

                  // if(curY>regionY1)
                  //   data[(curY-1)*xGrid+curX].lock();

                  // data[curY*(xGrid-1)+curX].lock();

                  if (curY > regionY1) {
                    grid = (curY - 1) * xGrid + curX;
                    // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
                    //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
                    //    v_edges[grid].red, v_edges[grid].last_usage, L ,
                    //    v_edges[grid].usage.load() + v_edges[grid].red +
                    //    (int)(L*v_edges[grid].last_usage));
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY < regionY2 - 1) {
                        tmp_grid = curY * xGrid + curX;
                        // float d1_top = (d1_edgeID[(curY+1)*xGrid+curX] ==
                        // edgeID && d1_netID[(curY+1)*xGrid+curX] == netID)?
                        // d1[curY+1][curX] : BIG_INT;
                        tmp_cost =
                            d1[curY + 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    tmpY = curY - 1; // the bottom neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been
                    put into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));

                    }
                    else */
                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],
                    // galois::MethodFlag::WRITE);
                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {
                      galois::atomicMin(d1[tmpY][curX], tmp);
                      if (d1[tmpY][curX] == tmp)
                        ctx.push(lateUpdateReq(&(d1[tmpY][curX]), tmp, curX,
                                               curY, true));
                    }
                  }
                  // top
                  if (curY < regionY2) {

                    grid = curY * xGrid + curX;
                    // printf("grid: %d %d usage: %d red:%d last:%d sum%f %d\n",
                    //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),
                    //    v_edges[grid].red, v_edges[grid].last_usage, L ,
                    //    v_edges[grid].usage.load() + v_edges[grid].red +
                    //    (int)(L*v_edges[grid].last_usage));
                    if ((preX == curX) || (curr_d1 == 0)) {
                      tmp =
                          curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    } else {
                      if (curY > regionY1 + 1) {
                        tmp_grid = (curY - 1) * xGrid + curX;
                        // float d1_bot = (d1_edgeID[(curY-1)*xGrid+curX] ==
                        // edgeID && d1_netID[(curY-1)*xGrid+curX] == netID)?
                        // d1[curY-1][curX] : BIG_INT;
                        tmp_cost =
                            d1[curY - 1][curX] +
                            v_costTable[v_edges[tmp_grid].usage +
                                        v_edges[tmp_grid].red +
                                        (int)(L *
                                              v_edges[tmp_grid].last_usage)];

                        if (tmp_cost < curr_d1 + VIA) {
                          // hyperV[curY][curX] = TRUE;
                          tmpV = true;
                        }
                      }
                      tmp =
                          curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                    }
                    tmpY = curY + 1; // the top neighbor

                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put
                    into heap1
                    {
                        d1[tmpY][curX] = tmp;
                        parentX1[tmpY][curX] = curX;
                        parentY1[tmpY][curX] = curY;
                        HV[tmpY][curX] = TRUE;
                        pq1.push(&(d1[tmpY][curX]));
                    }
                    else*/
                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],
                    // galois::MethodFlag::WRITE);
                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {
                      galois::atomicMin(d1[tmpY][curX], tmp);
                      if (d1[tmpY][curX] == tmp)
                        ctx.push(lateUpdateReq(&(d1[tmpY][curX]), tmp, curX,
                                               curY, true));
                    }
                  }
                  hyperV[curY][curX] = tmpV;
                  // data[curY*xGrid+curX].unlock();
                }
                // data[ind1].unlock();
              },
              // galois::wl<galois::worklists::ParaMeter<>>(),
              // galois::wl<PSChunk>(),
              galois::wl<OBIM_late>(RequestIndexerLate),
              // galois::chunk_size<MAZE_CHUNK_SIZE>()
              // galois::parallel_break(),
              // galois::steal(),
              galois::loopname("fine_grain"));

          timer_foreach.stop();

          for (auto ii = v2.begin(); ii != v2.end(); ii++)
            pop_heap2[*ii] = FALSE;

          crossX = return_ind1 % xGrid;
          crossY = return_ind1 / xGrid;

          cnt      = 0;
          int curX = crossX;
          int curY = crossY;
          int tmpX, tmpY;
          // if(netID == 2 && edgeID == 26)
          //    printf("crossX %d crossY %d return_d: %f\n", crossX, crossY,
          //    return_dist.load());
          timer_traceback.start();
          while (d1[curY][curX] != 0) // loop until reach subtree1
          {
            // if(cnt < 200)
            //    printf("Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\n",
            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],
            //    HV[curY][curX], d1[curY][curX]);

            hypered = FALSE;
            if (cnt != 0) {
              if (curX != tmpX && hyperH[curY][curX]) {
                curX    = 2 * curX - tmpX;
                hypered = TRUE;
              }
              // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
              if (curY != tmpY && hyperV[curY][curX]) {
                curY    = 2 * curY - tmpY;
                hypered = TRUE;
              }
            }
            tmpX = curX;
            tmpY = curY;
            if (!hypered) {
              if (HV[tmpY][tmpX]) {
                curY = parentY1[tmpY][tmpX];
              } else {
                curX = parentX3[tmpY][tmpX];
              }
            }

            tmp_gridsX[cnt] = curX;
            tmp_gridsY[cnt] = curY;
            cnt++;
          }
          // reverse the grids on the path

          for (i = 0; i < cnt; i++) {
            tmpind    = cnt - 1 - i;
            gridsX[i] = tmp_gridsX[tmpind];
            gridsY[i] = tmp_gridsY[tmpind];
          }
          // add the connection point (crossX, crossY)
          gridsX[cnt] = crossX;
          gridsY[cnt] = crossY;
          cnt++;

          curX     = crossX;
          curY     = crossY;
          cnt_n1n2 = cnt;

          // change the tree structure according to the new routing for the tree
          // edge find E1 and E2, and the endpoints of the edges they are on
          E1x = gridsX[0];
          E1y = gridsY[0];
          E2x = gridsX[cnt_n1n2 - 1];
          E2y = gridsY[cnt_n1n2 - 1];

          edge_n1n2 = edgeID;

          timer_traceback.stop();

          // if(netID == 14628)
          //    printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n",
          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);
          // (1) consider subtree1
          timer_adjusttree.start();
          if (n1 >= deg && (E1x != n1x || E1y != n1y))
          // n1 is not a pin and E1!=n1, then make change to subtree1,
          // otherwise, no change to subtree1
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

            // find A1, A2 and edge_n1A1, edge_n1A2
            if (treenodes[n1].nbr[0] == n2) {
              A1        = treenodes[n1].nbr[1];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[1];
              edge_n1A2 = treenodes[n1].edge[2];
            } else if (treenodes[n1].nbr[1] == n2) {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[2];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[2];
            } else {
              A1        = treenodes[n1].nbr[0];
              A2        = treenodes[n1].nbr[1];
              edge_n1A1 = treenodes[n1].edge[0];
              edge_n1A2 = treenodes[n1].edge[1];
            }

            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
            {
              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on
              // (n1, A1)
              if (endpt1 == A2 || endpt2 == A2) {
                tmpi      = A1;
                A1        = A2;
                A2        = tmpi;
                tmpi      = edge_n1A1;
                edge_n1A1 = edge_n1A2;
                edge_n1A2 = tmpi;
              }

              // update route for edge (n1, A1), (n1, A2)
              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                               edge_n1A1, edge_n1A2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
            }    // if E1 is on (n1, A1) or (n1, A2)
            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
            {
              C1        = endpt1;
              C2        = endpt2;
              edge_C1C2 = corrEdge[E1y][E1x];

              // update route for edge (n1, C1), (n1, C2) and (A1, A2)
              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
              // update position for n1
              treenodes[n1].x = E1x;
              treenodes[n1].y = E1y;
              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
              // C2)->(A1, A2)
              edge_n1C1               = edge_n1A1;
              treeedges[edge_n1C1].n1 = C1;
              treeedges[edge_n1C1].n2 = n1;
              edge_n1C2               = edge_n1A2;
              treeedges[edge_n1C2].n1 = n1;
              treeedges[edge_n1C2].n2 = C2;
              edge_A1A2               = edge_C1C2;
              treeedges[edge_A1A2].n1 = A1;
              treeedges[edge_A1A2].n2 = A2;
              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
              // n1's nbr (n2, A1, A2)->(n2, C1, C2)
              treenodes[n1].nbr[0]  = n2;
              treenodes[n1].edge[0] = edge_n1n2;
              treenodes[n1].nbr[1]  = C1;
              treenodes[n1].edge[1] = edge_n1C1;
              treenodes[n1].nbr[2]  = C2;
              treenodes[n1].edge[2] = edge_n1C2;
              // A1's nbr n1->A2
              for (i = 0; i < 3; i++) {
                if (treenodes[A1].nbr[i] == n1) {
                  treenodes[A1].nbr[i]  = A2;
                  treenodes[A1].edge[i] = edge_A1A2;
                  break;
                }
              }
              // A2's nbr n1->A1
              for (i = 0; i < 3; i++) {
                if (treenodes[A2].nbr[i] == n1) {
                  treenodes[A2].nbr[i]  = A1;
                  treenodes[A2].edge[i] = edge_A1A2;
                  break;
                }
              }
              // C1's nbr C2->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C1].nbr[i] == C2) {
                  treenodes[C1].nbr[i]  = n1;
                  treenodes[C1].edge[i] = edge_n1C1;
                  break;
                }
              }
              // C2's nbr C1->n1
              for (i = 0; i < 3; i++) {
                if (treenodes[C2].nbr[i] == C1) {
                  treenodes[C2].nbr[i]  = n1;
                  treenodes[C2].edge[i] = edge_n1C2;
                  break;
                }
              }

            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
          }   // n1 is not a pin and E1!=n1

          // (2) consider subtree2

          if (n2 >= deg && (E2x != n2x || E2y != n2y))
          // n2 is not a pin and E2!=n2, then make change to subtree2,
          // otherwise, no change to subtree2
          {
            // find the endpoints of the edge E1 is on
            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

            // find B1, B2
            if (treenodes[n2].nbr[0] == n1) {
              B1        = treenodes[n2].nbr[1];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[1];
              edge_n2B2 = treenodes[n2].edge[2];
            } else if (treenodes[n2].nbr[1] == n1) {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[2];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[2];
            } else {
              B1        = treenodes[n2].nbr[0];
              B2        = treenodes[n2].nbr[1];
              edge_n2B1 = treenodes[n2].edge[0];
              edge_n2B2 = treenodes[n2].edge[1];
            }

            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
            {
              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on
              // (n2, B1)
              if (endpt1 == B2 || endpt2 == B2) {
                tmpi      = B1;
                B1        = B2;
                B2        = tmpi;
                tmpi      = edge_n2B1;
                edge_n2B1 = edge_n2B2;
                edge_n2B2 = tmpi;
              }

              // update route for edge (n2, B1), (n2, B2)
              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                               edge_n2B1, edge_n2B2);

              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
            }    // if E2 is on (n2, B1) or (n2, B2)
            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
            {
              D1        = endpt1;
              D2        = endpt2;
              edge_D1D2 = corrEdge[E2y][E2x];

              // update route for edge (n2, D1), (n2, D2) and (B1, B2)
              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
              // update position for n2
              treenodes[n2].x = E2x;
              treenodes[n2].y = E2y;
              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
              // D2)->(B1, B2)
              edge_n2D1               = edge_n2B1;
              treeedges[edge_n2D1].n1 = D1;
              treeedges[edge_n2D1].n2 = n2;
              edge_n2D2               = edge_n2B2;
              treeedges[edge_n2D2].n1 = n2;
              treeedges[edge_n2D2].n2 = D2;
              edge_B1B2               = edge_D1D2;
              treeedges[edge_B1B2].n1 = B1;
              treeedges[edge_B1B2].n2 = B2;
              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
              // n1's nbr (n1, B1, B2)->(n1, D1, D2)
              treenodes[n2].nbr[0]  = n1;
              treenodes[n2].edge[0] = edge_n1n2;
              treenodes[n2].nbr[1]  = D1;
              treenodes[n2].edge[1] = edge_n2D1;
              treenodes[n2].nbr[2]  = D2;
              treenodes[n2].edge[2] = edge_n2D2;
              // B1's nbr n2->B2
              for (i = 0; i < 3; i++) {
                if (treenodes[B1].nbr[i] == n2) {
                  treenodes[B1].nbr[i]  = B2;
                  treenodes[B1].edge[i] = edge_B1B2;
                  break;
                }
              }
              // B2's nbr n2->B1
              for (i = 0; i < 3; i++) {
                if (treenodes[B2].nbr[i] == n2) {
                  treenodes[B2].nbr[i]  = B1;
                  treenodes[B2].edge[i] = edge_B1B2;
                  break;
                }
              }
              // D1's nbr D2->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D1].nbr[i] == D2) {
                  treenodes[D1].nbr[i]  = n2;
                  treenodes[D1].edge[i] = edge_n2D1;
                  break;
                }
              }
              // D2's nbr D1->n2
              for (i = 0; i < 3; i++) {
                if (treenodes[D2].nbr[i] == D1) {
                  treenodes[D2].nbr[i]  = n2;
                  treenodes[D2].edge[i] = edge_n2D2;
                  break;
                }
              }
            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
          }   // n2 is not a pin and E2!=n2

          // update route for edge (n1, n2) and edge usage

          // printf("update route? %d %d\n", netID, num_edges);
          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
            free(treeedges[edge_n1n2].route.gridsX);
            free(treeedges[edge_n1n2].route.gridsY);
          }
          treeedges[edge_n1n2].route.gridsX =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.gridsY =
              (short*)calloc(cnt_n1n2, sizeof(short));
          treeedges[edge_n1n2].route.type     = MAZEROUTE;
          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
          treeedges[edge_n1n2].n_ripups += 1;
          total_ripups += 1;
          max_ripups.update(treeedges[edge_n1n2].n_ripups);

          for (i = 0; i < cnt_n1n2; i++) {
            // printf("cnt_n1n2: %d\n", cnt_n1n2);
            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
          }
          // std::cout << " adjsut tree" << std::endl;
          timer_adjusttree.stop();

          // update edge usage

          /*for(i=0; i<pre_length; i++)
          {
              if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
              {
                  if(i != pre_length - 1)
                      min_y = min(pre_gridsY[i], pre_gridsY[i+1]);
                  else
                      min_y = pre_gridsY[i];
                  //v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                  //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
          (short unsigned)1);
                  //printf("x y %d %d i %d \n", pre_gridsX[i], min_y, i);
                  v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short
          int)1, std::memory_order_relaxed);
                  //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf("V
          negative! %d \n", i);
              }
              else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
              {
                  if(i != pre_length - 1)
                      min_x = min(pre_gridsX[i], pre_gridsX[i+1]);
                  else
                      min_x = pre_gridsX[i];
                  //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                  //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
          (short unsigned)1);
                  //printf("x y %d %d i %d\n", min_x, pre_gridsY[i], i);
                  h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short
          int)1, std::memory_order_relaxed);
                  //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)
          printf("H negative! %d \n", i);
              }
          }*/
          timer_updateusage.start();
          for (i = 0; i < cnt_n1n2 - 1; i++) {
            if (gridsX[i] == gridsX[i + 1]) // a vertical edge
            {
              min_y = min(gridsY[i], gridsY[i + 1]);
              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short
              // unsigned)1);
              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
              min_x = min(gridsX[i], gridsX[i + 1]);
              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              // (short unsigned)1);
              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                  (short int)1, std::memory_order_relaxed);
            }
          }
          timer_updateusage.stop();
          /*if(LOCK){
              for(i=0; i<cnt_n1n2-1; i++)
              {
                  if(gridsX[i]==gridsX[i+1]) // a vertical edge
                  {
                      min_y = min(gridsY[i], gridsY[i+1]);
                      v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                  }
                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                  {
                      min_x = min(gridsX[i], gridsX[i+1]);
                      h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                  }
              }
          }*/
          // printf("netID %d edgeID %d src %d %d dst %d %d routelen: %d\n",
          // netID, edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);
          timer_checkroute2dtree.start();
          if (checkRoute2DTree(netID)) {
            reInitTree(netID);
            return;
          }
          timer_checkroute2dtree.stop();
        } // congested route, if(enter)
        timer_finegrain.stop();
      } // only route the non-degraded edges (len>0)
    }   // iterate on edges of a net
  }

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);

  thread_local_storage->clear();
  delete thread_local_storage;

  delete[] d1_edgeID;
  delete[] d1_netID;
}


================================================
FILE: lonestar/eda/cpu/sproute/maze_lock.h
================================================


Bool newRipupCheck_lock(TreeEdge* treeedge, int ripup_threshold, int netID,
                        int edgeID) {
  short *gridsX, *gridsY;
  int i, grid, ymin, xmin;
  Bool needRipup  = FALSE;
  Bool needRipup2 = FALSE;

  if (treeedge->len == 0) {
    return (FALSE);
  } // not ripup for degraded edge

  if (treeedge->route.type == MAZEROUTE) {
    gridsX = treeedge->route.gridsX;
    gridsY = treeedge->route.gridsY;

    for (i = 0; i < treeedge->route.routelen; i++) // first check
    {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = ymin * xGrid + gridsX[i];
        if (v_edges[grid].usage + v_edges[grid].red >=
            vCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }

      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsY[i] * (xGrid - 1) + xmin;
        if (h_edges[grid].usage + h_edges[grid].red >=
            hCapacity - ripup_threshold) {
          needRipup = TRUE;
          break;
        }
      }
    }

    if (needRipup) {

      for (i = 0; i < treeedge->route.routelen; i++) // then lock
      {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          grid = ymin * xGrid + gridsX[i];
          galois::runtime::acquire(&v_edges[grid], galois::MethodFlag::WRITE);

        } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          grid = gridsY[i] * (xGrid - 1) + xmin;
          galois::runtime::acquire(&h_edges[grid], galois::MethodFlag::WRITE);
        }
      }

      for (i = 0; i < treeedge->route.routelen; i++) // second check
      {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          ymin = min(gridsY[i], gridsY[i + 1]);
          grid = ymin * xGrid + gridsX[i];
          if (v_edges[grid].usage + v_edges[grid].red >=
              vCapacity - ripup_threshold) {
            needRipup2 = TRUE;
            break;
          }

        } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
        {
          xmin = min(gridsX[i], gridsX[i + 1]);
          grid = gridsY[i] * (xGrid - 1) + xmin;
          if (h_edges[grid].usage + h_edges[grid].red >=
              hCapacity - ripup_threshold) {
            needRipup2 = TRUE;
            break;
          }
        }
      }

      if (needRipup2) {

        for (i = 0; i < treeedge->route.routelen; i++) {
          if (gridsX[i] == gridsX[i + 1]) // a vertical edge
          {
            ymin = min(gridsY[i], gridsY[i + 1]);
            // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;
            v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(
                (short unsigned)1, std::memory_order_relaxed);
          } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
          {
            xmin = min(gridsX[i], gridsX[i + 1]);
            // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;
            h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(
                (short unsigned)1, std::memory_order_relaxed);
          }
        }
      }

      return (TRUE);
    } else {
      return (FALSE);
    }
  } else {
    printf("route type is not maze, netID %d\n", netID);
    fflush(stdout);
    printEdge(netID, edgeID);

    exit(0);
  }
}

void mazeRouteMSMD_lock(int iter, int expand, float costHeight,
                        int ripup_threshold, int mazeedge_Threshold,
                        Bool Ordering, int cost_type) {
  // LOCK = 0;
  float forange;

  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>
      thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  galois::do_all(
      galois::iterate(0, numValidNets),
      [&](const auto nidRPC)
      // galois::do_all(galois::iterate(0, numValidNets),
      //        [&] (const auto nidRPC)
      {
        int grid, netID;

        // maze routing for multi-source, multi-destination
        bool hypered, enter;
        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,
            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,
            num_edges;
        int regionX1, regionX2, regionY1, regionY2;
        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
            tmp_gridsY[YRANGE];
        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
            edge_C1C2;
        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
        int E1x, E1y, E2x, E2y;
        int tmp_grid;
        int preX, preY, origENG, edgeREC;

        float tmp, tmp_cost;
        TreeEdge *treeedges, *treeedge;
        TreeNode* treenodes;

        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;

        float** d1    = thread_local_storage.getLocal()->d1_p;
        bool** HV     = thread_local_storage.getLocal()->HV_p;
        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;
        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;

        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;
        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;
        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;
        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;

        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;

        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;

        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;
        // bool* inRegion_alloc =
        // thread_local_storage->getLocal()->inRegion_alloc;

        local_pq pq1 = perthread_pq.get();
        local_vec v2 = perthread_vec.get();

        /*for(i=0; i<yGrid*xGrid; i++)
        {
            pop_heap2[i] = FALSE;
        } */

        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
        /*for(int i=0; i<yGrid; i++)
        {
            for(int j=0; j<xGrid; j++)
                inRegion[i][j] = FALSE;
        }*/
        // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
        // parentY1[153][134], parentX3[153][134]); printf("what is
        // happening?\n");

        if (Ordering) {
          netID = treeOrderCong[nidRPC].treeIndex;
        } else {
          netID = nidRPC;
        }

        deg = sttrees[netID].deg;

        origENG = expand;

        netedgeOrderDec(netID, netEO);

        treeedges = sttrees[netID].edges;
        treenodes = sttrees[netID].nodes;
        // loop for all the tree edges (2*deg-3)
        num_edges = 2 * deg - 3;

        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {
          edgeID   = netEO[edgeREC].edgeID;
          treeedge = &(treeedges[edgeID]);

          n1            = treeedge->n1;
          n2            = treeedge->n2;
          n1x           = treenodes[n1].x;
          n1y           = treenodes[n1].y;
          n2x           = treenodes[n2].x;
          n2y           = treenodes[n2].y;
          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

          if (treeedge->len >
              mazeedge_Threshold) // only route the non-degraded edges (len>0)
          {

            // enter = newRipupCheck_nosub(treeedge, n1x, n1y, n2x, n2y,
            // ripup_threshold, netID, edgeID);
            enter =
                newRipupCheck_lock(treeedge, ripup_threshold, netID, edgeID);

            // ripup the routing for the edge
            if (enter) {

              // if(netID == 252163 && edgeID == 51)
              //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
              //    edgeID, n1x, n1y, n2x, n2y);
              if (n1y <= n2y) {
                ymin = n1y;
                ymax = n2y;
              } else {
                ymin = n2y;
                ymax = n1y;
              }

              if (n1x <= n2x) {
                xmin = n1x;
                xmax = n2x;
              } else {
                xmin = n2x;
                xmax = n1x;
              }

              int enlarge =
                  min(origENG,
                      (iter / 6 + 3) *
                          treeedge->route
                              .routelen); // michael, this was global variable
              regionX1 = max(0, xmin - enlarge);
              regionX2 = min(xGrid - 1, xmax + enlarge);
              regionY1 = max(0, ymin - enlarge);
              regionY2 = min(yGrid - 1, ymax + enlarge);

              // initialize d1[][] and d2[][] as BIG_INT
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  d1[i][j] = BIG_INT;
                  /*d2[i][j] = BIG_INT;
                  hyperH[i][j] = FALSE;
                  hyperV[i][j] = FALSE;*/
                }
              }
              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperH[i][j] = FALSE;
                }
              }
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperV[i][j] = FALSE;
                }
              }
              // TODO: use seperate loops

              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
              // grids on the two subtrees
              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                        regionY2, d1, corrEdge, inRegion);
              // TODO: use std priority queue
              // while loop to find shortest path
              ind1 = (pq1.top().d1_p - &d1[0][0]);
              pq1.pop();
              curX = ind1 % xGrid;
              curY = ind1 / xGrid;

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
                pop_heap2[*ii] = TRUE;
              }
              float curr_d1;
              while (pop_heap2[ind1] ==
                     FALSE) // stop until the grid position been popped out from
                            // both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                // if(PRINT) printf("curX curY %d %d, (%d, %d), (%d, %d),
                // pq1.size: %d\n", curX, curY, regionX1, regionX2, regionY1,
                // regionY2, pq1.size()); if(curX == 102 && curY == 221)
                // exit(1);
                curr_d1 = d1[curY][curX];
                if (curr_d1 != 0) {
                  if (HV[curY][curX]) {
                    preX = parentX1[curY][curX];
                    preY = parentY1[curY][curX];
                  } else {
                    preX = parentX3[curY][curX];
                    preY = parentY3[curY][curX];
                  }
                } else {
                  preX = curX;
                  preY = curY;
                }

                // left
                if (curX > regionX1) {
                  grid = curY * (xGrid - 1) + curX - 1;
                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // h_edges[grid].usage.load(), h_edges[grid].red,
                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +
                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX < regionX2 - 1) {
                      tmp_grid = curY * (xGrid - 1) + curX;
                      tmp_cost =
                          d1[curY][curX + 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperH[curY][curX] = TRUE; // Michael
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }
                  // if(LOCK)  h_edges[grid].releaseLock();
                  tmpX = curX - 1; // the left neighbor

                  /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put
                  into heap1
                  {
                      d1[curY][tmpX] = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX] = FALSE;
                      pq1.push(&(d1[curY][tmpX]));
                  }
                  else */
                  if (d1[curY][tmpX] >
                      tmp) // left neighbor been put into heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // right
                if (curX < regionX2) {
                  grid = curY * (xGrid - 1) + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // h_edges[grid].usage.load(), h_edges[grid].red,
                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +
                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX > regionX1 + 1) {
                      tmp_grid = curY * (xGrid - 1) + curX - 1;
                      tmp_cost =
                          d1[curY][curX - 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperH[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }
                  // if(LOCK) h_edges[grid].releaseLock();
                  tmpX = curX + 1; // the right neighbor

                  /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                  into heap1
                  {
                      d1[curY][tmpX] = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX] = FALSE;
                      pq1.push(&(d1[curY][tmpX]));

                  }
                  else */
                  if (d1[curY][tmpX] > tmp) // right neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // bottom
                if (curY > regionY1) {
                  grid = (curY - 1) * xGrid + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // v_edges[grid].usage.load(), v_edges[grid].red,
                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +
                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));
                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY < regionY2 - 1) {
                      tmp_grid = curY * xGrid + curX;
                      tmp_cost =
                          d1[curY + 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }
                  // if(LOCK) v_edges[grid].releaseLock();
                  tmpY = curY - 1; // the bottom neighbor

                  /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put
                  into heap1
                  {
                      d1[tmpY][curX] = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX] = TRUE;
                      pq1.push(&(d1[tmpY][curX]));

                  }
                  else */
                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }
                // top
                if (curY < regionY2) {
                  grid = curY * xGrid + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // v_edges[grid].usage.load(), v_edges[grid].red,
                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +
                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));
                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY > regionY1 + 1) {
                      tmp_grid = (curY - 1) * xGrid + curX;
                      tmp_cost =
                          d1[curY - 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }
                  // if(LOCK) v_edges[grid].releaseLock();
                  tmpY = curY + 1; // the top neighbor

                  /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put
                  into heap1
                  {
                      d1[tmpY][curX] = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX] = TRUE;
                      pq1.push(&(d1[tmpY][curX]));
                  }
                  else*/
                  if (d1[tmpY][curX] >
                      tmp) // top neighbor been put into heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }

                // update ind1 and ind2 for next loop, Michael: need to check if
                // it is up-to-date value.
                float d1_push;
                do {
                  ind1    = pq1.top().d1_p - &d1[0][0];
                  d1_push = pq1.top().d1_push;
                  pq1.pop();
                  curX = ind1 % xGrid;
                  curY = ind1 / xGrid;
                } while (d1_push != d1[curY][curX]);
              } // while loop

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
                pop_heap2[*ii] = FALSE;

              crossX = ind1 % xGrid;
              crossY = ind1 / xGrid;

              cnt  = 0;
              curX = crossX;
              curY = crossY;
              while (d1[curY][curX] != 0) // loop until reach subtree1
              {
                hypered = FALSE;
                if (cnt != 0) {
                  if (curX != tmpX && hyperH[curY][curX]) {
                    curX    = 2 * curX - tmpX;
                    hypered = TRUE;
                  }
                  // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
                  if (curY != tmpY && hyperV[curY][curX]) {
                    curY    = 2 * curY - tmpY;
                    hypered = TRUE;
                  }
                }
                tmpX = curX;
                tmpY = curY;
                if (!hypered) {
                  if (HV[tmpY][tmpX]) {
                    curY = parentY1[tmpY][tmpX];
                  } else {
                    curX = parentX3[tmpY][tmpX];
                  }
                }

                tmp_gridsX[cnt] = curX;
                tmp_gridsY[cnt] = curY;
                cnt++;
              }
              // reverse the grids on the path

              for (i = 0; i < cnt; i++) {
                tmpind    = cnt - 1 - i;
                gridsX[i] = tmp_gridsX[tmpind];
                gridsY[i] = tmp_gridsY[tmpind];
              }
              // add the connection point (crossX, crossY)
              gridsX[cnt] = crossX;
              gridsY[cnt] = crossY;
              cnt++;

              curX     = crossX;
              curY     = crossY;
              cnt_n1n2 = cnt;

              // change the tree structure according to the new routing for the
              // tree edge find E1 and E2, and the endpoints of the edges they
              // are on
              E1x = gridsX[0];
              E1y = gridsY[0];
              E2x = gridsX[cnt_n1n2 - 1];
              E2y = gridsY[cnt_n1n2 - 1];

              edge_n1n2 = edgeID;
              // if(netID == 252163 && edgeID == 51)
              //    printf("E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\n",
              //    E1x, E1y, E2x, E2y, cnt_n1n2);

              // (1) consider subtree1
              if (n1 >= deg && (E1x != n1x || E1y != n1y))
              // n1 is not a pin and E1!=n1, then make change to subtree1,
              // otherwise, no change to subtree1
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

                // find A1, A2 and edge_n1A1, edge_n1A2
                if (treenodes[n1].nbr[0] == n2) {
                  A1        = treenodes[n1].nbr[1];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[1];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else if (treenodes[n1].nbr[1] == n2) {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[1];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[1];
                }

                if (endpt1 == n1 ||
                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
                {
                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always
                  // on (n1, A1)
                  if (endpt1 == A2 || endpt2 == A2) {
                    tmpi      = A1;
                    A1        = A2;
                    A2        = tmpi;
                    tmpi      = edge_n1A1;
                    edge_n1A1 = edge_n1A2;
                    edge_n1A2 = tmpi;
                  }

                  // update route for edge (n1, A1), (n1, A2)
                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                                   edge_n1A1, edge_n1A2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                }    // if E1 is on (n1, A1) or (n1, A2)
                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
                {
                  C1        = endpt1;
                  C2        = endpt2;
                  edge_C1C2 = corrEdge[E1y][E1x];

                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)
                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
                  // C2)->(A1, A2)
                  edge_n1C1               = edge_n1A1;
                  treeedges[edge_n1C1].n1 = C1;
                  treeedges[edge_n1C1].n2 = n1;
                  edge_n1C2               = edge_n1A2;
                  treeedges[edge_n1C2].n1 = n1;
                  treeedges[edge_n1C2].n2 = C2;
                  edge_A1A2               = edge_C1C2;
                  treeedges[edge_A1A2].n1 = A1;
                  treeedges[edge_A1A2].n2 = A2;
                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)
                  treenodes[n1].nbr[0]  = n2;
                  treenodes[n1].edge[0] = edge_n1n2;
                  treenodes[n1].nbr[1]  = C1;
                  treenodes[n1].edge[1] = edge_n1C1;
                  treenodes[n1].nbr[2]  = C2;
                  treenodes[n1].edge[2] = edge_n1C2;
                  // A1's nbr n1->A2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A1].nbr[i] == n1) {
                      treenodes[A1].nbr[i]  = A2;
                      treenodes[A1].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // A2's nbr n1->A1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A2].nbr[i] == n1) {
                      treenodes[A2].nbr[i]  = A1;
                      treenodes[A2].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // C1's nbr C2->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C1].nbr[i] == C2) {
                      treenodes[C1].nbr[i]  = n1;
                      treenodes[C1].edge[i] = edge_n1C1;
                      break;
                    }
                  }
                  // C2's nbr C1->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C2].nbr[i] == C1) {
                      treenodes[C2].nbr[i]  = n1;
                      treenodes[C2].edge[i] = edge_n1C2;
                      break;
                    }
                  }

                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
              }   // n1 is not a pin and E1!=n1

              // (2) consider subtree2

              if (n2 >= deg && (E2x != n2x || E2y != n2y))
              // n2 is not a pin and E2!=n2, then make change to subtree2,
              // otherwise, no change to subtree2
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

                // find B1, B2
                if (treenodes[n2].nbr[0] == n1) {
                  B1        = treenodes[n2].nbr[1];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[1];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else if (treenodes[n2].nbr[1] == n1) {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[1];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[1];
                }

                if (endpt1 == n2 ||
                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
                {
                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always
                  // on (n2, B1)
                  if (endpt1 == B2 || endpt2 == B2) {
                    tmpi      = B1;
                    B1        = B2;
                    B2        = tmpi;
                    tmpi      = edge_n2B1;
                    edge_n2B1 = edge_n2B2;
                    edge_n2B2 = tmpi;
                  }

                  // update route for edge (n2, B1), (n2, B2)
                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                                   edge_n2B1, edge_n2B2);

                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                }    // if E2 is on (n2, B1) or (n2, B2)
                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
                {
                  D1        = endpt1;
                  D2        = endpt2;
                  edge_D1D2 = corrEdge[E2y][E2x];

                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)
                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
                  // D2)->(B1, B2)
                  edge_n2D1               = edge_n2B1;
                  treeedges[edge_n2D1].n1 = D1;
                  treeedges[edge_n2D1].n2 = n2;
                  edge_n2D2               = edge_n2B2;
                  treeedges[edge_n2D2].n1 = n2;
                  treeedges[edge_n2D2].n2 = D2;
                  edge_B1B2               = edge_D1D2;
                  treeedges[edge_B1B2].n1 = B1;
                  treeedges[edge_B1B2].n2 = B2;
                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)
                  treenodes[n2].nbr[0]  = n1;
                  treenodes[n2].edge[0] = edge_n1n2;
                  treenodes[n2].nbr[1]  = D1;
                  treenodes[n2].edge[1] = edge_n2D1;
                  treenodes[n2].nbr[2]  = D2;
                  treenodes[n2].edge[2] = edge_n2D2;
                  // B1's nbr n2->B2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B1].nbr[i] == n2) {
                      treenodes[B1].nbr[i]  = B2;
                      treenodes[B1].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // B2's nbr n2->B1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B2].nbr[i] == n2) {
                      treenodes[B2].nbr[i]  = B1;
                      treenodes[B2].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // D1's nbr D2->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D1].nbr[i] == D2) {
                      treenodes[D1].nbr[i]  = n2;
                      treenodes[D1].edge[i] = edge_n2D1;
                      break;
                    }
                  }
                  // D2's nbr D1->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D2].nbr[i] == D1) {
                      treenodes[D2].nbr[i]  = n2;
                      treenodes[D2].edge[i] = edge_n2D2;
                      break;
                    }
                  }
                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
              }   // n2 is not a pin and E2!=n2

              // update route for edge (n1, n2) and edge usage

              // printf("update route? %d %d\n", netID, num_edges);
              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
                free(treeedges[edge_n1n2].route.gridsX);
                free(treeedges[edge_n1n2].route.gridsY);
              }
              treeedges[edge_n1n2].route.gridsX =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.gridsY =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.type     = MAZEROUTE;
              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
              treeedges[edge_n1n2].n_ripups += 1;
              total_ripups += 1;
              max_ripups.update(treeedges[edge_n1n2].n_ripups);

              for (i = 0; i < cnt_n1n2; i++) {
                // printf("cnt_n1n2: %d\n", cnt_n1n2);
                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
              }

              // update edge usage

              for (i = 0; i < cnt_n1n2 - 1; i++) {
                if (gridsX[i] == gridsX[i + 1]) // a vertical edge
                {
                  min_y = min(gridsY[i], gridsY[i + 1]);
                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
                  // (short unsigned)1);
                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                      (short int)1);
                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
                {
                  min_x = min(gridsX[i], gridsX[i + 1]);
                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
                  // (short unsigned)1);
                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                      (short int)1);
                }
              }
              /*if(LOCK){
                  for(i=0; i<cnt_n1n2-1; i++)
                  {
                      if(gridsX[i]==gridsX[i+1]) // a vertical edge
                      {
                          min_y = min(gridsY[i], gridsY[i+1]);
                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                      }
                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                      {
                          min_x = min(gridsX[i], gridsX[i+1]);
                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                      }
                  }
              }*/
              if (checkRoute2DTree(netID)) {
                reInitTree(netID);
                return;
              }
            } // congested route
          }   // maze routing
        }     // loop edgeID
      },
      // galois::wl<galois::worklists::ParaMeter<>>(),
      galois::steal(),
      // galois::chunk_size<32>(),
      galois::loopname("maze routing")); // galois::do_all

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}

void mazeRouteMSMD_M1M2(int iter, int expand, float costHeight,
                        int ripup_threshold, int mazeedge_Threshold,
                        Bool Ordering, int cost_type) {
  // LOCK = 0;
  float forange;

  // allocate memory for distance and parent and pop_heap
  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));
  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));

  forange = 40 * hCapacity;

  if (cost_type == 2) {
    for (int i = 0; i < forange; i++) {
      if (i < hCapacity - 1)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity - 1)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  } else {

    for (int i = 0; i < forange; i++) {
      if (i < hCapacity)
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        h_costTable[i] =
            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - hCapacity);
    }
    forange = 40 * vCapacity;
    for (int i = 0; i < forange; i++) {
      if (i < vCapacity)
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;
      else
        v_costTable[i] =
            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +
            costHeight / slope * (i - vCapacity);
    }
  }

  /*forange = yGrid*xGrid;
  for(i=0; i<forange; i++)
  {
      pop_heap2[i] = FALSE;
  } //Michael*/

  if (Ordering) {
    StNetOrder();
    // printf("order?\n");
  }

  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>
      thread_local_storage{};
  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize
  PerThread_PQ perthread_pq;
  PerThread_Vec perthread_vec;
  PRINT = 0;
  galois::GAccumulator<int> total_ripups;
  galois::GReduceMax<int> max_ripups;
  total_ripups.reset();
  max_ripups.reset();

  // galois::runtime::profileVtune( [&] (void) {
  /*std::random_device rd;
  std::mt19937 g(rd());
  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);

  galois::do_all(galois::iterate(net_shuffle), */
  // galois::for_each(galois::iterate(0, numValidNets),
  //        [&] (const auto nidRPC, auto& ctx)
  galois::do_all(
      galois::iterate(0, numValidNets),
      [&](const auto nidRPC) {
        int grid, netID;

        // maze routing for multi-source, multi-destination
        bool hypered, enter;
        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,
            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,
            num_edges;
        int regionX1, regionX2, regionY1, regionY2;
        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],
            tmp_gridsY[YRANGE];
        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;
        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,
            edge_C1C2;
        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;
        int E1x, E1y, E2x, E2y;
        int tmp_grid;
        int preX, preY, origENG, edgeREC;

        float tmp, tmp_cost;
        TreeEdge *treeedges, *treeedge;
        TreeNode* treenodes;

        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;

        float** d1    = thread_local_storage.getLocal()->d1_p;
        bool** HV     = thread_local_storage.getLocal()->HV_p;
        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;
        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;

        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;
        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;
        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;
        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;

        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;

        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;

        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;
        // bool* inRegion_alloc =
        // thread_local_storage.getLocal()->inRegion_alloc;

        local_pq pq1 = perthread_pq.get();
        local_vec v2 = perthread_vec.get();

        /*for(i=0; i<yGrid*xGrid; i++)
        {
            pop_heap2[i] = FALSE;
        } */

        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));
        /*for(int i=0; i<yGrid; i++)
        {
            for(int j=0; j<xGrid; j++)
                inRegion[i][j] = FALSE;
        }*/
        // printf("hyperV[153][134]: %d %d %d\n", hyperV[153][134],
        // parentY1[153][134], parentX3[153][134]); printf("what is
        // happening?\n");

        if (Ordering) {
          netID = treeOrderCong[nidRPC].treeIndex;
        } else {
          netID = nidRPC;
        }

        deg = sttrees[netID].deg;

        origENG = expand;

        netedgeOrderDec(netID, netEO);

        treeedges = sttrees[netID].edges;
        treenodes = sttrees[netID].nodes;
        // loop for all the tree edges (2*deg-3)
        num_edges = 2 * deg - 3;

        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {
          edgeID   = netEO[edgeREC].edgeID;
          treeedge = &(treeedges[edgeID]);

          n1            = treeedge->n1;
          n2            = treeedge->n2;
          n1x           = treenodes[n1].x;
          n1y           = treenodes[n1].y;
          n2x           = treenodes[n2].x;
          n2y           = treenodes[n2].y;
          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);

          if (treeedge->len >
              mazeedge_Threshold) // only route the non-degraded edges (len>0)
          {

            // enter = newRipupCheck_nosub(treeedge, n1x, n1y, n2x, n2y,
            // ripup_threshold, netID, edgeID);
            enter =
                newRipupCheck_lock(treeedge, ripup_threshold, netID, edgeID);
            // enter = newRipupCheck_atomic(treeedge, n1x, n1y, n2x, n2y,
            // ripup_threshold, netID, edgeID);

            // ripup the routing for the edge
            if (enter) {

              // if(netID == 252163 && edgeID == 51)
              //    printf("netID %d edgeID %d src %d %d dst %d %d\n", netID,
              //    edgeID, n1x, n1y, n2x, n2y);
              if (n1y <= n2y) {
                ymin = n1y;
                ymax = n2y;
              } else {
                ymin = n2y;
                ymax = n1y;
              }

              if (n1x <= n2x) {
                xmin = n1x;
                xmax = n2x;
              } else {
                xmin = n2x;
                xmax = n1x;
              }

              int enlarge =
                  min(origENG,
                      (iter / 6 + 3) *
                          treeedge->route
                              .routelen); // michael, this was global variable
              regionX1 = max(0, xmin - enlarge);
              regionX2 = min(xGrid - 1, xmax + enlarge);
              regionY1 = max(0, ymin - enlarge);
              regionY2 = min(yGrid - 1, ymax + enlarge);

              // initialize d1[][] and d2[][] as BIG_INT
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  d1[i][j] = BIG_INT;
                  /*d2[i][j] = BIG_INT;
                  hyperH[i][j] = FALSE;
                  hyperV[i][j] = FALSE;*/
                }
              }
              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));
              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperH[i][j] = FALSE;
                }
              }
              for (i = regionY1; i <= regionY2; i++) {
                for (j = regionX1; j <= regionX2; j++) {
                  hyperV[i][j] = FALSE;
                }
              }
              // TODO: use seperate loops

              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the
              // grids on the two subtrees
              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,
                        regionY2, d1, corrEdge, inRegion);
              // TODO: use std priority queue
              // while loop to find shortest path
              ind1 = (pq1.top().d1_p - &d1[0][0]);
              pq1.pop();
              curX = ind1 % xGrid;
              curY = ind1 / xGrid;

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {
                pop_heap2[*ii] = TRUE;
              }
              float curr_d1;
              while (pop_heap2[ind1] ==
                     FALSE) // stop until the grid position been popped out from
                            // both heap1 and heap2
              {
                // relax all the adjacent grids within the enlarged region for
                // source subtree

                // if(PRINT) printf("curX curY %d %d, (%d, %d), (%d, %d),
                // pq1.size: %d\n", curX, curY, regionX1, regionX2, regionY1,
                // regionY2, pq1.size()); if(curX == 102 && curY == 221)
                // exit(1);
                curr_d1 = d1[curY][curX];
                if (curr_d1 != 0) {
                  if (HV[curY][curX]) {
                    preX = parentX1[curY][curX];
                    preY = parentY1[curY][curX];
                  } else {
                    preX = parentX3[curY][curX];
                    preY = parentY3[curY][curX];
                  }
                } else {
                  preX = curX;
                  preY = curY;
                }

                // left
                if (curX > regionX1) {
                  grid = curY * (xGrid - 1) + curX - 1;
                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // h_edges[grid].usage.load(), h_edges[grid].red,
                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +
                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX < regionX2 - 1) {
                      tmp_grid = curY * (xGrid - 1) + curX;
                      tmp_cost =
                          d1[curY][curX + 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperH[curY][curX] = TRUE; // Michael
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }
                  // if(LOCK)  h_edges[grid].releaseLock();
                  tmpX = curX - 1; // the left neighbor

                  /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put
                  into heap1
                  {
                      d1[curY][tmpX] = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX] = FALSE;
                      pq1.push(&(d1[curY][tmpX]));
                  }
                  else */
                  if (d1[curY][tmpX] >
                      tmp) // left neighbor been put into heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // right
                if (curX < regionX2) {
                  grid = curY * (xGrid - 1) + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // h_edges[grid].usage.load(), h_edges[grid].red,
                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +
                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));
                  if ((preY == curY) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  } else {
                    if (curX > regionX1 + 1) {
                      tmp_grid = curY * (xGrid - 1) + curX - 1;
                      tmp_cost =
                          d1[curY][curX - 1] +
                          h_costTable[h_edges[tmp_grid].usage +
                                      h_edges[tmp_grid].red +
                                      (int)(L * h_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperH[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          h_costTable[h_edges[grid].usage + h_edges[grid].red +
                                      (int)(L * h_edges[grid].last_usage)];
                  }
                  // if(LOCK) h_edges[grid].releaseLock();
                  tmpX = curX + 1; // the right neighbor

                  /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put
                  into heap1
                  {
                      d1[curY][tmpX] = tmp;
                      parentX3[curY][tmpX] = curX;
                      parentY3[curY][tmpX] = curY;
                      HV[curY][tmpX] = FALSE;
                      pq1.push(&(d1[curY][tmpX]));

                  }
                  else */
                  if (d1[curY][tmpX] > tmp) // right neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[curY][tmpX]       = tmp;
                    parentX3[curY][tmpX] = curX;
                    parentY3[curY][tmpX] = curY;
                    HV[curY][tmpX]       = FALSE;
                    pq1.push({&(d1[curY][tmpX]), tmp});
                  }
                }
                // bottom
                if (curY > regionY1) {
                  grid = (curY - 1) * xGrid + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // v_edges[grid].usage.load(), v_edges[grid].red,
                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +
                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));
                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY < regionY2 - 1) {
                      tmp_grid = curY * xGrid + curX;
                      tmp_cost =
                          d1[curY + 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }
                  // if(LOCK) v_edges[grid].releaseLock();
                  tmpY = curY - 1; // the bottom neighbor

                  /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put
                  into heap1
                  {
                      d1[tmpY][curX] = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX] = TRUE;
                      pq1.push(&(d1[tmpY][curX]));

                  }
                  else */
                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into
                                            // heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }
                // top
                if (curY < regionY2) {
                  grid = curY * xGrid + curX;

                  // printf("grid: %d usage: %d red:%d last:%d sum%f %d\n",grid,
                  // v_edges[grid].usage.load(), v_edges[grid].red,
                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +
                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));
                  if ((preX == curX) || (curr_d1 == 0)) {
                    tmp = curr_d1 +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  } else {
                    if (curY > regionY1 + 1) {
                      tmp_grid = (curY - 1) * xGrid + curX;
                      tmp_cost =
                          d1[curY - 1][curX] +
                          v_costTable[v_edges[tmp_grid].usage +
                                      v_edges[tmp_grid].red +
                                      (int)(L * v_edges[tmp_grid].last_usage)];

                      if (tmp_cost < curr_d1 + VIA) {
                        hyperV[curY][curX] = TRUE;
                      }
                    }
                    tmp = curr_d1 + VIA +
                          v_costTable[v_edges[grid].usage + v_edges[grid].red +
                                      (int)(L * v_edges[grid].last_usage)];
                  }
                  // if(LOCK) v_edges[grid].releaseLock();
                  tmpY = curY + 1; // the top neighbor

                  /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put
                  into heap1
                  {
                      d1[tmpY][curX] = tmp;
                      parentX1[tmpY][curX] = curX;
                      parentY1[tmpY][curX] = curY;
                      HV[tmpY][curX] = TRUE;
                      pq1.push(&(d1[tmpY][curX]));
                  }
                  else*/
                  if (d1[tmpY][curX] >
                      tmp) // top neighbor been put into heap1 but needs update
                  {
                    d1[tmpY][curX]       = tmp;
                    parentX1[tmpY][curX] = curX;
                    parentY1[tmpY][curX] = curY;
                    HV[tmpY][curX]       = TRUE;
                    pq1.push({&(d1[tmpY][curX]), tmp});
                  }
                }

                // update ind1 and ind2 for next loop, Michael: need to check if
                // it is up-to-date value.
                float d1_push;
                do {
                  ind1    = pq1.top().d1_p - &d1[0][0];
                  d1_push = pq1.top().d1_push;
                  pq1.pop();
                  curX = ind1 % xGrid;
                  curY = ind1 / xGrid;
                } while (d1_push != d1[curY][curX]);
              } // while loop

              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)
                pop_heap2[*ii] = FALSE;

              crossX = ind1 % xGrid;
              crossY = ind1 / xGrid;

              cnt  = 0;
              curX = crossX;
              curY = crossY;
              while (d1[curY][curX] != 0) // loop until reach subtree1
              {
                hypered = FALSE;
                if (cnt != 0) {
                  if (curX != tmpX && hyperH[curY][curX]) {
                    curX    = 2 * curX - tmpX;
                    hypered = TRUE;
                  }
                  // printf("hyperV[153][134]: %d\n", hyperV[curY][curX]);
                  if (curY != tmpY && hyperV[curY][curX]) {
                    curY    = 2 * curY - tmpY;
                    hypered = TRUE;
                  }
                }
                tmpX = curX;
                tmpY = curY;
                if (!hypered) {
                  if (HV[tmpY][tmpX]) {
                    curY = parentY1[tmpY][tmpX];
                  } else {
                    curX = parentX3[tmpY][tmpX];
                  }
                }

                tmp_gridsX[cnt] = curX;
                tmp_gridsY[cnt] = curY;
                cnt++;
              }
              // reverse the grids on the path

              for (i = 0; i < cnt; i++) {
                tmpind    = cnt - 1 - i;
                gridsX[i] = tmp_gridsX[tmpind];
                gridsY[i] = tmp_gridsY[tmpind];
              }
              // add the connection point (crossX, crossY)
              gridsX[cnt] = crossX;
              gridsY[cnt] = crossY;
              cnt++;

              curX     = crossX;
              curY     = crossY;
              cnt_n1n2 = cnt;

              // change the tree structure according to the new routing for the
              // tree edge find E1 and E2, and the endpoints of the edges they
              // are on
              E1x = gridsX[0];
              E1y = gridsY[0];
              E2x = gridsX[cnt_n1n2 - 1];
              E2y = gridsY[cnt_n1n2 - 1];

              edge_n1n2 = edgeID;
              // if(netID == 252163 && edgeID == 51)
              //    printf("E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\n",
              //    E1x, E1y, E2x, E2y, cnt_n1n2);

              // (1) consider subtree1
              if (n1 >= deg && (E1x != n1x || E1y != n1y))
              // n1 is not a pin and E1!=n1, then make change to subtree1,
              // otherwise, no change to subtree1
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;
                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;

                // find A1, A2 and edge_n1A1, edge_n1A2
                if (treenodes[n1].nbr[0] == n2) {
                  A1        = treenodes[n1].nbr[1];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[1];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else if (treenodes[n1].nbr[1] == n2) {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[2];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[2];
                } else {
                  A1        = treenodes[n1].nbr[0];
                  A2        = treenodes[n1].nbr[1];
                  edge_n1A1 = treenodes[n1].edge[0];
                  edge_n1A2 = treenodes[n1].edge[1];
                }

                if (endpt1 == n1 ||
                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)
                {
                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always
                  // on (n1, A1)
                  if (endpt1 == A2 || endpt2 == A2) {
                    tmpi      = A1;
                    A1        = A2;
                    A2        = tmpi;
                    tmpi      = edge_n1A1;
                    edge_n1A1 = edge_n1A2;
                    edge_n1A2 = tmpi;
                  }

                  // update route for edge (n1, A1), (n1, A2)
                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,
                                   edge_n1A1, edge_n1A2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                }    // if E1 is on (n1, A1) or (n1, A2)
                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
                {
                  C1        = endpt1;
                  C2        = endpt2;
                  edge_C1C2 = corrEdge[E1y][E1x];

                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)
                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,
                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);
                  // update position for n1
                  treenodes[n1].x = E1x;
                  treenodes[n1].y = E1y;
                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,
                  // C2)->(A1, A2)
                  edge_n1C1               = edge_n1A1;
                  treeedges[edge_n1C1].n1 = C1;
                  treeedges[edge_n1C1].n2 = n1;
                  edge_n1C2               = edge_n1A2;
                  treeedges[edge_n1C2].n1 = n1;
                  treeedges[edge_n1C2].n2 = C2;
                  edge_A1A2               = edge_C1C2;
                  treeedges[edge_A1A2].n1 = A1;
                  treeedges[edge_A1A2].n2 = A2;
                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2
                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)
                  treenodes[n1].nbr[0]  = n2;
                  treenodes[n1].edge[0] = edge_n1n2;
                  treenodes[n1].nbr[1]  = C1;
                  treenodes[n1].edge[1] = edge_n1C1;
                  treenodes[n1].nbr[2]  = C2;
                  treenodes[n1].edge[2] = edge_n1C2;
                  // A1's nbr n1->A2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A1].nbr[i] == n1) {
                      treenodes[A1].nbr[i]  = A2;
                      treenodes[A1].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // A2's nbr n1->A1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[A2].nbr[i] == n1) {
                      treenodes[A2].nbr[i]  = A1;
                      treenodes[A2].edge[i] = edge_A1A2;
                      break;
                    }
                  }
                  // C1's nbr C2->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C1].nbr[i] == C2) {
                      treenodes[C1].nbr[i]  = n1;
                      treenodes[C1].edge[i] = edge_n1C1;
                      break;
                    }
                  }
                  // C2's nbr C1->n1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[C2].nbr[i] == C1) {
                      treenodes[C2].nbr[i]  = n1;
                      treenodes[C2].edge[i] = edge_n1C2;
                      break;
                    }
                  }

                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)
              }   // n1 is not a pin and E1!=n1

              // (2) consider subtree2

              if (n2 >= deg && (E2x != n2x || E2y != n2y))
              // n2 is not a pin and E2!=n2, then make change to subtree2,
              // otherwise, no change to subtree2
              {
                // find the endpoints of the edge E1 is on
                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;
                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;

                // find B1, B2
                if (treenodes[n2].nbr[0] == n1) {
                  B1        = treenodes[n2].nbr[1];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[1];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else if (treenodes[n2].nbr[1] == n1) {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[2];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[2];
                } else {
                  B1        = treenodes[n2].nbr[0];
                  B2        = treenodes[n2].nbr[1];
                  edge_n2B1 = treenodes[n2].edge[0];
                  edge_n2B2 = treenodes[n2].edge[1];
                }

                if (endpt1 == n2 ||
                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)
                {
                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always
                  // on (n2, B1)
                  if (endpt1 == B2 || endpt2 == B2) {
                    tmpi      = B1;
                    B1        = B2;
                    B2        = tmpi;
                    tmpi      = edge_n2B1;
                    edge_n2B1 = edge_n2B2;
                    edge_n2B2 = tmpi;
                  }

                  // update route for edge (n2, B1), (n2, B2)
                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,
                                   edge_n2B1, edge_n2B2);

                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                }    // if E2 is on (n2, B1) or (n2, B2)
                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
                {
                  D1        = endpt1;
                  D2        = endpt2;
                  edge_D1D2 = corrEdge[E2y][E2x];

                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)
                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,
                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);
                  // update position for n2
                  treenodes[n2].x = E2x;
                  treenodes[n2].y = E2y;
                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,
                  // D2)->(B1, B2)
                  edge_n2D1               = edge_n2B1;
                  treeedges[edge_n2D1].n1 = D1;
                  treeedges[edge_n2D1].n2 = n2;
                  edge_n2D2               = edge_n2B2;
                  treeedges[edge_n2D2].n1 = n2;
                  treeedges[edge_n2D2].n2 = D2;
                  edge_B1B2               = edge_D1D2;
                  treeedges[edge_B1B2].n1 = B1;
                  treeedges[edge_B1B2].n2 = B2;
                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2
                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)
                  treenodes[n2].nbr[0]  = n1;
                  treenodes[n2].edge[0] = edge_n1n2;
                  treenodes[n2].nbr[1]  = D1;
                  treenodes[n2].edge[1] = edge_n2D1;
                  treenodes[n2].nbr[2]  = D2;
                  treenodes[n2].edge[2] = edge_n2D2;
                  // B1's nbr n2->B2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B1].nbr[i] == n2) {
                      treenodes[B1].nbr[i]  = B2;
                      treenodes[B1].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // B2's nbr n2->B1
                  for (i = 0; i < 3; i++) {
                    if (treenodes[B2].nbr[i] == n2) {
                      treenodes[B2].nbr[i]  = B1;
                      treenodes[B2].edge[i] = edge_B1B2;
                      break;
                    }
                  }
                  // D1's nbr D2->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D1].nbr[i] == D2) {
                      treenodes[D1].nbr[i]  = n2;
                      treenodes[D1].edge[i] = edge_n2D1;
                      break;
                    }
                  }
                  // D2's nbr D1->n2
                  for (i = 0; i < 3; i++) {
                    if (treenodes[D2].nbr[i] == D1) {
                      treenodes[D2].nbr[i]  = n2;
                      treenodes[D2].edge[i] = edge_n2D2;
                      break;
                    }
                  }
                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)
              }   // n2 is not a pin and E2!=n2

              // update route for edge (n1, n2) and edge usage

              // printf("update route? %d %d\n", netID, num_edges);
              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {
                free(treeedges[edge_n1n2].route.gridsX);
                free(treeedges[edge_n1n2].route.gridsY);
              }
              treeedges[edge_n1n2].route.gridsX =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.gridsY =
                  (short*)calloc(cnt_n1n2, sizeof(short));
              treeedges[edge_n1n2].route.type     = MAZEROUTE;
              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;
              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);
              treeedges[edge_n1n2].n_ripups += 1;
              total_ripups += 1;
              max_ripups.update(treeedges[edge_n1n2].n_ripups);

              for (i = 0; i < cnt_n1n2; i++) {
                // printf("cnt_n1n2: %d\n", cnt_n1n2);
                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];
                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];
              }

              // update edge usage

              /*for(i=0; i<pre_length; i++)
              {
                  if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge
                  {
                      if(i != pre_length - 1)
                          min_y = min(pre_gridsY[i], pre_gridsY[i+1]);
                      else
                          min_y = pre_gridsY[i];
                      //v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                      //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
              (short unsigned)1);
                      //printf("x y %d %d i %d \n", pre_gridsX[i], min_y, i);
                      v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short
              int)1);
                      //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0)
              printf("V negative! %d \n", i);
                  }
                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                  {
                      if(i != pre_length - 1)
                          min_x = min(pre_gridsX[i], pre_gridsX[i+1]);
                      else
                          min_x = pre_gridsX[i];
                      //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                      //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
              (short unsigned)1);
                      //printf("x y %d %d i %d\n", min_x, pre_gridsY[i], i);
                      h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short
              int)1);
                      //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)
              printf("H negative! %d \n", i);
                  }
              }*/

              for (i = 0; i < cnt_n1n2 - 1; i++) {
                if (gridsX[i] == gridsX[i + 1]) // a vertical edge
                {
                  min_y = min(gridsY[i], gridsY[i + 1]);
                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;
                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,
                  // (short unsigned)1);
                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(
                      (short int)1);
                  galois::atomicMax(
                      v_edges[min_y * xGrid + gridsX[i]].max_ripups,
                      treeedges[edge_n1n2].n_ripups);
                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
                {
                  min_x = min(gridsX[i], gridsX[i + 1]);
                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;
                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,
                  // (short unsigned)1);
                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(
                      (short int)1);
                  galois::atomicMax(
                      h_edges[gridsY[i] * (xGrid - 1) + min_x].max_ripups,
                      treeedges[edge_n1n2].n_ripups);
                }
              }
              /*if(LOCK){
                  for(i=0; i<cnt_n1n2-1; i++)
                  {
                      if(gridsX[i]==gridsX[i+1]) // a vertical edge
                      {
                          min_y = min(gridsY[i], gridsY[i+1]);
                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();
                      }
                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge
                      {
                          min_x = min(gridsX[i], gridsX[i+1]);
                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();
                      }
                  }
              }*/
              if (checkRoute2DTree(netID)) {
                reInitTree(netID);
                return;
              }
            } // congested route
          }   // maze routing
        }     // loop edgeID
      },
      // galois::wl<galois::worklists::ParaMeter<>>(),
      galois::steal(),
      // galois::chunk_size<32>(),
      galois::loopname("maze routing")); // galois::do_all

  printf("total ripups: %d max ripups: %d\n", total_ripups.reduce(),
         max_ripups.reduce());
  //}, "mazeroute vtune function");
  free(h_costTable);
  free(v_costTable);
}


================================================
FILE: lonestar/eda/cpu/sproute/memAlloc.c
================================================
/* --------------------------------------------------------------------------
   Public domain memory allocation and de-allocation routines.
   Taken from Appendix B of: 
   Numerical Recipes in C: The Art of Scientific Computing, Second Edition,
   Cambridge University Press, 1992
----------------------------------------------------------------------------*/
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>

#include "memAlloc.h"

#define MEM_END 1
#define FREE_ARG char*

void runtimeError(char error_text[])
/* error handler */
{
  fprintf(stderr,"run-time error...\n");
  fprintf(stderr,"%s\n",error_text);
  fprintf(stderr,"...now exiting to system...\n");
  exit(1);
}

float *vector(long nl, long nh)
/* allocate a float vector with subscript range v[nl..nh] */
{
  float *v;
  v=(float *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(float)));
  if (!v) runtimeError("allocation failure in vector()");
  return v-nl+MEM_END;
}

int *ivector(long nl, long nh)
/* allocate an int vector with subscript range v[nl..nh] */
{
  int *v;
  v=(int *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(int)));
  if (!v) runtimeError("allocation failure in ivector()");
  return v-nl+MEM_END;
}

unsigned char *cvector(long nl, long nh)
/* allocate an unsigned char vector with subscript range v[nl..nh] */
{
  unsigned char *v;
  v=(unsigned char *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(unsigned char)));
  if (!v) runtimeError("allocation failure in cvector()");
  return v-nl+MEM_END;
}

unsigned long *lvector(long nl, long nh)
/* allocate an unsigned long vector with subscript range v[nl..nh] */
{
  unsigned long *v;
  v=(unsigned long *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(long)));
  if (!v) runtimeError("allocation failure in lvector()");
  return v-nl+MEM_END;
}

double *dvector(long nl, long nh)
/* allocate a double vector with subscript range v[nl..nh] */
{
  double *v;
  v=(double *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(double)));
  if (!v) runtimeError("allocation failure in dvector()");
  return v-nl+MEM_END;
}

float **matrix(long nrl, long nrh, long ncl, long nch)
/* allocate a float matrix with subscript range m[nrl..nrh][ncl..nch] */
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  float **m;
  
  /* allocate pointers to rows */
  m=(float **) malloc((size_t)((nrow+MEM_END)*sizeof(float*)));
  if (!m) runtimeError("allocation failure 1 in matrix()");
  m += MEM_END;
  m -= nrl;
  
  /* allocate rows and set pointers to them */
  m[nrl]=(float *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(float)));
  if (!m[nrl]) runtimeError("allocation failure 2 in matrix()");
  m[nrl] += MEM_END;
  m[nrl] -= ncl;
  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
  /* return pointer to array of pointers to rows */
  return m;
}

double **dmatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  double **m;

  /* allocate pointers to rows */
  m=(double **) malloc((size_t)((nrow+MEM_END)*sizeof(double*)));
  if (!m) runtimeError("allocation failure 1 in dmatrix()");
  m += MEM_END;
  m -= nrl;
  
  /* allocate rows and set pointers to them */
  m[nrl]=(double *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(double)));
  if (!m[nrl]) runtimeError("allocation failure 2 din matrix()");
  m[nrl] += MEM_END;
  m[nrl] -= ncl;
  
  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}

int **imatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  int **m;

  /* allocate pointers to rows */
  m=(int **) malloc((size_t)((nrow+MEM_END)*sizeof(int*)));
  if (!m) runtimeError("allocation failure 1 in imatrix()");
  m += MEM_END;
  m -= nrl;
  
  /* allocate rows and set pointers to them */
  m[nrl]=(int *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(int)));
  if (!m[nrl]) runtimeError("allocation failure 2 in imatrix()");
  m[nrl] += MEM_END;
  m[nrl] -= ncl;
  
  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}


char **cmatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a char matrix with subscript range m[nrl..nrh][ncl..nch] */
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  char **m;

  /* allocate pointers to rows */
  m=(char **) malloc((size_t)((nrow+MEM_END)*sizeof(char*)));
  if (!m) runtimeError("allocation failure 1 in cmatrix()");
  m += MEM_END;
  m -= nrl;
  
  /* allocate rows and set pointers to them */
  m[nrl]=(char *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(char)));
  if (!m[nrl]) runtimeError("allocation failure 2 in cmatrix()");
  m[nrl] += MEM_END;
  m[nrl] -= ncl;
  
  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}


unsigned long **lmatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  unsigned long **m;

  /* allocate pointers to rows */
  m=(unsigned long **) malloc((size_t)((nrow+MEM_END)*sizeof(long*)));
  if (!m) runtimeError("allocation failure 1 in lmatrix()");
  m += MEM_END;
  m -= nrl;
  
  /* allocate rows and set pointers to them */
  m[nrl]=(unsigned long *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(long)));
  if (!m[nrl]) runtimeError("allocation failure 2 in lmatrix()");
  m[nrl] += MEM_END;
  m[nrl] -= ncl;
  
  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}


float **submatrix(float **a, long oldrl, long oldrh, long oldcl, long oldch, 
  long newrl, long newcl)
/* point a submatrix [newrl..][newcl..] to a[oldrl..oldrh][oldcl..oldch] */
{
  long i,j,nrow=oldrh-oldrl+1,ncol=oldcl-newcl;
  float **m;
  
  /* allocate array of pointers to rows */
  m=(float **) malloc((size_t) ((nrow+MEM_END)*sizeof(float*)));
  if (!m) runtimeError("allocation failure in submatrix()");
  m += MEM_END;
  m -= newrl;
  
  /* set pointers to rows */
  for(i=oldrl,j=newrl;i<=oldrh;i++,j++) m[j]=a[i]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}

float **convert_matrix(float *a, long nrl, long nrh, long ncl, long nch)
/* allocate a float matrix m[nrl..nrh][ncl..nch] that points to the matrix
declared in the standard C manner as a[nrow][ncol], where nrow=nrh-nrl+1
and ncol=nch-ncl+1. The routine should be called with the address
&a[0][0] as the first argument. */
{
  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1;
  float **m;

  /* allocate pointers to rows */
  m=(float **) malloc((size_t) ((nrow+MEM_END)*sizeof(float*)));
  if (!m) runtimeError("allocation failure in convert_matrix()");
  m += MEM_END;
  m -= nrl;
  
  /* set pointers to rows */
  m[nrl]=a-ncl;
  for(i=1,j=nrl+1;i<nrow;i++,j++) m[j]=m[j-1]+ncol;
  
  /* return pointer to array of pointers to rows */
  return m;
}

float ***f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)
/* allocate a float 3tensor with range t[nrl..nrh][ncl..nch][ndl..ndh] */
{
  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;
  float ***t;

  /* allocate pointers to pointers to rows */
  t=(float ***) malloc((size_t)((nrow+MEM_END)*sizeof(float**)));
  if (!t) runtimeError("allocation failure 1 in f3tensor()");
  t += MEM_END;
  t -= nrl;
  
  /* allocate pointers to rows and set pointers to them */
  t[nrl]=(float **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(float*)));
  if (!t[nrl]) runtimeError("allocation failure 2 in f3tensor()");
  t[nrl] += MEM_END;
  t[nrl] -= ncl;
  
  /* allocate rows and set pointers to them */
  t[nrl][ncl]=(float *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(float)));
  if (!t[nrl][ncl]) runtimeError("allocation failure 3 in f3tensor()");
  t[nrl][ncl] += MEM_END;
  t[nrl][ncl] -= ndl;
  
  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;
  for(i=nrl+1;i<=nrh;i++) {
    t[i]=t[i-1]+ncol;
    t[i][ncl]=t[i-1][ncl]+ncol*ndep;
    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;
  }
  
  /* return pointer to array of pointers to rows */
  return t;
}


unsigned long ***lmatrix3D(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)
/* allocate an unsigned long 3D matrix with range t[nrl..nrh][ncl..nch][ndl..ndh] */
{
  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;
  unsigned long ***t;
  long d1, d2, d3;
  
  /* allocate pointers to pointers to rows */
  t=(unsigned long ***) malloc((size_t)((nrow+MEM_END)*sizeof(long**)));
  if (!t) runtimeError("allocation failure 1 in lmatrix3D()");
  t += MEM_END;
  t -= nrl;
  
  /* allocate pointers to rows and set pointers to them */
  t[nrl]=(unsigned long **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(long*)));
  if (!t[nrl]) runtimeError("allocation failure 2 in lmatrix3D()");
  t[nrl] += MEM_END;
  t[nrl] -= ncl;
  
  /* allocate rows and set pointers to them */
  t[nrl][ncl]=(unsigned long *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(long)));
  if (!t[nrl][ncl]) runtimeError("allocation failure 3 in lmatrix3D()");
  t[nrl][ncl] += MEM_END;
  t[nrl][ncl] -= ndl;
  
  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;
  for(i=nrl+1;i<=nrh;i++) {
    t[i]=t[i-1]+ncol;
    t[i][ncl]=t[i-1][ncl]+ncol*ndep;
    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;
  }  
  
  /* return pointer to array of pointers to rows */
  return t;
}


int ***imatrix3D(int nrl, int nrh, int ncl, int nch, int ndl, int ndh)
/* allocate an int 3D matrix with range t[nrl..nrh][ncl..nch][ndl..ndh] */
{
  int i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;
  int ***t;
  int d1, d2, d3;
  
  /* allocate pointers to pointers to rows */
  t=(int ***) malloc((size_t)((nrow+MEM_END)*sizeof(int**)));
  if (!t) runtimeError("allocation failure 1 in imatrix3D()");
  t += MEM_END;
  t -= nrl;
  
  /* allocate pointers to rows and set pointers to them */
  t[nrl]=(int **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(int*)));
  if (!t[nrl]) runtimeError("allocation failure 2 in imatrix3D()");
  t[nrl] += MEM_END;
  t[nrl] -= ncl;
  
  /* allocate rows and set pointers to them */
  t[nrl][ncl]=(int *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(int)));
  if (!t[nrl][ncl]) runtimeError("allocation failure 3 in imatrix3D()");
  t[nrl][ncl] += MEM_END;
  t[nrl][ncl] -= ndl;
  
  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;
  for(i=nrl+1;i<=nrh;i++) {
    t[i]=t[i-1]+ncol;
    t[i][ncl]=t[i-1][ncl]+ncol*ndep;
    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;
  }  
  
  /* return pointer to array of pointers to rows */
  return t;
}


void free_vector(float *v, long nl, long nh)
/* free a float vector allocated with vector() */
{
  free((FREE_ARG) (v+nl-MEM_END));
}

void free_ivector(int *v, long nl, long nh)
/* free an int vector allocated with ivector() */
{
  free((FREE_ARG) (v+nl-MEM_END));
}

void free_cvector(unsigned char *v, long nl, long nh)
/* free an unsigned char vector allocated with cvector() */
{
  free((FREE_ARG) (v+nl-MEM_END));
}

void free_lvector(unsigned long *v, long nl, long nh)
/* free an unsigned long vector allocated with lvector() */
{
  free((FREE_ARG) (v+nl-MEM_END));
}

void free_dvector(double *v, long nl, long nh)
/* free a double vector allocated with dvector() */
{
  free((FREE_ARG) (v+nl-MEM_END));
}

void free_matrix(float **m, long nrl, long nrh, long ncl, long nch)
/* free a float matrix allocated by matrix() */
{
  free((FREE_ARG) (m[nrl]+ncl-MEM_END));
  free((FREE_ARG) (m+nrl-MEM_END));
}

void free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch)
/* free a double matrix allocated by dmatrix() */
{
  free((FREE_ARG) (m[nrl]+ncl-MEM_END));
  free((FREE_ARG) (m+nrl-MEM_END));
}

void free_imatrix(int **m, long nrl, long nrh, long ncl, long nch)
/* free an int matrix allocated by imatrix() */
{
  free((FREE_ARG) (m[nrl]+ncl-MEM_END));
  free((FREE_ARG) (m+nrl-MEM_END));
}

void free_cmatrix(char **m, long nrl, long nrh, long ncl, long nch)
/* free a char matrix allocated by imatrix() */
{
  free((FREE_ARG) (m[nrl]+ncl-MEM_END));
  free((FREE_ARG) (m+nrl-MEM_END));
}

void free_lmatrix(unsigned long **m, long nrl, long nrh, long ncl, long nch)
/* free an unsigned long matrix allocated by lmatrix() */
{
  free((FREE_ARG) (m[nrl]+ncl-MEM_END));
  free((FREE_ARG) (m+nrl-MEM_END));
}

void free_submatrix(float **b, long nrl, long nrh, long ncl, long nch)
/* free a submatrix allocated by submatrix() */
{
  free((FREE_ARG) (b+nrl-MEM_END));
}

void free_convert_matrix(float **b, long nrl, long nrh, long ncl, long nch)
/* free a matrix allocated by convert_matrix() */
{
  free((FREE_ARG) (b+nrl-MEM_END));
}

void free_f3tensor(float ***t, long nrl, long nrh, long ncl, long nch,
  long ndl, long ndh)
/* free a float f3tensor allocated by f3tensor() */
{
  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));
  free((FREE_ARG) (t[nrl]+ncl-MEM_END));
  free((FREE_ARG) (t+nrl-MEM_END));
}


void free_lmatrix3D(unsigned long ***t, long nrl, long nrh, long ncl, long nch,
  long ndl, long ndh)
/* free an unsigned long 3D matrix allocated by lmatrix3D() */
{
  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));
  free((FREE_ARG) (t[nrl]+ncl-MEM_END));
  free((FREE_ARG) (t+nrl-MEM_END));
}


void free_imatrix3D(int ***t, int nrl, int nrh, int ncl, int nch,
  int ndl, int ndh)
/* free an unsigned int 3D matrix allocated by imatrix3D() */
{
  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));
  free((FREE_ARG) (t[nrl]+ncl-MEM_END));
  free((FREE_ARG) (t+nrl-MEM_END));
}


================================================
FILE: lonestar/eda/cpu/sproute/memAlloc.h
================================================
/* --------------------------------------------------------------------------
   Public domain memory allocation and de-allocation routine header file.
   Taken from Appendix B of:
   Numerical Recipes in C: The Art of Scientific Computing, Second Edition,
   Cambridge University Press, 1992
----------------------------------------------------------------------------*/
#ifndef _MEMALLOC_H_
#define _MEMALLOC_H_

static float sqrarg;
#define SQR(a) ((sqrarg = (a)) == 0.0 ? 0.0 : sqrarg * sqrarg)

static double dsqrarg;
#define DSQR(a) ((dsqrarg = (a)) == 0.0 ? 0.0 : dsqrarg * dsqrarg)

static double dmaxarg1, dmaxarg2;
#define DMAX(a, b)                                                             \
  (dmaxarg1 = (a), dmaxarg2 = (b),                                             \
   (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))

static double dminarg1, dminarg2;
#define DMIN(a, b)                                                             \
  (dminarg1 = (a), dminarg2 = (b),                                             \
   (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))

static float maxarg1, maxarg2;
#define FMAX(a, b)                                                             \
  (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))

static float minarg1, minarg2;
#define FMIN(a, b)                                                             \
  (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))

static long lmaxarg1, lmaxarg2;
#define LMAX(a, b)                                                             \
  (lmaxarg1 = (a), lmaxarg2 = (b),                                             \
   (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))

static long lminarg1, lminarg2;
#define LMIN(a, b)                                                             \
  (lminarg1 = (a), lminarg2 = (b),                                             \
   (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))

static int imaxarg1, imaxarg2;
#define IMAX(a, b)                                                             \
  (imaxarg1 = (a), imaxarg2 = (b),                                             \
   (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))

static int iminarg1, iminarg2;
#define IMIN(a, b)                                                             \
  (iminarg1 = (a), iminarg2 = (b),                                             \
   (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))

#define SIGN(a, b) ((b) >= 0.0 ? fabs(a) : -fabs(a))

void runTimeError(char error_text[]);
float* vector(long nl, long nh);
int* ivector(long nl, long nh);
unsigned char* cvector(long nl, long nh);
unsigned long* lvector(long nl, long nh);
double* dvector(long nl, long nh);
float** matrix(long nrl, long nrh, long ncl, long nch);
double** dmatrix(long nrl, long nrh, long ncl, long nch);
int** imatrix(long nrl, long nrh, long ncl, long nch);
char** cmatrix(long nrl, long nrh, long ncl, long nch);
unsigned long** lmatrix(long nrl, long nrh, long ncl, long nch);
float** submatrix(float** a, long oldrl, long oldrh, long oldcl, long oldch,
                  long newrl, long newcl);
float** convert_matrix(float* a, long nrl, long nrh, long ncl, long nch);
float*** f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh);
unsigned long*** lmatrix3D(long nrl, long nrh, long ncl, long nch, long ndl,
                           long ndh);
int*** imatrix3D(int nrl, int nrh, int ncl, int nch, int ndl, int ndh);

void free_vector(float* v, long nl, long nh);
void free_ivector(int* v, long nl, long nh);
void free_cvector(unsigned char* v, long nl, long nh);
void free_lvector(unsigned long* v, long nl, long nh);
void free_dvector(double* v, long nl, long nh);
void free_matrix(float** m, long nrl, long nrh, long ncl, long nch);
void free_dmatrix(double** m, long nrl, long nrh, long ncl, long nch);
void free_imatrix(int** m, long nrl, long nrh, long ncl, long nch);
void free_cmatrix(char** m, long nrl, long nrh, long ncl, long nch);
void free_lmatrix(unsigned long** m, long nrl, long nrh, long ncl, long nch);
void free_submatrix(float** b, long nrl, long nrh, long ncl, long nch);
void free_convert_matrix(float** b, long nrl, long nrh, long ncl, long nch);
void free_f3tensor(float*** t, long nrl, long nrh, long ncl, long nch, long ndl,
                   long ndh);
void free_lmatrix3D(unsigned long*** t, long nrl, long nrh, long ncl, long nch,
                    long ndl, long ndh);
void free_imatrix3D(int*** t, int nrl, int nrh, int ncl, int nch, int ndl,
                    int ndh);

#endif /* _MEMALLOC_H_ */


================================================
FILE: lonestar/eda/cpu/sproute/mst2.c
================================================
#include  <stdlib.h>
#include  <stdio.h>
#include   <assert.h>
#include  "global.h"
#include  "neighbors.h"
#include  "dist.h"
#include  "heap.h"
#include  "err.h"


void  mst2_package_init( long  n )
{
  allocate_heap( n );
  allocate_nn_arrays( n );
}

/****************************************************************************/
/*
*/

void  mst2_package_done()
{
  deallocate_heap();
  deallocate_nn_arrays();
}  

/****************************************************************************/
/*
*/

void  mst2
( 
  long    n,
  Point*  pt, 
  long*   parent
)
{
  long  i, k, nn1;
  long  d;
  long  oct;
  long  root = 0;
  extern  nn_array*  nn;

//  brute_force_nearest_neighbors( n, pt, nn );
  dq_nearest_neighbors( n, pt, nn );

  /* 
     Binary heap implementation of Prim's algorithm.
     Runs in O(n*log(n)) time since at most 8n edges are considered
  */

  heap_init( n );
  heap_insert( root, 0 );
  parent[root] = root;

  for( k = 0;  k < n;  k++ )   /* n points to be extracted from heap */
  {
    i = heap_delete_min();

    if (i<0) break;
#ifdef DEBUG
    assert( i >= 0 );
#endif 

    /*
      pt[i] entered the tree, update heap keys for its neighbors
    */
    for( oct = 0;  oct < 8;  oct++ )
    {
      nn1 = nn[i][oct]; 
      if( nn1 >= 0 )
      {
        d  = dist( pt[i], pt[nn1] );
        if( in_heap(nn1) && (d < heap_key(nn1)) )
        {
          heap_decrease_key( nn1, d );
          parent[nn1] = i;
        } 
        else if( never_seen(nn1) )
        {
          heap_insert( nn1, d );
          parent[nn1] = i;
        }
      }
    }
  }
}

/****************************************************************************/
/****************************************************************************/


================================================
FILE: lonestar/eda/cpu/sproute/mst2.h
================================================
#ifndef _MST2_H_
#define _MST2_H_

#include "global.h"

void mst2_package_init(long n);
void mst2_package_done();
void mst2(long n, Point* pt, long* parent);

#endif


================================================
FILE: lonestar/eda/cpu/sproute/neighbors.c
================================================
#include  <assert.h>
#include  <string.h>
#include  <stdlib.h>
#include  "global.h"
#include  "err.h"
#include  "dist.h"

long  octant
(
  Point  from,
  Point  to
);

static Point* _pt;

/***************************************************************************/
/*
  For efficiency purposes auxiliary arrays are allocated as globals 
*/

long    max_arrays_size = 0;
nn_array*  nn   = (nn_array*)NULL;
Point*  sheared = (Point*)NULL;
long*  sorted   = (long*)NULL;
long*  aux      = (long*)NULL;  

/***************************************************************************/
/*
  resize the auxiliary arrays to fit the specified number of points 
*/

void  allocate_nn_arrays( long  n )
{
  if( max_arrays_size < n ) 
  {
    nn      = (nn_array*)realloc( (void*)nn, (size_t)n*sizeof(nn_array) );
    sheared = (Point*)realloc( (void*)sheared, (size_t)n*sizeof(Point) );
    sorted  = (long*)realloc( (void*)sorted, (size_t)n*sizeof(long) );
    aux     = (long*)realloc( (void*)aux, (size_t)n*sizeof(long) );
    if( !nn || !sheared || !sorted || !aux )
    {
      err_exit( "Cannot allocate memory in allocate_nn_arrays!" );
    }
    max_arrays_size = n;
  }
}

/***************************************************************************/
/*
  free memory used by auxiliary arrays
*/

void  deallocate_nn_arrays()
{
  max_arrays_size = 0;
  if( nn )
  {
    free( (void*)nn );
    nn = (nn_array*)NULL;
  }
  if( sheared )
  {
    free( (void*)sheared );
    sheared = (Point*)NULL;
  }
  if( sorted )
  {
    free( (void*)sorted );
    sorted = (long*)NULL;
  }
  if( aux )
  {
    free( (void*)aux );
    aux = (long*)NULL;
  }

}

/***************************************************************************/
/*
  comparison function for use in quicksort
*/

static  int compare_x
( 
  const void*  i, 
  const void*  j 
)
{
  /*
    points with the same x must appear in increasing order of y 
  */
  if( sheared[*((long*)i)].x == sheared[*((long*)j)].x)
  {
    return  sheared[*((long*)i)].y - sheared[*((long*)j)].y;
  }
  else
  {
    return  sheared[*((long*)i)].x - sheared[*((long*)j)].x;
  }
}


/***************************************************************************/
/*
  Combine step of the Guibas-Stolfi divide-and-conquer NE nearest neighbor
  algorithm. For efficiency purposes SW nearest neighbors are computed 
  at the same time.
*/

void  ne_sw_combine
(
  long    left,
  long    mid,
  long    right,
  Point*  pt,
  long*   sorted,
  long*   aux,
  long    oct,
  nn_array*  nn
)
{
  long   i, j, k, y2; 
  long   i1;
  long   i2; 
  long   best_i2;     /* index of current best nearest-neighbor */
  long   best_dist;   /* distance to best nearest-neighbor      */
  long   d;

#ifdef DEBUG
  assert( right > mid );
  assert( mid > left );
#endif

  /*
    update north-east nearest neighbors accross the mid-line
  */

  i1 = left;
  i2 = mid;   y2 = pt[ sorted[i2] ].y;

  while( (i1 < mid) && (pt[ sorted[i1] ].y >= y2) )
  {
    i1++;
  }
  
  if( i1 < mid )
  {
    best_i2   = i2;
    best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );
    i2++;

    while( (i1 < mid) && (i2 < right) )
    {
      if( pt[ sorted[i1] ].y < pt[ sorted[i2] ].y )
      {
        d = dist2( pt + sorted[i1], pt + sorted[i2] );
        if( d < best_dist ) 
        {
          best_i2   = i2;
          best_dist = d;
        }
        i2++;
      }
      else 
      {
        if( (nn[ sorted[i1] ][oct] == -1) || 
            ( best_dist < dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) 
           )
        {
          nn[ sorted[i1] ][oct] = sorted[best_i2];
        }
        i1++;
        if( i1 < mid )
        {
          best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );
        }
      }    
    }

    while( i1 < mid )
    {
      if( (nn[ sorted[i1] ][oct] == -1) || 
          ( dist2( pt + sorted[i1], pt + sorted[best_i2] ) < 
            dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) 
        )
      {
        nn[ sorted[i1] ][oct] = sorted[best_i2];
      }
      i1++;
    }
  }
  /*
    repeat for south-west nearest neighbors
  */

  oct = (oct + 4) % 8;

  i1 = right - 1;
  i2 = mid - 1;   y2 = pt[ sorted[i2] ].y;
     
  while( (i1 >= mid) && (pt[ sorted[i1] ].y <= y2) )
  {
    i1--;
  }

  if( i1 >= mid )
  {
    best_i2   = i2;
    best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );
    i2--;

    while( (i1 >= mid) && (i2 >= left) )
    {
      if( pt[ sorted[i1] ].y > pt[ sorted[i2] ].y )
      {
        d = dist2( pt + sorted[i1], pt + sorted[i2] );
        if( d < best_dist ) 
        {
          best_i2   = i2;   
          best_dist = d;
        }
        i2--;
      }
      else 
      {
        if( (nn[ sorted[i1] ][oct] == -1) || 
            ( best_dist < dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) 
           )
        {
          nn[ sorted[i1] ][oct] = sorted[best_i2];
        }
        i1--;
        if( i1 >= mid )
        {
          best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );
        }
      }    
    }

    while( i1 >= mid )
    {
      if( (nn[ sorted[i1] ][oct] == -1) || 
          ( dist2( pt + sorted[i1], pt + sorted[best_i2] ) < 
            dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) 
        )
      {
        nn[ sorted[i1] ][oct] = sorted[best_i2];
      }
      i1--;
    }
  }

  /*
    merge sorted[left..mid-1] with sorted[mid..right-1] by y-coordinate
  */

  i = left;  /* first unprocessed element in left  list  */
  j = mid;   /* first unprocessed element in right list  */
  k = left;  /* first free available slot in output list */

  while( (i < mid) && (j < right) )
  {
    if( pt[ sorted[i] ].y >= pt[ sorted[j] ].y )
    {
      aux[k++] = sorted[i++]; 
    }
    else 
    {
      aux[k++] = sorted[j++]; 
    }
  }

  /*
    copy leftovers 
  */
  while( i < mid   ) {  aux[k++] = sorted[i++]; }
  while( j < right ) {  aux[k++] = sorted[j++]; }

  /*
    now copy sorted points from 'aux' to 'sorted' 
  */

  for( i = left;  i < right;  i++ )  { sorted[i] = aux[i]; }

#if 0
  memcpy( (void*)(sorted+left),             /* destination */
          (void*)(aux+left),             /* source      */
          (size_t)(right-left)*sizeof(long) /* number of bytes */ 
        );
#endif

}

/***************************************************************************/
/*
   compute north-east and south-west nearest neighbors for points indexed 
   by {sorted[left],...,sorted[right-1]} 
*/

void  ne_sw_nearest_neighbors
(
  long    left,
  long    right,
  Point*  pt,
  long*   sorted,
  long*   aux,
  long    oct,
  nn_array*  nn
)
{
  long   mid;

#ifdef DEBUG
  assert( right > left );
#endif

  if( right == left + 1 )  
  {
    nn[ sorted[left] ][oct] = nn[ sorted[left]][(oct+4) % 8] = -1;
  }
  else
  {
    mid = (left + right) / 2;
    ne_sw_nearest_neighbors( left, mid, pt, sorted, aux, oct, nn );
    ne_sw_nearest_neighbors( mid, right, pt, sorted, aux, oct, nn );
    ne_sw_combine( left, mid, right, pt, sorted, aux, oct, nn );
  }
}

/***************************************************************************/
/*
  Guibas-Stolfi algorithm for computing nearest NE neighbors
*/

void  dq_nearest_neighbors
(
  long      n,
  Point*    pt,
  nn_array*  nn
)
{
  long   i, oct;
  void  check_nn( long, Point*, nn_array* );

  long   shear[4][4] = {
                         {1, -1,  0,  2}, 
                         {2,  0, -1,  1}, 
                         {1,  1, -2,  0}, 
                         {0,  2, -1, -1} 
                       };


_pt = pt;

  for( oct = 0;  oct < 4;  oct++ )
  {
    for( i = 0;   i < n;   i++ )
    {
      sheared[i].x = shear[oct][0]*pt[i].x + shear[oct][1]*pt[i].y;
      sheared[i].y = shear[oct][2]*pt[i].x + shear[oct][3]*pt[i].y;
      sorted[i] = i;
    }
    
    qsort( sorted, n, sizeof(long), compare_x );
    ne_sw_nearest_neighbors( 0, n, sheared, sorted, aux, oct, nn );
  }

#ifdef DEBUG
  check_nn( n, pt, nn );
#endif

}

/***************************************************************************/
/***************************************************************************/
/*
  Brute-force nearest-neighbor computation for debugging purposes
*/

/***************************************************************************/
/*
  Half-open octants are numbered from 0 to 7 in anti-clockwise order 
  starting from ( dx >= dy > 0 ).
*/

#define sgn(x)  ( x>0 ? 1 : (x < 0 ? -1 : 0) )

long  octant
( 
  Point  from,
  Point  to
)
{
  long  dx = to.x - from.x;
  long  dy = to.y - from.y;
  long  sgn1 = sgn(dx)*sgn(dy);
  long  sgn2 = sgn(dx+dy)*sgn(dx-dy);
  long   oct = 0x0;

  
  if( (dy < 0) || ((dy==0) && (dx>0)) )        oct += 4;
  if( (sgn1 < 0) || (dy==0) )                  oct += 2;
  if( (sgn1*sgn2 < 0) || (dy==0) || (dx==0) )  oct += 1;

  return  oct;
}

/***************************************************************************/
/*
  O(n^2) algorithm for computing all nearest neighbors
*/

void  brute_force_nearest_neighbors
(
  long    n,
  Point*  pt,
  nn_array*  nn
)
{
  long  i, j, oct;
  long  d;

  /*
    compute nearest neighbors by inspecting all pairs of points 
  */
  for( i = 0;   i < n;   i++ )
  {
    for( oct = 0;  oct < 8;  oct++ )
    {
      nn[i][oct]   = -1;
    }
  }

  for( i = 0;   i < n;  i++ )
  {
    for( j = i+1;   j < n;  j++ )
    {
      d = dist(pt[i], pt[j]);

      oct = octant( pt[i], pt[j] ); 
      if( ( nn[i][oct] == -1 ) ||
          ( d < dist(pt[i], pt[ nn[i][oct] ]) )
        )
      {
        nn[i][oct]  = j;
      }

      oct = (oct + 4) % 8;       
      if( ( nn[j][oct] == -1 ) ||
          ( d < dist(pt[j], pt[ nn[j][oct] ]) )
        )
      {
        nn[j][oct]  = i;
      }
    }
  }
}


/***************************************************************************/
/*
  compare nearest neighbors against those computed by brute force
*/

void  check_nn
(
  long    n,
  Point*  pt,
  nn_array*  nn
)
{
  long       i, j, oct;
  nn_array*  nn1;

  nn1  = (nn_array*)calloc( (size_t)n, (size_t)sizeof(nn_array) );
  brute_force_nearest_neighbors( n, pt, nn1 );

  for( i = 0;   i < n;   i++ )
  {
    for( oct = 0;  oct < 8;  oct++ )
    {
      if( nn[i][oct] == -1 )
      {
        assert( nn1[i][oct] == -1 );
      }
      else
      {
        assert( nn1[i][oct] != -1 );

        if( octant(pt[i], pt[ nn[i][oct] ]) != oct )
        {
        printf( "WRONG OCTANT!\noct=%ld\n", oct );
        printf( "i=%ld, x=%ld, y=%ld\n", i, pt[i].x, pt[i].y );
        j = nn[i][oct];
        printf( "nn=%ld, x=%ld, y=%ld, dist = %ld\n", j, pt[j].x, pt[j].y,
                 dist(pt[i], pt[j ]) );          
        }
//        assert( octant(pt[i], pt[ nn[i][oct] ]) == oct );

        assert( octant(pt[i], pt[ nn1[i][oct] ]) == oct );

        if( dist(pt[i], pt[ nn[i][oct] ]) != 
                dist(pt[i], pt[ nn1[i][oct] ]) ) 
       {
        printf( "NNs DON'T MATCH!\noct=%ld\n", oct );
        printf( "i=%ld, x=%ld, y=%ld\n", i, pt[i].x, pt[i].y );
        j = nn[i][oct];
        printf( "nn=%ld, x=%ld, y=%ld, dist = %ld\n", j, pt[j].x, pt[j].y,
                 dist(pt[i], pt[j ]) );
        j = nn1[i][oct];
        printf( "nn1=%ld, x=%ld, y=%ld, dist = %ld\n", j, pt[j].x, pt[j].y,
                 dist(pt[i], pt[ j ]) );
       }
//        assert( dist(pt[i], pt[ nn[i][oct] ]) == 
//                dist(pt[i], pt[ nn1[i][oct] ]) );
      }
    }
  }
  
  free( nn1 );
}

/***************************************************************************/
/***************************************************************************/


================================================
FILE: lonestar/eda/cpu/sproute/neighbors.h
================================================
#include "global.h"

void allocate_nn_arrays(long n);
void deallocate_nn_arrays();

void brute_force_nearest_neighbors(long n, Point* pt, nn_array* nn);

void dq_nearest_neighbors(long n, Point* pt, nn_array* nn);


================================================
FILE: lonestar/eda/cpu/sproute/parallel_router_morphgraph.cpp
================================================
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <queue>
#include <limits>
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/PriorityQueue.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include "galois/graphs/TypeTraits.h"
#include "galois/substrate/SimpleLock.h"
#include "galois/AtomicHelpers.h"
#include "galois/runtime/Profile.h"

#include "galois/LargeArray.h"

#include "llvm/Support/CommandLine.h"

#include "Lonestar/BoilerPlate.h"
#include "Lonestar/BFS_SSSP.h"
using namespace std;

#define VIA_COST 1
float HEIGHT      = 6.0;
float K           = 0.5;
float HEIGHT_STEP = 2.0;
float K_STEP      = 0.05;

float slope = 3.0;

static const int DIST_INFINITY   = std::numeric_limits<int>::max() / 2 - 1;
static const int FIFO_CHUNK_SIZE = 64;
static const int MAZE_CHUNK_SIZE = 32;
static const int OBIM_delta      = 4;

struct Info {
  int grid_x, grid_y, num_layers;
  int* v_capacity;
  int* h_capacity;
  int* min_width;
  int* min_spacing;
  int* via_spacing;
  int lower_left_x, lower_left_y, tile_height, tile_width;
  int num_nets;
  int num_capa_adjust;
  int num_tiles;
};

struct Pin {
  int x, y, layer;
  int tile_x, tile_y;
  Pin() : x(0), y(0), layer(0) {}
};

struct Path {
  int src_tile_x, src_tile_y, src_layer;
  int dst_tile_x, dst_tile_y, dst_layer;
  Path()
      : src_tile_x(-1), src_tile_y(-1), src_layer(-1), dst_tile_x(-1),
        dst_tile_y(-1), dst_layer(-1) {}
  Path(int src_tile_x, int src_tile_y, int src_layer, int dst_tile_x,
       int dst_tile_y, int dst_layer)
      : src_tile_x(src_tile_x), src_tile_y(src_tile_y), src_layer(src_layer),
        dst_tile_x(dst_tile_x), dst_tile_y(dst_tile_y), dst_layer(dst_layer) {}
};

struct Node {
  // galois::substrate::SimpleLock lock;
  float dist;
  bool in_direction[4]; // four directions indicating x+1, y+1, x-1, y-1
  int pin_layer;
  Node() : dist(DIST_INFINITY) {
    pin_layer = 0;
    for (int i = 0; i < 4; i++)
      in_direction[i] = 0;
  }
  inline void reset() {
    pin_layer = 0;
    dist      = DIST_INFINITY;
    for (int i = 0; i < 4; i++)
      in_direction[i] = 0;
  }
  inline bool update_direction(float new_dist, float old_dist, int direction) {
    if (new_dist < old_dist) {
      for (int i = 0; i < 4; i++)
        in_direction[i] = (i == direction) ? 1 : 0;
      return true;
    } else if (new_dist == old_dist) {
      in_direction[direction] = 1;
      return false;
    } else {
      cout << "error in update direction" << endl;
      return false;
    }
  }
  /*inline void acquireLock()
  {
      lock.lock();
  }
  inline void releaseLock()
  {
      lock.unlock();
  } */
};
struct Edge {
  int capacity;
  int utilization;
  float scale;
  float cost;
  Edge() : capacity(0), utilization(0), scale(1), cost(1) {}
  void compute_cost() {
    cost = 1 + HEIGHT / (1 + exp(-K * (float)(utilization - capacity)));
    /*if(utilization > capacity)
        cost += HEIGHT/ slope * ((float)(utilization - capacity));*/
  }
};

using edgeArray = galois::LargeArray<Edge>;
using nodeArray = galois::LargeArray<Node>;

struct Net {
  string name;
  int id;
  int num_pins;
  int min_width;
  Pin* pinlist;
  bool reroute;
  std::list<Path> pathlist;
  std::list<Edge*> edgelist;
  std::list<Node*> nodelist;
};

struct timerstruct {
  galois::StatTimer t_init_dist;
  galois::StatTimer t_trace_back;
  galois::StatTimer t_rebuild_queue;
  galois::StatTimer t_maze_route;
  galois::StatTimer t_print_path;
  galois::StatTimer t_record_path;
  galois::StatTimer t_mazeloop;
  timerstruct()
      : t_init_dist("init_dist"), t_trace_back("trace_back"),
        t_rebuild_queue("rebuild_queue"), t_maze_route("maze_route"),
        t_print_path("print_path"), t_record_path("record_path"),
        t_mazeloop("mazeloop") {}
};

using netArray = galois::LargeArray<Net>;
using Graph    = galois::graphs::MorphGraph<int, void, true>;
using GNode    = Graph::GraphNode;

void init_dist(const Info info, nodeArray& nodelist) {
  galois::on_each([&](const unsigned tid, const unsigned numT) {
    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);
    unsigned start       = total_nodes / numT * tid;
    unsigned end =
        (tid == numT - 1) ? total_nodes : total_nodes / numT * (tid + 1);
    for (unsigned i = start; i < end; i++)
      nodelist[i].reset();
  });

  /*for(int y = 0; y < info.grid_y - 1; y++)
  {
      for(int x = 0; x < info.grid_x - 1; x++)
      {
          nodelist[y * (info.grid_x - 1) + x].reset();
      }
  }*/
}

void trace_back(const Info info, nodeArray& nodelist, int nodeid, int netid) {
  Node* dst = &nodelist[nodeid];
  int srcid;
  int dstid          = nodeid;
  int prev_direction = -1;
  while (dst->dist != 0) {
    int tile_y = dstid / (info.grid_x - 1);
    int tile_x = dstid % (info.grid_x - 1);
    // cout << "trace back:"<< dstid << "("<< tile_x << "," << tile_y << ") " <<
    // "dist: " << nodelist[dstid].dist << " "; cout << dst->in_direction[0] <<
    // dst->in_direction[1] << dst->in_direction[2] <<
    // dst->in_direction[3]<<endl;

    int rand_offset   = (prev_direction >= 0) ? prev_direction : rand() % 4;
    int end_direction = -1;
    for (int i = 0; i < 4; i++) {
      if (dst->in_direction[(i + rand_offset) % 4]) {
        end_direction  = (i + rand_offset) % 4;
        prev_direction = end_direction;
        break;
      }
    }
    if (end_direction >= 0) {
      for (int i = 0; i < 4; i++) {
        dst->in_direction[i] = (i == end_direction) ? 1 : 0;
      }
    } else {
      cout << " error in end direction reset" << endl;
      exit(1);
    }

    switch (end_direction) {
    case 0:
      srcid = dstid + 1;
      break;
    case 1:
      srcid = dstid + info.grid_x - 1;
      break;
    case 2:
      srcid = dstid - 1;
      break;
    case 3:
      srcid = dstid - (info.grid_x - 1);
      break;
    default: {

      cout << "error in trace back direction: end_direction = " << end_direction
           << " tile: " << tile_x << "," << tile_y;
      cout << " netid: " << netid << endl;

      cout << "direction:" << dst->in_direction[0]
           << " distance: " << nodelist[dstid + 1].dist << endl;
      cout << "direction:" << dst->in_direction[1]
           << " distance: " << nodelist[dstid + info.grid_x - 1].dist << endl;
      cout << "direction:" << dst->in_direction[2]
           << " distance: " << nodelist[dstid - 1].dist << endl;
      cout << "direction:" << dst->in_direction[3]
           << " distance: " << nodelist[dstid - (info.grid_x - 1)].dist << endl;

      // exit(1);
    }
    }

    if (srcid < 0 || srcid > info.num_tiles - 1) {
      cout << "error in trace back srcid" << endl;
      // exit(1);
    }
    nodelist[dstid].dist = 0;
    dstid                = srcid;
    dst                  = &nodelist[dstid];
  }
}

void rebuild_queue(
    const Info info, nodeArray& nodelist, galois::InsertBag<int>& allNodeBag,
    galois::InsertBag<int>& initBag,
    bool last_pin) // actually this src is the dst just connected to the net
{
  /*galois::do_all(galois::iterate(allNodeBag),
      [&] (const auto nodeid)
      {
              int neighborid;
              if(nodelist[nodeid].dist == 0)
              {
                  //cout<<"rebuild queue: ("<< nodeid %(info.grid_x - 1) << ","
     << nodeid/(info.grid_x - 1) <<")"<< endl; neighborid = nodeid + 1;
                  if(neighborid <= info.num_tiles - 1 &&
     nodelist[neighborid].dist == 0 && nodelist[neighborid].in_direction[2])
                  {
                      nodelist[nodeid].in_direction[0] = 1;
                  }

                  neighborid = nodeid - 1;
                  if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&
     nodelist[neighborid].in_direction[0])
                  {
                      nodelist[nodeid].in_direction[2] = 1;
                  }

                  neighborid = nodeid + info.grid_x - 1;
                  if(neighborid <= info.num_tiles - 1 &&
     nodelist[neighborid].dist == 0 && nodelist[neighborid].in_direction[3])
                  {
                      nodelist[nodeid].in_direction[1] = 1;
                  }

                  neighborid = nodeid - (info.grid_x - 1);
                  if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&
     nodelist[neighborid].in_direction[1])
                  {
                      nodelist[nodeid].in_direction[3] = 1;
                  }
                  if(!last_pin)
                      initBag.push(nodeid);

              }
              else
              {
                  nodelist[nodeid].reset();
              }

      },
      galois::steal(),
      galois::chunk_size<2048>()
      );*/
  galois::on_each([&](const unsigned tid, const unsigned numT) {
    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);
    unsigned start       = total_nodes / numT * tid;
    unsigned end =
        (tid == numT - 1) ? total_nodes : total_nodes / numT * (tid + 1);

    for (unsigned nodeid = start; nodeid < end; nodeid++) {
      int neighborid;
      if (nodelist[nodeid].dist == 0) {
        // cout<<"rebuild queue: ("<< nodeid %(info.grid_x - 1) << "," <<
        // nodeid/(info.grid_x - 1) <<")"<< endl;
        neighborid = nodeid + 1;
        if (neighborid <= info.num_tiles - 1 &&
            nodelist[neighborid].dist == 0 &&
            nodelist[neighborid].in_direction[2]) {
          nodelist[nodeid].in_direction[0] = 1;
        }

        neighborid = nodeid - 1;
        if (neighborid >= 0 && nodelist[neighborid].dist == 0 &&
            nodelist[neighborid].in_direction[0]) {
          nodelist[nodeid].in_direction[2] = 1;
        }

        neighborid = nodeid + info.grid_x - 1;
        if (neighborid <= info.num_tiles - 1 &&
            nodelist[neighborid].dist == 0 &&
            nodelist[neighborid].in_direction[3]) {
          nodelist[nodeid].in_direction[1] = 1;
        }

        neighborid = nodeid - (info.grid_x - 1);
        if (neighborid >= 0 && nodelist[neighborid].dist == 0 &&
            nodelist[neighborid].in_direction[1]) {
          nodelist[nodeid].in_direction[3] = 1;
        }
        if (!last_pin)
          initBag.push(nodeid);

      } else {
        nodelist[nodeid].reset();
      }
    }
  });

  /*for(int nodeid = 0; nodeid < info.num_tiles; nodeid++)
  {
      int neighborid;
      if(nodelist[nodeid].dist == 0)
      {
          //cout<<"rebuild queue: ("<< nodeid %(info.grid_x - 1) << "," <<
  nodeid/(info.grid_x - 1) <<")"<< endl; neighborid = nodeid + 1; if(neighborid
  <= info.num_tiles - 1 && nodelist[neighborid].dist == 0 &&
  nodelist[neighborid].in_direction[2])
          {
              nodelist[nodeid].in_direction[0] = 1;
          }

          neighborid = nodeid - 1;
          if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&
  nodelist[neighborid].in_direction[0])
          {
              nodelist[nodeid].in_direction[2] = 1;
          }

          neighborid = nodeid + info.grid_x - 1;
          if(neighborid <= info.num_tiles - 1 && nodelist[neighborid].dist == 0
  && nodelist[neighborid].in_direction[3])
          {
              nodelist[nodeid].in_direction[1] = 1;
          }

          neighborid = nodeid - (info.grid_x - 1);
          if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&
  nodelist[neighborid].in_direction[1])
          {
              nodelist[nodeid].in_direction[3] = 1;
          }
          if(!last_pin)
              initBag.push(nodeid);

      }
      else
      {
          nodelist[nodeid].reset();
      }
  }*/
}

inline int horizontal_min_spacing(const Info info, edgeArray& h_edge,
                                  int nodeid) {
  for (int i = 0; i < info.num_layers; i++) {
    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)
    if (info.h_capacity[i] != 0)
      return info.min_spacing[i];
  }
  cout << "error in finding horizontal min spacing!" << endl;
  exit(1);
}

inline int vertical_min_spacing(const Info info, edgeArray& v_edge,
                                int nodeid) {
  for (int i = 0; i < info.num_layers; i++) {
    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)
    if (info.h_capacity[i] != 0)
      return info.min_spacing[i];
  }
  cout << "error in finding horizontal min spacing!" << endl;
  exit(1);
}

inline int horizontal_layer(const Info info, edgeArray& h_edge, int nodeid) {
  for (int i = 0; i < info.num_layers; i++) {
    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)
    if (info.h_capacity[i] != 0)
      return i + 1;
  }
  cout << "error in finding horizontal layer!" << endl;
  exit(1);
}

inline int vertical_layer(const Info info, edgeArray& v_edge, int nodeid) {
  for (int i = 0; i < info.num_layers; i++) {
    if (info.v_capacity[i] != 0)
      return i + 1;
  }
  cout << "error in finding horizontal layer!" << endl;
  exit(1);
}

void record_path(const Info info, nodeArray& nodelist, edgeArray& h_edge,
                 edgeArray& v_edge, Net& net) {
  for (int nodeid = 0; nodeid < info.num_tiles; nodeid++) {
    int neighborid;
    Node& node = nodelist[nodeid];
    if (nodelist[nodeid].dist == 0) {
      // cout<< nodeid << " " << nodeid % (info.grid_x - 1) << " " << nodeid
      // /(info.grid_x - 1) << " " << node.in_direction[0] <<
      // node.in_direction[1] << node.in_direction[2] << node.in_direction[3] <<
      // endl;
      neighborid = nodeid + 1;
      if (nodeid % (info.grid_x - 1) != info.grid_x - 2 &&
          nodelist[neighborid].dist == 0 && node.in_direction[0] &&
          nodelist[neighborid].in_direction[2])
      //&& h_edge[nodeid].utilization + net.min_width +
      // horizontal_min_spacing(info, h_edge, nodeid) <=
      // h_edge[nodeid].capacity)
      {
        Path* path       = new Path;
        path->src_tile_x = nodeid % (info.grid_x - 1);
        path->src_tile_y = nodeid / (info.grid_x - 1);
        path->dst_tile_x = nodeid % (info.grid_x - 1) + 1;
        path->dst_tile_y = nodeid / (info.grid_x - 1);

        path->src_layer = horizontal_layer(info, h_edge, nodeid);
        path->dst_layer = path->src_layer;

        h_edge[nodeid].utilization +=
            net.min_width + horizontal_min_spacing(info, h_edge, nodeid);
        h_edge[nodeid].compute_cost();
        /*h_edge[nodeid].cost = 1 + HEIGHT / ( 1 + exp(- K *
        (h_edge[nodeid].utilization - h_edge[nodeid].capacity)));
        if(h_edge[nodeid].utilization > h_edge[nodeid].capacity)
            h_edge[nodeid].cost += HEIGHT/ slope *
        ((float)(h_edge[nodeid].utilization - h_edge[nodeid].capacity));*/
        net.pathlist.push_back(*path);
        net.edgelist.push_back(&h_edge[nodeid]);
        /*{
            cout<< "utilization exceeds capacity in h net " << net.id << " node:
        " << path->src_tile_x << "," << path->src_tile_y << endl; cout << " "<<
        h_edge[nodeid].utilization << "/" << h_edge[nodeid].capacity << endl;
            exit(1);
        }*/
      }

      neighborid = nodeid + info.grid_x - 1;
      if (neighborid <= info.num_tiles - 1 && nodelist[neighborid].dist == 0 &&
          node.in_direction[1] && nodelist[neighborid].in_direction[3])
      //&& v_edge[nodeid].utilization + net.min_width +
      // vertical_min_spacing(info, v_edge, nodeid) <= v_edge[nodeid].capacity)
      {
        Path* path       = new Path;
        path->src_tile_x = nodeid % (info.grid_x - 1);
        path->src_tile_y = nodeid / (info.grid_x - 1);
        path->dst_tile_x = nodeid % (info.grid_x - 1);
        path->dst_tile_y = nodeid / (info.grid_x - 1) + 1;

        path->src_layer = vertical_layer(info, v_edge, nodeid);
        path->dst_layer = path->src_layer;

        v_edge[nodeid].utilization +=
            net.min_width + vertical_min_spacing(info, v_edge, nodeid);
        v_edge[nodeid].compute_cost();
        /*v_edge[nodeid].cost = 1 + HEIGHT / ( 1 + exp(- K *
        (v_edge[nodeid].utilization - v_edge[nodeid].capacity)));
        if(v_edge[nodeid].utilization > v_edge[nodeid].capacity)
            v_edge[nodeid].cost += HEIGHT/ slope *
        ((float)(v_edge[nodeid].utilization - v_edge[nodeid].capacity));*/
        net.pathlist.push_back(*path);
        net.edgelist.push_back(&v_edge[nodeid]);

        /*{
            cout<< "utilization exceeds capacity in v net " << net.id << " node:
        " << path->src_tile_x << "," << path->src_tile_y << endl; cout << " "<<
        v_edge[nodeid].utilization << "/" << v_edge[nodeid].capacity << endl;
            exit(1);
        }*/
      }

      if ((nodelist[nodeid].in_direction[0] ||
           nodelist[nodeid].in_direction[2] ||
           nodelist[nodeid].pin_layer ==
               horizontal_layer(info, h_edge, nodeid)) &&
          (nodelist[nodeid].in_direction[1] ||
           nodelist[nodeid].in_direction[3] ||
           nodelist[nodeid].pin_layer ==
               vertical_layer(info, v_edge, nodeid))) {
        Path* path       = new Path;
        path->src_tile_x = nodeid % (info.grid_x - 1);
        path->src_tile_y = nodeid / (info.grid_x - 1);
        path->dst_tile_x = nodeid % (info.grid_x - 1);
        path->dst_tile_y = nodeid / (info.grid_x - 1);

        path->src_layer = 1;
        path->dst_layer = 2; // only for 2D

        net.pathlist.push_back(*path);
        net.nodelist.push_back(&nodelist[nodeid]);
      }
    } else
      nodelist[nodeid].reset();
  }
}
void printnode(const Info info, nodeArray& nodelist, int node_x, int node_y) {
  int nodeid = node_x + (info.grid_x - 1) * node_y;
  Node& node = nodelist[nodeid];
  cout << "node: " << nodeid << "(" << node_x << "," << node_y << ") "
       << node.in_direction[0] << node.in_direction[1] << node.in_direction[2];
  cout << node.in_direction[3] << endl;
}

void print_capacity(const Info info, edgeArray& v_edge, edgeArray& h_edge) {
  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {
    if (v_edge[i].utilization > 0)
      cout << "utilization at v edge: (" << i % (info.grid_x - 1) << ","
           << i / (info.grid_x - 1) << ") " << v_edge[i].utilization << endl;
  }
  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {
    if (h_edge[i].utilization > 0)
      cout << "utilization at h edge: (" << i % (info.grid_x - 1) << ","
           << i / (info.grid_x - 1) << ") " << h_edge[i].utilization << endl;
  }
}

struct UpdateRequestIndexer {
  nodeArray& nodelist;

  unsigned int operator()(const int& req) const {
    unsigned int t = (unsigned int)(nodelist[req].dist) / OBIM_delta;
    return t;
  }
};

void mazeroute(const Info info, nodeArray& nodelist,
               galois::InsertBag<int>& allNodeBag, edgeArray& v_edge,
               edgeArray& h_edge, Net* netlist, Graph& g,
               std::vector<GNode>& node_GNode, timerstruct& timer) {
  // std::queue<int> node_queue; // store nodeid
  // int reference_dist;
  std::atomic<float> reference_dist;

  namespace gwl = galois::worklists;
  using dChunk  = gwl::ChunkFIFO<FIFO_CHUNK_SIZE>;
  using OBIM    = gwl::OrderedByIntegerMetric<UpdateRequestIndexer, dChunk>;

  int num_reroute     = 0;
  int sweep_direction = rand() % 2;
  for (int netid = rand() % (info.num_nets), cnt = 0; cnt < info.num_nets;
       cnt++) {
    if (sweep_direction)
      netid = (netid + 1) % info.num_nets;
    else
      netid = (netid == 0) ? (info.num_nets - 1) : (netid - 1);

    if (netid % 10000 == 0)
      cout << "net: " << netid << " " << netlist[netid].id << endl;
    Net& net = netlist[netid];
    if (net.num_pins > 1000 ||
        !net.reroute) // contest says no need to route it for now, but we have
                      // to resolve it
      continue;
    num_reroute++;
    /*if(net.min_width != 1)
        cout<<"net: " << net.id << " minwidth: " << net.min_width << endl;*/
    bool debug = false;
    /*if(netid >= 20000)
    {
        cout<<"remove from netid"<<net.id<<endl;
        break;
    }*/
    /*if(netid == 11285)
        debug = true;*/
    timer.t_init_dist.start();
    init_dist(info, nodelist);
    timer.t_init_dist.stop();

    bool last_pin = false;
    galois::InsertBag<int> initBag;
    for (int pinid = 0; pinid < net.num_pins; pinid++) {
      Pin& pin = net.pinlist[pinid];

      int nodeid     = pin.tile_y * (info.grid_x - 1) + pin.tile_x;
      Node& node     = nodelist[nodeid];
      node.pin_layer = pin.layer;
      if (debug == true)
        cout << "pin: " << pinid << " tile: " << pin.tile_x << " " << pin.tile_y
             << endl;

      reference_dist.store(DIST_INFINITY);

      if (pinid == net.num_pins - 1)
        last_pin = true;

      if (pinid == 0) {
        node.dist = 0;

        if (info.v_capacity[pin.layer - 1] != 0) // vertical direction layer
        {
          node.in_direction[1] = 1;
          node.in_direction[3] = 1;
        } else if (info.h_capacity[pin.layer - 1] !=
                   0) // herizontal direction layer
        {
          node.in_direction[0] = 1;
          node.in_direction[2] = 1;
        } else
          cout << "error in init qnode" << endl;

        // node_queue.push(nodeid);
        initBag.push(nodeid);
        /*int tile_x = nodeid % (info.grid_x - 1);
        int tile_y = nodeid / (info.grid_x - 1);
        cout<<"push pin 0 :" << tile_x << " " << tile_y << endl;*/

      } else if (node.dist != 0) {
        timer.t_mazeloop.start();
        galois::for_each(
            galois::iterate(initBag),
            [&](const auto& srcid, auto& ctx)

            {
              GNode src_GNode = node_GNode[srcid];
              float local_reference_dist =
                  reference_dist; // or reference_dist.load()
              Node& src = nodelist[srcid];

              /*if(debug)
              {
                  int tile_x = srcid % (info.grid_x - 1);
                  int tile_y = srcid / (info.grid_x - 1);
                  cout<<"src: \t"<< tile_x << "\t"<< tile_y <<"\tdistance:\t" <<
              nodelist[srcid].dist << "\tindirection: "; cout<<
              src.in_direction[0] <<
              src.in_direction[1]<<src.in_direction[2]<<src.in_direction[3]<<endl;
                  cout<<"to 0: " << h_edge[srcid].utilization << "/"<<
              h_edge[srcid].capacity <<endl; cout<<"to 1: " <<
              v_edge[srcid].utilization << "/"<< v_edge[srcid].capacity <<endl;
                  cout<<"to 2: " << h_edge[srcid - 1].utilization << "/"<<
              h_edge[srcid - 1].capacity <<endl; cout<<"to 3: " << v_edge[srcid
              - (info.grid_x - 1)].utilization << "/"<< v_edge[srcid -
              (info.grid_x - 1)].capacity <<endl;
              }*/

              // int edge_cnt = 0;
              int dstid;
              for (auto e : g.edges(src_GNode)) {
                auto dst_GNode = g.getEdgeDst(e);
                dstid          = g.getData(dst_GNode);
                if (dstid == srcid + 1) {
                  Node& dst    = nodelist[dstid];
                  Edge& r_edge = h_edge[srcid];

                  if (src.in_direction[2] &&
                      src.dist + r_edge.cost <= dst.dist &&
                      src.dist + r_edge.cost <=
                          local_reference_dist) // previous node is x - 1
                  {
                    bool updated = dst.update_direction(src.dist + r_edge.cost,
                                                        dst.dist, 2);
                    dst.dist     = src.dist + r_edge.cost;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  } else if ((src.in_direction[1] || src.in_direction[3]) &&
                             src.dist + r_edge.cost + VIA_COST <= dst.dist &&
                             src.dist + r_edge.cost + VIA_COST <=
                                 local_reference_dist) {
                    bool updated = dst.update_direction(
                        src.dist + r_edge.cost + VIA_COST, dst.dist, 2);
                    dst.dist = src.dist + r_edge.cost + VIA_COST;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  }

                }

                else if (dstid == srcid - 1) {
                  Node& dst    = nodelist[dstid];
                  Edge& l_edge = h_edge[srcid - 1];

                  if (src.in_direction[0] &&
                      src.dist + l_edge.cost <= dst.dist &&
                      src.dist + l_edge.cost <=
                          local_reference_dist) // previous node is x + 1
                  {
                    bool updated = dst.update_direction(src.dist + l_edge.cost,
                                                        dst.dist, 0);
                    dst.dist     = src.dist + l_edge.cost;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  } else if ((src.in_direction[1] || src.in_direction[3]) &&
                             src.dist + l_edge.cost + VIA_COST <= dst.dist &&
                             src.dist + l_edge.cost + VIA_COST <=
                                 local_reference_dist) {
                    bool updated = dst.update_direction(
                        src.dist + l_edge.cost + VIA_COST, dst.dist, 0);
                    dst.dist = src.dist + l_edge.cost + VIA_COST;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  }
                }

                else if (dstid == srcid + info.grid_x - 1) {
                  Node& dst    = nodelist[dstid];
                  Edge& t_edge = v_edge[srcid];

                  if (src.in_direction[3] &&
                      src.dist + t_edge.cost <= dst.dist &&
                      src.dist + t_edge.cost <=
                          local_reference_dist) // previous node is y - 1
                  {
                    bool updated = dst.update_direction(src.dist + t_edge.cost,
                                                        dst.dist, 3);
                    dst.dist     = src.dist + t_edge.cost;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  } else if ((src.in_direction[0] || src.in_direction[2]) &&
                             src.dist + t_edge.cost + VIA_COST <= dst.dist &&
                             src.dist + t_edge.cost + VIA_COST <=
                                 local_reference_dist) {
                    bool updated = dst.update_direction(
                        src.dist + t_edge.cost + VIA_COST, dst.dist, 3);
                    dst.dist = src.dist + t_edge.cost + VIA_COST;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  }
                } else if (dstid == srcid - (info.grid_x - 1)) {
                  Node& dst    = nodelist[dstid];
                  Edge& b_edge = v_edge[srcid - (info.grid_x - 1)];

                  if (src.in_direction[1] &&
                      src.dist + b_edge.cost <= dst.dist &&
                      src.dist + b_edge.cost <=
                          local_reference_dist) // previous node is y + 1
                  {
                    bool updated = dst.update_direction(src.dist + b_edge.cost,
                                                        dst.dist, 1);
                    dst.dist     = src.dist + b_edge.cost;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  } else if ((src.in_direction[0] || src.in_direction[2]) &&
                             src.dist + b_edge.cost + VIA_COST <= dst.dist &&
                             src.dist + b_edge.cost + VIA_COST <=
                                 local_reference_dist) {
                    bool updated = dst.update_direction(
                        src.dist + b_edge.cost + VIA_COST, dst.dist, 1);
                    dst.dist = src.dist + b_edge.cost + VIA_COST;

                    if (dstid == nodeid) // if dst is one of the destination
                                         // pins
                      local_reference_dist = dst.dist;
                    else if (updated)
                      ctx.push(dstid);
                  }
                } else {
                  cout << "there must be something wrong in constructing the "
                          "graph"
                       << endl;
                  exit(1);
                }
              }

              galois::atomicMin(reference_dist, local_reference_dist);
            },
            galois::wl<dChunk>(),
            // galois::wl<OBIM>(UpdateRequestIndexer{nodelist}),
            galois::steal(), galois::chunk_size<MAZE_CHUNK_SIZE>());
        timer.t_mazeloop.stop();

        // while queue is empty, nodeid is reached.
        timer.t_trace_back.start();
        trace_back(info, nodelist, nodeid, net.id);
        if (info.v_capacity[pin.layer - 1] != 0) // vertical direction layer
        {
          node.in_direction[1] = 1;
          node.in_direction[3] = 1;
        } else if (info.h_capacity[pin.layer - 1] !=
                   0) // herizontal direction layer
        {
          node.in_direction[0] = 1;
          node.in_direction[2] = 1;
        } else
          cout << "error in init qnode" << endl;
        timer.t_trace_back.stop();

        timer.t_rebuild_queue.start();
        rebuild_queue(info, nodelist, allNodeBag, initBag, last_pin);
        timer.t_rebuild_queue.stop();
        // printnode(info, nodelist, 99, 191);
        /*if(last_pin && !node_queue.empty())
        {
            cout<<"queue is not empty when finish a net"<<endl;
        }*/
      }
    }
    timer.t_record_path.start();
    record_path(info, nodelist, h_edge, v_edge, net);
    timer.t_record_path.stop();
    // print_capacity(info, v_edge, h_edge);
  }
  cout << "num of routed nets :" << num_reroute << endl;
}

void readfile(ifstream& infile, Info& info, nodeArray& nodelist,
              edgeArray& v_edge, edgeArray& h_edge, Net*& netlist, Graph& g,
              std::vector<GNode>& node_GNode) {
  string temp1, temp2;
  infile >> temp1 >> info.grid_x >> info.grid_y >> info.num_layers;
  // cout << info.grid_x << info.grid_y << info.layer << endl;
  info.num_tiles = (info.grid_x - 1) * (info.grid_y - 1);
  v_edge.allocateInterleaved(info.num_tiles);
  h_edge.allocateInterleaved(info.num_tiles);
  // v_edge.allocateBlocked(info.num_tiles);
  // h_edge.allocateBlocked(info.num_tiles);
  int nlayers      = info.num_layers;
  info.v_capacity  = new int[nlayers];
  info.h_capacity  = new int[nlayers];
  info.min_width   = new int[nlayers];
  info.min_spacing = new int[nlayers];
  info.via_spacing = new int[nlayers];
  infile >> temp1 >> temp2;

  if (temp1 == "vertical" && temp2 == "capacity") {
    for (int i = 0; i < nlayers; i++) {
      infile >> info.v_capacity[i];
      // cout<<info.v_capacity[i]<<" ";
    }
    // cout<<endl;
  }

  infile >> temp1 >> temp2;
  if (temp1 == "horizontal" && temp2 == "capacity") {
    for (int i = 0; i < nlayers; i++) {
      infile >> info.h_capacity[i];
      // cout<<info.h_capacity[i]<<" ";
    }
    // cout<<endl;
  }

  infile >> temp1 >> temp2;
  if (temp1 == "minimum" && temp2 == "width") {
    for (int i = 0; i < nlayers; i++) {
      infile >> info.min_width[i];
      // cout<<info.min_width[i]<<" ";
    }
    // cout<<endl;
  }

  infile >> temp1 >> temp2;
  if (temp1 == "minimum" && temp2 == "spacing") {
    for (int i = 0; i < nlayers; i++) {
      infile >> info.min_spacing[i];
      // cout<<info.min_spacing[i]<<" ";
    }
    // cout<<endl;
  }

  infile >> temp1 >> temp2;
  if (temp1 == "via" && temp2 == "spacing") {
    for (int i = 0; i < nlayers; i++) {
      infile >> info.via_spacing[i];
      // cout<<info.via_spacing[i]<<" ";
    }
    // cout<<endl;
  }

  infile >> info.lower_left_x >> info.lower_left_y >> info.tile_width >>
      info.tile_height;
  // cout << info.lower_left_x << info.lower_left_y << info.tile_width <<
  // info.tile_height << endl;

  infile >> temp1 >> temp2;
  if (temp1 == "num" && temp2 == "net")
    infile >> info.num_nets;

  netlist = new Net[info.num_nets];
  for (int i = 0; i < info.num_nets; i++) {
    infile >> netlist[i].name >> netlist[i].id >> netlist[i].num_pins >>
        netlist[i].min_width;
    int num_pins       = netlist[i].num_pins;
    netlist[i].pinlist = new Pin[num_pins];
    netlist[i].reroute = true;
    for (int j = 0; j < netlist[i].num_pins; j++) {
      infile >> netlist[i].pinlist[j].x >> netlist[i].pinlist[j].y >>
          netlist[i].pinlist[j].layer;
      netlist[i].pinlist[j].tile_x =
          (netlist[i].pinlist[j].x - info.lower_left_x) / info.tile_width;
      netlist[i].pinlist[j].tile_y =
          (netlist[i].pinlist[j].y - info.lower_left_y) / info.tile_height;
    }
  }

  infile >> info.num_capa_adjust;
  int src_tile_x, src_tile_y, dst_tile_x, dst_tile_y, src_layer, dst_layer,
      capacity;
  int total_v_capacity = 0, total_h_capacity = 0;
  for (int i = 0; i < info.num_layers; i++) {
    total_v_capacity += info.v_capacity[i];
    total_h_capacity += info.h_capacity[i];
  }
  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 2); i++)
    v_edge[i].capacity = total_v_capacity;

  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++)
    if (i % (info.grid_x - 1) != info.grid_x - 2)
      h_edge[i].capacity = total_h_capacity;

  for (int i = 0; i < info.num_capa_adjust; i++) {
    infile >> src_tile_x >> src_tile_y >> src_layer;
    infile >> dst_tile_x >> dst_tile_y >> dst_layer;
    infile >> capacity;
    if (dst_tile_x - src_tile_x == 1)
      h_edge[src_tile_y * (info.grid_x - 1) + src_tile_x].capacity = capacity;
    else if (dst_tile_y - src_tile_y == 1)
      v_edge[src_tile_y * (info.grid_x - 1) + src_tile_x].capacity = capacity;
    else {
      cout << "error in reading capacity adjustment" << endl;
      exit(1);
    }
  }

  nodelist.allocateInterleaved(info.num_tiles);
  node_GNode.resize(info.num_tiles);
  for (int i = 0; i < info.num_tiles; i++) {
    GNode n = g.createNode(i);
    g.addNode(n);
    node_GNode[i] = n;
  }

  for (int nodeid = 0; nodeid < info.num_tiles; nodeid++) {
    GNode src = node_GNode[nodeid];
    int neighborid;
    neighborid = nodeid + 1;
    if (nodeid % (info.grid_x - 1) != info.grid_x - 2) {
      GNode dst = node_GNode[neighborid];
      g.addEdge(src, dst);
      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );
    }

    neighborid = nodeid - 1;
    if (nodeid % (info.grid_x - 1) != 0) {
      GNode dst = node_GNode[neighborid];
      g.addEdge(src, dst);
      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );
    }

    neighborid = nodeid + (info.grid_x - 1);
    if (neighborid <= info.num_tiles - 1) {
      GNode dst = node_GNode[neighborid];
      g.addEdge(src, dst);
      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );
    }

    neighborid = nodeid - (info.grid_x - 1);
    if (neighborid >= 0) {
      GNode dst = node_GNode[neighborid];
      g.addEdge(src, dst);
      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );
    }
  }

  infile.close();
}

void print_path(const Info info, Net* netlist) {
  ofstream outfile("router.out");
  for (int netid = 0; netid < info.num_nets; netid++) {
    Net& net = netlist[netid];
    // if(netid != 2)
    //    continue;
    outfile << net.name << " " << net.id << endl;

    for (std::list<Path>::iterator path = net.pathlist.begin();
         path != net.pathlist.end(); path++) {
      // cout<< tile_x<<" " << tile_y << endl;
      int src_cor_x = path->src_tile_x * info.tile_width +
                      0.5 * info.tile_width + info.lower_left_x;
      int src_cor_y = path->src_tile_y * info.tile_height +
                      0.5 * info.tile_height + info.lower_left_y;
      int src_layer = path->src_layer;

      int dst_cor_x = path->dst_tile_x * info.tile_width +
                      0.5 * info.tile_width + info.lower_left_x;
      int dst_cor_y = path->dst_tile_y * info.tile_height +
                      0.5 * info.tile_height + info.lower_left_y;
      int dst_layer = path->dst_layer;

      // outfile<< "(" << path->src_tile_x << "," << path->src_tile_y << "," <<
      // src_layer << ")-(" << path->dst_tile_x << "," << path->dst_tile_y <<
      // ","
      // << dst_layer << ")" << endl;
      outfile << "(" << src_cor_x << "," << src_cor_y << "," << src_layer
              << ")-(" << dst_cor_x << "," << dst_cor_y << "," << dst_layer
              << ")" << endl;
    }
    outfile << "!" << endl;
  }
}

static const char* name = "Single Source Shortest Path";
static const char* desc =
    "Computes the shortest path from a source node to all nodes in a directed "
    "graph using a modified chaotic iteration algorithm";
static const char* url = "single_source_shortest_path";

namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);

int check_overflow(const Info info, nodeArray& nodelist, edgeArray& v_edge,
                   edgeArray& h_edge, Net* netlist, bool print) {
  int total_of = 0;
  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {
    if (i % (info.grid_x - 1) != info.grid_x - 2) {
      if (h_edge[i].utilization <= h_edge[i].capacity) {
        h_edge[i].scale = 1.0;
      } else {
        h_edge[i].scale =
            (float)h_edge[i].utilization / (float)h_edge[i].capacity;
        total_of += h_edge[i].utilization - h_edge[i].capacity;
        h_edge[i].utilization = 0;
      }
      // if(i % (info.grid_x - 1) == 121 && i / (info.grid_x - 1) == 86)
      if (print) {
        cout << "h_edge " << i % (info.grid_x - 1) << " "
             << i / (info.grid_x - 1) << " overflow: " << h_edge[i].utilization
             << "/" << h_edge[i].capacity << " = " << h_edge[i].scale << endl;
      }
    }
  }

  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 2); i++) {
    if (v_edge[i].utilization <= v_edge[i].capacity) {
      v_edge[i].scale = 1.0;
    } else {
      v_edge[i].scale =
          (float)v_edge[i].utilization / (float)v_edge[i].capacity;
      total_of += v_edge[i].utilization - v_edge[i].capacity;
      if (print) {
        cout << "v_edge " << i % (info.grid_x - 1) << " "
             << i / (info.grid_x - 1) << " " << i
             << " overflow: " << v_edge[i].utilization << "/"
             << v_edge[i].capacity << " = " << v_edge[i].scale << endl;
      }
      v_edge[i].utilization = 0;
    }
  }

  cout << "total overflow is " << total_of << endl;
  return total_of;
}

void reset_net(const Info info, nodeArray& nodelist, edgeArray& v_edge,
               edgeArray& h_edge, Net* netlist) {
  int num_reroute = 0;
  for (int i = 0; i < info.num_nets; i++) {
    bool no_need_reroute = true;
    for (auto edge_iter = netlist[i].edgelist.begin();
         edge_iter != netlist[i].edgelist.end(); edge_iter++) {
      if ((*edge_iter)->scale > 1) {
        num_reroute++;
        no_need_reroute = false;
        break;
      }
    }
    if (!no_need_reroute) {
      for (auto edge_iter = netlist[i].edgelist.begin();
           edge_iter != netlist[i].edgelist.end(); edge_iter++) {
        if ((*edge_iter)->scale > 1) {

        } else {
          (*edge_iter)->utilization -= (netlist[i].min_width + 1);
          (*edge_iter)->compute_cost();
        }
      }
      netlist[i].reroute = true;
      netlist[i].pathlist.clear();
      netlist[i].edgelist.clear();
      netlist[i].nodelist.clear();
    } else
      netlist[i].reroute = false;
  }
  cout << "num of nets to reroute: " << num_reroute << endl;
}

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, argv[1]);
  /*if(argc != 2)
  {
      cout<<"usage: ./a.out [input]"<<endl;
      return 0;
  }*/
  ifstream infile(argv[1]);

  if (!infile.is_open()) {
    cout << "unable to open input file" << endl;
    return 0;
  }
  Info info;

  Graph graph;

  galois::preAlloc(numThreads * 2);
  nodeArray nodelist;
  edgeArray v_edge;
  edgeArray h_edge;
  Net* netlist;
  std::vector<GNode> node_GNode;
  const int MAX_ITERATION = 1;
  readfile(infile, info, nodelist, v_edge, h_edge, netlist, graph, node_GNode);
  infile.close();
  // cout<<"reading input file done"<<endl;
  timerstruct Time;
  // bool print = false;
  galois::InsertBag<int> allNodeBag;
  // galois::on_each(
  //        [&] (const unsigned tid, const unsigned numT)
  {
    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);
    unsigned start       = 0;   // total_nodes / numT * tid;
    unsigned end = total_nodes; //(tid == numT - 1)? total_nodes : total_nodes /
                                // numT * (tid + 1);
    for (unsigned i = start; i < end; i++)
      allNodeBag.push(i);
  }
  //        );

  for (int i = 0; i < MAX_ITERATION; i++) {
    cout << endl;
    cout << "iteration: " << i << " K = " << K << " HEIGHT = " << HEIGHT
         << " slope = " << slope << endl;
    if (i != 0)
      reset_net(info, nodelist, v_edge, h_edge, netlist);

    Time.t_maze_route.start();

    galois::runtime::profileVtune(
        [&](void) {
          mazeroute(info, nodelist, allNodeBag, v_edge, h_edge, netlist, graph,
                    node_GNode, Time);
        },
        "mazeroute");
    Time.t_maze_route.stop();
    bool print = false; // Martin
    if (i == MAX_ITERATION - 1)
      print = true;
    int overflow =
        check_overflow(info, nodelist, v_edge, h_edge, netlist, print);
    if (overflow == 0)
      break;

    if (K < 1)
      K += K_STEP;
    else
      K = 2.0;
    HEIGHT += HEIGHT_STEP;
    if (i > 20)
      HEIGHT_STEP = 4;
  }

  Time.t_print_path.start();
  print_path(info, netlist);
  Time.t_print_path.stop();

  return 0;
}


================================================
FILE: lonestar/eda/cpu/sproute/rand-pts.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int ac, char *av[])
{
    int d=10, tmp, i;
    int PNUM = 0;

    for (i=1; i<ac; i++) {
        if (strcmp(av[i], "-r")==0)  // random
            srandom((int) getpid());
        else if (strncmp(av[i], "-s", 2)==0)  // set random seed
            srandom(atoi(av[i]+2));
        else if (strcmp(av[i], "-n")==0)  // print # of points first
            PNUM=1;
        else if (sscanf(av[i], "%d", &tmp))  // set # of points
            d = tmp;
        else {
            printf("Usage: %s [-r] [-s<S>] [-n] [<D>]\n", av[0]);
            printf("  Output <D> random points ");
            printf("as <D> lines of coordinate pairs.\n");
            printf("  Default <D> is 10.\n");
            printf("  -r\t Randomize. Use getpid() as seed.\n");
            printf("  -s<S>\t Set random seed to <S>.\n");
            printf("  -n\t Write <D> first before the random points.\n");
            exit(-1);
        }
    }
    
    if (PNUM)
        printf("%d\n", d);
    for (i=1; i<=d; i++)
        printf("%4d %4d\n", (int) random()%10000, (int) random()%10000);
}


================================================
FILE: lonestar/eda/cpu/sproute/route.h
================================================
#ifndef _ROUTE_H_
#define _ROUTE_H_

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include "DataType.h"
#include "flute.h"
#include "DataProc.h"
#include "RipUp.h"

#define SAMEX 0
#define SAMEY 1

float costHVH[XRANGE]; // Horizontal first Z
float costVHV[YRANGE]; // Vertical first Z
float costH[YRANGE];   // Horizontal segment cost
float costV[XRANGE];   // Vertical segment cost
float costLR[YRANGE];  // Left and right boundary cost
float costTB[XRANGE];  // Top and bottom boundary cost

float costHVHtest[YRANGE]; // Vertical first Z
float costVtest[XRANGE];   // Vertical segment cost
float costTBtest[XRANGE];  // Top and bottom boundary cost

#define HCOST 5000;

// estimate the routing by assigning 1 for H and V segments, 0.5 to both
// possible L for L segments
void estimateOneSeg(Segment* seg) {
  int i;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  // assign 0.5 to both Ls (x1,y1)-(x1,y2) + (x1,y2)-(x2,y2) + (x1,y1)-(x2,y1) +
  // (x2,y1)-(x2,y2)
  if (seg->x1 == seg->x2) // a vertical segment
  {
    for (i = ymin; i < ymax; i++)
      v_edges[i * xGrid + seg->x1].est_usage += 1;
  } else if (seg->y1 == seg->y2) // a horizontal segment
  {
    for (i = seg->x1; i < seg->x2; i++)
      h_edges[seg->y1 * (xGrid - 1) + i].est_usage += 1;
  } else // a diagonal segment
  {
    for (i = ymin; i < ymax; i++) {
      v_edges[i * xGrid + seg->x1].est_usage += 0.5;
      v_edges[i * xGrid + seg->x2].est_usage += 0.5;
    }
    for (i = seg->x1; i < seg->x2; i++) {
      h_edges[seg->y1 * (xGrid - 1) + i].est_usage += 0.5;
      h_edges[seg->y2 * (xGrid - 1) + i].est_usage += 0.5;
    }
  }
}

void routeSegV(Segment* seg) {
  int i;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  for (i = ymin; i < ymax; i++)
    v_edges[i * xGrid + seg->x1].est_usage++;
}

void routeSegH(Segment* seg) {
  int i;

  for (i = seg->x1; i < seg->x2; i++)
    h_edges[seg->y1 * (xGrid - 1) + i].est_usage++;
}

// L-route, based on previous L route
void routeSegL(Segment* seg) {
  int i, grid, grid1;
  float costL1, costL2, tmp;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  if (seg->x1 == seg->x2) // V route
    routeSegV(seg);
  else if (seg->y1 == seg->y2) // H route
    routeSegH(seg);
  else // L route
  {
    costL1 = costL2 = 0;

    for (i = ymin; i < ymax; i++) {
      grid = i * xGrid;
      tmp  = v_edges[grid + seg->x1].red + v_edges[grid + seg->x1].est_usage -
            vCapacity_lb;
      if (tmp > 0)
        costL1 += tmp;
      tmp = v_edges[grid + seg->x2].red + v_edges[grid + seg->x2].est_usage -
            vCapacity_lb;
      if (tmp > 0)
        costL2 += tmp;
    }
    grid  = seg->y2 * (xGrid - 1);
    grid1 = seg->y1 * (xGrid - 1);
    for (i = seg->x1; i < seg->x2; i++) {
      tmp = h_edges[grid + i].red + h_edges[grid + i].est_usage - hCapacity_lb;
      if (tmp > 0)
        costL1 += tmp;
      tmp =
          h_edges[grid1 + i].red + h_edges[grid1 + i].est_usage - hCapacity_lb;
      if (tmp > 0)
        costL2 += tmp;
    }

    printf("costL1 is %f, costL2 is %f\n", costL1, costL2);

    if (costL1 < costL2) {
      // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)
      for (i = ymin; i < ymax; i++) {
        v_edges[i * xGrid + seg->x1].est_usage += 1;
      }
      grid = seg->y2 * (xGrid - 1);
      for (i = seg->x1; i < seg->x2; i++) {
        h_edges[grid + i].est_usage += 1;
      }
      seg->xFirst = FALSE;
    } // if costL1<costL2
    else {
      // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)
      grid = seg->y1 * (xGrid - 1);
      for (i = seg->x1; i < seg->x2; i++) {
        h_edges[grid + i].est_usage += 1;
      }
      for (i = ymin; i < ymax; i++) {
        v_edges[i * xGrid + seg->x2].est_usage += 1;
      }
      seg->xFirst = TRUE;
    }
  } // else L route
}

// First time L-route, based on 0.5-0.5 estimation
void routeSegLFirstTime(Segment* seg) {
  int i, vedge, hedge;
  float costL1, costL2, tmp;
  int ymin, ymax;

  if (seg->y1 < seg->y2) {
    ymin = seg->y1;
    ymax = seg->y2;
  } else {
    ymin = seg->y2;
    ymax = seg->y1;
  }

  costL1 = costL2 = 0;

  for (i = ymin; i < ymax; i++) {
    vedge = i * xGrid + seg->x1;
    tmp   = v_edges[vedge].red + v_edges[vedge].est_usage - vCapacity_lb;
    if (tmp > 0)
      costL1 += tmp;
  }
  for (i = ymin; i < ymax; i++) {
    vedge = i * xGrid + seg->x2;
    tmp   = v_edges[vedge].red + v_edges[vedge].est_usage - vCapacity_lb;
    if (tmp > 0)
      costL2 += tmp;
  }

  for (i = seg->x1; i < seg->x2; i++) {
    hedge = seg->y2 * (xGrid - 1) + i;
    tmp   = h_edges[hedge].red + h_edges[hedge].est_usage - hCapacity_lb;
    if (tmp > 0)
      costL1 += tmp;
  }
  for (i = seg->x1; i < seg->x2; i++) {
    hedge = seg->y1 * (xGrid - 1) + i;
    tmp   = h_edges[hedge].red + h_edges[hedge].est_usage - hCapacity_lb;
    if (tmp > 0)
      costL2 += tmp;
  }

  if (costL1 < costL2) {
    // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)
    for (i = ymin; i < ymax; i++) {
      vedge = i * xGrid + seg->x1;
      v_edges[vedge].est_usage += 0.5;
      vedge += seg->x2 - seg->x1;
      v_edges[vedge].est_usage -= 0.5;
    }
    for (i = seg->x1; i < seg->x2; i++) {
      hedge = seg->y2 * (xGrid - 1) + i;
      h_edges[hedge].est_usage += 0.5;
      hedge = seg->y1 * (xGrid - 1) + i;
      h_edges[hedge].est_usage -= 0.5;
    }
    seg->xFirst = FALSE;
  } else {
    // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)
    for (i = seg->x1; i < seg->x2; i++) {
      hedge = seg->y1 * (xGrid - 1) + i;
      h_edges[hedge].est_usage += 0.5;
      hedge = seg->y2 * (xGrid - 1) + i;
      h_edges[hedge].est_usage -= 0.5;
    }
    for (i = ymin; i < ymax; i++) {
      vedge = i * xGrid + seg->x2;
      v_edges[vedge].est_usage += 0.5;
      vedge += seg->x1 - seg->x2;
      v_edges[vedge].est_usage -= 0.5;
    }
    seg->xFirst = TRUE;
  }
}

// route all segments with L, firstTime: TRUE, no previous route, FALSE -
// previous is L-route
void routeLAll(Bool firstTime) {
  int i, j;

  if (firstTime) // no previous route
  {
    // estimate congestion with 0.5+0.5 L
    for (i = 0; i < numValidNets; i++) {
      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {
        estimateOneSeg(&seglist[j]);
      }
    }
    // L route
    for (i = 0; i < numValidNets; i++) {
      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {
        // no need to reroute the H or V segs
        if (seglist[j].x1 != seglist[j].x2 || seglist[j].y1 != seglist[j].y2)
          routeSegLFirstTime(&seglist[j]);
      }
    }
  } else // previous is L-route
  {
    for (i = 0; i < numValidNets; i++) {
      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {
        // no need to reroute the H or V segs
        if (seglist[j].x1 != seglist[j].x2 || seglist[j].y1 != seglist[j].y2) {
          ripupSegL(&seglist[j]);
          routeSegL(&seglist[j]);
        }
      }
    }
  }
}

// L-route, rip-up the previous route according to the ripuptype
// L-route, rip-up the previous route according to the ripuptype
void newrouteL(int netID, RouteType ripuptype, Bool viaGuided) {
  int i, j, d, n1, n2, x1, y1, x2, y2, grid, grid1;
  float costL1 = 0, costL2 = 0, tmp;
  int ymin, ymax;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  d         = sttrees[netID].deg;
  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;

  // loop for all the tree edges (2*d-3)
  for (i = 0; i < 2 * d - 3; i++) {
    if (sttrees[netID].edges[i].len >
        0) // only route the non-degraded edges (len>0)
    {

      treeedge = &(treeedges[i]);

      n1 = treeedge->n1;
      n2 = treeedge->n2;
      x1 = treenodes[n1].x;
      y1 = treenodes[n1].y;
      x2 = treenodes[n2].x;
      y2 = treenodes[n2].y;

      if (y1 < y2) {
        ymin = y1;
        ymax = y2;
      } else {
        ymin = y2;
        ymax = y1;
      }

      // ripup the original routing
      if (ripuptype > NOROUTE) // it's been routed
        newRipup(treeedge, x1, y1, x2, y2);

      treeedge->route.type = LROUTE;
      if (x1 == x2) // V-routing
      {
        for (j = ymin; j < ymax; j++)
          v_edges[j * xGrid + x1].est_usage++;
        treeedge->route.xFirst = FALSE;
        if (treenodes[n1].status % 2 == 0) {
          treenodes[n1].status += 1;
        }
        if (treenodes[n2].status % 2 == 0) {
          treenodes[n2].status += 1;
        }
      } else if (y1 == y2) // H-routing
      {
        for (j = x1; j < x2; j++)
          h_edges[y1 * (xGrid - 1) + j].est_usage++;
        treeedge->route.xFirst = TRUE;
        if (treenodes[n2].status < 2) {
          treenodes[n2].status += 2;
        }
        if (treenodes[n1].status < 2) {
          treenodes[n1].status += 2;
        }
      } else // L-routing
      {

        if (viaGuided) {

          if (treenodes[n1].status == 0 || treenodes[n1].status == 3) {
            costL1 = costL2 = 0;
          } else if (treenodes[n1].status == 2) {
            costL1 = viacost;
            costL2 = 0;
          } else if (treenodes[n1].status == 1) {

            costL1 = 0;
            costL2 = viacost;
          } else {
            printf("wrong node status %d", treenodes[n1].status);
          }
          if (treenodes[n2].status == 2) {
            costL2 += viacost;
          } else if (treenodes[n2].status == 1) {
            costL1 += viacost;
          }
        } else {
          costL1 = costL2 = 0;
        }

        for (j = ymin; j < ymax; j++) {
          grid = j * xGrid;
          tmp  = v_edges[grid + x1].est_usage - vCapacity_lb +
                v_edges[grid + x1].red;
          if (tmp > 0)
            costL1 += tmp;
          tmp = v_edges[grid + x2].est_usage - vCapacity_lb +
                v_edges[grid + x2].red;
          if (tmp > 0)
            costL2 += tmp;
        }
        grid  = y2 * (xGrid - 1);
        grid1 = y1 * (xGrid - 1);
        for (j = x1; j < x2; j++) {
          tmp = h_edges[grid + j].est_usage - hCapacity_lb +
                h_edges[grid + j].red;
          if (tmp > 0)
            costL1 += tmp;
          tmp = h_edges[grid1 + j].est_usage - hCapacity_lb +
                h_edges[grid1 + j].red;
          if (tmp > 0)
            costL2 += tmp;
        }

        if (costL1 < costL2) {
          if (treenodes[n1].status % 2 == 0) {
            treenodes[n1].status += 1;
          }
          if (treenodes[n2].status < 2) {
            treenodes[n2].status += 2;
          }

          // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)
          for (j = ymin; j < ymax; j++) {
            v_edges[j * xGrid + x1].est_usage += 1;
          }
          grid = y2 * (xGrid - 1);
          for (j = x1; j < x2; j++) {
            h_edges[grid + j].est_usage += 1;
          }
          treeedge->route.xFirst = FALSE;
        } // if costL1<costL2
        else {
          if (treenodes[n2].status % 2 == 0) {
            treenodes[n2].status += 1;
          }
          if (treenodes[n1].status < 2) {
            treenodes[n1].status += 2;
          }

          // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)
          grid = y1 * (xGrid - 1);
          for (j = x1; j < x2; j++) {
            h_edges[grid + j].est_usage += 1;
          }
          for (j = ymin; j < ymax; j++) {
            v_edges[j * xGrid + x2].est_usage += 1;
          }
          treeedge->route.xFirst = TRUE;
        }

      } // else L-routing
    }   // if non-degraded edge
    else
      sttrees[netID].edges[i].route.type = NOROUTE;
  } // loop i
}

// route all segments with L, firstTime: TRUE, first newrouteLAll, FALSE - not
// first
void newrouteLAll(Bool firstTime, Bool viaGuided) {
  int i;

  if (firstTime) {
    for (i = 0; i < numValidNets; i++) {
      newrouteL(i, NOROUTE, viaGuided); // do L-routing
    }
  } else {
    for (i = 0; i < numValidNets; i++) {
      newrouteL(i, LROUTE, viaGuided);
    }
  }
}

void newrouteZ_edge(int netID, int edgeID) {
  int i, j, n1, n2, x1, y1, x2, y2, segWidth, bestZ, grid, grid1, grid2, ymin,
      ymax;
  float tmp, bestcost, btTEST;
  Bool HVH; // the shape of Z routing (TRUE - HVH, FALSE - VHV)
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  if (sttrees[netID].edges[edgeID].len >
      0) // only route the non-degraded edges (len>0)
  {
    treeedges = sttrees[netID].edges;
    treeedge  = &(treeedges[edgeID]);
    treenodes = sttrees[netID].nodes;
    n1        = treeedge->n1;
    n2        = treeedge->n2;
    x1        = treenodes[n1].x;
    y1        = treenodes[n1].y;
    x2        = treenodes[n2].x;
    y2        = treenodes[n2].y;

    if (x1 != x2 || y1 != y2) // not H or V edge, do Z-routing (if H or V edge,
                              // no need to reroute)
    {
      // ripup the original routing
      newRipup(treeedge, x1, y1, x2, y2);

      treeedge->route.type = ZROUTE;

      segWidth = x2 - x1;
      if (y1 < y2) {
        ymin = y1;
        ymax = y2;
      } else {
        ymin = y2;
        ymax = y1;
      }

      // compute the cost for all Z routing

      for (i = 0; i <= segWidth; i++) {
        costHVH[i] = 0;
        costV[i]   = 0;
        costTB[i]  = 0;

        costHVHtest[i] = 0;
        costVtest[i]   = 0;
        costTBtest[i]  = 0;
      }

      // compute the cost for all H-segs and V-segs and partial boundary seg
      // cost for V-segs
      for (i = x1; i <= x2; i++) {
        grid = ymin * xGrid;
        for (j = ymin; j < ymax; j++) {
          tmp = v_edges[grid + i].est_usage - vCapacity_lb +
                v_edges[grid + i].red;
          grid += xGrid;
          if (tmp > 0) {
            costV[i - x1] += tmp;
            costVtest[i - x1] += HCOST;
            ;
          } else {
            costVtest[i - x1] += tmp;
          }
        }
      }
      // cost for Top&Bot boundary segs (form Z with V-seg)
      grid = y2 * (xGrid - 1);
      for (j = x1; j <= x2; j++) {
        tmp =
            h_edges[grid + j].est_usage - hCapacity_lb + h_edges[grid + j].red;
        if (tmp > 0) {
          costTB[0] += tmp;
          costTBtest[0] += HCOST;
        } else {
          costTBtest[0] += tmp;
        }
      }
      grid1 = y1 * (xGrid - 1) + x1;
      grid2 = y2 * (xGrid - 1) + x1;
      for (i = 1; i <= segWidth; i++) {
        costTB[i] = costTB[i - 1];
        tmp       = h_edges[grid1 + i - 1].est_usage - hCapacity_lb +
              h_edges[grid1 + i - 1].red;
        if (tmp > 0) {
          costTB[i] += tmp;
          costTBtest[i] += HCOST;
        } else {
          costTBtest[i] += tmp;
        }
        tmp = h_edges[grid2 + i - 1].est_usage - hCapacity_lb +
              h_edges[grid2 + i - 1].red;
        if (tmp > 0) {
          costTB[i] -= tmp;
          costTBtest[i] -= HCOST;
        } else {
          costTBtest[i] -= tmp;
        }
      }
      // compute cost for all Z routing
      HVH      = TRUE;
      bestcost = BIG_INT;
      btTEST   = BIG_INT;
      bestZ    = 0;
      for (i = 0; i <= segWidth; i++) {
        costHVH[i]     = costV[i] + costTB[i];
        costHVHtest[i] = costVtest[i] + costTBtest[i];
        if (costHVH[i] < bestcost) {
          bestcost = costHVH[i];
          btTEST   = costHVHtest[i];
          bestZ    = i + x1;
        } else if (costHVH[i] == bestcost) {
          if (costHVHtest[i] < btTEST) {
            btTEST = costHVHtest[i];
            bestZ  = i + x1;
          }
        }
      }

      if (HVH) {
        grid = y1 * (xGrid - 1);
        for (i = x1; i < bestZ; i++) {
          h_edges[grid + i].est_usage += 1;
        }
        grid = y2 * (xGrid - 1);
        for (i = bestZ; i < x2; i++) {
          h_edges[grid + i].est_usage += 1;
        }
        grid = ymin * xGrid;
        for (i = ymin; i < ymax; i++) {
          v_edges[grid + bestZ].est_usage += 1;
          grid += xGrid;
        }
        treeedge->route.HVH    = HVH;
        treeedge->route.Zpoint = bestZ;
      } else {
        printf("warning, in the maze edge, not HVH results is produced");
      }
    } // else Z route

  } // if non-degraded edge
}

// Z-route, rip-up the previous route according to the ripuptype
void newrouteZ(int netID, int threshold) {
  int ind, i, j, d, n1, n2, x1, y1, x2, y2, segWidth, segHeight, bestZ, grid,
      grid1, grid2, ymin, ymax, n1a, n2a, status1, status2;
  float tmp, bestcost, btTEST;
  Bool HVH;       // the shape of Z routing (TRUE - HVH, FALSE - VHV)
  Bool y1Smaller; // TRUE - y1<y2, FALSE y1>y2
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  d = sttrees[netID].deg;

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;

  // loop for all the tree edges (2*d-3)

  for (ind = 0; ind < 2 * d - 3; ind++) {
    treeedge = &(treeedges[ind]);

    n1 = treeedge->n1;
    n2 = treeedge->n2;
    x1 = treenodes[n1].x;
    y1 = treenodes[n1].y;
    x2 = treenodes[n2].x;
    y2 = treenodes[n2].y;

    if (sttrees[netID].edges[ind].len >
        threshold) // only route the edges with len>5
    {

      if (y1 < y2) {
        ymin = y1;
        ymax = y2;
      } else {
        ymin = y2;
        ymax = y1;
      }

      if (x1 != x2 && y1 != y2) // not H or V edge, do Z-routing
      {
        // ripup the original routing
        if (newRipupType2(treeedge, treenodes, x1, y1, x2, y2, d)) {

          n1a     = treenodes[n1].stackAlias;
          n2a     = treenodes[n2].stackAlias;
          status1 = treenodes[n1a].status;
          status2 = treenodes[n2a].status;

          treeedge->route.type = ZROUTE;

          segWidth = x2 - x1;
          if (y1 < y2) {
            ymin      = y1;
            ymax      = y2;
            y1Smaller = TRUE;
          } else {
            ymin      = y2;
            ymax      = y1;
            y1Smaller = FALSE;
          }
          segHeight = ymax - ymin;

          // compute the cost for all Z routing

          if (status1 == 0 || status1 == 3) {
            for (i = 0; i < segWidth; i++) {
              costHVH[i]     = 0;
              costHVHtest[i] = 0;
            }
            for (i = 0; i < segHeight; i++) {
              costVHV[i] = 0;
            }
          } else if (status1 == 2) {
            for (i = 0; i < segWidth; i++) {
              costHVH[i]     = 0;
              costHVHtest[i] = 0;
            }
            for (i = 0; i < segHeight; i++) {
              costVHV[i] = viacost;
            }
          } else {

            for (i = 0; i < segWidth; i++) {
              costHVH[i]     = viacost;
              costHVHtest[i] = viacost;
            }
            for (i = 0; i < segHeight; i++) {
              costVHV[i] = 0;
            }
          }

          if (status2 == 2) {
            for (i = 0; i < segHeight; i++) {
              costVHV[i] += viacost;
            }

          } else if (status2 == 1) {
            for (i = 0; i < segWidth; i++) {
              costHVH[i] += viacost;
              costHVHtest[i] += viacost;
            }
          }

          for (i = 0; i < segWidth; i++) {
            costV[i]  = 0;
            costTB[i] = 0;

            costVtest[i]  = 0;
            costTBtest[i] = 0;
          }
          for (i = 0; i < segHeight; i++) {
            costH[i]  = 0;
            costLR[i] = 0;
          }

          // compute the cost for all H-segs and V-segs and partial boundary seg
          // cost for V-segs
          for (i = x1; i < x2; i++) {
            grid = ymin * xGrid;
            for (j = ymin; j < ymax; j++) {
              tmp = v_edges[grid + i].est_usage - vCapacity_lb +
                    v_edges[grid + i].red;
              grid += xGrid;
              if (tmp > 0) {
                costV[i - x1] += tmp;
                costVtest[i - x1] += HCOST;
              } else {
                costVtest[i - x1] += tmp;
              }
            }
          }
          // cost for Top&Bot boundary segs (form Z with V-seg)
          grid = y2 * (xGrid - 1);
          for (j = x1; j < x2; j++) {
            tmp = h_edges[grid + j].est_usage - hCapacity_lb +
                  h_edges[grid + j].red;
            if (tmp > 0) {
              costTB[0] += tmp;
              costTBtest[0] += HCOST;
            } else {
              costTBtest[0] += tmp;
            }
          }
          grid1 = y1 * (xGrid - 1) + x1;
          grid2 = y2 * (xGrid - 1) + x1;
          for (i = 1; i < segWidth; i++) {
            costTB[i] = costTB[i - 1];
            tmp       = h_edges[grid1 + i - 1].est_usage - hCapacity_lb +
                  h_edges[grid1 + i - 1].red;
            if (tmp > 0) {
              costTB[i] += tmp;
              costTBtest[0] += HCOST;
            } else {
              costTBtest[0] += tmp;
            }
            tmp = h_edges[grid2 + i - 1].est_usage - hCapacity_lb +
                  h_edges[grid2 + i - 1].red;
            if (tmp > 0) {
              costTB[i] -= tmp;
              costTBtest[0] -= HCOST;
            } else {
              costTBtest[0] -= tmp;
            }
          }
          // cost for H-segs
          grid = ymin * (xGrid - 1);
          for (i = ymin; i < ymax; i++) {
            for (j = x1; j < x2; j++) {
              tmp = h_edges[grid + j].est_usage - hCapacity_lb +
                    h_edges[grid + j].red;
              if (tmp > 0)
                costH[i - ymin] += tmp;
            }
            grid += xGrid - 1;
          }
          // cost for Left&Right boundary segs (form Z with H-seg)
          if (y1Smaller) {
            for (j = y1; j < y2; j++) {
              tmp = v_edges[j * xGrid + x2].est_usage - vCapacity_lb +
                    v_edges[j * xGrid + x2].red;
              if (tmp > 0)
                costLR[0] += tmp;
            }
            for (i = 1; i < segHeight; i++) {
              costLR[i] = costLR[i - 1];
              grid      = (y1 + i - 1) * xGrid;
              tmp       = v_edges[grid + x1].est_usage - vCapacity_lb +
                    v_edges[grid + x1].red;
              if (tmp > 0)
                costLR[i] += tmp;
              tmp = v_edges[grid + x2].est_usage - vCapacity_lb +
                    v_edges[grid + x2].red;
              if (tmp > 0)
                costLR[i] -= tmp;
            }
          } else {
            for (j = y2; j < y1; j++) {
              tmp = v_edges[j * xGrid + x1].est_usage - vCapacity_lb;
              if (tmp > 0)
                costLR[0] += tmp;
            }
            for (i = 1; i < segHeight; i++) {
              costLR[i] = costLR[i - 1];
              grid      = (y2 + i - 1) * xGrid;
              tmp       = v_edges[grid + x2].est_usage - vCapacity_lb +
                    v_edges[grid + x2].red;
              if (tmp > 0)
                costLR[i] += tmp;
              tmp = v_edges[grid + x1].est_usage - vCapacity_lb +
                    v_edges[grid + x1].red;
              if (tmp > 0)
                costLR[i] -= tmp;
            }
          }

          // compute cost for all Z routing
          HVH      = TRUE;
          bestcost = BIG_INT;
          btTEST   = BIG_INT;
          bestZ    = 0;
          for (i = 0; i < segWidth; i++) {
            costHVH[i] += costV[i] + costTB[i];
            if (costHVH[i] < bestcost) {
              bestcost = costHVH[i];
              btTEST   = costHVHtest[i];
              bestZ    = i + x1;
            } else if (costHVH[i] == bestcost) {
              if (costHVHtest[i] < btTEST) {
                btTEST = costHVHtest[i];
                bestZ  = i + x1;
              }
            }
          }
          for (i = 0; i < segHeight; i++) {
            costVHV[i] += costH[i] + costLR[i];
            if (costVHV[i] < bestcost) {
              bestcost = costVHV[i];
              bestZ    = i + ymin;
              HVH      = FALSE;
            }
          }

          if (HVH) {
            if (treenodes[n1a].status < 2) {
              treenodes[n1a].status += 2;
            }
            if (treenodes[n2a].status < 2) {
              treenodes[n2a].status += 2;
            }

            treenodes[n1a].hID++;
            treenodes[n2a].hID++;

            grid = y1 * (xGrid - 1);
            for (i = x1; i < bestZ; i++) {
              h_edges[grid + i].est_usage += 1;
            }
            grid = y2 * (xGrid - 1);
            for (i = bestZ; i < x2; i++) {
              h_edges[grid + i].est_usage += 1;
            }
            grid = ymin * xGrid;
            for (i = ymin; i < ymax; i++) {
              v_edges[grid + bestZ].est_usage += 1;
              grid += xGrid;
            }
            treeedge->route.HVH    = HVH;
            treeedge->route.Zpoint = bestZ;
          } else {
            if (treenodes[n2a].status % 2 == 0) {
              treenodes[n2a].status += 1;
            }
            if (treenodes[n1a].status % 2 == 0) {
              treenodes[n1a].status += 1;
            }

            treenodes[n1a].lID++;
            treenodes[n2a].lID++;
            if (y1Smaller) {
              grid = y1 * xGrid;
              for (i = y1; i < bestZ; i++) {
                v_edges[grid + x1].est_usage += 1;
                grid += xGrid;
              }
              grid = bestZ * xGrid;
              for (i = bestZ; i < y2; i++) {
                v_edges[grid + x2].est_usage += 1;
                grid += xGrid;
              }
              grid = bestZ * (xGrid - 1);
              for (i = x1; i < x2; i++) {
                h_edges[grid + i].est_usage += 1;
              }
              treeedge->route.HVH    = HVH;
              treeedge->route.Zpoint = bestZ;
            } else {
              grid = y2 * xGrid;
              for (i = y2; i < bestZ; i++) {
                v_edges[grid + x2].est_usage += 1;
                grid += xGrid;
              }
              grid = bestZ * xGrid;
              for (i = bestZ; i < y1; i++) {
                v_edges[grid + x1].est_usage += 1;
                grid += xGrid;
              }
              grid = bestZ * (xGrid - 1);
              for (i = x1; i < x2; i++) {
                h_edges[grid + i].est_usage += 1;
              }
              treeedge->route.HVH    = HVH;
              treeedge->route.Zpoint = bestZ;
            }
          }
        } else { // if ripuped by type 2
          if (d == 2) {
            newrouteZ_edge(netID, ind);
          }
        }
      }

    } else if (d == 2 && sttrees[netID].edges[ind].len > threshold) {
      newrouteZ_edge(netID, ind);
    } // if non-degraded edge
      //        else
      //            sttrees[netID].edges[ind].route.type = NOROUTE;
  }   // loop ind
}

// ripup a tree edge according to its ripup type and Z-route it
// route all segments with L, firstTime: TRUE, first newrouteLAll, FALSE - not
// first
void newrouteZAll(int threshold) {
  int i;

  for (i = 0; i < numValidNets; i++) {
    newrouteZ(i, threshold); // ripup previous route and do Z-routing
  }
}

// Ripup the original route and do Monotonic routing within bounding box
void routeMonotonic(int netID, int edgeID, int threshold) {
  int i, j, cnt, x, xl, yl, xr, yr, n1, n2, x1, y1, x2, y2, grid, xGrid_1,
      ind_i, ind_j, ind_x;
  int vedge, hedge, segWidth, segHeight, curX, curY;
  int gridsX[XRANGE + YRANGE], gridsY[XRANGE + YRANGE];
  float **cost, tmp;
  Bool** parent; // remember the parent of a grid on the shortest path, TRUE -
                 // same x, FALSE - same y
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  if (sttrees[netID].edges[edgeID].route.routelen >
      threshold) // only route the non-degraded edges (len>0)
  {
    treeedges = sttrees[netID].edges;
    treeedge  = &(treeedges[edgeID]);
    treenodes = sttrees[netID].nodes;
    n1        = treeedge->n1;
    n2        = treeedge->n2;
    x1        = treenodes[n1].x;
    y1        = treenodes[n1].y;
    x2        = treenodes[n2].x;
    y2        = treenodes[n2].y;

    if (x1 != x2 || y1 != y2) // not H or V edge, do Z-routing (if H or V edge,
                              // no need to reroute)
    {
      // ripup the original routing
      newRipup(treeedge, x1, y1, x2, y2);

      segWidth  = ADIFF(x1, x2);
      segHeight = ADIFF(y1, y2);
      if (x1 <= x2) {
        xl = x1;
        yl = y1;
        xr = x2;
        yr = y2;
      } else {
        xl = x2;
        yl = y2;
        xr = x1;
        yr = y1;
      }

      // find the best monotonic path from (x1, y1) to (x2, y2)
      cost   = (float**)malloc((segHeight + 1) * sizeof(float*));
      parent = (Bool**)malloc((segHeight + 1) * sizeof(Bool*));
      for (i = 0; i <= segHeight; i++) {
        cost[i]   = (float*)malloc((segWidth + 1) * sizeof(float));
        parent[i] = (Bool*)malloc((segWidth + 1) * sizeof(Bool));
      }

      xGrid_1 = xGrid - 1; // tmp variable to save runtime
      if (yl <= yr) {
        // initialize first column
        cost[0][0] = 0;
        grid       = yl * xGrid;
        for (j = 0; j < segHeight; j++) {
          cost[j + 1][0] =
              cost[j][0] +
              max((float)0, v_edges[grid + xl].red +
                                v_edges[grid + xl].est_usage - vCapacity_lb);
          parent[j + 1][0] = SAMEX;
          grid += xGrid;
        }
        // update other columns
        for (i = 0; i < segWidth; i++) {
          x = xl + i;
          // update the cost of a column of grids by h-edges
          grid = yl * xGrid_1;
          for (j = 0; j <= segHeight; j++) {
            tmp              = max((float)0, h_edges[grid + x].red +
                                    h_edges[grid + x].est_usage - hCapacity_lb);
            cost[j][i + 1]   = cost[j][i] + tmp;
            parent[j][i + 1] = SAMEY;
            grid += xGrid - 1;
          }
          // update the cost of a column of grids by v-edges
          grid  = yl * xGrid;
          ind_x = x + 1;
          ind_i = i + 1;
          for (j = 0; j < segHeight; j++) {
            ind_j = j + 1;
            tmp   = cost[j][ind_i] +
                  max((float)0, v_edges[grid + ind_x].red +
                                    v_edges[grid + ind_x].est_usage -
                                    vCapacity_lb);
            if (cost[ind_j][ind_i] > tmp) {
              cost[ind_j][ind_i]   = tmp;
              parent[ind_j][ind_i] = SAMEX;
            }
            grid += xGrid;
          }
        }

        // store the shortest path and update the usage
        curX = xr;
        curY = yr;
        cnt  = 0;

        while (curX != xl || curY != yl) {
          // printf("xl is %d, yl is %d, curX is %d, curY is
          // %d\n",xl,yl,curX,curY); printf("%d\n", sttrees[netID].deg);
          gridsX[cnt] = curX;
          gridsY[cnt] = curY;
          cnt++;
          if (parent[curY - yl][curX - xl] == SAMEX) {
            curY--;
            vedge = curY * xGrid + curX;
            v_edges[vedge].est_usage += 1;
          } else {
            curX--;
            hedge = curY * (xGrid - 1) + curX;
            h_edges[hedge].est_usage += 1;
          }
        }

        gridsX[cnt] = xl;
        gridsY[cnt] = yl;
        cnt++;

      } // yl<=yr

      else // yl>yr
      {
        // initialize first column
        cost[segHeight][0] = 0;
        grid               = (yl - 1) * xGrid;
        for (j = segHeight - 1; j >= 0; j--) {
          cost[j][0] =
              cost[j + 1][0] +
              max((float)0, v_edges[grid + xl].red +
                                v_edges[grid + xl].est_usage - vCapacity_lb);
          parent[j][0] = SAMEX;
          grid -= xGrid;
        }
        // update other columns
        for (i = 0; i < segWidth; i++) {
          x = xl + i;
          // update the cost of a column of grids by h-edges
          grid  = yl * (xGrid - 1);
          ind_i = i + 1;
          for (j = segHeight; j >= 0; j--) {
            tmp              = max((float)0, h_edges[grid + x].red +
                                    h_edges[grid + x].est_usage - hCapacity_lb);
            cost[j][ind_i]   = cost[j][i] + tmp;
            parent[j][ind_i] = SAMEY;
            grid -= xGrid - 1;
          }
          // update the cost of a column of grids by v-edges
          grid  = (yl - 1) * xGrid;
          ind_x = x + 1;
          for (j = segHeight - 1; j >= 0; j--) {
            tmp = cost[j + 1][ind_i] +
                  max((float)0, v_edges[grid + ind_x].red +
                                    v_edges[grid + ind_x].est_usage -
                                    vCapacity_lb);
            if (cost[j][ind_i] > tmp) {
              cost[j][ind_i]   = tmp;
              parent[j][ind_i] = SAMEX;
            }
            grid -= xGrid;
          }
        }

        // store the shortest path and update the usage
        curX = xr;
        curY = yr;
        cnt  = 0;
        while (curX != xl || curY != yl) {
          gridsX[cnt] = curX;
          gridsY[cnt] = curY;
          cnt++;
          if (parent[curY - yr][curX - xl] == SAMEX) {
            vedge = curY * xGrid + curX;
            v_edges[vedge].est_usage += 1;
            curY++;
          } else {
            curX--;
            hedge = curY * (xGrid - 1) + curX;
            h_edges[hedge].est_usage += 1;
          }
        }
        gridsX[cnt] = xl;
        gridsY[cnt] = yl;
        cnt++;

      } // yl>yr
      treeedge->route.routelen = cnt - 1;

      treeedge->route.gridsX =
          (short*)realloc(treeedge->route.gridsX, cnt * sizeof(short));
      treeedge->route.gridsY =
          (short*)realloc(treeedge->route.gridsY, cnt * sizeof(short));
      if (x1 != gridsX[0] ||
          y1 != gridsY[0]) // gridsX[] and gridsY[] store the path from n2 to n1
      {
        cnt = 0;
        for (i = treeedge->route.routelen; i >= 0; i--) {
          treeedge->route.gridsX[cnt] = gridsX[i];
          treeedge->route.gridsY[cnt] = gridsY[i];
          cnt++;
        }
      } else // gridsX[] and gridsY[] store the path from n1 to n2
      {
        for (i = 0; i <= treeedge->route.routelen; i++) {
          treeedge->route.gridsX[i] = gridsX[i];
          treeedge->route.gridsY[i] = gridsY[i];
        }
      }

      for (i = 0; i <= segHeight; i++) {
        free(cost[i]);
        free(parent[i]);
      }
      free(cost);
      free(parent);

    } // if(x1!=x2 || y1!=y2)
  }   // non-degraded edge
}

void routeMonotonicAll(int threshold) {
  int netID, edgeID;

  for (netID = 0; netID < numValidNets; netID++) {
    for (edgeID = 0; edgeID < sttrees[netID].deg * 2 - 3; edgeID++) {
      routeMonotonic(
          netID, edgeID,
          threshold); // ripup previous route and do Monotonic routing
    }
  }
  printf("MonotonicAll OK\n");
}

void spiralRoute(int netID, int edgeID) {
  int j, n1, n2, x1, y1, x2, y2, grid, grid1, n1a, n2a;
  float costL1 = 0, costL2 = 0, tmp;
  int ymin, ymax;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;

  // loop for all the tree edges (2*d-3)

  treeedge = &(treeedges[edgeID]);
  if (treeedge->len > 0) // only route the non-degraded edges (len>0)
  {

    n1 = treeedge->n1;
    n2 = treeedge->n2;
    x1 = treenodes[n1].x;
    y1 = treenodes[n1].y;
    x2 = treenodes[n2].x;
    y2 = treenodes[n2].y;

    n1a = treenodes[n1].stackAlias;
    n2a = treenodes[n2].stackAlias;

    if (y1 < y2) {
      ymin = y1;
      ymax = y2;
    } else {
      ymin = y2;
      ymax = y1;
    }

    // ripup the original routing

    treeedge->route.type = LROUTE;
    if (x1 == x2) // V-routing
    {
      for (j = ymin; j < ymax; j++)
        v_edges[j * xGrid + x1].est_usage++;
      treeedge->route.xFirst = FALSE;
      if (treenodes[n1].status % 2 == 0) {
        treenodes[n1].status += 1;
      }
      if (treenodes[n2].status % 2 == 0) {
        treenodes[n2].status += 1;
      }

      if (treenodes[n1a].status % 2 == 0) {
        treenodes[n1a].status += 1;
      }
      if (treenodes[n2a].status % 2 == 0) {
        treenodes[n2a].status += 1;
      }
    } else if (y1 == y2) // H-routing
    {
      for (j = x1; j < x2; j++)
        h_edges[y1 * (xGrid - 1) + j].est_usage++;
      treeedge->route.xFirst = TRUE;
      if (treenodes[n2].status < 2) {
        treenodes[n2].status += 2;
      }
      if (treenodes[n1].status < 2) {
        treenodes[n1].status += 2;
      }

      if (treenodes[n2a].status < 2) {
        treenodes[n2a].status += 2;
      }
      if (treenodes[n1a].status < 2) {
        treenodes[n1a].status += 2;
      }
    } else // L-routing
    {

      if (treenodes[n1].status == 0 || treenodes[n1].status == 3) {
        costL1 = costL2 = 0;
      } else if (treenodes[n1].status == 2) {
        costL1 = viacost;
        costL2 = 0;
      } else if (treenodes[n1].status == 1) {

        costL1 = 0;
        costL2 = viacost;
      } else {
        printf("wrong node status %d", treenodes[n1].status);
      }
      if (treenodes[n2].status == 2) {
        costL2 += viacost;
      } else if (treenodes[n2].status == 1) {
        costL1 += viacost;
      }

      for (j = ymin; j < ymax; j++) {
        grid = j * xGrid;
        tmp  = v_edges[grid + x1].est_usage - vCapacity_lb +
              v_edges[grid + x1].red;
        if (tmp > 0)
          costL1 += tmp;
        tmp = v_edges[grid + x2].est_usage - vCapacity_lb +
              v_edges[grid + x2].red;
        if (tmp > 0)
          costL2 += tmp;
        // costL1 += simpleCost (v_edges[grid+x1].est_usage,
        // v_edges[grid+x1].cap); costL2 += simpleCost (
        // v_edges[grid+x2].est_usage,  v_edges[grid+x2].cap);
      }
      grid  = y2 * (xGrid - 1);
      grid1 = y1 * (xGrid - 1);
      for (j = x1; j < x2; j++) {
        tmp =
            h_edges[grid + j].est_usage - hCapacity_lb + h_edges[grid + j].red;
        if (tmp > 0)
          costL1 += tmp;
        tmp = h_edges[grid1 + j].est_usage - hCapacity_lb +
              h_edges[grid1 + j].red;
        if (tmp > 0)
          costL2 += tmp;
        // costL1 += simpleCost (h_edges[grid+j].est_usage,
        // h_edges[grid+j].cap); costL2 += simpleCost
        // (h_edges[grid1+j].est_usage, h_edges[grid1+j].cap);
      }

      if (costL1 < costL2) {
        if (treenodes[n1].status % 2 == 0) {
          treenodes[n1].status += 1;
        }
        if (treenodes[n2].status < 2) {
          treenodes[n2].status += 2;
        }

        if (treenodes[n1a].status % 2 == 0) {
          treenodes[n1a].status += 1;
        }
        if (treenodes[n2a].status < 2) {
          treenodes[n2a].status += 2;
        }
        treenodes[n2a].hID++;
        treenodes[n1a].lID++;

        // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)
        for (j = ymin; j < ymax; j++) {
          v_edges[j * xGrid + x1].est_usage += 1;
        }
        grid = y2 * (xGrid - 1);
        for (j = x1; j < x2; j++) {
          h_edges[grid + j].est_usage += 1;
        }
        treeedge->route.xFirst = FALSE;
      } // if costL1<costL2
      else {
        if (treenodes[n2].status % 2 == 0) {
          treenodes[n2].status += 1;
        }
        if (treenodes[n1].status < 2) {
          treenodes[n1].status += 2;
        }

        if (treenodes[n2a].status % 2 == 0) {
          treenodes[n2a].status += 1;
        }

        if (treenodes[n1a].status < 2) {
          treenodes[n1a].status += 2;
        }
        treenodes[n1a].hID++;
        treenodes[n2a].lID++;

        // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)
        grid = y1 * (xGrid - 1);
        for (j = x1; j < x2; j++) {
          h_edges[grid + j].est_usage += 1;
        }
        for (j = ymin; j < ymax; j++) {
          v_edges[j * xGrid + x2].est_usage += 1;
        }
        treeedge->route.xFirst = TRUE;
      }

    } // else L-routing
  }   // if non-degraded edge
  else
    sttrees[netID].edges[edgeID].route.type = NOROUTE;
}

void spiralRouteAll() {
  int netID, d, k, edgeID, nodeID, deg, numpoints, n1, n2;
  int na;
  Bool redundant;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;
  int quehead, quetail;
  int edgeQueue[5000];

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    numpoints = 0;

    for (d = 0; d < 2 * deg - 2; d++) {
      treenodes[d].topL = -1;
      treenodes[d].botL = MAXLAYER;
      // treenodes[d].l = 0;
      treenodes[d].assigned   = FALSE;
      treenodes[d].stackAlias = d;
      treenodes[d].conCNT     = 0;
      treenodes[d].hID        = 0;
      treenodes[d].lID        = 0;
      treenodes[d].status     = 0;

      if (d < deg) {
        treenodes[d].botL = treenodes[d].topL = 0;
        // treenodes[d].l = 0;
        treenodes[d].assigned = TRUE;
        treenodes[d].status   = 2;

        xcor[numpoints] = treenodes[d].x;
        ycor[numpoints] = treenodes[d].y;
        dcor[numpoints] = d;
        numpoints++;
      } else {
        redundant = FALSE;
        for (k = 0; k < numpoints; k++) {
          if ((treenodes[d].x == xcor[k]) && (treenodes[d].y == ycor[k])) {
            treenodes[d].stackAlias = dcor[k];
            redundant               = TRUE;
            break;
          }
        }
        if (!redundant) {
          xcor[numpoints] = treenodes[d].x;
          ycor[numpoints] = treenodes[d].y;
          dcor[numpoints] = d;
          numpoints++;
        }
      }
    }
  }

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        n1 = treeedge->n1;
        n2 = treeedge->n2;

        treeedge->n1a = treenodes[n1].stackAlias;
        treenodes[treeedge->n1a].eID[treenodes[treeedge->n1a].conCNT] = edgeID;
        treenodes[treeedge->n1a].conCNT++;

        treeedge->n2a = treenodes[n2].stackAlias;
        treenodes[treeedge->n2a].eID[treenodes[treeedge->n2a].conCNT] = edgeID;
        (treenodes[treeedge->n2a].conCNT)++;
        treeedges[edgeID].assigned = FALSE;
      } else {
        treeedges[edgeID].assigned = TRUE;
      }
    }
  }

  for (netID = 0; netID < numValidNets; netID++) {

    newRipupNet(netID);

    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;
    quehead = quetail = 0;

    for (nodeID = 0; nodeID < deg; nodeID++) {
      treenodes[nodeID].assigned = TRUE;
      for (k = 0; k < treenodes[nodeID].conCNT; k++) {

        edgeID = treenodes[nodeID].eID[k];

        if (treeedges[edgeID].assigned == FALSE) {
          edgeQueue[quetail]         = edgeID;
          treeedges[edgeID].assigned = TRUE;
          quetail++;
        }
      }
    }

    while (quehead != quetail) {
      edgeID   = edgeQueue[quehead];
      treeedge = &(treeedges[edgeID]);
      if (treenodes[treeedge->n1a].assigned) {
        spiralRoute(netID, edgeID);
        treeedge->assigned = TRUE;
        if (!treenodes[treeedge->n2a].assigned) {
          for (k = 0; k < treenodes[treeedge->n2a].conCNT; k++) {
            edgeID = treenodes[treeedge->n2a].eID[k];
            if (!treeedges[edgeID].assigned) {
              edgeQueue[quetail]         = edgeID;
              treeedges[edgeID].assigned = TRUE;
              quetail++;
            }
          }
          treenodes[treeedge->n2a].assigned = TRUE;
        }
      } else {
        spiralRoute(netID, edgeID);
        treeedge->assigned = TRUE;
        if (!treenodes[treeedge->n1a].assigned) {
          for (k = 0; k < treenodes[treeedge->n1a].conCNT; k++) {
            edgeID = treenodes[treeedge->n1a].eID[k];
            if (!treeedges[edgeID].assigned) {
              edgeQueue[quetail]         = edgeID;
              treeedges[edgeID].assigned = TRUE;
              quetail++;
            }
          }
          treenodes[treeedge->n1a].assigned = TRUE;
        }
      }
      quehead++;
    }
  }

  for (netID = 0; netID < numValidNets; netID++) {
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    for (d = 0; d < 2 * deg - 2; d++) {
      na = treenodes[d].stackAlias;

      treenodes[d].status = treenodes[na].status;
    }
  }
}

void routeLVEnew(int netID, int edgeID, int threshold, int enlarge) {
  int i, j, cnt, xmin, xmax, ymin, ymax, n1, n2, x1, y1, x2, y2, grid, xGrid_1,
      deg, yminorig, ymaxorig;
  int vedge, hedge, bestp1x = 0, bestp1y = 0;
  int gridsX[XRANGE + YRANGE], gridsY[XRANGE + YRANGE];
  float tmp1, tmp2, tmp3, tmp4, tmp, best;
  Bool LH1, LH2, BL1 = false, BL2 = false;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  if (sttrees[netID].edges[edgeID].len >
      threshold) // only route the non-degraded edges (len>0)
  {
    treeedges = sttrees[netID].edges;
    treeedge  = &(treeedges[edgeID]);
    treenodes = sttrees[netID].nodes;
    n1        = treeedge->n1;
    n2        = treeedge->n2;
    x1        = treenodes[n1].x;
    y1        = treenodes[n1].y;
    x2        = treenodes[n2].x;
    y2        = treenodes[n2].y;

    // ripup the original routing
    if (newRipupCheck(treeedge, threshold, netID, edgeID)) {

      deg  = sttrees[netID].deg;
      xmin = max(x1 - enlarge, 0);
      xmax = min(xGrid - 1, x2 + enlarge);

      if (y1 < y2) {
        ymin     = max(y1 - enlarge, 0);
        ymax     = min(yGrid - 1, y2 + enlarge);
        yminorig = y1;
        ymaxorig = y2;
      } else {
        ymin     = max(y2 - enlarge, 0);
        ymax     = min(yGrid - 1, y1 + enlarge);
        yminorig = y2;
        ymaxorig = y1;
      }

      if (deg > 2) {
        for (j = 0; j < deg * 2 - 2; j++) {
          if (treenodes[j].x < x1) {
            xmin = x1;
          }
          if (treenodes[j].x > x2) {
            xmax = x2;
          }
          if (treenodes[j].y < yminorig) {
            ymin = yminorig;
          }
          if (treenodes[j].y > ymaxorig) {
            ymax = ymaxorig;
          }
        }
      }

      xGrid_1 = xGrid - 1; // tmp variable to save runtime

      for (j = ymin; j <= ymax; j++) {
        d1[j][xmin] = 0;
      }
      // update other columns
      for (i = xmin; i <= xmax; i++) {
        d2[ymin][i] = 0;
      }

      for (j = ymin; j <= ymax; j++) {
        grid = j * xGrid_1 + xmin;
        for (i = xmin; i < xmax; i++) {
          tmp          = h_costTable[h_edges[grid].red + h_edges[grid].usage];
          d1[j][i + 1] = d1[j][i] + tmp;
          grid++;
        }
        // update the cost of a column of grids by v-edges
      }

      for (j = ymin; j < ymax; j++) {
        // update the cost of a column of grids by h-edges
        grid = j * xGrid + xmin;
        for (i = xmin; i <= xmax; i++) {
          tmp          = h_costTable[v_edges[grid].red + v_edges[grid].usage];
          d2[j + 1][i] = d2[j][i] + tmp;
          grid++;
        }
        // update the cost of a column of grids by v-edges
      }

      best = BIG_INT;

      for (j = ymin; j <= ymax; j++) {
        for (i = xmin; i <= xmax; i++) {

          tmp1 = ADIFF(d2[j][x1], d2[y1][x1]) +
                 ADIFF(d1[j][i], d1[j][x1]); // yfirst for point 1
          tmp2 = ADIFF(d2[j][i], d2[y1][i]) + ADIFF(d1[y1][i], d1[y1][x1]);
          tmp3 = ADIFF(d2[y2][i], d2[j][i]) + ADIFF(d1[y2][i], d1[y2][x2]);
          tmp4 = ADIFF(d2[y2][x2], d2[j][x2]) +
                 ADIFF(d1[j][x2], d1[j][i]); // xifrst for mid point

          tmp = tmp1 + tmp4;
          LH1 = FALSE;
          LH2 = TRUE;

          if (tmp2 + tmp3 < tmp) {
            tmp = tmp2 + tmp3;
            LH1 = TRUE;
            LH2 = FALSE;
          }

          if (tmp1 + tmp3 + viacost < tmp) {
            LH1 = FALSE;
            LH2 = FALSE;
            tmp = tmp1 + tmp3 + viacost;
          }

          if (tmp2 + tmp4 + viacost < tmp) {
            LH1 = TRUE;
            LH2 = TRUE;
            tmp = tmp2 + tmp4 + viacost;
          }

          if (tmp < best) {
            bestp1x = i;
            bestp1y = j;
            BL1     = LH1;
            BL2     = LH2;
            best    = tmp;
          }
        }
      }
      cnt = 0;

      if (BL1) {
        if (bestp1x > x1) {
          for (i = x1; i < bestp1x; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = y1;
            hedge       = y1 * xGrid_1 + i;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        } else {
          for (i = x1; i > bestp1x; i--) {
            gridsX[cnt] = i;
            gridsY[cnt] = y1;
            hedge       = y1 * xGrid_1 + i - 1;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        }
        if (bestp1y > y1) {
          for (i = y1; i < bestp1y; i++) {
            gridsX[cnt] = bestp1x;
            gridsY[cnt] = i;
            cnt++;
            vedge = i * xGrid + bestp1x;
            v_edges[vedge].usage += 1;
          }
        } else {
          for (i = y1; i > bestp1y; i--) {
            gridsX[cnt] = bestp1x;
            gridsY[cnt] = i;
            cnt++;
            vedge = (i - 1) * xGrid + bestp1x;
            v_edges[vedge].usage += 1;
          }
        }
      } else {
        if (bestp1y > y1) {
          for (i = y1; i < bestp1y; i++) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
            vedge = i * xGrid + x1;
            v_edges[vedge].usage += 1;
          }
        } else {
          for (i = y1; i > bestp1y; i--) {
            gridsX[cnt] = x1;
            gridsY[cnt] = i;
            cnt++;
            vedge = (i - 1) * xGrid + x1;
            v_edges[vedge].usage += 1;
          }
        }
        if (bestp1x > x1) {
          for (i = x1; i < bestp1x; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = bestp1y;
            hedge       = bestp1y * xGrid_1 + i;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        } else {
          for (i = x1; i > bestp1x; i--) {
            gridsX[cnt] = i;
            gridsY[cnt] = bestp1y;
            hedge       = bestp1y * xGrid_1 + i - 1;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        }
      }

      if (BL2) {
        if (bestp1x < x2) {
          for (i = bestp1x; i < x2; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = bestp1y;
            hedge       = bestp1y * xGrid_1 + i;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        } else {
          for (i = bestp1x; i > x2; i--) {
            gridsX[cnt] = i;
            gridsY[cnt] = bestp1y;
            hedge       = bestp1y * xGrid_1 + i - 1;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        }

        if (y2 > bestp1y) {
          for (i = bestp1y; i < y2; i++) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
            vedge = i * xGrid + x2;
            v_edges[vedge].usage += 1;
          }
        } else {
          for (i = bestp1y; i > y2; i--) {
            gridsX[cnt] = x2;
            gridsY[cnt] = i;
            cnt++;
            vedge = (i - 1) * xGrid + x2;
            v_edges[vedge].usage += 1;
          }
        }
      } else {

        if (y2 > bestp1y) {
          for (i = bestp1y; i < y2; i++) {
            gridsX[cnt] = bestp1x;
            gridsY[cnt] = i;
            cnt++;
            vedge = i * xGrid + bestp1x;
            v_edges[vedge].usage += 1;
          }
        } else {
          for (i = bestp1y; i > y2; i--) {
            gridsX[cnt] = bestp1x;
            gridsY[cnt] = i;
            cnt++;
            vedge = (i - 1) * xGrid + bestp1x;
            v_edges[vedge].usage += 1;
          }
        }
        if (x2 > bestp1x) {
          for (i = bestp1x; i < x2; i++) {
            gridsX[cnt] = i;
            gridsY[cnt] = y2;
            hedge       = y2 * xGrid_1 + i;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        } else {
          for (i = bestp1x; i > x2; i--) {
            gridsX[cnt] = i;
            gridsY[cnt] = y2;
            hedge       = y2 * xGrid_1 + i - 1;
            h_edges[hedge].usage += 1;
            cnt++;
          }
        }
      }

      gridsX[cnt] = x2;
      gridsY[cnt] = y2;
      cnt++;

      treeedge->route.routelen = cnt - 1;
      free(treeedge->route.gridsX);
      free(treeedge->route.gridsY);

      treeedge->route.gridsX = (short*)calloc(cnt, sizeof(short));
      treeedge->route.gridsY = (short*)calloc(cnt, sizeof(short));

      for (i = 0; i < cnt; i++) {
        treeedge->route.gridsX[i] = gridsX[i];
        treeedge->route.gridsY[i] = gridsY[i];
      }

    } // if(x1!=x2 || y1!=y2)
  }   // non-degraded edge
}

void routeLVAll(int threshold, int expand) {
  int netID, edgeID, numEdges, i, forange;

  printf("%d threshold, %d expand\n", threshold, expand);

  h_costTable = (float*)calloc(10 * hCapacity, sizeof(float));

  forange = 10 * hCapacity;
  for (i = 0; i < forange; i++) {
    h_costTable[i] =
        costheight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) +
        1; // /hCapacity*30));
  }

  for (netID = 0; netID < numValidNets; netID++) {
    numEdges = 2 * sttrees[netID].deg - 3;
    for (edgeID = 0; edgeID < numEdges; edgeID++) {
      routeLVEnew(netID, edgeID, threshold,
                  expand); // ripup previous route and do Monotonic routing
    }
  }
  free(h_costTable);
  // printf("LV routing OK\n");
}

void newrouteLInMaze(int netID) {
  int i, j, d, n1, n2, x1, y1, x2, y2, grid, grid1;
  int costL1, costL2, tmp;
  int ymin, ymax;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  d         = sttrees[netID].deg;
  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;

  // loop for all the tree edges (2*d-3)
  for (i = 0; i < 2 * d - 3; i++) {
    if (sttrees[netID].edges[i].len >
        0) // only route the non-degraded edges (len>0)
    {

      treeedge = &(treeedges[i]);

      n1 = treeedge->n1;
      n2 = treeedge->n2;
      x1 = treenodes[n1].x;
      y1 = treenodes[n1].y;
      x2 = treenodes[n2].x;
      y2 = treenodes[n2].y;

      if (y1 < y2) {
        ymin = y1;
        ymax = y2;
      } else {
        ymin = y2;
        ymax = y1;
      }

      treeedge->route.type = LROUTE;
      if (x1 == x2) // V-routing
      {
        for (j = ymin; j < ymax; j++)
          v_edges[j * xGrid + x1].usage++;
        treeedge->route.xFirst = FALSE;

      } else if (y1 == y2) // H-routing
      {
        for (j = x1; j < x2; j++)
          h_edges[y1 * (xGrid - 1) + j].usage++;
        treeedge->route.xFirst = TRUE;

      } else // L-routing
      {

        costL1 = costL2 = 0;

        for (j = ymin; j < ymax; j++) {
          grid = j * xGrid;
          tmp =
              v_edges[grid + x1].usage - vCapacity_lb + v_edges[grid + x1].red;
          if (tmp > 0)
            costL1 += tmp;
          tmp =
              v_edges[grid + x2].usage - vCapacity_lb + v_edges[grid + x2].red;
          if (tmp > 0)
            costL2 += tmp;
          // costL1 += simpleCost (v_edges[grid+x1].est_usage,
          // v_edges[grid+x1].cap); costL2 += simpleCost (
          // v_edges[grid+x2].est_usage,  v_edges[grid+x2].cap);
        }
        grid  = y2 * (xGrid - 1);
        grid1 = y1 * (xGrid - 1);
        for (j = x1; j < x2; j++) {
          tmp = h_edges[grid + j].usage - hCapacity_lb + h_edges[grid + j].red;
          if (tmp > 0)
            costL1 += tmp;
          tmp =
              h_edges[grid1 + j].usage - hCapacity_lb + h_edges[grid1 + j].red;
          if (tmp > 0)
            costL2 += tmp;
        }

        if (costL1 < costL2) {

          // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)
          for (j = ymin; j < ymax; j++) {
            v_edges[j * xGrid + x1].usage += 1;
          }
          grid = y2 * (xGrid - 1);
          for (j = x1; j < x2; j++) {
            h_edges[grid + j].usage += 1;
          }
          treeedge->route.xFirst = FALSE;
        } // if costL1<costL2
        else {

          // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)
          grid = y1 * (xGrid - 1);
          for (j = x1; j < x2; j++) {
            h_edges[grid + j].usage += 1;
          }
          for (j = ymin; j < ymax; j++) {
            v_edges[j * xGrid + x2].usage += 1;
          }
          treeedge->route.xFirst = TRUE;
        }

      } // else L-routing
    }   // if non-degraded edge
    else
      sttrees[netID].edges[i].route.type = NOROUTE;
  } // loop i
}

#endif


================================================
FILE: lonestar/eda/cpu/sproute/utility.h
================================================
#ifndef _UTILITY_H_
#define _UTILITY_H_

#include <stdio.h>
#include <stdlib.h>

#include "DataType.h"
#include "flute.h"
#include "DataProc.h"

void printEdge(int netID, int edgeID) {
  int i;
  TreeEdge edge;
  TreeNode* nodes;

  edge  = sttrees[netID].edges[edgeID];
  nodes = sttrees[netID].nodes;

  printf("edge %d: (%d, %d)->(%d, %d)\n", edgeID, nodes[edge.n1].x,
         nodes[edge.n1].y, nodes[edge.n2].x, nodes[edge.n2].y);
  for (i = 0; i <= edge.route.routelen; i++) {
    printf("(%d, %d) ", edge.route.gridsX[i], edge.route.gridsY[i]);
  }
  printf("\n");
}

void plotTree(int netID) {
  short *gridsX, *gridsY;
  int i, j, Zpoint, n1, n2, x1, x2, y1, y2, ymin, ymax, xmin, xmax;

  RouteType routetype;
  TreeEdge* treeedge;
  TreeNode* treenodes;
  FILE* fp;

  xmin = ymin = 1e5;
  xmax = ymax = 0;

  fp = fopen("plottree", "w");
  if (fp == NULL) {
    printf("Error in opening file plottree\n");
    exit(1);
  }

  treenodes = sttrees[netID].nodes;
  for (i = 0; i < sttrees[netID].deg; i++) {
    x1 = treenodes[i].x;
    y1 = treenodes[i].y;
    fprintf(fp, "%f %f\n", (float)x1 - 0.1, (float)y1);
    fprintf(fp, "%f %f\n", (float)x1, (float)y1 - 0.1);
    fprintf(fp, "%f %f\n", (float)x1 + 0.1, (float)y1);
    fprintf(fp, "%f %f\n", (float)x1, (float)y1 + 0.1);
    fprintf(fp, "%f %f\n", (float)x1 - 0.1, (float)y1);
    fprintf(fp, "\n");
  }
  for (i = sttrees[netID].deg; i < sttrees[netID].deg * 2 - 2; i++) {
    x1 = treenodes[i].x;
    y1 = treenodes[i].y;
    fprintf(fp, "%f %f\n", (float)x1 - 0.1, (float)y1 + 0.1);
    fprintf(fp, "%f %f\n", (float)x1 + 0.1, (float)y1 - 0.1);
    fprintf(fp, "\n");
    fprintf(fp, "%f %f\n", (float)x1 + 0.1, (float)y1 + 0.1);
    fprintf(fp, "%f %f\n", (float)x1 - 0.1, (float)y1 - 0.1);
    fprintf(fp, "\n");
  }

  for (i = 0; i < sttrees[netID].deg * 2 - 3; i++) {
    if (1) // i!=14)
    {
      treeedge = &(sttrees[netID].edges[i]);

      n1   = treeedge->n1;
      n2   = treeedge->n2;
      x1   = treenodes[n1].x;
      y1   = treenodes[n1].y;
      x2   = treenodes[n2].x;
      y2   = treenodes[n2].y;
      xmin = min(xmin, min(x1, x2));
      xmax = max(xmax, max(x1, x2));
      ymin = min(ymin, min(y1, y2));
      ymax = max(ymax, max(y1, y2));

      routetype = treeedge->route.type;

      if (routetype == LROUTE) // remove L routing
      {
        if (treeedge->route.xFirst) {
          fprintf(fp, "%d %d\n", x1, y1);
          fprintf(fp, "%d %d\n", x2, y1);
          fprintf(fp, "%d %d\n", x2, y2);
          fprintf(fp, "\n");
        } else {
          fprintf(fp, "%d %d\n", x1, y1);
          fprintf(fp, "%d %d\n", x1, y2);
          fprintf(fp, "%d %d\n", x2, y2);
          fprintf(fp, "\n");
        }
      } else if (routetype == ZROUTE) {
        Zpoint = treeedge->route.Zpoint;
        if (treeedge->route.HVH) {
          fprintf(fp, "%d %d\n", x1, y1);
          fprintf(fp, "%d %d\n", Zpoint, y1);
          fprintf(fp, "%d %d\n", Zpoint, y2);
          fprintf(fp, "%d %d\n", x2, y2);
          fprintf(fp, "\n");
        } else {
          fprintf(fp, "%d %d\n", x1, y1);
          fprintf(fp, "%d %d\n", x1, Zpoint);
          fprintf(fp, "%d %d\n", x2, Zpoint);
          fprintf(fp, "%d %d\n", x2, y2);
          fprintf(fp, "\n");
        }
      } else if (routetype == MAZEROUTE) {
        gridsX = treeedge->route.gridsX;
        gridsY = treeedge->route.gridsY;
        for (j = 0; j <= treeedge->route.routelen; j++) {
          fprintf(fp, "%d %d\n", gridsX[j], gridsY[j]);
        }
        fprintf(fp, "\n");
      }
    }
  }

  fprintf(fp, "%d %d\n", xmin - 2, ymin - 2);
  fprintf(fp, "\n");
  fprintf(fp, "%d %d\n", xmax + 2, ymax + 2);
  fclose(fp);
}

void getlen() {
  int i, edgeID, totlen = 0;
  TreeEdge* treeedge;

  for (i = 0; i < numValidNets; i++) {
    for (edgeID = 0; edgeID < 2 * sttrees[i].deg - 3; edgeID++) {
      treeedge = &(sttrees[i].edges[edgeID]);
      if (treeedge->route.type < MAZEROUTE)
        printf("wrong\n");
      //                totlen += ADIFF(treenodes[treeedge->n1].x,
      //                treenodes[treeedge->n2].x) +
      //                ADIFF(treenodes[treeedge->n1].y,
      //                treenodes[treeedge->n2].y);
      else
        totlen += treeedge->route.routelen;
    }
  }
  printf("Routed len: %d\n", totlen);
}

void ConvertToFull3DType2() {
  short *gridsX, *gridsY, *gridsL, tmpX[MAXLEN], tmpY[MAXLEN], tmpL[MAXLEN];
  int k, netID, edgeID, routeLen;
  int newCNT, numVIA, deg, j;
  TreeEdge *treeedges, *treeedge;

  numVIA = 0;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        newCNT   = 0;
        routeLen = treeedge->route.routelen;
        //				printf("netID %d, edgeID %d, len %d\n",netID, edgeID,
        // routeLen);
        gridsX = treeedge->route.gridsX;
        gridsY = treeedge->route.gridsY;
        gridsL = treeedge->route.gridsL;
        /*
                        if (edgeID == treenodes[n1a].hID) {
                            for (k = treenodes[n1a].botL; k <
           treenodes[n1a].topL; k++) { tmpX[newCNT] = gridsX[0]; tmpY[newCNT] =
           gridsY[0]; tmpL[newCNT] = k; newCNT++; numVIA++;
                            }
                        }
                        */

        // finish from n1->real route

        for (j = 0; j < routeLen; j++) {
          tmpX[newCNT] = gridsX[j];
          tmpY[newCNT] = gridsY[j];
          tmpL[newCNT] = gridsL[j];
          newCNT++;

          if (gridsL[j] > gridsL[j + 1]) {
            for (k = gridsL[j]; k > gridsL[j + 1]; k--) {
              tmpX[newCNT] = gridsX[j + 1];
              tmpY[newCNT] = gridsY[j + 1];
              tmpL[newCNT] = k;
              newCNT++;
              numVIA++;
            }
          } else if (gridsL[j] < gridsL[j + 1]) {
            for (k = gridsL[j]; k < gridsL[j + 1]; k++) {
              tmpX[newCNT] = gridsX[j + 1];
              tmpY[newCNT] = gridsY[j + 1];
              tmpL[newCNT] = k;
              newCNT++;
              numVIA++;
            }
          }
        }
        tmpX[newCNT] = gridsX[j];
        tmpY[newCNT] = gridsY[j];
        tmpL[newCNT] = gridsL[j];
        newCNT++;

        /*
        if (edgeID == treenodes[n2a].hID) {
            if (treenodes[n2a].topL != treenodes[n2a].botL)
            for (k = treenodes[n2a].topL-1; k >= treenodes[n2a].botL; k--) {
                tmpX[newCNT] = gridsX[routeLen];
                tmpY[newCNT] = gridsY[routeLen];
                tmpL[newCNT] = k;
                newCNT++;
                numVIA++;
            }
        }
        */
        // last grid -> node2 finished

        if (treeedges[edgeID].route.type == MAZEROUTE) {
          free(treeedges[edgeID].route.gridsX);
          free(treeedges[edgeID].route.gridsY);
          free(treeedges[edgeID].route.gridsL);
        }
        treeedge->route.gridsX   = (short*)calloc(newCNT, sizeof(short));
        treeedge->route.gridsY   = (short*)calloc(newCNT, sizeof(short));
        treeedge->route.gridsL   = (short*)calloc(newCNT, sizeof(short));
        treeedge->route.type     = MAZEROUTE;
        treeedge->route.routelen = newCNT - 1;

        for (k = 0; k < newCNT; k++) {
          treeedge->route.gridsX[k] = tmpX[k];
          treeedge->route.gridsY[k] = tmpY[k];
          treeedge->route.gridsL[k] = tmpL[k];
        }
      }
      //			printEdge3D(netID, edgeID);
    }
  }
  //	printf("Total number of via %d\n",numVIA);
}

static int comparePVMINX(const void* a, const void* b) {
  if (((OrderNetPin*)a)->minX > ((OrderNetPin*)b)->minX)
    return 1;
  else if (((OrderNetPin*)a)->minX == ((OrderNetPin*)b)->minX)
    return 0;
  else
    return -1;
}

static int comparePVPV(const void* a, const void* b) {
  if (((OrderNetPin*)a)->npv > ((OrderNetPin*)b)->npv)
    return 1;
  else if (((OrderNetPin*)a)->npv == ((OrderNetPin*)b)->npv)
    return 0;
  else
    return -1;
}

void netpinOrderInc() {
  int j, d, ind, totalLength, xmin;
  TreeNode* treenodes;
  StTree* stree;

  float npvalue;

  numTreeedges = 0;
  for (j = 0; j < numValidNets; j++) {
    d = sttrees[j].deg;
    numTreeedges += 2 * d - 3;
  }

  if (treeOrderPV != NULL) {
    free(treeOrderPV);
  }

  treeOrderPV = (OrderNetPin*)malloc(numValidNets * sizeof(OrderNetPin));

  for (j = 0; j < numValidNets; j++) {
    xmin        = BIG_INT;
    totalLength = 0;
    treenodes   = sttrees[j].nodes;
    stree       = &(sttrees[j]);
    d           = stree->deg;
    for (ind = 0; ind < 2 * d - 3; ind++) {
      totalLength += stree->edges[ind].len;
      if (xmin < treenodes[stree->edges[ind].n1].x) {
        xmin = treenodes[stree->edges[ind].n1].x;
      }
    }

    npvalue = (float)totalLength / d;

    treeOrderPV[j].npv       = npvalue;
    treeOrderPV[j].treeIndex = j;
    treeOrderPV[j].minX      = xmin;
  }

  qsort(treeOrderPV, numValidNets, sizeof(OrderNetPin), comparePVMINX);
  qsort(treeOrderPV, numValidNets, sizeof(OrderNetPin), comparePVPV);
}

void fillVIA() {
  short tmpX[MAXLEN], tmpY[MAXLEN], *gridsX, *gridsY, *gridsL, tmpL[MAXLEN];
  int k, netID, edgeID, routeLen, n1a, n2a;
  int n1, n2, newCNT, numVIAT1, numVIAT2, deg, j;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  numVIAT1 = 0;
  numVIAT2 = 0;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;
    treenodes = sttrees[netID].nodes;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        newCNT   = 0;
        routeLen = treeedge->route.routelen;
        //				printf("netID %d, edgeID %d, len %d\n",netID, edgeID,
        // routeLen);
        n1     = treeedge->n1;
        n2     = treeedge->n2;
        gridsX = treeedge->route.gridsX;
        gridsY = treeedge->route.gridsY;
        gridsL = treeedge->route.gridsL;

        n1a = treenodes[n1].stackAlias;

        n2a = treenodes[n2].stackAlias;

        n1a = treeedge->n1a;
        n2a = treeedge->n2a;

        if (edgeID == treenodes[n1a].hID || edgeID == treenodes[n2a].hID) {

          if (edgeID == treenodes[n1a].hID) {

            for (k = treenodes[n1a].botL; k < treenodes[n1a].topL; k++) {
              tmpX[newCNT] = gridsX[0];
              tmpY[newCNT] = gridsY[0];
              tmpL[newCNT] = k;
              newCNT++;
              if (n1a < deg) {
                numVIAT1++;
              } else {
                numVIAT2++;
              }
            }
          }

          // finish from n1->real route

          for (j = 0; j < routeLen; j++) {
            tmpX[newCNT] = gridsX[j];
            tmpY[newCNT] = gridsY[j];
            tmpL[newCNT] = gridsL[j];
            newCNT++;

            /*						if (gridsL[j] > gridsL[j+1]) {
                                        printf("fill via should not entered\n");
                                        for (k = gridsL[j]; k > gridsL[j+1];
               k--) { tmpX[newCNT] = gridsX[j+1]; tmpY[newCNT] = gridsY[j+1];
                                            tmpL[newCNT] = k;
                                            newCNT++;
                                            numVIA++;
                                        }
                                    } else if (gridsL[j] < gridsL[j+1]){
                                        printf("fill via should not entered\n");
                                        for (k = gridsL[j]; k < gridsL[j+1];
               k++) { tmpX[newCNT] = gridsX[j+1]; tmpY[newCNT] = gridsY[j+1];
                                            tmpL[newCNT] = k;
                                            newCNT++;
                                            numVIA++;
                                        }
                                    }
                                    */
          }
          tmpX[newCNT] = gridsX[j];
          tmpY[newCNT] = gridsY[j];
          tmpL[newCNT] = gridsL[j];
          newCNT++;

          if (edgeID == treenodes[n2a].hID) {
            if (treenodes[n2a].topL != treenodes[n2a].botL)
              for (k = treenodes[n2a].topL - 1; k >= treenodes[n2a].botL; k--) {
                tmpX[newCNT] = gridsX[routeLen];
                tmpY[newCNT] = gridsY[routeLen];
                tmpL[newCNT] = k;
                newCNT++;
                if (n2a < deg) {
                  numVIAT1++;
                } else {
                  numVIAT2++;
                }
              }
          }
          // last grid -> node2 finished

          if (treeedges[edgeID].route.type == MAZEROUTE) {
            free(treeedges[edgeID].route.gridsX);
            free(treeedges[edgeID].route.gridsY);
            free(treeedges[edgeID].route.gridsL);
          }
          treeedge->route.gridsX   = (short*)calloc(newCNT, sizeof(short));
          treeedge->route.gridsY   = (short*)calloc(newCNT, sizeof(short));
          treeedge->route.gridsL   = (short*)calloc(newCNT, sizeof(short));
          treeedge->route.type     = MAZEROUTE;
          treeedge->route.routelen = newCNT - 1;

          for (k = 0; k < newCNT; k++) {
            treeedge->route.gridsX[k] = tmpX[k];
            treeedge->route.gridsY[k] = tmpY[k];
            treeedge->route.gridsL[k] = tmpL[k];
          }
        } // if edgeID == treenodes[n1a].hID || edgeID == treenodes[n2a].hID
      }
      //			printEdge3D(netID, edgeID);
    }
  }
  printf("via related to pin nodes %d\n", numVIAT1);
  printf("via related stiner nodes %d\n", numVIAT2);
}

int threeDVIA() {
  short* gridsL;
  int netID, edgeID, deg;
  int routeLen, numVIA, j;
  TreeEdge *treeedges, *treeedge;

  numVIA = 0;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {

      treeedge = &(treeedges[edgeID]);

      if (treeedge->len > 0) {

        routeLen = treeedge->route.routelen;
        gridsL   = treeedge->route.gridsL;

        for (j = 0; j < routeLen; j++) {
          if (gridsL[j] != gridsL[j + 1]) {
            numVIA++;
          }
        }
      }
    }
  }

  // printf("num of vias %d\n",numVIA);
  return (numVIA);
}

void assignEdge(int netID, int edgeID, Bool processDIR) {

  short *gridsX, *gridsY, *gridsL;
  int i, k, l, grid, min_x, min_y, routelen, n1a, n2a, last_layer;
  int min_result, endLayer = 0;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  treeedges = sttrees[netID].edges;
  treenodes = sttrees[netID].nodes;
  treeedge  = &(treeedges[edgeID]);

  gridsX = treeedge->route.gridsX;
  gridsY = treeedge->route.gridsY;
  gridsL = treeedge->route.gridsL;

  routelen = treeedge->route.routelen;
  n1a      = treeedge->n1a;
  n2a      = treeedge->n2a;

  for (l = 0; l < numLayers; l++) {
    for (k = 0; k <= routelen; k++) {
      gridD[l][k]   = BIG_INT;
      viaLink[l][k] = BIG_INT;
    }
  }

  for (k = 0; k < routelen; k++) {
    if (gridsX[k] == gridsX[k + 1]) {
      min_y = min(gridsY[k], gridsY[k + 1]);
      for (l = 0; l < numLayers; l++) {
        grid            = l * gridV + min_y * xGrid + gridsX[k];
        layerGrid[l][k] = v_edges3D[grid].cap - v_edges3D[grid].usage;
      }
    } else {
      min_x = min(gridsX[k], gridsX[k + 1]);
      for (l = 0; l < numLayers; l++) {
        grid            = l * gridH + gridsY[k] * (xGrid - 1) + min_x;
        layerGrid[l][k] = h_edges3D[grid].cap - h_edges3D[grid].usage;
      }
    }
  }

  if (processDIR) {
    if (treenodes[n1a].assigned) {
      for (l = treenodes[n1a].botL; l <= treenodes[n1a].topL; l++) {
        gridD[l][0] = 0;
      }
    } else {
      printf("warning, start point not assigned\n");
      fflush(stdout);
    }

    for (k = 0; k < routelen; k++) {
      for (l = 0; l < numLayers; l++) {
        for (i = 0; i < numLayers; i++) {
          if (k == 0) {
            if (l != i) {
              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 2) {
                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 2;
                viaLink[i][k] = l;
              }
            }
          } else {
            if (l != i) {
              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 3) {
                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 3;
                viaLink[i][k] = l;
              }
            }
          }
        }
      }
      for (l = 0; l < numLayers; l++) {
        if (layerGrid[l][k] > 0) {
          gridD[l][k + 1] = gridD[l][k] + 1;
        } else {
          gridD[l][k + 1] = gridD[l][k] + BIG_INT;
        }
      }
    }

    for (l = 0; l < numLayers; l++) {
      for (i = 0; i < numLayers; i++) {
        if (l != i) {
          if (gridD[i][k] >
              gridD[l][k] + ADIFF(i, l) * 1) { //+ ADIFF(i,l) * 3 ) {
            gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 1; //+ ADIFF(i,l) * 3 ;
            viaLink[i][k] = l;
          }
        }
      }
    }

    k = routelen;

    if (treenodes[n2a].assigned) {
      min_result = BIG_INT;
      for (i = treenodes[n2a].topL; i >= treenodes[n2a].botL; i--) {
        if (gridD[i][routelen] < min_result) {
          min_result = gridD[i][routelen];
          endLayer   = i;
        }
      }
    } else {
      min_result = gridD[0][routelen];
      endLayer   = 0;
      for (i = 0; i < numLayers; i++) {
        if (gridD[i][routelen] < min_result) {
          min_result = gridD[i][routelen];
          endLayer   = i;
        }
      }
    }

    k = routelen;

    if (viaLink[endLayer][routelen] == BIG_INT) {

      last_layer = endLayer;
      // printf("endlayer: %d\n", last_layer);
    } else {
      last_layer = viaLink[endLayer][routelen];
      // printf("vialink last layer: %d\n", last_layer);
    }

    for (k = routelen; k >= 0; k--) {
      gridsL[k] = last_layer;
      if (viaLink[last_layer][k] == BIG_INT) {
        // last_layer = last_layer;
      } else {
        last_layer = viaLink[last_layer][k];
      }
    }

    if (gridsL[0] < treenodes[n1a].botL) {
      treenodes[n1a].botL = gridsL[0];
      treenodes[n1a].lID  = edgeID;
    }
    if (gridsL[0] > treenodes[n1a].topL) {
      treenodes[n1a].topL = gridsL[0];
      treenodes[n1a].hID  = edgeID;
    }

    k = routelen;
    if (treenodes[n2a].assigned) {

      if (gridsL[routelen] < treenodes[n2a].botL) {
        treenodes[n2a].botL = gridsL[routelen];
        treenodes[n2a].lID  = edgeID;
      }
      if (gridsL[routelen] > treenodes[n2a].topL) {
        treenodes[n2a].topL = gridsL[routelen];
        treenodes[n2a].hID  = edgeID;
      }

    } else {
      // treenodes[n2a].assigned = TRUE;
      treenodes[n2a].topL = gridsL[routelen]; // max(endLayer,
                                              // gridsL[routelen]);
      treenodes[n2a].botL = gridsL[routelen]; // min(endLayer,
                                              // gridsL[routelen]);
      treenodes[n2a].lID = treenodes[n2a].hID = edgeID;
    }

    if (treenodes[n2a].assigned) {
      if (gridsL[routelen] > treenodes[n2a].topL ||
          gridsL[routelen] < treenodes[n2a].botL) {
        printf("target ending layer out of range\n");
      }
    }

  } else {

    if (treenodes[n2a].assigned) {
      for (l = treenodes[n2a].botL; l <= treenodes[n2a].topL; l++) {
        gridD[l][routelen] = 0;
      }
    }

    for (k = routelen; k > 0; k--) {
      for (l = 0; l < numLayers; l++) {
        for (i = 0; i < numLayers; i++) {
          if (k == routelen) {
            if (l != i) {
              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 2) {
                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 2;
                viaLink[i][k] = l;
              }
            }
          } else {
            if (l != i) {
              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 3) {
                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 3;
                viaLink[i][k] = l;
              }
            }
          }
        }
      }
      for (l = 0; l < numLayers; l++) {
        if (layerGrid[l][k - 1] > 0) {
          gridD[l][k - 1] = gridD[l][k] + 1;
        } else {
          gridD[l][k - 1] = gridD[l][k] + BIG_INT;
        }
      }
    }

    for (l = 0; l < numLayers; l++) {
      for (i = 0; i < numLayers; i++) {
        if (l != i) {
          if (gridD[i][0] > gridD[l][0] + ADIFF(i, l) * 1) {
            gridD[i][0]   = gridD[l][0] + ADIFF(i, l) * 1;
            viaLink[i][0] = l;
          }
        }
      }
    }

    if (treenodes[n1a].assigned) {
      min_result = BIG_INT;
      for (i = treenodes[n1a].topL; i >= treenodes[n1a].botL; i--) {
        if (gridD[i][k] < min_result) {
          min_result = gridD[i][0];
          endLayer   = i;
        }
      }

    } else {
      min_result = gridD[0][k];
      endLayer   = 0;
      for (i = 0; i < numLayers; i++) {
        if (gridD[i][k] < min_result) {
          min_result = gridD[i][k];
          endLayer   = i;
        }
      }
    }

    last_layer = endLayer;

    for (k = 0; k <= routelen; k++) {
      if (viaLink[last_layer][k] == BIG_INT) {
        // last_layer = last_layer;
      } else {
        last_layer = viaLink[last_layer][k];
      }
      gridsL[k] = last_layer;
    }

    gridsL[routelen] = gridsL[routelen - 1];

    if (gridsL[routelen] < treenodes[n2a].botL) {
      treenodes[n2a].botL = gridsL[routelen];
      treenodes[n2a].lID  = edgeID;
    }
    if (gridsL[routelen] > treenodes[n2a].topL) {
      treenodes[n2a].topL = gridsL[routelen];
      treenodes[n2a].hID  = edgeID;
    }

    if (treenodes[n1a].assigned) {

      if (gridsL[0] < treenodes[n1a].botL) {
        treenodes[n1a].botL = gridsL[0];
        treenodes[n1a].lID  = edgeID;
      }
      if (gridsL[0] > treenodes[n1a].topL) {
        treenodes[n1a].topL = gridsL[0];
        treenodes[n1a].hID  = edgeID;
      }

    } else {
      // treenodes[n1a].assigned = TRUE;
      treenodes[n1a].topL = gridsL[0]; // max(endLayer, gridsL[0]);
      treenodes[n1a].botL = gridsL[0]; // min(endLayer, gridsL[0]);
      treenodes[n1a].lID = treenodes[n1a].hID = edgeID;
    }
  }
  treeedge->assigned = TRUE;

  for (k = 0; k < routelen; k++) {
    if (gridsX[k] == gridsX[k + 1]) {
      min_y = min(gridsY[k], gridsY[k + 1]);
      grid  = gridsL[k] * gridV + min_y * xGrid + gridsX[k];

      if (v_edges3D[grid].usage < v_edges3D[grid].cap) {
        v_edges3D[grid].usage++;

      } else {
        v_edges3D[grid].usage++;
      }

    } else {
      min_x = min(gridsX[k], gridsX[k + 1]);
      grid  = gridsL[k] * gridH + gridsY[k] * (xGrid - 1) + min_x;

      if (h_edges3D[grid].usage < h_edges3D[grid].cap) {
        h_edges3D[grid].usage++;
      } else {
        h_edges3D[grid].usage++;
      }
    }
  }
}

void newLayerAssignmentV4() {
  short* gridsL;
  int i, k, netID, edgeID, nodeID, routeLen;
  int n1, n2, connectionCNT, deg;

  int n1a, n2a;
  int quehead, quetail;
  int edgeQueue[5000];
  int sumcheck = 0;

  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;
    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {

      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        routeLen               = treeedge->route.routelen;
        treeedge->route.gridsL = (short*)calloc((routeLen + 1), sizeof(short));
        treeedge->assigned     = FALSE;
      }
    }
  }
  netpinOrderInc();

  for (i = 0; i < numValidNets; i++) {

    netID     = treeOrderPV[i].treeIndex;
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;
    quehead = quetail = 0;

    for (nodeID = 0; nodeID < deg; nodeID++) {
      for (k = 0; k < treenodes[nodeID].conCNT; k++) {
        edgeID = treenodes[nodeID].eID[k];
        if (!treeedges[edgeID].assigned) {
          edgeQueue[quetail]         = edgeID;
          treeedges[edgeID].assigned = TRUE;
          quetail++;
        }
      }
    }

    while (quehead != quetail) {
      edgeID   = edgeQueue[quehead];
      treeedge = &(treeedges[edgeID]);
      sumcheck += treeedge->route.routelen;
      if (treenodes[treeedge->n1a].assigned) {
        assignEdge(netID, edgeID, 1);
        treeedge->assigned = TRUE;
        if (!treenodes[treeedge->n2a].assigned) {
          for (k = 0; k < treenodes[treeedge->n2a].conCNT; k++) {
            edgeID = treenodes[treeedge->n2a].eID[k];
            if (!treeedges[edgeID].assigned) {
              edgeQueue[quetail]         = edgeID;
              treeedges[edgeID].assigned = TRUE;
              quetail++;
            }
          }
          treenodes[treeedge->n2a].assigned = TRUE;
        }
      } else {
        assignEdge(netID, edgeID, 0);
        treeedge->assigned = TRUE;
        if (!treenodes[treeedge->n1a].assigned) {
          for (k = 0; k < treenodes[treeedge->n1a].conCNT; k++) {
            edgeID = treenodes[treeedge->n1a].eID[k];
            if (!treeedges[edgeID].assigned) {
              edgeQueue[quetail]         = edgeID;
              treeedges[edgeID].assigned = TRUE;
              quetail++;
            }
          }
          treenodes[treeedge->n1a].assigned = TRUE;
        }
      }
      quehead++;
    }

    deg = sttrees[netID].deg;

    for (nodeID = 0; nodeID < 2 * deg - 2; nodeID++) {
      treenodes[nodeID].topL     = -1;
      treenodes[nodeID].botL     = numLayers;
      treenodes[nodeID].conCNT   = 0;
      treenodes[nodeID].hID      = BIG_INT;
      treenodes[nodeID].lID      = BIG_INT;
      treenodes[nodeID].status   = 0;
      treenodes[nodeID].assigned = FALSE;

      if (nodeID < deg) {
        treenodes[nodeID].botL     = 0;
        treenodes[nodeID].assigned = TRUE;
        treenodes[nodeID].status   = 1;
      }
    }

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {

      treeedge = &(treeedges[edgeID]);

      if (treeedge->len > 0) {

        routeLen = treeedge->route.routelen;

        n1     = treeedge->n1;
        n2     = treeedge->n2;
        gridsL = treeedge->route.gridsL;

        n1a                                   = treenodes[n1].stackAlias;
        n2a                                   = treenodes[n2].stackAlias;
        connectionCNT                         = treenodes[n1a].conCNT;
        treenodes[n1a].heights[connectionCNT] = gridsL[0];
        treenodes[n1a].eID[connectionCNT]     = edgeID;
        treenodes[n1a].conCNT++;

        if (gridsL[0] > treenodes[n1a].topL) {
          treenodes[n1a].hID  = edgeID;
          treenodes[n1a].topL = gridsL[0];
        }
        if (gridsL[0] < treenodes[n1a].botL) {
          treenodes[n1a].lID  = edgeID;
          treenodes[n1a].botL = gridsL[0];
        }

        treenodes[n1a].assigned = TRUE;

        connectionCNT                         = treenodes[n2a].conCNT;
        treenodes[n2a].heights[connectionCNT] = gridsL[routeLen];
        treenodes[n2a].eID[connectionCNT]     = edgeID;
        treenodes[n2a].conCNT++;
        if (gridsL[routeLen] > treenodes[n2a].topL) {
          treenodes[n2a].hID  = edgeID;
          treenodes[n2a].topL = gridsL[routeLen];
        }
        if (gridsL[routeLen] < treenodes[n2a].botL) {
          treenodes[n2a].lID  = edgeID;
          treenodes[n2a].botL = gridsL[routeLen];
        }

        treenodes[n2a].assigned = TRUE;

      } // edge len > 0
    }   // eunmerating edges
  }

  // printf("sum check number 2 %d\n",sumcheck);
}

void newLA() {
  int netID, d, k, edgeID, deg, numpoints, n1, n2;
  Bool redundant;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    numpoints = 0;

    for (d = 0; d < 2 * deg - 2; d++) {
      treenodes[d].topL = -1;
      treenodes[d].botL = numLayers;
      // treenodes[d].l = 0;
      treenodes[d].assigned   = FALSE;
      treenodes[d].stackAlias = d;
      treenodes[d].conCNT     = 0;
      treenodes[d].hID        = BIG_INT;
      treenodes[d].lID        = BIG_INT;
      treenodes[d].status     = 0;

      if (d < deg) {
        treenodes[d].botL = treenodes[d].topL = 0;
        // treenodes[d].l = 0;
        treenodes[d].assigned = TRUE;
        treenodes[d].status   = 1;

        xcor[numpoints] = treenodes[d].x;
        ycor[numpoints] = treenodes[d].y;
        dcor[numpoints] = d;
        numpoints++;
      } else {
        redundant = FALSE;
        for (k = 0; k < numpoints; k++) {
          if ((treenodes[d].x == xcor[k]) && (treenodes[d].y == ycor[k])) {
            treenodes[d].stackAlias = dcor[k];

            redundant = TRUE;
            break;
          }
        }
        if (!redundant) {
          xcor[numpoints] = treenodes[d].x;
          ycor[numpoints] = treenodes[d].y;
          dcor[numpoints] = d;
          numpoints++;
        }
      }
    }
  }

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        n1 = treeedge->n1;
        n2 = treeedge->n2;

        treeedge->n1a = treenodes[n1].stackAlias;
        treenodes[treeedge->n1a].eID[treenodes[treeedge->n1a].conCNT] = edgeID;
        treenodes[treeedge->n1a].conCNT++;
        treeedge->n2a = treenodes[n2].stackAlias;
        treenodes[treeedge->n2a].eID[treenodes[treeedge->n2a].conCNT] = edgeID;
        treenodes[treeedge->n2a].conCNT++;
      }
    }
  }

  printf("node processing\n");
  newLayerAssignmentV4();
  printf("layer assignment\n");
  ConvertToFull3DType2();
}

void printEdge3D(int netID, int edgeID) {
  int i;
  TreeEdge edge;
  TreeNode* nodes;

  edge  = sttrees[netID].edges[edgeID];
  nodes = sttrees[netID].nodes;

  printf("edge %d: n1 %d (%d, %d)-> n2 %d(%d, %d)\n", edgeID, edge.n1,
         nodes[edge.n1].x, nodes[edge.n1].y, edge.n2, nodes[edge.n2].x,
         nodes[edge.n2].y);
  if (edge.len > 0) {
    for (i = 0; i <= edge.route.routelen; i++) {
      printf("(%d, %d,%d) ", edge.route.gridsX[i], edge.route.gridsY[i],
             edge.route.gridsL[i]);
    }
    printf("\n");
  }
}

void printTree3D(int netID) {
  int edgeID, nodeID;
  for (nodeID = 0; nodeID < 2 * sttrees[netID].deg - 2; nodeID++) {
    printf("nodeID %d,  [%d, %d]\n", nodeID, sttrees[netID].nodes[nodeID].y,
           sttrees[netID].nodes[nodeID].x);
  }

  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {
    printEdge3D(netID, edgeID);
  }
}

void checkRoute3D() {
  short *gridsX, *gridsY, *gridsL;
  int i, netID, edgeID, nodeID, edgelength;
  int n1, n2, x1, y1, x2, y2, deg;
  int distance;
  Bool gridFlag;
  TreeEdge* treeedge;
  TreeNode* treenodes;

  for (netID = 0; netID < numValidNets; netID++) {

    treenodes = sttrees[netID].nodes;
    deg       = sttrees[netID].deg;

    for (nodeID = 0; nodeID < 2 * deg - 2; nodeID++) {
      if (nodeID < deg) {
        if (treenodes[nodeID].botL != 0) {
          printf("causing pin node floating\n");
        }

        if (treenodes[nodeID].botL > treenodes[nodeID].topL) {
          // printf("pin node l %d h %d wrong lid %d hid %d\n",
          // treenodes[nodeID].botL, treenodes[nodeID].topL,
          // treenodes[nodeID].lID, treenodes[nodeID].hID);
        }
      }
    }
    for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {
      if (sttrees[netID].edges[edgeID].len == 0) {
        continue;
      }
      treeedge   = &(sttrees[netID].edges[edgeID]);
      edgelength = treeedge->route.routelen;
      n1         = treeedge->n1;
      n2         = treeedge->n2;
      x1         = treenodes[n1].x;
      y1         = treenodes[n1].y;
      x2         = treenodes[n2].x;
      y2         = treenodes[n2].y;
      gridsX     = treeedge->route.gridsX;
      gridsY     = treeedge->route.gridsY;
      gridsL     = treeedge->route.gridsL;

      gridFlag = FALSE;

      if (gridsX[0] != x1 || gridsY[0] != y1) {
        printf("net[%d] edge[%d] start node wrong, net deg %d, n1 %d\n", netID,
               edgeID, deg, n1);
        printEdge3D(netID, edgeID);
      }
      if (gridsX[edgelength] != x2 || gridsY[edgelength] != y2) {
        printf("net[%d] edge[%d] end node wrong, net deg %d, n2 %d\n", netID,
               edgeID, deg, n2);
        printEdge3D(netID, edgeID);
      }
      for (i = 0; i < treeedge->route.routelen; i++) {
        distance = ADIFF(gridsX[i + 1], gridsX[i]) +
                   ADIFF(gridsY[i + 1], gridsY[i]) +
                   ADIFF(gridsL[i + 1], gridsL[i]);
        if (distance > 1 || distance < 0) {
          gridFlag = TRUE;
          printf("net[%d] edge[%d] maze route wrong, distance %d, i %d\n",
                 netID, edgeID, distance, i);
          printf("current [%d, %d, %d], next [%d, %d, %d]", gridsL[i],
                 gridsY[i], gridsX[i], gridsL[i + 1], gridsY[i + 1],
                 gridsX[i + 1]);
        }
      }

      for (i = 0; i <= treeedge->route.routelen; i++) {
        if (gridsL[i] < 0) {
          printf("gridsL less than 0, %d\n", gridsL[i]);
        }
      }
      if (gridFlag) {
        printEdge3D(netID, edgeID);
      }
    }
  }
}

void write3D() {
  short *gridsX, *gridsY, *gridsL;
  int netID, i, edgeID, deg, lastX, lastY, lastL, xreal, yreal, routeLen;
  TreeEdge *treeedges, *treeedge;
  FILE* fp;
  TreeEdge edge;

  fp = fopen("output.out", "w");
  if (fp == NULL) {
    printf("Error in opening %s\n", "output.out");
    exit(1);
  }

  for (netID = 0; netID < numValidNets; netID++) {
    fprintf(fp, "%s %d\n", nets[netID]->name, netID);
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      edge     = sttrees[netID].edges[edgeID];
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        routeLen = treeedge->route.routelen;
        gridsX   = treeedge->route.gridsX;
        gridsY   = treeedge->route.gridsY;
        gridsL   = treeedge->route.gridsL;
        lastX    = wTile * (gridsX[0] + 0.5) + xcorner;
        lastY    = hTile * (gridsY[0] + 0.5) + ycorner;
        lastL    = gridsL[0];
        for (i = 1; i <= routeLen; i++) {
          xreal = wTile * (gridsX[i] + 0.5) + xcorner;
          yreal = hTile * (gridsY[i] + 0.5) + ycorner;

          fprintf(fp, "(%d,%d,%d)-(%d,%d,%d)\n", lastX, lastY, lastL + 1, xreal,
                  yreal, gridsL[i] + 1);
          lastX = xreal;
          lastY = yreal;
          lastL = gridsL[i];
        }
      }
    }
    fprintf(fp, "!\n");
  }
  fclose(fp);
}

static int compareTEL(const void* a, const void* b) {
  if (((OrderTree*)a)->xmin < ((OrderTree*)b)->xmin)
    return 1;
  else if (((OrderTree*)a)->xmin == ((OrderTree*)b)->xmin)
    return 0;
  else
    return -1;
}

void StNetOrder() {
  short *gridsX, *gridsY;
  int i, j, d, ind, grid, min_x, min_y;
  TreeEdge *treeedges, *treeedge;
  StTree* stree;

  numTreeedges = 0;

  if (treeOrderCong != NULL) {
    free(treeOrderCong);
  }

  treeOrderCong = (OrderTree*)malloc(numValidNets * sizeof(OrderTree));

  i = 0;
  for (j = 0; j < numValidNets; j++) {
    stree                      = &(sttrees[j]);
    d                          = stree->deg;
    treeOrderCong[j].xmin      = 0;
    treeOrderCong[j].treeIndex = j;
    for (ind = 0; ind < 2 * d - 3; ind++) {
      treeedges = stree->edges;
      treeedge  = &(treeedges[ind]);

      gridsX = treeedge->route.gridsX;
      gridsY = treeedge->route.gridsY;
      for (i = 0; i < treeedge->route.routelen; i++) {
        if (gridsX[i] == gridsX[i + 1]) // a vertical edge
        {
          min_y = min(gridsY[i], gridsY[i + 1]);
          grid  = min_y * xGrid + gridsX[i];
          treeOrderCong[j].xmin +=
              max(0, v_edges[grid].usage - v_edges[grid].cap);
        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
        {
          min_x = min(gridsX[i], gridsX[i + 1]);
          grid  = gridsY[i] * (xGrid - 1) + min_x;
          treeOrderCong[j].xmin +=
              max(0, h_edges[grid].usage - h_edges[grid].cap);
        }
      }
    }
  }

  qsort(treeOrderCong, numValidNets, sizeof(OrderTree), compareTEL);
}

void recoverEdge(int netID, int edgeID) {
  short *gridsX, *gridsY, *gridsL;
  int i, grid, ymin, xmin, n1a, n2a;
  int connectionCNT, routeLen;
  TreeEdge *treeedges, *treeedge;
  TreeNode* treenodes;

  treeedges = sttrees[netID].edges;
  treeedge  = &(treeedges[edgeID]);

  routeLen = treeedge->route.routelen;

  if (treeedge->len == 0) {
    printf("trying to recover an 0 length edge\n");
    exit(0);
  }

  treenodes = sttrees[netID].nodes;

  gridsX = treeedge->route.gridsX;
  gridsY = treeedge->route.gridsY;
  gridsL = treeedge->route.gridsL;

  n1a = treeedge->n1a;
  n2a = treeedge->n2a;

  connectionCNT                         = treenodes[n1a].conCNT;
  treenodes[n1a].heights[connectionCNT] = gridsL[0];
  treenodes[n1a].eID[connectionCNT]     = edgeID;
  treenodes[n1a].conCNT++;

  if (gridsL[0] > treenodes[n1a].topL) {
    treenodes[n1a].hID  = edgeID;
    treenodes[n1a].topL = gridsL[0];
  }
  if (gridsL[0] < treenodes[n1a].botL) {
    treenodes[n1a].lID  = edgeID;
    treenodes[n1a].botL = gridsL[0];
  }

  treenodes[n1a].assigned = TRUE;

  connectionCNT                         = treenodes[n2a].conCNT;
  treenodes[n2a].heights[connectionCNT] = gridsL[routeLen];
  treenodes[n2a].eID[connectionCNT]     = edgeID;
  treenodes[n2a].conCNT++;
  if (gridsL[routeLen] > treenodes[n2a].topL) {
    treenodes[n2a].hID  = edgeID;
    treenodes[n2a].topL = gridsL[routeLen];
  }
  if (gridsL[routeLen] < treenodes[n2a].botL) {
    treenodes[n2a].lID  = edgeID;
    treenodes[n2a].botL = gridsL[routeLen];
  }

  treenodes[n2a].assigned = TRUE;

  for (i = 0; i < treeedge->route.routelen; i++) {
    if (gridsL[i] == gridsL[i + 1]) {
      if (gridsX[i] == gridsX[i + 1]) // a vertical edge
      {
        ymin = min(gridsY[i], gridsY[i + 1]);
        grid = gridsL[i] * gridV + ymin * xGrid + gridsX[i];
        v_edges3D[grid].usage += 1;
      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge
      {
        xmin = min(gridsX[i], gridsX[i + 1]);
        grid = gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + xmin;
        h_edges3D[grid].usage += 1;
      }
    }
  }
}

void checkUsage() {
  short *gridsX, *gridsY;
  int netID, i, k, edgeID, deg;
  int j, cnt;
  Bool redsus;
  TreeEdge *treeedges, *treeedge;
  TreeEdge edge;

  for (netID = 0; netID < numValidNets; netID++) {
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      edge     = sttrees[netID].edges[edgeID];
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        gridsX = treeedge->route.gridsX;
        gridsY = treeedge->route.gridsY;

        redsus = TRUE;

        while (redsus) {
          redsus = FALSE;

          for (i = 0; i <= treeedge->route.routelen; i++) {
            for (j = 0; j < i; j++) {
              if (gridsX[i] == gridsX[j] &&
                  gridsY[i] == gridsY[j]) // a vertical edge
              {
                cnt = 1;
                for (k = i + 1; k <= treeedge->route.routelen; k++) {
                  gridsX[j + cnt] = gridsX[k];
                  gridsY[j + cnt] = gridsY[k];
                  cnt++;
                }
                treeedge->route.routelen -= i - j;
                redsus = TRUE;
                i      = 0;
                j      = 0;
                printf("redundant edge component discovered\n");
              }
            }
          }
        }
      }
    }
  }
  printf("usage checked\n");
}

static int compareEdgeLen(const void* a, const void* b) {
  if (((OrderNetEdge*)a)->length < ((OrderNetEdge*)b)->length)
    return 1;
  else if (((OrderNetEdge*)a)->length == ((OrderNetEdge*)b)->length)
    return 0;
  else
    return -1;
}

void netedgeOrderDec(int netID, OrderNetEdge* netEO) {
  int j, d, numTreeedges;

  d            = sttrees[netID].deg;
  numTreeedges = 2 * d - 3;

  for (j = 0; j < numTreeedges; j++) {
    netEO[j].length = sttrees[netID].edges[j].route.routelen;
    netEO[j].edgeID = j;
  }

  qsort(netEO, numTreeedges, sizeof(OrderNetEdge), compareEdgeLen);
}

void printEdge2D(int netID, int edgeID) {
  int i;
  TreeEdge edge;
  TreeNode* nodes;

  edge  = sttrees[netID].edges[edgeID];
  nodes = sttrees[netID].nodes;

  printf("edge %d: n1 %d (%d, %d)-> n2 %d(%d, %d), routeType %d\n", edgeID,
         edge.n1, nodes[edge.n1].x, nodes[edge.n1].y, edge.n2, nodes[edge.n2].x,
         nodes[edge.n2].y, edge.route.type);
  if (edge.len > 0) {
    for (i = 0; i <= edge.route.routelen; i++) {
      printf("(%d, %d) ", edge.route.gridsX[i], edge.route.gridsY[i]);
    }
    printf("\n");
  }
}

void printTree2D(int netID) {
  int edgeID, nodeID;
  for (nodeID = 0; nodeID < 2 * sttrees[netID].deg - 2; nodeID++) {
    printf("nodeID %d,  [%d, %d]\n", nodeID, sttrees[netID].nodes[nodeID].y,
           sttrees[netID].nodes[nodeID].x);
  }

  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {
    printEdge2D(netID, edgeID);
  }
}

Bool checkRoute2DTree(int netID) {
  Bool STHwrong, gridFlag;
  short *gridsX, *gridsY;
  int i, edgeID, edgelength;
  int n1, n2, x1, y1, x2, y2;
  int distance;
  TreeEdge* treeedge;
  TreeNode* treenodes;

  STHwrong = FALSE;

  treenodes = sttrees[netID].nodes;
  // if(netID == 2nnn/b52163) return false;
  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {
    treeedge   = &(sttrees[netID].edges[edgeID]);
    edgelength = treeedge->route.routelen;
    n1         = treeedge->n1;
    n2         = treeedge->n2;
    x1         = treenodes[n1].x;
    y1         = treenodes[n1].y;
    x2         = treenodes[n2].x;
    y2         = treenodes[n2].y;
    gridsX     = treeedge->route.gridsX;
    gridsY     = treeedge->route.gridsY;

    gridFlag = FALSE;

    if (treeedge->len < 0) {
      printf("rip upped edge without edge len re assignment\n");
      STHwrong = TRUE;
    }

    if (treeedge->len > 0) {

      if (treeedge->route.routelen < 1) {
        // printf("%d %d .routelen %d len  %d\n",netID, edgeID,
        // treeedge->route.routelen, treeedge->len);
        STHwrong = TRUE;
        printf("checking failed %d roulen = 0\n", netID);
        return (TRUE);
      }
      // if(netID == 252163 && edgeID == 10)
      //	printf("checking src: %d %d gridstart: %d %d dst: %d %d gridend: %d
      //%d\n", y1, x1, gridsY[0],gridsX[0], y2, x2,
      // gridsY[edgelength],gridsX[edgelength]);
      if (gridsX[0] != x1 || gridsY[0] != y1) {
        printf("%d %d initial grid wrong y1 x1 [%d %d] , net start [%d %d] "
               "routelen %d\n ",
               netID, edgeID, y1, x1, gridsY[0], gridsX[0],
               treeedge->route.routelen);
        STHwrong = TRUE;
      }
      if (gridsX[edgelength] != x2 || gridsY[edgelength] != y2) {
        printf("%d %d end grid wrong y2 x2 [%d %d] , net start [%d %d] "
               "routelen %d\n ",
               netID, edgeID, y2, x2, gridsY[edgelength], gridsX[edgelength],
               treeedge->route.routelen);
        STHwrong = TRUE;
      }
      for (i = 0; i < treeedge->route.routelen; i++) {

        distance =
            ADIFF(gridsX[i + 1], gridsX[i]) + ADIFF(gridsY[i + 1], gridsY[i]);
        if (distance != 1) {
          printf("net[%d] edge[%d] maze route wrong, distance %d, i %d\n",
                 netID, edgeID, distance, i);
          gridFlag = TRUE;
          STHwrong = TRUE;
        }
      }

      if (gridFlag) {
        printEdge2D(netID, edgeID);
      }
      if (STHwrong) {
        printf("checking failed %d STHwrong\n", netID);
        return (TRUE);
      }
    }
  }

  return (STHwrong);
}

void writeRoute3D(const char* routingfile3D) {
  short *gridsX, *gridsY, *gridsL;
  int netID, i, edgeID, deg, routeLen;
  TreeEdge *treeedges, *treeedge;
  FILE* fp;
  TreeEdge edge;

  fp = fopen(routingfile3D, "w");
  if (fp == NULL) {
    printf("Error in opening %s\n", routingfile3D);
    exit(1);
  }

  for (netID = 0; netID < numValidNets; netID++) {
    fprintf(fp, "%s\n", nets[netID]->name);
    fprintf(fp, "(\n");
    treeedges = sttrees[netID].edges;
    deg       = sttrees[netID].deg;

    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {
      edge     = sttrees[netID].edges[edgeID];
      treeedge = &(treeedges[edgeID]);
      if (treeedge->len > 0) {

        routeLen = treeedge->route.routelen;
        gridsX   = treeedge->route.gridsX;
        gridsY   = treeedge->route.gridsY;
        gridsL   = treeedge->route.gridsL;
        /*lastX = wTile*(gridsX[0]+0.5)+xcorner;
        lastY = hTile*(gridsY[0]+0.5)+ycorner;
        lastL = gridsL[0];
        for (i = 1; i <= routeLen; i ++) {
            xreal = wTile*(gridsX[i]+0.5)+xcorner;
            yreal = hTile*(gridsY[i]+0.5)+ycorner;

                fprintf(fp, "(%d,%d,%d)-(%d,%d,%d)\n",
        lastX,lastY,lastL+1,xreal,yreal,gridsL[i]+1); lastX = xreal; lastY =
        yreal; lastL = gridsL[i];
        }*/
        for (i = 0; i <= routeLen; i++) {
          int llx   = wTile * gridsX[i] + xcorner;
          int lly   = hTile * gridsY[i] + ycorner;
          int urx   = wTile * (gridsX[i] + 1) + xcorner;
          int ury   = hTile * (gridsY[i] + 1) + ycorner;
          int layer = gridsL[i] + 1;

          fprintf(fp, "%d %d %d %d Metal%d\n", llx, lly, urx, ury, layer);
        }
      }
    }
    fprintf(fp, ")\n");
  }

  for (netID = 0; netID < numInvalidNets; netID++) {
    fprintf(fp, "%s\n", invalid_nets[netID]->name);
    fprintf(fp, "(\n");

    int llx   = wTile * invalid_nets[netID]->pinX[0] + xcorner;
    int lly   = hTile * invalid_nets[netID]->pinY[0] + ycorner;
    int urx   = wTile * (invalid_nets[netID]->pinX[0] + 1) + xcorner;
    int ury   = hTile * (invalid_nets[netID]->pinY[0] + 1) + ycorner;
    int layer = invalid_nets[netID]->pinL[0];

    fprintf(fp, "%d %d %d %d Metal%d\n", llx, lly, urx, ury, layer);
    fprintf(fp, "%d %d %d %d Metal%d\n", llx, lly, urx, ury, layer + 1);

    fprintf(fp, ")\n");
  }

  fclose(fp);
}

float* pH;
float* pV;
struct BBox* netBox;
struct BBox** pnetBox;

struct TD {
  int id;
  float cost;
};

struct BBox {
  int xmin;
  int ymin;
  int xmax;
  int ymax;
  int hSpan;
  int vSpan;
}; // lower_left corner and upper_right corner

struct wire {
  int x1, y1, x2, y2;
  int netID;
};

/*static int ordercost(const void *a,  const void *b)
{
    struct TD *pa, *pb;

    pa = *(struct TD**)a;
    pb = *(struct TD**)b;

    if (pa->cost < pb->cost) return 1;
    if (pa->cost > pb->cost) return -1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);
}//decreasing order

static int ordervSpan(const void *a,  const void *b)
{
    struct BBox *pa, *pb;

    pa = *(struct BBox**)a;
    pb = *(struct BBox**)b;

    if (pa->vSpan < pb->vSpan) return -1;
    if (pa->vSpan > pb->vSpan) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);
}

static int orderhSpan(const void *a,  const void *b)
{
    struct BBox *pa, *pb;

    pa = *(struct BBox**)a;
    pb = *(struct BBox**)b;

    if (pa->hSpan < pb->hSpan) return -1;
    if (pa->hSpan > pb->hSpan) return 1;
    return 0;
   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);
}*/

// binary search to map the new coordinates to original coordinates

// Copy Routing Solution for the best routing solution so far
void copyRS(void) {
  int i, j, netID, edgeID, numEdges, numNodes;

  if (sttreesBK != NULL) {
    for (netID = 0; netID < numValidNets; netID++) {

      numEdges = 2 * sttreesBK[netID].deg - 3;
      for (edgeID = 0; edgeID < numEdges; edgeID++) {
        if (sttreesBK[netID].edges[edgeID].len > 0) {
          free(sttreesBK[netID].edges[edgeID].route.gridsX);
          free(sttreesBK[netID].edges[edgeID].route.gridsY);
        }
      }
      free(sttreesBK[netID].nodes);
      free(sttreesBK[netID].edges);
    }
    free(sttreesBK);
  }

  sttreesBK = (StTree*)malloc(numValidNets * sizeof(StTree));

  for (netID = 0; netID < numValidNets; netID++) {
    numNodes = 2 * sttrees[netID].deg - 2;
    numEdges = 2 * sttrees[netID].deg - 3;

    sttreesBK[netID].nodes = (TreeNode*)malloc(numNodes * sizeof(TreeNode));

    for (i = 0; i < numNodes; i++) {
      sttreesBK[netID].nodes[i].x = sttrees[netID].nodes[i].x;
      sttreesBK[netID].nodes[i].y = sttrees[netID].nodes[i].y;
      for (j = 0; j < 3; j++) {
        sttreesBK[netID].nodes[i].nbr[j]  = sttrees[netID].nodes[i].nbr[j];
        sttreesBK[netID].nodes[i].edge[j] = sttrees[netID].nodes[i].edge[j];
      }
    }
    sttreesBK[netID].deg = sttrees[netID].deg;

    sttreesBK[netID].edges = (TreeEdge*)malloc(numEdges * sizeof(TreeEdge));

    for (edgeID = 0; edgeID < numEdges; edgeID++) {
      sttreesBK[netID].edges[edgeID].len = sttrees[netID].edges[edgeID].len;
      sttreesBK[netID].edges[edgeID].n1  = sttrees[netID].edges[edgeID].n1;
      sttreesBK[netID].edges[edgeID].n2  = sttrees[netID].edges[edgeID].n2;

      if (sttrees[netID].edges[edgeID].len >
          0) // only route the non-degraded edges (len>0)
      {
        sttreesBK[netID].edges[edgeID].route.routelen =
            sttrees[netID].edges[edgeID].route.routelen;
        sttreesBK[netID].edges[edgeID].route.gridsX = (short*)calloc(
            (sttrees[netID].edges[edgeID].route.routelen + 1), sizeof(short));
        sttreesBK[netID].edges[edgeID].route.gridsY = (short*)calloc(
            (sttrees[netID].edges[edgeID].route.routelen + 1), sizeof(short));

        for (i = 0; i <= sttrees[netID].edges[edgeID].route.routelen; i++) {
          sttreesBK[netID].edges[edgeID].route.gridsX[i] =
              sttrees[netID].edges[edgeID].route.gridsX[i];
          sttreesBK[netID].edges[edgeID].route.gridsY[i] =
              sttrees[netID].edges[edgeID].route.gridsY[i];
        }
      }
    }
  }
}

void copyBR() {
  short *gridsX, *gridsY;
  int i, j, netID, edgeID, numEdges, numNodes, grid, min_y, min_x;

  if (sttreesBK != NULL) {

    printf("copy BR working\n");

    for (netID = 0; netID < numValidNets; netID++) {
      numEdges = 2 * sttrees[netID].deg - 3;
      for (edgeID = 0; edgeID < numEdges; edgeID++) {
        if (sttrees[netID].edges[edgeID].len > 0) {
          free(sttrees[netID].edges[edgeID].route.gridsX);
          free(sttrees[netID].edges[edgeID].route.gridsY);
        }
      }
      free(sttrees[netID].nodes);
      free(sttrees[netID].edges);
    }
    free(sttrees);

    sttrees = (StTree*)malloc(numValidNets * sizeof(StTree));

    for (netID = 0; netID < numValidNets; netID++) {
      numNodes = 2 * sttreesBK[netID].deg - 2;
      numEdges = 2 * sttreesBK[netID].deg - 3;

      sttrees[netID].nodes = (TreeNode*)malloc(numNodes * sizeof(TreeNode));

      for (i = 0; i < numNodes; i++) {
        sttrees[netID].nodes[i].x = sttreesBK[netID].nodes[i].x;
        sttrees[netID].nodes[i].y = sttreesBK[netID].nodes[i].y;
        for (j = 0; j < 3; j++) {
          sttrees[netID].nodes[i].nbr[j]  = sttreesBK[netID].nodes[i].nbr[j];
          sttrees[netID].nodes[i].edge[j] = sttreesBK[netID].nodes[i].edge[j];
        }
      }

      sttrees[netID].edges = (TreeEdge*)malloc(numEdges * sizeof(TreeEdge));

      sttrees[netID].deg = sttreesBK[netID].deg;

      for (edgeID = 0; edgeID < numEdges; edgeID++) {
        sttrees[netID].edges[edgeID].len = sttreesBK[netID].edges[edgeID].len;
        sttrees[netID].edges[edgeID].n1  = sttreesBK[netID].edges[edgeID].n1;
        sttrees[netID].edges[edgeID].n2  = sttreesBK[netID].edges[edgeID].n2;

        sttrees[netID].edges[edgeID].route.type = MAZEROUTE;
        sttrees[netID].edges[edgeID].route.routelen =
            sttreesBK[netID].edges[edgeID].route.routelen;

        if (sttreesBK[netID].edges[edgeID].len >
            0) // only route the non-degraded edges (len>0)
        {
          sttrees[netID].edges[edgeID].route.type = MAZEROUTE;
          sttrees[netID].edges[edgeID].route.routelen =
              sttreesBK[netID].edges[edgeID].route.routelen;
          sttrees[netID].edges[edgeID].route.gridsX = (short*)calloc(
              (sttreesBK[netID].edges[edgeID].route.routelen + 1),
              sizeof(short));
          sttrees[netID].edges[edgeID].route.gridsY = (short*)calloc(
              (sttreesBK[netID].edges[edgeID].route.routelen + 1),
              sizeof(short));

          for (i = 0; i <= sttreesBK[netID].edges[edgeID].route.routelen; i++) {
            sttrees[netID].edges[edgeID].route.gridsX[i] =
                sttreesBK[netID].edges[edgeID].route.gridsX[i];
            sttrees[netID].edges[edgeID].route.gridsY[i] =
                sttreesBK[netID].edges[edgeID].route.gridsY[i];
            // printf("x %d y %d
            // ",sttrees[netID].edges[edgeID].route.gridsX[i],sttrees[netID].edges[edgeID].route.gridsY[i]);
          }
          // printf("\n");
        }
      }
    }
    /*
            for(netID=0; netID<numValidNets; netID++) {
                numEdges = 2 * sttreesBK[netID].deg -3;
                treeedges = sttreesBK[netID].edges;
                for(edgeID=0; edgeID<numEdges; edgeID++)
                {
                    if(sttrees[netID].edges[edgeID].len>0)
                    {
                        free (sttreesBK[netID].edges[edgeID].route.gridsX);
                        free (sttreesBK[netID].edges[edgeID].route.gridsY);
                    }
                }
                free(sttreesBK[netID].nodes);
                free(sttreesBK[netID].edges);
            }
            free(sttreesBK); */

    for (i = 0; i < yGrid; i++) {
      for (j = 0; j < xGrid - 1; j++) {

        grid                = i * (xGrid - 1) + j;
        h_edges[grid].usage = 0;
      }
    }
    for (i = 0; i < yGrid - 1; i++) {
      for (j = 0; j < xGrid; j++) {

        grid                = i * xGrid + j;
        v_edges[grid].usage = 0;
      }
    }
    for (netID = 0; netID < numValidNets; netID++) {
      numEdges = 2 * sttrees[netID].deg - 3;
      for (edgeID = 0; edgeID < numEdges; edgeID++) {
        if (sttrees[netID].edges[edgeID].len > 0) {
          gridsX = sttrees[netID].edges[edgeID].route.gridsX;
          gridsY = sttrees[netID].edges[edgeID].route.gridsY;
          for (i = 0; i < sttrees[netID].edges[edgeID].route.routelen; i++) {
            if (gridsX[i] == gridsX[i + 1]) // a vertical edge
            {
              min_y = min(gridsY[i], gridsY[i + 1]);
              v_edges[min_y * xGrid + gridsX[i]].usage += 1;
            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge
            {
              min_x = min(gridsX[i], gridsX[i + 1]);
              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage += 1;
            }
          }
        }
      }
    }
  }
}

void freeRR(void) {
  int netID, edgeID, numEdges;

  if (sttreesBK != NULL) {
    for (netID = 0; netID < numValidNets; netID++) {

      numEdges = 2 * sttreesBK[netID].deg - 3;
      for (edgeID = 0; edgeID < numEdges; edgeID++) {
        if (sttreesBK[netID].edges[edgeID].len > 0) {
          free(sttreesBK[netID].edges[edgeID].route.gridsX);
          free(sttreesBK[netID].edges[edgeID].route.gridsY);
        }
      }
      free(sttreesBK[netID].nodes);
      free(sttreesBK[netID].edges);
    }
    free(sttreesBK);
  }
}

#endif


================================================
FILE: lonestar/libdistbench/CMakeLists.txt
================================================
add_library(distbench STATIC src/Start.cpp src/Input.cpp src/Output.cpp)
target_include_directories(distbench PUBLIC
  "${CMAKE_CURRENT_SOURCE_DIR}/include"
)

target_link_libraries(distbench Galois::cusp Galois::gluon LLVMSupport)


================================================
FILE: lonestar/libdistbench/include/DistBench/Input.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Reader.h
 *
 * Contains definitions for the common distributed graph loading functionality
 * of benchmark applications.
 */
#ifndef GALOIS_DISTBENCH_INPUT_H
#define GALOIS_DISTBENCH_INPUT_H

#include "galois/graphs/CuSPPartitioner.h"
#include "llvm/Support/CommandLine.h"

/*******************************************************************************
 * Supported partitioning schemes
 ******************************************************************************/

//! enums of partitioning schemes supported
enum PARTITIONING_SCHEME {
  OEC,           //!< outgoing edge cut
  IEC,           //!< incoming edge cut
  HOVC,          //!< outgoing hybrid vertex cut
  HIVC,          //!< incoming hybrid vertex cut
  CART_VCUT,     //!< cartesian vertex cut
  CART_VCUT_IEC, //!< cartesian vertex cut using iec
  // CEC,                   //!< custom edge cut
  GINGER_O, //!< Ginger, outgoing
  GINGER_I, //!< Ginger, incoming
  FENNEL_O, //!< Fennel, oec
  FENNEL_I, //!< Fennel, iec
  SUGAR_O   //!< Sugar, oec
};

/**
 * Turns a PARTITIONING_SCHEME enum to a string
 *
 * @param e partitioning scheme enum
 * @return string version of e
 */
inline const char* EnumToString(PARTITIONING_SCHEME e) {
  switch (e) {
  case OEC:
    return "oec";
  case IEC:
    return "iec";
  case HOVC:
    return "hovc";
  case HIVC:
    return "hivc";
  case CART_VCUT:
    return "cvc";
  case CART_VCUT_IEC:
    return "cvc_iec";
  // case CEC:
  //  return "cec";
  case GINGER_O:
    return "ginger-oec";
  case GINGER_I:
    return "ginger-iec";
  case FENNEL_O:
    return "fennel-oec";
  case FENNEL_I:
    return "fennel-iec";
  case SUGAR_O:
    return "sugar-oec";
  default:
    GALOIS_DIE("unsupported partition scheme: ", e);
  }
}

/*******************************************************************************
 * Graph-loading-related command line arguments
 ******************************************************************************/
namespace cll = llvm::cl;

//! input graph file
extern cll::opt<std::string> inputFile;
//! input graph file, but transposed
extern cll::opt<std::string> inputFileTranspose;
//! symmetric input graph file
extern cll::opt<bool> symmetricGraph;
//! partitioning scheme to use
extern cll::opt<PARTITIONING_SCHEME> partitionScheme;
////! path to vertex id map for custom edge cut
// extern cll::opt<std::string> vertexIDMapFileName;
//! true if you want to read graph structure from a file
extern cll::opt<bool> readFromFile;
//! path to local graph structure to read
extern cll::opt<std::string> localGraphFileName;
//! if true, the local graph structure will be saved to disk after partitioning
extern cll::opt<bool> saveLocalGraph;
//! file specifying blocking of masters
extern cll::opt<std::string> mastersFile;

// @todo command line argument for read balancing across hosts

/*******************************************************************************
 * Graph-loading functions
 ******************************************************************************/

template <typename NodeData, typename EdgeData>
using DistGraphPtr =
    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;

/**
 * Loads a symmetric graph file (i.e. directed graph with edges in both
 * directions)
 *
 * @tparam NodeData node data to store in graph
 * @tparam EdgeData edge data to store in graph
 * @param scaleFactor How to split nodes among hosts
 * @returns a pointer to a newly allocated DistGraph based on the command line
 * loaded based on command line arguments
 */
template <typename NodeData, typename EdgeData>
DistGraphPtr<NodeData, EdgeData>
constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
  if (!symmetricGraph) {
    GALOIS_DIE("application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph");
  }

  switch (partitionScheme) {
  case OEC:
  case IEC:
    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,
        mastersFile);
  case HOVC:
  case HIVC:
    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
        inputFileTranspose);

  case CART_VCUT:
  case CART_VCUT_IEC:
    return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
        inputFileTranspose);

    // case CEC:
    //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
    //                                 scaleFactor, vertexIDMapFileName, false);

  case GINGER_O:
  case GINGER_I:
    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
        inputFileTranspose);

  case FENNEL_O:
  case FENNEL_I:
    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
        inputFileTranspose);

  case SUGAR_O:
    return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
        inputFileTranspose);
  default:
    GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
    return DistGraphPtr<NodeData, EdgeData>(nullptr);
  }
}

/**
 * Loads a graph file with the purpose of iterating over the out edges
 * of the graph.
 *
 * @tparam NodeData node data to store in graph
 * @tparam EdgeData edge data to store in graph
 * @tparam iterateOut says if you want to iterate over out edges or not; if
 * false, will iterate over in edgse
 * @tparam enable_if this function  will only be enabled if iterateOut is true
 * @param scaleFactor How to split nodes among hosts
 * @returns a pointer to a newly allocated DistGraph based on the command line
 * loaded based on command line arguments
 */
template <typename NodeData, typename EdgeData, bool iterateOut = true,
          typename std::enable_if<iterateOut>::type* = nullptr>
DistGraphPtr<NodeData, EdgeData>
constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
  // 1 host = no concept of cut; just load from edgeCut, no transpose
  auto& net = galois::runtime::getSystemNetworkInterface();
  if (net.Num == 1) {
    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);
  }

  switch (partitionScheme) {
  case OEC:
    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose, mastersFile);
  case IEC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
          inputFileTranspose, mastersFile);
    } else {
      GALOIS_DIE("incoming edge cut requires transpose graph");
      break;
    }

  case HOVC:
    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);
  case HIVC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("incoming hybrid cut requires transpose graph");
      break;
    }

  case CART_VCUT:
    return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);

  case CART_VCUT_IEC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("cvc incoming cut requires transpose graph");
      break;
    }

    // case CEC:
    //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
    //                                 scaleFactor, vertexIDMapFileName, false);

  case GINGER_O:
    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);
  case GINGER_I:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("Ginger requires transpose graph");
      break;
    }

  case FENNEL_O:
    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);
  case FENNEL_I:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("Fennel requires transpose graph");
      break;
    }

  case SUGAR_O:
    return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
        inputFileTranspose);

  default:
    GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
    return DistGraphPtr<NodeData, EdgeData>(nullptr);
  }
}

/**
 * Loads a graph file with the purpose of iterating over the in edges
 * of the graph.
 *
 * @tparam NodeData node data to store in graph
 * @tparam EdgeData edge data to store in graph
 * @tparam iterateOut says if you want to iterate over out edges or not; if
 * false, will iterate over in edges
 * @tparam enable_if this function  will only be enabled if iterateOut is false
 * (i.e. iterate over in-edges)
 * @param scaleFactor How to split nodes among hosts
 * @returns a pointer to a newly allocated DistGraph based on the command line
 * loaded based on command line arguments
 */
template <typename NodeData, typename EdgeData, bool iterateOut = true,
          typename std::enable_if<!iterateOut>::type* = nullptr>
DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
  auto& net = galois::runtime::getSystemNetworkInterface();

  // 1 host = no concept of cut; just load from edgeCut
  if (net.Num == 1) {
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
          inputFileTranspose);
    } else {
      fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
                      "transpose to iterate over in-edges: pass in transpose "
                      "graph with -graphTranspose to avoid unnecessary "
                      "overhead.\n");
      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
          inputFileTranspose);
    }
  }

  switch (partitionScheme) {
  case OEC:
    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose, mastersFile);
  case IEC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
          inputFileTranspose, mastersFile);
    } else {
      GALOIS_DIE("iec requires transpose graph");
      break;
    }

  case HOVC:
    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose);
  case HIVC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("hivc requires transpose graph");
      break;
    }

  case CART_VCUT:
    return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose);
  case CART_VCUT_IEC:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                        EdgeData>(inputFile, galois::CUSP_CSC,
                                                  galois::CUSP_CSC, false,
                                                  inputFileTranspose);
    } else {
      GALOIS_DIE("cvc requires transpose graph");
      break;
    }

  case GINGER_O:
    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose);
  case GINGER_I:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("Ginger requires transpose graph");
      break;
    }

  case FENNEL_O:
    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose);
  case FENNEL_I:
    if (inputFileTranspose.size()) {
      return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
          inputFileTranspose);
    } else {
      GALOIS_DIE("Fennel requires transpose graph");
      break;
    }

  case SUGAR_O:
    return galois::cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
        inputFileTranspose);

  default:
    GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
    return DistGraphPtr<NodeData, EdgeData>(nullptr);
  }
}

#endif


================================================
FILE: lonestar/libdistbench/include/DistBench/MiningStart.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// TODO document + merge with regular bench start? have to figure out how to
// have both substrates coexist

#ifndef GALOIS_DISTBENCH_MININGSTART_H
#define GALOIS_DISTBENCH_MININGSTART_H

#include "DistBench/Input.h"
#include "galois/AtomicHelpers.h"
#include "galois/Galois.h"
#include "galois/graphs/GenericPartitioners.h"
#include "galois/graphs/GluonEdgeSubstrate.h"
#include "galois/graphs/MiningPartitioner.h"
#include "galois/Version.h"
#include "llvm/Support/CommandLine.h"

#ifdef GALOIS_ENABLE_GPU
#include "galois/cuda/EdgeHostDecls.h"
#else
// dummy struct declaration to allow non-het code to compile without
// having to include cuda_context_decl
struct CUDA_Context;
#endif

//! standard global options to the benchmarks
namespace cll = llvm::cl;

// note these come from distbenchstart rather than mining bench
extern cll::opt<int> numThreads;
extern cll::opt<int> numRuns;
extern cll::opt<std::string> statFile;
//! Set method for metadata sends
extern cll::opt<DataCommMode> commMetadata;
extern cll::opt<bool> output;

#ifdef GALOIS_ENABLE_GPU
enum Personality { CPU, GPU_CUDA };

std::string personality_str(Personality p);

extern int gpudevice;
extern Personality personality;
extern cll::opt<unsigned> scalegpu;
extern cll::opt<unsigned> scalecpu;
extern cll::opt<int> num_nodes;
extern cll::opt<std::string> personality_set;
#endif

/**
 * Initialize Galois runtime for distributed benchmarks and print/report various
 * information.
 *
 * @param argc argument count
 * @param argv list of arguments
 * @param app Name of the application
 * @param desc Description of the application
 * @param url URL to the application
 */
void DistBenchStart(int argc, char** argv, const char* app,
                    const char* desc = nullptr, const char* url = nullptr);

template <typename NodeData, typename EdgeData>
using MiningGraphPtr = std::unique_ptr<
    galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>>;
template <typename NodeData, typename EdgeData>
using MiningSubstratePtr = std::unique_ptr<galois::graphs::GluonEdgeSubstrate<
    galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>>>;

#ifdef GALOIS_ENABLE_GPU
// in internal namespace because this function shouldn't be called elsewhere
namespace internal {
void heteroSetup(std::vector<unsigned>& scaleFactor);
}; // namespace internal

/**
 * Given a loaded graph, marshal it over to the GPU device for use
 * on the GPU.
 *
 * @param GluonEdgeSubstrate Gluon substrate containing info needed to marshal
 * to GPU
 * @param cuda_ctx the CUDA context of the currently running program
 */
template <typename NodeData, typename EdgeData>
static void
marshalGPUGraph(MiningSubstratePtr<NodeData, EdgeData>& GluonEdgeSubstrate,
                struct CUDA_Context** cuda_ctx, bool LoadProxyEdges = true) {
  auto& net                 = galois::runtime::getSystemNetworkInterface();
  const unsigned my_host_id = galois::runtime::getHostID();

  galois::StatTimer marshalTimer("TIMER_GRAPH_MARSHAL", "DistBench");

  marshalTimer.start();

  if (personality == GPU_CUDA) {
    *cuda_ctx = get_CUDA_context(my_host_id);

    if (!init_CUDA_context(*cuda_ctx, gpudevice)) {
      GALOIS_DIE("failed to initialize CUDA context");
    }

    EdgeMarshalGraph m;
    (*GluonEdgeSubstrate).getEdgeMarshalGraph(m, LoadProxyEdges);
    load_graph_CUDA(*cuda_ctx, m, net.Num);
  }
  marshalTimer.stop();
}
#endif

/**
 */
template <typename NodeData, typename EdgeData, bool iterateOutEdges = true>
static MiningGraphPtr<NodeData, EdgeData> loadDGraph(bool loadProxyEdges) {
  using Graph =
      galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>;
  galois::StatTimer dGraphTimer("GraphConstructTime", "DistBench");

  dGraphTimer.start();
  const auto& net = galois::runtime::getSystemNetworkInterface();
  MiningGraphPtr<NodeData, EdgeData> loadedGraph = std::make_unique<Graph>(
      inputFile, net.ID, net.Num, loadProxyEdges, loadProxyEdges);
  assert(loadedGraph != nullptr);
  dGraphTimer.stop();

  return loadedGraph;
}
/**
 * Loads a graph into memory, setting up heterogeneous execution if
 * necessary. Unlike the dGraph load functions above, this is meant
 * to be exposed to the user.
 *
 * @tparam NodeData struct specifying what kind of data the node contains
 * @tparam EdgeData type specifying the type of the edge data
 * @tparam iterateOutEdges Boolean specifying if the graph should be iterating
 * over outgoing or incoming edges
 *
 * @param cuda_ctx CUDA context of the currently running program; only matters
 * if using GPU
 *
 * @returns Pointer to the loaded graph and Gluon substrate
 */
template <typename NodeData, typename EdgeData, bool iterateOutEdges = true>
std::pair<MiningGraphPtr<NodeData, EdgeData>,
          MiningSubstratePtr<NodeData, EdgeData>>
#ifdef GALOIS_ENABLE_GPU
distGraphInitialization(struct CUDA_Context** cuda_ctx,
#else
distGraphInitialization(
#endif
                        bool loadProxyEdges = true) {
  galois::StatTimer initTimer("DistGraphInitialization", "DistMiningBench");
  using Graph =
      galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>;
  using Substrate = galois::graphs::GluonEdgeSubstrate<Graph>;

  initTimer.start();
  std::vector<unsigned> scaleFactor;
  MiningGraphPtr<NodeData, EdgeData> g;
  MiningSubstratePtr<NodeData, EdgeData> s;

#ifdef GALOIS_ENABLE_GPU
  internal::heteroSetup(scaleFactor);
#endif
  g = loadDGraph<NodeData, EdgeData, iterateOutEdges>(loadProxyEdges);

  // load substrate
  const auto& net = galois::runtime::getSystemNetworkInterface();
  // if you want to load proxy edges (true), then do nothing should be false
  // hence the use of ! to negate
  s = std::make_unique<Substrate>(*g, net.ID, net.Num, !loadProxyEdges,
                                  commMetadata);

// marshal graph to GPU as necessary
#ifdef GALOIS_ENABLE_GPU
  if (net.ID == 0) {
    galois::gPrint("Beginning to marshal graph to GPU\n");
  }
  marshalGPUGraph(s, cuda_ctx, loadProxyEdges);
#endif

  initTimer.stop();

  return std::make_pair(std::move(g), std::move(s));
}

#endif


================================================
FILE: lonestar/libdistbench/include/DistBench/Output.h
================================================
#ifndef GALOIS_DISTBENCH_OUTPUT_H
#define GALOIS_DISTBENCH_OUTPUT_H

#include <stdint.h>
#include <string>
#include <fstream>
#include "galois/gIO.h"

std::string makeOutputFilename(const std::string& outputDir);

template <typename T>
void writeOutput(const std::string& outputDir, const std::string& /*fieldName*/,
                 T* values, size_t length, uint64_t* IDs) {
  std::string filename = makeOutputFilename(outputDir);

  std::ofstream outputFile(filename.c_str());

  for (size_t i = 0; i < length; i++) {
    outputFile << *(IDs++) << " " << *(values++) << "\n";
  }

  galois::gPrint("Output written to: ", filename, "\n");
}

#endif


================================================
FILE: lonestar/libdistbench/include/DistBench/Start.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GALOIS_DISTBENCH_START_H
#define GALOIS_DISTBENCH_START_H

#include "DistBench/Input.h"
#include "galois/AtomicHelpers.h"
#include "galois/Galois.h"
#include "galois/graphs/GluonSubstrate.h"
#include "galois/Version.h"
#include "llvm/Support/CommandLine.h"

#ifdef GALOIS_ENABLE_GPU
#include "galois/cuda/HostDecls.h"
#else
// dummy struct declaration to allow non-het code to compile without
// having to include cuda_context_decl
struct CUDA_Context;
#endif

//! standard global options to the benchmarks
namespace cll = llvm::cl;

extern cll::opt<int> numThreads;
extern cll::opt<int> numRuns;
extern cll::opt<std::string> statFile;
//! If set, ignore partitioning comm optimizations
extern cll::opt<bool> partitionAgnostic;
//! Set method for metadata sends
extern cll::opt<DataCommMode> commMetadata;
//! Where to write output if output is set
extern cll::opt<std::string> outputLocation;
extern cll::opt<bool> output;

#ifdef GALOIS_ENABLE_GPU
enum Personality { CPU, GPU_CUDA };

std::string personality_str(Personality p);

extern int gpudevice;
extern Personality personality;
extern cll::opt<unsigned> scalegpu;
extern cll::opt<unsigned> scalecpu;
extern cll::opt<int> num_nodes;
extern cll::opt<std::string> personality_set;
#endif

/**
 * Initialize Galois runtime for distributed benchmarks and print/report various
 * information.
 *
 * @param argc argument count
 * @param argv list of arguments
 * @param app Name of the application
 * @param desc Description of the application
 * @param url URL to the application
 */
void DistBenchStart(int argc, char** argv, const char* app,
                    const char* desc = nullptr, const char* url = nullptr);

template <typename NodeData, typename EdgeData>
using DistGraphPtr =
    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;
template <typename NodeData, typename EdgeData>
using DistSubstratePtr = std::unique_ptr<galois::graphs::GluonSubstrate<
    galois::graphs::DistGraph<NodeData, EdgeData>>>;

#ifdef GALOIS_ENABLE_GPU
// in internal namespace because this function shouldn't be called elsewhere
namespace internal {
void heteroSetup(std::vector<unsigned>& scaleFactor);
}; // namespace internal

/**
 * Given a loaded graph, marshal it over to the GPU device for use
 * on the GPU.
 *
 * @param gluonSubstrate Gluon substrate containing info needed to marshal
 * to GPU
 * @param cuda_ctx the CUDA context of the currently running program
 */
template <typename NodeData, typename EdgeData>
static void
marshalGPUGraph(DistSubstratePtr<NodeData, EdgeData>& gluonSubstrate,
                struct CUDA_Context** cuda_ctx) {
  auto& net                 = galois::runtime::getSystemNetworkInterface();
  const unsigned my_host_id = galois::runtime::getHostID();

  galois::StatTimer marshalTimer("TIMER_GRAPH_MARSHAL", "DistBench");

  marshalTimer.start();

  if (personality == GPU_CUDA) {
    *cuda_ctx = get_CUDA_context(my_host_id);

    if (!init_CUDA_context(*cuda_ctx, gpudevice)) {
      GALOIS_DIE("failed to initialize CUDA context");
    }

    MarshalGraph m;
    (*gluonSubstrate).getMarshalGraph(m);
    load_graph_CUDA(*cuda_ctx, m, net.Num);
  }

  marshalTimer.stop();
}
#endif

/**
 * Loads a graph into memory. Details/partitioning will be handled in the
 * construct graph call.
 *
 * The user should NOT call this function.
 *
 * @tparam NodeData struct specifying what kind of data the node contains
 * @tparam EdgeData type specifying the type of the edge data
 * @tparam iterateOutEdges Boolean specifying if the graph should be iterating
 * over outgoing or incoming edges
 *
 * @param scaleFactor Vector that specifies how much of the graph each
 * host should get
 *
 * @returns Pointer to the loaded graph
 */
template <typename NodeData, typename EdgeData, bool iterateOutEdges = true>
static DistGraphPtr<NodeData, EdgeData>
loadDistGraph(std::vector<unsigned>& scaleFactor) {
  galois::StatTimer dGraphTimer("GraphConstructTime", "DistBench");
  dGraphTimer.start();

  DistGraphPtr<NodeData, EdgeData> loadedGraph =
      constructGraph<NodeData, EdgeData, iterateOutEdges>(scaleFactor);
  assert(loadedGraph != nullptr);

  dGraphTimer.stop();

  // Save local graph structure
  // if (saveLocalGraph)
  //  (*loadedGraph).save_local_graph_to_file(localGraphFileName);

  return loadedGraph;
}

/**
 * Loads a symmetric graph into memory.
 * Details/partitioning will be handled in the construct graph call.
 *
 * The user should NOT call this function.
 *
 * @tparam NodeData struct specifying what kind of data the node contains
 * @tparam EdgeData type specifying the type of the edge data
 *
 * @param scaleFactor Vector that specifies how much of the graph each
 * host should get
 *
 * @returns Pointer to the loaded symmetric graph
 */
template <typename NodeData, typename EdgeData>
static DistGraphPtr<NodeData, EdgeData>
loadSymmetricDistGraph(std::vector<unsigned>& scaleFactor) {
  galois::StatTimer dGraphTimer("GraphConstructTime", "DistBench");
  dGraphTimer.start();

  DistGraphPtr<NodeData, EdgeData> loadedGraph = nullptr;

  // make sure that the symmetric graph flag was passed in
  if (symmetricGraph) {
    loadedGraph = constructSymmetricGraph<NodeData, EdgeData>(scaleFactor);
  } else {
    GALOIS_DIE("This application requires a symmetric graph input;"
               " please use the -symmetricGraph flag "
               " to indicate the input is a symmetric graph.");
  }

  assert(loadedGraph != nullptr);

  dGraphTimer.stop();

  // Save local graph structure
  // if (saveLocalGraph)
  //  (*loadedGraph).save_local_graph_to_file(localGraphFileName);

  return loadedGraph;
}

/**
 * Loads a graph into memory, setting up heterogeneous execution if
 * necessary. Unlike the dGraph load functions above, this is meant
 * to be exposed to the user.
 *
 * @tparam NodeData struct specifying what kind of data the node contains
 * @tparam EdgeData type specifying the type of the edge data
 * @tparam iterateOutEdges Boolean specifying if the graph should be iterating
 * over outgoing or incoming edges
 *
 * @param cuda_ctx CUDA context of the currently running program; only matters
 * if using GPU
 *
 * @returns Pointer to the loaded graph and Gluon substrate
 */
template <typename NodeData, typename EdgeData, bool iterateOutEdges = true>
std::pair<DistGraphPtr<NodeData, EdgeData>,
          DistSubstratePtr<NodeData, EdgeData>>
#ifdef GALOIS_ENABLE_GPU
distGraphInitialization(struct CUDA_Context** cuda_ctx) {
#else
distGraphInitialization() {
#endif
  using Graph     = galois::graphs::DistGraph<NodeData, EdgeData>;
  using Substrate = galois::graphs::GluonSubstrate<Graph>;
  std::vector<unsigned> scaleFactor;
  DistGraphPtr<NodeData, EdgeData> g;
  DistSubstratePtr<NodeData, EdgeData> s;

#ifdef GALOIS_ENABLE_GPU
  internal::heteroSetup(scaleFactor);
#endif
  g = loadDistGraph<NodeData, EdgeData, iterateOutEdges>(scaleFactor);
  // load substrate
  const auto& net = galois::runtime::getSystemNetworkInterface();
  s = std::make_unique<Substrate>(*g, net.ID, net.Num, g->isTransposed(),
                                  g->cartesianGrid(), partitionAgnostic,
                                  commMetadata);

// marshal graph to GPU as necessary
#ifdef GALOIS_ENABLE_GPU
  marshalGPUGraph(s, cuda_ctx);
#endif

  return std::make_pair(std::move(g), std::move(s));
}

/**
 * Loads a symmetric graph into memory, setting up heterogeneous execution if
 * necessary. Unlike the dGraph load functions above, this is meant
 * to be exposed to the user.
 *
 * @tparam NodeData struct specifying what kind of data the node contains
 * @tparam EdgeData type specifying the type of the edge data
 *
 * @param cuda_ctx CUDA context of the currently running program; only matters
 * if using GPU
 *
 * @returns Pointer to the loaded symmetric graph
 */
template <typename NodeData, typename EdgeData>
std::pair<DistGraphPtr<NodeData, EdgeData>,
          DistSubstratePtr<NodeData, EdgeData>>
#ifdef GALOIS_ENABLE_GPU
symmetricDistGraphInitialization(struct CUDA_Context** cuda_ctx) {
#else
symmetricDistGraphInitialization() {
#endif
  using Graph     = galois::graphs::DistGraph<NodeData, EdgeData>;
  using Substrate = galois::graphs::GluonSubstrate<Graph>;
  std::vector<unsigned> scaleFactor;
  DistGraphPtr<NodeData, EdgeData> g;
  DistSubstratePtr<NodeData, EdgeData> s;

#ifdef GALOIS_ENABLE_GPU
  internal::heteroSetup(scaleFactor);
#endif
  g = loadSymmetricDistGraph<NodeData, EdgeData>(scaleFactor);
  // load substrate
  const auto& net = galois::runtime::getSystemNetworkInterface();
  s = std::make_unique<Substrate>(*g, net.ID, net.Num, g->isTransposed(),
                                  g->cartesianGrid(), partitionAgnostic,
                                  commMetadata);

// marshal graph to GPU as necessary
#ifdef GALOIS_ENABLE_GPU
  marshalGPUGraph(s, cuda_ctx);
#endif

  return std::make_pair(std::move(g), std::move(s));
}

#endif


================================================
FILE: lonestar/libdistbench/src/Input.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

/**
 * @file Reader.cpp
 *
 * Contains definitions for command line arguments related to distributed
 * graph loading.
 */

#include "DistBench/Input.h"

using namespace galois::graphs;

namespace cll = llvm::cl;

cll::opt<std::string> inputFile(cll::Positional, cll::desc("<input file>"),
                                cll::Required);
cll::opt<std::string> inputFileTranspose("graphTranspose",
                                         cll::desc("<input file, transposed>"),
                                         cll::init(""));
cll::opt<bool>
    symmetricGraph("symmetricGraph",
                   cll::desc("Specify that the input graph is symmetric"),
                   cll::init(false));

cll::opt<PARTITIONING_SCHEME> partitionScheme(
    "partition", cll::desc("Type of partitioning."),
    cll::values(
        clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
        clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
        clEnumValN(HOVC, "hovc", "Outgoing Hybrid Vertex-Cut"),
        clEnumValN(HIVC, "hivc", "Incoming Hybrid Vertex-Cut"),
        clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
        clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"),
        // clEnumValN(CEC, "cec", "Custom edge cut from vertexID mapping"),
        clEnumValN(GINGER_O, "ginger-o", "ginger, outgiong edges, using CuSP"),
        clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"),
        clEnumValN(FENNEL_O, "fennel-o",
                   "fennel, outgoing edge cut, using CuSP"),
        clEnumValN(FENNEL_I, "fennel-i",
                   "fennel, incoming edge cut, using CuSP"),
        clEnumValN(SUGAR_O, "sugar-o",
                   "fennel, incoming edge cut, using CuSP")),
    cll::init(OEC));

cll::opt<bool> readFromFile("readFromFile",
                            cll::desc("Set this flag if graph is to be "
                                      "constructed from file (file must be "
                                      "created by Abelian CSR)"),
                            cll::init(false), cll::Hidden);

cll::opt<std::string>
    localGraphFileName("localGraphFileName",
                       cll::desc("Name of the local file to construct "
                                 "local graph (file must be created by "
                                 "Abelian CSR)"),
                       cll::init("local_graph"), cll::Hidden);

cll::opt<bool> saveLocalGraph("saveLocalGraph",
                              cll::desc("Set to save the local CSR graph"),
                              cll::init(false), cll::Hidden);

cll::opt<std::string> mastersFile("mastersFile",
                                  cll::desc("File specifying masters blocking"),
                                  cll::init(""), cll::Hidden);


================================================
FILE: lonestar/libdistbench/src/Output.cpp
================================================
#include "DistBench/Output.h"
#include "galois/runtime/Network.h"

#include <iomanip>

namespace {
std::string zeroPad(int num, int width) {
  std::ostringstream out;

  out << std::setw(width) << std::setfill('0') << num;

  return out.str();
}

} // namespace

std::string makeOutputFilename(const std::string& outputDir) {
  std::string filename = zeroPad(galois::runtime::getHostID(), 8);

  std::string output{outputDir};
  if (output.empty() || output.compare(output.size() - 1, 1, "/") == 0) {
    output += filename;
  } else {
    output += "/" + filename;
  }

  return output;
}


================================================
FILE: lonestar/libdistbench/src/Start.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "DistBench/Start.h"
#include "galois/Version.h"
#include "galois/runtime/Network.h"
#include "galois/runtime/DistStats.h"
#include "galois/runtime/DataCommMode.h"

#include <sstream>

////////////////////////////////////////////////////////////////////////////////
// Command line args
////////////////////////////////////////////////////////////////////////////////

cll::opt<int> numThreads("t", cll::desc("Number of threads (default 1)"),
                         cll::init(1));
cll::opt<int> numRuns("runs", cll::desc("Number of runs (default 3)"),
                      cll::init(3));
cll::opt<std::string>
    statFile("statFile", cll::desc("Optional output file to print stats to"));

cll::opt<bool>
    partitionAgnostic("partitionAgnostic",
                      cll::desc("Do not use partition-aware optimizations"),
                      cll::init(false), cll::Hidden);

// TODO: use enums
cll::opt<DataCommMode> commMetadata(
    "metadata", cll::desc("Communication metadata"),
    cll::values(clEnumValN(noData, "auto", "Dynamically choose the metadata"),
                clEnumValN(bitsetData, "bitset", "Use bitset metadata always"),
                clEnumValN(offsetsData, "offsets",
                           "Use offsets metadata always"),
                clEnumValN(gidsData, "gids", "Use global IDs metadata always"),
                clEnumValN(onlyData, "none",
                           "Do not use any metadata (sends "
                           "non-updated values)")),
    cll::init(noData), cll::Hidden);

cll::opt<std::string> outputLocation(
    "outputLocation",
    cll::desc("Location (directory) to write results to when output is true"));

cll::opt<bool> output("output", cll::desc("Write result (default false)"),
                      cll::init(false));

#ifdef GALOIS_ENABLE_GPU
std::string personality_str(Personality p) {
  switch (p) {
  case CPU:
    return "CPU";
  case GPU_CUDA:
    return "GPU_CUDA";
  }

  assert(false && "Invalid personality");
  return "";
}

int gpudevice;
Personality personality = CPU;

cll::opt<unsigned> scalegpu(
    "scalegpu",
    cll::desc("Scale GPU workload w.r.t. CPU, default is proportionally "
              "equal workload to CPU and GPU (1)"),
    cll::init(1));
cll::opt<unsigned> scalecpu(
    "scalecpu",
    cll::desc("Scale CPU workload w.r.t. GPU, default is proportionally "
              "equal workload to CPU and GPU (1)"),
    cll::init(1));
cll::opt<int> num_nodes(
    "num_nodes",
    cll::desc("Num of physical nodes with devices (default = num of hosts): "
              "detect GPU to use for each host automatically"),
    cll::init(-1));
cll::opt<std::string> personality_set(
    "pset",
    cll::desc("String specifying personality for hosts on each physical "
              "node. 'c'=CPU, 'g'=GPU (default 'c')"),
    cll::init("c"));
#endif

static void PrintVersion(llvm::raw_ostream& out) {
  out << "D-Galois Benchmark Suite v" << galois::getVersion() << " ("
      << galois::getRevision() << ")\n";
  out.flush();
}

////////////////////////////////////////////////////////////////////////////////
//! initialize benchmark + functions to help initialization
////////////////////////////////////////////////////////////////////////////////

void DistBenchStart(int argc, char** argv, const char* app, const char* desc,
                    const char* url) {
  llvm::cl::SetVersionPrinter(PrintVersion);
  llvm::cl::ParseCommandLineOptions(argc, argv);
  numThreads = galois::setActiveThreads(numThreads);
  galois::runtime::setStatFile(statFile);

  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    PrintVersion(llvm::outs());
    llvm::outs() << "Copyright (C) " << galois::getCopyrightYear()
                 << " The University of Texas at Austin\n";
    llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n";
    llvm::outs() << "application: " << (app ? app : "unspecified") << "\n";

    if (desc) {
      llvm::outs() << desc << "\n";
    }
    if (url) {
      llvm::outs()
          << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url
          << "\n";
    }
    llvm::outs() << "\n";
    llvm::outs().flush();

    std::ostringstream cmdout;

    for (int i = 0; i < argc; ++i) {
      cmdout << argv[i];
      if (i != argc - 1)
        cmdout << " ";
    }

    galois::runtime::reportParam("DistBench", "CommandLine", cmdout.str());
    galois::runtime::reportParam("DistBench", "Threads", numThreads);
    galois::runtime::reportParam("DistBench", "Hosts", net.Num);
    galois::runtime::reportParam("DistBench", "Runs", numRuns);
    galois::runtime::reportParam("DistBench", "Run_UUID",
                                 galois::runtime::getRandUUID());
    galois::runtime::reportParam("DistBench", "Input", inputFile);
    galois::runtime::reportParam("DistBench", "PartitionScheme",
                                 EnumToString(partitionScheme));
  }

  char name[256];
  gethostname(name, 256);
  galois::runtime::reportParam("DistBench", "Hostname", name);
}

#ifdef GALOIS_ENABLE_GPU
/**
 * Processes/setups the specified heterogeneous configuration (the pset
 * command line option) and sets up the scale factor vector for
 * graph partitioning.
 *
 * @param scaleFactor input and output: an empty vector that will hold
 * the scale factor (i.e. how much each host will get relative to
 * other hosts) at the end of the function
 */
void internal::heteroSetup(std::vector<unsigned>& scaleFactor) {
  const unsigned my_host_id = galois::runtime::getHostID();

  // Parse arg string when running on multiple hosts and update
  // personality with corresponding value.
  auto& net = galois::runtime::getSystemNetworkInterface();

  if (num_nodes == -1)
    num_nodes = net.Num;

  assert((net.Num % num_nodes) == 0);

  if (personality_set.length() == (net.Num / num_nodes)) {
    switch (personality_set.c_str()[my_host_id % (net.Num / num_nodes)]) {
    case 'g':
      personality = GPU_CUDA;
      break;
    case 'c':
    default:
      personality = CPU;
      break;
    }

    if (personality == GPU_CUDA) {
      gpudevice = get_gpu_device_id(personality_set, num_nodes);
    } else {
      gpudevice = -1;
    }

    // scale factor setup
    if ((scalecpu > 1) || (scalegpu > 1)) {
      for (unsigned i = 0; i < net.Num; ++i) {
        if (personality_set.c_str()[i % num_nodes] == 'c') {
          scaleFactor.push_back(scalecpu);
        } else {
          scaleFactor.push_back(scalegpu);
        }
      }
    }
  } else {
    galois::gWarn(
        "Command line option -pset ignored because its string length is not "
        "equal to the number of processes/hosts on each physical node");
  }
}
#endif


================================================
FILE: lonestar/liblonestar/CMakeLists.txt
================================================
add_library(lonestar STATIC src/BoilerPlate.cpp)

target_include_directories(lonestar PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
)

target_link_libraries(lonestar Galois::shmem LLVMSupport)


================================================
FILE: lonestar/liblonestar/include/Lonestar/BFS_SSSP.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef LONESTAR_BFS_SSSP_H
#define LONESTAR_BFS_SSSP_H
#include <iostream>
#include <cstdlib>

template <typename Graph, typename _DistLabel, bool USE_EDGE_WT,
          ptrdiff_t EDGE_TILE_SIZE = 256>
struct BFS_SSSP {

  using Dist = _DistLabel;

  constexpr static const Dist DIST_INFINITY =
      std::numeric_limits<Dist>::max() / 2 - 1;

  using GNode = typename Graph::GraphNode;
  using EI    = typename Graph::edge_iterator;

  struct UpdateRequest {
    GNode src;
    Dist dist;
    UpdateRequest(const GNode& N, Dist W) : src(N), dist(W) {}
    UpdateRequest() : src(), dist(0) {}

    friend bool operator<(const UpdateRequest& left,
                          const UpdateRequest& right) {
      return left.dist == right.dist ? left.src < right.src
                                     : left.dist < right.dist;
    }
  };

  struct UpdateRequestIndexer {
    unsigned shift;

    template <typename R>
    unsigned int operator()(const R& req) const {
      unsigned int t = req.dist >> shift;
      return t;
    }
  };

  struct SrcEdgeTile {
    GNode src;
    Dist dist;
    EI beg;
    EI end;

    friend bool operator<(const SrcEdgeTile& left, const SrcEdgeTile& right) {
      return left.dist == right.dist ? left.src < right.src
                                     : left.dist < right.dist;
    }
  };

  struct SrcEdgeTileMaker {
    GNode src;
    Dist dist;

    SrcEdgeTile operator()(const EI& beg, const EI& end) const {
      return SrcEdgeTile{src, dist, beg, end};
    }
  };

  template <typename WL, typename TileMaker>
  static void pushEdgeTiles(WL& wl, EI beg, const EI end, const TileMaker& f) {
    assert(beg <= end);

    if ((end - beg) > EDGE_TILE_SIZE) {
      for (; beg + EDGE_TILE_SIZE < end;) {
        auto ne = beg + EDGE_TILE_SIZE;
        assert(ne < end);
        wl.push(f(beg, ne));
        beg = ne;
      }
    }

    if ((end - beg) > 0) {
      wl.push(f(beg, end));
    }
  }

  template <typename WL, typename TileMaker>
  static void pushEdgeTiles(WL& wl, Graph& graph, GNode src,
                            const TileMaker& f) {
    auto beg       = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
    const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);

    pushEdgeTiles(wl, beg, end, f);
  }

  template <typename WL, typename TileMaker>
  static void pushEdgeTilesParallel(WL& wl, Graph& graph, GNode src,
                                    const TileMaker& f) {

    auto beg       = graph.edge_begin(src);
    const auto end = graph.edge_end(src);

    if ((end - beg) > EDGE_TILE_SIZE) {

      galois::on_each(
          [&](const unsigned tid, const unsigned numT) {
            auto p = galois::block_range(beg, end, tid, numT);

            auto b       = p.first;
            const auto e = p.second;

            pushEdgeTiles(wl, b, e, f);
          },
          galois::loopname("Init-Tiling"));

    } else if ((end - beg) > 0) {
      wl.push(f(beg, end));
    }
  }

  struct ReqPushWrap {
    template <typename C>
    void operator()(C& cont, const GNode& n, const Dist& dist,
                    const char* const) const {
      (*this)(cont, n, dist);
    }

    template <typename C>
    void operator()(C& cont, const GNode& n, const Dist& dist) const {
      cont.push(UpdateRequest(n, dist));
    }
  };

  struct SrcEdgeTilePushWrap {

    Graph& graph;

    template <typename C>
    void operator()(C& cont, const GNode& n, const Dist& dist,
                    const char* const) const {
      pushEdgeTilesParallel(cont, graph, n, SrcEdgeTileMaker{n, dist});
    }

    template <typename C>
    void operator()(C& cont, const GNode& n, const Dist& dist) const {
      pushEdgeTiles(cont, graph, n, SrcEdgeTileMaker{n, dist});
    }
  };

  struct OutEdgeRangeFn {
    Graph& graph;
    auto operator()(const GNode& n) const {
      return graph.edges(n, galois::MethodFlag::UNPROTECTED);
    }

    auto operator()(const UpdateRequest& req) const {
      return graph.edges(req.src, galois::MethodFlag::UNPROTECTED);
    }
  };

  struct TileRangeFn {
    template <typename T>
    auto operator()(const T& tile) const {
      return galois::makeIterRange(tile.beg, tile.end);
    }
  };

  struct not_consistent {
    Graph& g;
    std::atomic<bool>& refb;
    not_consistent(Graph& g, std::atomic<bool>& refb) : g(g), refb(refb) {}

    template <bool useWt, typename iiTy>
    Dist getEdgeWeight(iiTy,
                       typename std::enable_if<!useWt>::type* = nullptr) const {
      return 1;
    }

    template <bool useWt, typename iiTy>
    Dist getEdgeWeight(iiTy ii,
                       typename std::enable_if<useWt>::type* = nullptr) const {
      return g.getEdgeData(ii);
    }

    void operator()(typename Graph::GraphNode node) const {
      Dist sd = g.getData(node);
      if (sd == DIST_INFINITY)
        return;

      for (auto ii : g.edges(node)) {
        auto dst = g.getEdgeDst(ii);
        Dist dd  = g.getData(dst);
        Dist ew  = getEdgeWeight<USE_EDGE_WT>(ii);
        if (dd > sd + ew) {
          std::cout << "Wrong label: " << dd << ", on node: " << dst
                    << ", correct label from src node " << node << " is "
                    << sd + ew << "\n"; // XXX
          refb = true;
          // return;
        }
      }
    }
  };

  struct max_dist {
    Graph& g;
    galois::GReduceMax<Dist>& m;

    max_dist(Graph& g, galois::GReduceMax<Dist>& m) : g(g), m(m) {}

    void operator()(typename Graph::GraphNode node) const {
      Dist d = g.getData(node);
      if (d == DIST_INFINITY)
        return;
      m.update(d);
    }
  };

  static bool verify(Graph& graph, GNode source) {
    if (graph.getData(source) != 0) {
      std::cerr << "ERROR: source has non-zero dist value == "
                << graph.getData(source) << std::endl;
      return false;
    }

    std::atomic<size_t> notVisited(0);
    galois::do_all(galois::iterate(graph), [&notVisited, &graph](GNode node) {
      if (graph.getData(node) >= DIST_INFINITY)
        ++notVisited;
    });

    if (notVisited)
      std::cerr << notVisited
                << " unvisited nodes; this is an error if the graph is "
                   "strongly connected\n";

    std::atomic<bool> not_c(false);
    galois::do_all(galois::iterate(graph), not_consistent(graph, not_c));

    if (not_c) {
      std::cerr << "node found with incorrect distance\n";
      return false;
    }

    galois::GReduceMax<Dist> m;
    galois::do_all(galois::iterate(graph), max_dist(graph, m));

    std::cout << "max dist: " << m.reduce() << "\n";

    return true;
  }
};

template <typename T, typename BucketFunc, size_t MAX_BUCKETS = 543210ul>
class SerialBucketWL {

  using Bucket      = std::deque<T>;
  using BucketsCont = std::vector<Bucket>;

  size_t m_minBucket;
  BucketFunc m_func;
  BucketsCont m_buckets;
  Bucket m_lastBucket;

  static_assert(MAX_BUCKETS > 0, "MAX_BUCKETS must be > 0");

public:
  explicit SerialBucketWL(const BucketFunc& f) : m_minBucket(0ul), m_func(f) {
    // reserve enough so that resize never reallocates memory
    // otherwise, minBucket may return an invalid reference
    m_buckets.reserve(MAX_BUCKETS);
  }

  void push(const T& item) {
    size_t b = m_func(item);
    assert(b >= m_minBucket && "can't push below m_minBucket");

    if (b < m_buckets.size()) {
      m_buckets[b].push_back(item);
      return;
    } else {
      if (b >= MAX_BUCKETS) {
        std::cerr << "Increase MAX_BUCKETS limit" << std::endl;
        m_lastBucket.push_back(item);
      } else {
        m_buckets.resize(b + 1);
        m_buckets[b].push_back(item);
      }
    }
  }

  void goToNextBucket(void) {
    while (m_minBucket < m_buckets.size() && m_buckets[m_minBucket].empty()) {
      ++m_minBucket;
    }
  }

  Bucket& minBucket(void) {
    if (m_minBucket < m_buckets.size()) {
      return m_buckets[m_minBucket];
    } else {
      return m_lastBucket;
    }
  }

  bool empty(void) const { return emptyImpl(m_minBucket); }

  bool allEmpty(void) const { return emptyImpl(0ul); }

private:
  bool emptyImpl(size_t start) const {
    for (size_t i = start; i < m_buckets.size(); ++i) {
      if (!m_buckets[i].empty()) {
        return false;
      }
    }

    return m_lastBucket.empty();
  }
};

#endif //  LONESTAR_BFS_SSSP_H


================================================
FILE: lonestar/liblonestar/include/Lonestar/BoilerPlate.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef LONESTAR_BOILERPLATE_H
#define LONESTAR_BOILERPLATE_H

#include "galois/Galois.h"
#include "galois/Version.h"
#include "llvm/Support/CommandLine.h"

//! standard global options to the benchmarks
extern llvm::cl::opt<bool> skipVerify;
extern llvm::cl::opt<int> numThreads;
extern llvm::cl::opt<std::string> statFile;
extern llvm::cl::opt<bool> symmetricGraph;

//! initialize lonestar benchmark
void LonestarStart(int argc, char** argv, const char* app, const char* desc,
                   const char* url, llvm::cl::opt<std::string>* input);
void LonestarStart(int argc, char** argv);
#endif


================================================
FILE: lonestar/liblonestar/include/Lonestar/Utils.h
================================================


#pragma once
#include <random>
#include <vector>
#include <algorithm>

//! Used to pick random non-zero degree starting points for search algorithms
//! This code has been copied from GAP benchmark suite
//! (https://github.com/sbeamer/gapbs/blob/master/src/benchmark.h)
template <typename Graph>
class SourcePicker {
  static const uint32_t kRandSeed;
  std::mt19937 rng;
  std::uniform_int_distribution<typename Graph::GraphNode> udist;
  const Graph& graph;

public:
  explicit SourcePicker(const Graph& g)
      : rng(kRandSeed), udist(0, g.size() - 1), graph(g) {}

  auto PickNext() {
    typename Graph::GraphNode source;
    do {
      source = udist(rng);
    } while (graph.getDegree(source) == 0);
    return source;
  }
};
template <typename Graph>
const uint32_t SourcePicker<Graph>::kRandSeed = 27491095;

//! Used to determine if a graph has power-law degree distribution or not
//! by sampling some of the vertices in the graph randomly
//! This code has been copied from GAP benchmark suite
//! (https://github.com/sbeamer/gapbs/blob/master/src/tc.cc WorthRelabelling())
template <typename Graph>
bool isApproximateDegreeDistributionPowerLaw(const Graph& graph) {
  uint32_t averageDegree = graph.sizeEdges() / graph.size();
  if (averageDegree < 10)
    return false;
  SourcePicker<Graph> sp(graph);
  uint32_t num_samples = 1000;
  if (num_samples > graph.size())
    num_samples = graph.size();
  uint32_t sample_total = 0;
  std::vector<uint32_t> samples(num_samples);
  for (uint32_t trial = 0; trial < num_samples; trial++) {
    typename Graph::GraphNode node = sp.PickNext();
    samples[trial]                 = graph.getDegree(node);
    sample_total += samples[trial];
  }
  std::sort(samples.begin(), samples.end());
  double sample_average = static_cast<double>(sample_total) / num_samples;
  double sample_median  = samples[num_samples / 2];
  return sample_average / 1.25 > sample_median;
}


================================================
FILE: lonestar/liblonestar/src/BoilerPlate.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Lonestar/BoilerPlate.h"

#include <sstream>

//! standard global options to the benchmarks
llvm::cl::opt<bool>
    skipVerify("noverify",
               llvm::cl::desc("Skip verification step (default value false)"),
               llvm::cl::init(false));
llvm::cl::opt<int>
    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
               llvm::cl::init(1));
llvm::cl::opt<std::string> statFile(
    "statFile",
    llvm::cl::desc("ouput file to print stats to (default value empty)"),
    llvm::cl::init(""));

//! Flag that forces user to be aware that they should be passing in a
//! symmetric graph.
llvm::cl::opt<bool>
    symmetricGraph("symmetricGraph",
                   llvm::cl::desc("Specify that the input graph is symmetric"),
                   llvm::cl::init(false));

static void LonestarPrintVersion(llvm::raw_ostream& out) {
  out << "LoneStar Benchmark Suite v" << galois::getVersion() << " ("
      << galois::getRevision() << ")\n";
  out.flush();
}

//! initialize lonestar benchmark
void LonestarStart(int argc, char** argv) {
  LonestarStart(argc, argv, nullptr, nullptr, nullptr, nullptr);
}

//! initialize lonestar benchmark
void LonestarStart(int argc, char** argv, const char* app, const char* desc,
                   const char* url, llvm::cl::opt<std::string>* input) {
  llvm::cl::SetVersionPrinter(LonestarPrintVersion);
  llvm::cl::ParseCommandLineOptions(argc, argv);
  numThreads = galois::setActiveThreads(numThreads);

  galois::runtime::setStatFile(statFile);

  LonestarPrintVersion(llvm::outs());
  llvm::outs() << "Copyright (C) " << galois::getCopyrightYear()
               << " The University of Texas at Austin\n";
  llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n";
  llvm::outs() << "application: " << (app ? app : "unspecified") << "\n";
  if (desc) {
    llvm::outs() << desc << "\n";
  }
  if (url) {
    llvm::outs() << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
                 << url << "\n";
  }
  llvm::outs() << "\n";
  llvm::outs().flush();

  std::ostringstream cmdout;
  for (int i = 0; i < argc; ++i) {
    cmdout << argv[i];
    if (i != argc - 1) {
      cmdout << " ";
    }
  }

  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
  galois::runtime::reportParam("(NULL)", "Hosts", 1);
  if (input) {
    galois::runtime::reportParam("(NULL)", "Input", input->getValue());
  }

  char name[256];
  gethostname(name, 256);
  galois::runtime::reportParam("(NULL)", "Hostname", name);
}


================================================
FILE: lonestar/mining/CMakeLists.txt
================================================
include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)

add_subdirectory(libminingbench)

add_subdirectory(cpu)

if(GALOIS_ENABLE_GPU)
  add_subdirectory(gpu)
endif()


================================================
FILE: lonestar/mining/README.md
================================================
Overview of Graph Pattern Mining (GPM) in Galois
================================================================================

This directory contains benchmarks for efficient and flexible graph mining 
that run using the Pangolin framework [1] on a multi-core CPU or a GPU. 
It uses the bliss [2][3] library v0.73 for graph isomorphism test. 
The license for this library is in the bliss
directory: note that **it does not use the same license as the rest of Galois**.  

[1] Xuhao Chen, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, 
Pangolin: An Efficient and Flexible Graph Pattern Mining System on CPU and GPU, VLDB 2020

[2] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.

[3] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

Compiling GPM Applications Through CMake 
================================================================================

The dependencies for LonestarGPU suite are the same as shared-memory.
Note that  LonestarGPU requires CUDA 8.0 and above.

Note that heterogeneous Galois requires the cub and moderngpu git submodules,
which can be cloned using the followed commands.

```Shell
cd $GALOIS_ROOT
git submodule init
git submodule update
```
These modules will be cloned in the ${GALOIS\_ROOT}/external directory

Mining applications for CPU are enabled by default.
To build the mining applications for GPU, first, create a build directory and
run CMake with -DGALOIS\_CUDA\_CAPABILITY=\<insert CUDA capability here\> flag
in the build directory. The CUDA capability should be one that your
GPU supports. For example, if you wanted to build for a GTX 1080 and a K80,
the commands would look like this:

```Shell
cd ${GALOIS_ROOT}
mkdir build
cd build
cmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY="3.7;6.1"
```

After compiling through CMake, the system will create the 'lonestar/mining/cpu' 
and 'lonestar/mining/gpu' directories in ${GALOIS\_ROOT}/build directory. 

Compiling Mining Applications
================================================================================

Once CMake is completed,  compile the provided mining apps by executing the 
following command in the ${GALOIS\_ROOT}/build/lonestar/mining directory.

```Shell
`make -j`
```

You can compile a specific app by executing the following commands (shown for motif-counting on CPU).

```Shell
cd cpu/motif-counting
make -j
```

INPUT
================================================================================

We support four input graph format: **gr**, **txt**, **adj**, **mtx**.
For unlabeled graphs, we use the gr graph format, same as other Galois benchmarks.
**Make sure that the graph is symmetric and contains no self-loop or redundant edges**.
If not, use the convert tool in tools/graph-convert/ to convert the graph.
We use **adj** format for labeled graphs as also used by Arabesque and RStream.
The **adj** format takes as input graphs with the following formats (vertex labeled):

```
# <num vertices> <num edges>
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
...
```

We currently do not support graphs label on edges.

Vertex ids are expected to be sequential integers between 0 and (total number of vertices - 1).
For testing, we have prepared a test graph **citeseer**. After running make input,
the needed input files can be build in "$BUILD_DIR/inputs/Mining".

Running Provided Apps
================================================================================

The following are a few example command lines.

- `$ ./triangle-counting-mining-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -t 28`
- `$ ./k-clique-listing-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -k=3 -t 28`
- `$ ./motif-counting-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -k=3 -t 56`
- `$ ./frequent-subgraph-mining-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.sadj -ft adj -k=2 -ms=300 -t 28`

PERFORMANCE
================================================================================

Please see details in the paper.

CITATION
================================================================================

Please cite the following paper if you use Pangolin:

```
@article{Pangolin,
	title={Pangolin: An Efficient and Flexible Graph Mining System on CPU and GPU},
	author={Xuhao Chen and Roshan Dathathri and Gurbinder Gill and Keshav Pingali},
	year={2020},
	journal = {Proc. VLDB Endow.},
	issue_date = {August 2020},
	volume = {13},
	number = {8},
	month = aug,
	year = {2020},
	numpages = {12},
	publisher = {VLDB Endowment},
}
```


================================================
FILE: lonestar/mining/cpu/CMakeLists.txt
================================================
function(add_test_mine type app)
  set(options NOT_QUICK)
  set(one_value_args)
  set(multi_value_args REQUIRES COMMAND_PREFIX)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

  set(threads)
  set(thr "${GALOIS_NUM_TEST_THREADS}")
  while (${thr} GREATER 1)
    list(APPEND threads ${thr})
    math(EXPR thr "${thr} / 2")
  endwhile()
  list(APPEND threads "1")

  foreach (thr ${threads})
    set(name run-${type}-${app}-${thr})
    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})
    if (NOT ${X_NOT_QUICK})
      # Allow parallel tests
      set_tests_properties(${name}
        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)
    endif()
  endforeach()
endfunction(add_test_mine)

add_subdirectory(frequent-subgraph-mining)
add_subdirectory(k-clique-listing)
add_subdirectory(motif-counting)
add_subdirectory(triangle-counting)
#add_subdirectory(subgraph-listing)


================================================
FILE: lonestar/mining/cpu/frequent-subgraph-mining/CMakeLists.txt
================================================
add_executable(frequent-subgraph-mining-cpu fsm.cpp)
add_dependencies(apps frequent-subgraph-mining-cpu)
target_link_libraries(frequent-subgraph-mining-cpu PRIVATE Galois::pangolin miningbench)
install(TARGETS frequent-subgraph-mining-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_mine(small1 frequent-subgraph-mining-cpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.sadj" "-ft=adj" NOT_QUICK)


================================================
FILE: lonestar/mining/cpu/frequent-subgraph-mining/README.md
================================================
Frequent Subgraph Mining
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application does frequent subgraph mining in a graph using BFS 
expansion. It uses the bliss library [1][2] for graph isomorphism check.

[1] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.
[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

INPUT
--------------------------------------------------------------------------------

We support the following input graph formats: **txt**, **adj**.

We mostly use **adj** format as it is also used by Arabesque and RStream.
The **adj** format takes as input graphs with the following formats:

* **Labels on vertices (default)**
```
# <num vertices> <num edges>
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
...
```

We currently do not support graph labels on edges.
Vertex ids are expected to be sequential integers from 0 to (total number of vertices - 1).

This application takes in symmetric and simple graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/cpu/frequent-subgraph-mining; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./frequent-subgraph-mining-cpu <path-to-graph> -symmetricGraph -simpleGraph -k=3 -minsup=300 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/cpu/frequent-subgraph-mining/fsm.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/edge_miner.h"

const char* name = "FSM";
const char* desc = "Frequent subgraph mining in a graph using BFS extension";
const char* url  = nullptr;

#include "pangolin/BfsMining/edge_miner_api.h"
class MyAPI : public EdgeMinerAPI<EdgeEmbedding> {
public:
};

class AppMiner : public EdgeMiner<LabeledElement, EdgeEmbedding, MyAPI, true> {
public:
  AppMiner(unsigned ms, int nt)
      : EdgeMiner<LabeledElement, EdgeEmbedding, MyAPI, true>(ms, nt) {
    if (ms <= 1) {
      printf("ERROR: command line argument k must be 2 or greater\n");
      exit(1);
    }
    if (filetype == "gr") {
      printf("ERROR: gr file is not acceptable for FSM. Add -ft=adj and use "
             "adj file instead.\n");
      exit(1);
    }
    set_threshold(minsup);
    total_num = 0;
  }
  ~AppMiner() {}
  void print_output() {
    std::cout << "\n\ttotal_num_frequent_patterns = " << this->total_num
              << "\n";
  }
};

#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/cpu/frequent-subgraph-mining/fsm.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,
                    AccType& total);


================================================
FILE: lonestar/mining/cpu/k-clique-listing/CMakeLists.txt
================================================
add_executable(k-clique-listing-cpu kcl.cpp)
add_dependencies(apps k-clique-listing-cpu)
target_link_libraries(k-clique-listing-cpu PRIVATE Galois::pangolin miningbench)
install(TARGETS k-clique-listing-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_mine(small1 k-clique-listing-cpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr" NOT_QUICK)


================================================
FILE: lonestar/mining/cpu/k-clique-listing/README.md
================================================
k-Clique Listing
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application counts the k-Cliques in a graph. 

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/cpu/k-clique-listing; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./k-clique-listing-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/cpu/k-clique-listing/kcl.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/vertex_miner.h"

const char* name = "Kcl";
const char* desc = "Listing cliques of size k in a graph using BFS extension";
const char* url  = nullptr;

#include "pangolin/BfsMining/vertex_miner_api.h"
class MyAPI : public VertexMinerAPI<BaseEmbedding> {
public:
  // toExtend (only extend the last vertex in the embedding)
  static bool toExtend(unsigned n, const BaseEmbedding&, unsigned pos) {
    return pos == n - 1;
  }
  // toAdd (only add vertex connected to all the vertices in the embedding)
  static bool toAdd(unsigned n, PangolinGraph& g, const BaseEmbedding& emb,
                    unsigned, VertexId dst) {
    return is_all_connected_dag(g, dst, emb, n - 1);
  }
};

class AppMiner : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true> {
public:
  AppMiner(unsigned ms, int nt)
      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true>(ms, nt,
                                                               nblocks) {
    if (ms <= 2) {
      printf("ERROR: command line argument k must be 3 or greater\n");
      exit(1);
    }
    set_num_patterns(1);
  }
  ~AppMiner() {}
  void print_output() {
    std::cout << "\n\ttotal_num_cliques = " << get_total_count() << "\n";
  }
};

#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/cpu/k-clique-listing/kcl.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void kcl_gpu_solver(std::string filename, unsigned k, AccType& total,
                    size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/cpu/motif-counting/CMakeLists.txt
================================================
add_executable(motif-counting-cpu motif.cpp)
add_dependencies(apps motif-counting-cpu)
target_link_libraries(motif-counting-cpu PRIVATE Galois::pangolin miningbench)
install(TARGETS motif-counting-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_mine(small1 motif-counting-cpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr" NOT_QUICK)


================================================
FILE: lonestar/mining/cpu/motif-counting/README.md
================================================
Motif Counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application counts the motifs in a graph using BFS 
expansion. It uses the bliss library [1][2] for graph isomorphism test.

[1] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.
[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/cpu/motif-counting; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./motif-counting-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 28`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/cpu/motif-counting/motif.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/vertex_miner.h"

const char* name = "Motif Counting";
const char* desc =
    "Counts the vertex-induced motifs in a graph using BFS extension";
const char* url     = nullptr;
int num_patterns[3] = {2, 6, 21};

#include "pangolin/BfsMining/vertex_miner_api.h"
class MyAPI : public VertexMinerAPI<VertexEmbedding> {
public:
  // customized pattern classification method
  static unsigned getPattern(unsigned n, PangolinGraph& g, unsigned i,
                             VertexId dst, const VertexEmbedding& emb,
                             BYTE* pre_pid, unsigned pos) {
    return find_motif_pattern_id(n, g, i, dst, emb, pre_pid, pos);
  }
};

class AppMiner : public VertexMiner<SimpleElement, VertexEmbedding, MyAPI,
                                    false, false, true> {
public:
  AppMiner(unsigned ms, int nt)
      : VertexMiner<SimpleElement, VertexEmbedding, MyAPI, false, false, true>(
            ms, nt, nblocks) {
    if (ms <= 2) {
      printf("ERROR: command line argument k must be 3 or greater\n");
      exit(1);
    }
    set_num_patterns(num_patterns[k - 3]);
  }
  ~AppMiner() {}
  void print_output() { printout_motifs(); }
};

#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/cpu/motif-counting/motif.h
================================================
#pragma once
#include <vector>
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType>& acc,
                      size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/cpu/subgraph-listing/CMakeLists.txt
================================================
add_executable(sgl_cycle sgl_cycle.cpp)
add_executable(sgl_diamond sgl_diamond.cpp)
add_dependencies(apps sgl_cycle)
add_dependencies(apps sgl_diamond)
target_link_libraries(sgl_cycle PRIVATE Galois::pangolin miningbench)
target_link_libraries(sgl_diamond PRIVATE Galois::pangolin miningbench)
install(TARGETS sgl_cycle DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
install(TARGETS sgl_diamond DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)


================================================
FILE: lonestar/mining/cpu/subgraph-listing/README.mb
================================================
Subgraph Listing
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application counts the occurances of a given subgraph in a graph. 
Currently only two patterns are supported: diamond and 4-cycle.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.
You must also specify the query graph (i.e. pattern) using -p.
Currently you need to pass the 4-cycle and diamond query graphs
to the sgl_cycle and sgl_diamond executables respectively.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/cpu/subgraph-listing; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./sgl_cycle -symmetricGraph -simpleGraph <path-to-graph> -k 4 -p query/p0.graph -t 16`
-`$ ./sgl_diamond -symmetricGraph -simpleGraph <path-to-graph> -k 4 -p query/p1.graph -t 16`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/vertex_miner.h"

const char* name = "sgl";
const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                   "graph using bfs extension";
const char* url = nullptr;

#include "pangolin/BfsMining/vertex_miner_api.h"
class MyAPI : public VertexMinerAPI<BaseEmbedding> {
public:
  // matching order of the pattern
  static inline unsigned getExtendableVertex(unsigned n) {
    if (n == 2)
      return 0;   // u2 extended from u0
    return n - 1; // u[i] extended from u[i-1]
  }

  static inline bool toAdd(unsigned n, PangolinGraph& g,
                           const BaseEmbedding& emb, unsigned, VertexId dst) {
    // std::cout << "\t emb: " << emb << ", dst=" << dst << "\n";
    if (n == 3) {
      if (dst <= emb.get_vertex(0))
        return false;
      if (!is_connected(g, dst, emb.get_vertex(1)))
        return false;
    } else {
      if (dst <= emb.get_vertex(n - 1))
        return false;
    }
    // if (g.get_degree(dst) < pattern.get_degree(n)) return false;
    // for (unsigned i = 1; i < n; ++i)
    //  if (dst == emb.get_vertex(i)) return false;

    // u3 (extended from u2) connected to u1
    return true;
  }
};

class AppMiner
    : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1> {
public:
  AppMiner(unsigned ms, int nt)
      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1>(ms, nt,
                                                                     nblocks) {}
  ~AppMiner() {}
  void print_output() {
    std::cout << "\n\ttotal_num_subgraphs = " << get_total_count() << "\n";
  }
};

#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/vertex_miner.h"

const char* name = "sgl";
const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                   "graph using bfs extension";
const char* url = nullptr;

#include "pangolin/BfsMining/vertex_miner_api.h"
class MyAPI : public VertexMinerAPI<BaseEmbedding> {
public:
  // matching order of the pattern
  static inline unsigned getExtendableVertex(unsigned n) {
    if (n == 3)
      return 1;   // u3 extended from u1
    return n - 1; // u[i] extended from u[i-1]
  }

  static inline bool toAdd(unsigned n, PangolinGraph& g,
                           const BaseEmbedding& emb, unsigned, VertexId dst) {
    // std::cout << "\t emb: " << emb << ", dst=" << dst << ", pos=" << pos <<
    // "\n";
    // u3 > u2
    if (n == 3) {
      if (dst <= emb.get_vertex(2))
        return false;
    }
    // both u2 and u3 (extended from u1) connected to u0
    if (!is_connected(g, dst, emb.get_vertex(0)))
      return false;
    return true;
  }
};

class AppMiner
    : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1> {
public:
  AppMiner(unsigned ms, int nt)
      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1>(ms, nt,
                                                                     nblocks) {}
  ~AppMiner() {}
  void print_output() {
    std::cout << "\n\ttotal_num_subgraphs = " << get_total_count() << "\n";
  }
};

#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/cpu/triangle-counting/CMakeLists.txt
================================================
add_executable(triangle-counting-mining-cpu tc_mine.cpp)
add_dependencies(apps triangle-counting-mining-cpu)
target_link_libraries(triangle-counting-mining-cpu PRIVATE Galois::pangolin miningbench)
install(TARGETS triangle-counting-mining-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_test_mine(small1 triangle-counting-mining-cpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr" NOT_QUICK)


================================================
FILE: lonestar/mining/cpu/triangle-counting/README.md
================================================
Triangle Counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program counts the number of triangles in a given undirected graph.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/cpu/triangle-counting; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./triangle-counting-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/cpu/triangle-counting/tc.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void tc_gpu_solver(std::string filename, AccType& total, size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/cpu/triangle-counting/tc_mine.cpp
================================================
#include "MiningBench/Start.h"
#include "pangolin/BfsMining/vertex_miner.h"
#define TRIANGLE

const char* name = "TC";
const char* desc =
    "Counts the triangles in a graph (inputs do NOT need to be symmetrized)";
const char* url = nullptr;

#include "pangolin/BfsMining/vertex_miner_api.h"
class MyAPI : public VertexMinerAPI<BaseEmbedding> {
public:
  // toExtend (only extend the last vertex in the embedding)
  static bool toExtend(unsigned n, const BaseEmbedding&, unsigned pos) {
    return pos == n - 1;
  }
  // toAdd (only add vertex connected to all the vertices in the embedding)
  static bool toAdd(unsigned, PangolinGraph&, const BaseEmbedding&, unsigned,
                    VertexId) {
    return true;
  }
};

class AppMiner : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true> {
public:
  AppMiner(unsigned ms, int nt)
      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true>(ms, nt,
                                                               nblocks) {
    if (ms != 3) {
      printf("ERROR: command line argument k must be 3\n");
      exit(1);
    }
    set_num_patterns(1);
  }
  ~AppMiner() {}
  void print_output() {
    std::cout << "\n\ttotal_num_triangles = " << get_total_count() << "\n";
  }
};
#include "pangolin/BfsMining/engine.h"


================================================
FILE: lonestar/mining/gpu/CMakeLists.txt
================================================
include_directories(include)

function(add_test_mine type app)
  set(options NOT_QUICK)
  set(one_value_args)
  set(multi_value_args REQUIRES COMMAND_PREFIX)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})

  set(threads)
  set(thr "${GALOIS_NUM_TEST_THREADS}")
  while (${thr} GREATER 1)
    list(APPEND threads ${thr})
    math(EXPR thr "${thr} / 2")
  endwhile()
  list(APPEND threads "1")

  foreach (thr ${threads})
    set(name run-${type}-${app}-${thr})
    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})
    if (NOT ${X_NOT_QUICK})
      # Allow parallel tests
      set_tests_properties(${name}
        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)
    endif()
  endforeach()
endfunction(add_test_mine)

add_subdirectory(frequent-subgraph-mining)
add_subdirectory(k-clique-listing)
add_subdirectory(motif-counting)
add_subdirectory(triangle-counting)


================================================
FILE: lonestar/mining/gpu/frequent-subgraph-mining/CMakeLists.txt
================================================
add_executable(frequent-subgraph-mining-gpu fsm_gpu.cpp fsm.cu)
add_dependencies(apps frequent-subgraph-mining-gpu)
target_link_libraries(frequent-subgraph-mining-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)
target_compile_definitions(frequent-subgraph-mining-gpu PRIVATE GALOIS_ENABLE_GPU=1)
set_property(TARGET frequent-subgraph-mining-gpu PROPERTY CUDA_STANDARD 14)
install(TARGETS frequent-subgraph-mining-gpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_mine(small1 frequent-subgraph-mining-gpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.sadj" "-ft=adj")


================================================
FILE: lonestar/mining/gpu/frequent-subgraph-mining/README.md
================================================
Frequent Subgraph Mining
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application does frequent subgraph mining in a graph using BFS 
expansion. It uses the bliss library [1][2] for graph isomorphism check.

[1] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.
[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

INPUT
--------------------------------------------------------------------------------

We support the following input graph formats: **txt**, **adj**.

We mostly use **adj** format as also used by Arabesque and RStream.
The **adj** format takes as input graphs with the following formats:

* **Labels on vertices (default)**
```
# <num vertices> <num edges>
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]
...
```

We currently do not support graph labels on edges.
Vertex ids are expected to be sequential integers from 0 to (total number of vertices - 1).

This application takes in symmetric and simple graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/gpu/frequent-subgraph-mining; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./frequent-subgraph-mining-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -minsup=300 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/gpu/frequent-subgraph-mining/fsm.cu
================================================
// Copyright (c) 2019, Xuhao Chen
#include "fsm.h"
#include "pangolin/timer.h"
#include "pangolin/cutils.h"
#define USE_PID
#define USE_DOMAIN
#define EDGE_INDUCED
#define ENABLE_LABEL
#include <cub/cub.cuh>
#include "pangolin/miner.cuh"
#include "pangolin/bitsets.h"
#include <thrust/scan.h>
#include <thrust/extrema.h>
#include <thrust/execution_policy.h>
#define MAX_NUM_PATTERNS 21251

struct OrderedEdge {
  IndexT src;
  IndexT dst;
};

inline __device__ int get_init_pattern_id(node_data_type src_label,
                                          node_data_type dst_label,
                                          int nlabels) {
  return (int)src_label * nlabels + (int)dst_label;
}

inline __device__ unsigned get_pattern_id(node_data_type label0,
                                          node_data_type label1,
                                          node_data_type label2, int nlabels) {
  return nlabels * (nlabels * label0 + label1) + label2;
}

inline __device__ bool is_quick_automorphism(unsigned size, IndexT* vids,
                                             history_type his2,
                                             history_type his, IndexT src,
                                             IndexT dst) {
  if (dst <= vids[0])
    return true;
  if (dst == vids[1])
    return true;
  if (his == 0 && dst < vids[1])
    return true;
  if (size == 2) {
  } else if (size == 3) {
    if (his == 0 && his2 == 0 && dst <= vids[2])
      return true;
    if (his == 0 && his2 == 1 && dst == vids[2])
      return true;
    if (his == 1 && his2 == 1 && dst <= vids[2])
      return true;
  } else {
  }
  return false;
}

inline __device__ void swap(IndexT first, IndexT second) {
  if (first > second) {
    IndexT tmp = first;
    first      = second;
    second     = tmp;
  }
}

inline __device__ int compare(OrderedEdge oneEdge, OrderedEdge otherEdge) {
  swap(oneEdge.src, oneEdge.dst);
  swap(otherEdge.src, otherEdge.dst);
  if (oneEdge.src == otherEdge.src)
    return oneEdge.dst - otherEdge.dst;
  else
    return oneEdge.src - otherEdge.src;
}

inline __device__ bool is_edge_automorphism(unsigned size, IndexT* vids,
                                            history_type* hiss,
                                            history_type his, IndexT src,
                                            IndexT dst) {
  if (size < 3)
    return is_quick_automorphism(size, vids, hiss[2], his, src, dst);
  if (dst <= vids[0])
    return true;
  if (his == 0 && dst <= vids[1])
    return true;
  if (dst == vids[hiss[his]])
    return true;
  OrderedEdge added_edge;
  added_edge.src = src;
  added_edge.dst = dst;
  for (unsigned index = his + 1; index < size; ++index) {
    OrderedEdge edge;
    edge.src = vids[hiss[index]];
    edge.dst = vids[index];
    int cmp  = compare(added_edge, edge);
    if (cmp <= 0)
      return true;
  }
  return false;
}

__global__ void extend_alloc(unsigned m, unsigned level, CSRGraph graph,
                             EmbeddingList emb_list, IndexT* num_new_emb) {
  unsigned tid = threadIdx.x;
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ IndexT vid[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  if (pos < m) {
    emb_list.get_edge_embedding(level, pos, vid[tid], his[tid]);
    num_new_emb[pos] = 0;
    // if (pos == 1) printf("src=%d, dst=%d\n", vid[tid][0], vid[tid][1]);
    for (unsigned i = 0; i < level + 1; ++i) {
      IndexT src       = vid[tid][i];
      IndexT row_begin = graph.edge_begin(src);
      IndexT row_end   = graph.edge_end(src);
      for (IndexT e = row_begin; e < row_end; e++) {
        IndexT dst = graph.getEdgeDst(e);
        if (!is_edge_automorphism(level + 1, vid[tid], his[tid], i, src, dst))
          num_new_emb[pos]++;
      }
    }
  }
}

__global__ void extend_insert(unsigned m, unsigned level, CSRGraph graph,
                              EmbeddingList emb_list, IndexT* indices) {
  unsigned tid = threadIdx.x;
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  if (pos < m) {
    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);
    IndexT start = indices[pos];
    for (unsigned i = 0; i < level + 1; ++i) {
      IndexT src       = vids[tid][i];
      IndexT row_begin = graph.edge_begin(src);
      IndexT row_end   = graph.edge_end(src);
      for (IndexT e = row_begin; e < row_end; e++) {
        IndexT dst = graph.getEdgeDst(e);
        if (!is_edge_automorphism(level + 1, vids[tid], his[tid], i, src,
                                  dst)) {
          emb_list.set_idx(level + 1, start, pos);
          emb_list.set_his(level + 1, start, i);
          emb_list.set_vid(level + 1, start++, dst);
        }
      }
    }
  }
}

__global__ void init_aggregate(unsigned m, unsigned num_emb, CSRGraph graph,
                               EmbeddingList emb_list, unsigned* pids,
                               int nlabels, unsigned threshold,
                               Bitsets small_sets, Bitsets large_sets) {
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  if (pos < num_emb) {
    IndexT src               = emb_list.get_idx(1, pos);
    IndexT dst               = emb_list.get_vid(1, pos);
    node_data_type src_label = graph.getData(src);
    node_data_type dst_label = graph.getData(dst);
    int pid                  = 0;
    if (src_label <= dst_label)
      pid = get_init_pattern_id(src_label, dst_label, nlabels);
    else
      pid = get_init_pattern_id(dst_label, src_label, nlabels);
    pids[pos] = pid;
    if (src_label < dst_label) {
      small_sets.set(pid, src);
      large_sets.set(pid, dst);
    } else if (src_label > dst_label) {
      small_sets.set(pid, dst);
      large_sets.set(pid, src);
    } else {
      small_sets.set(pid, src);
      small_sets.set(pid, dst);
      large_sets.set(pid, src);
      large_sets.set(pid, dst);
    }
  }
}

__global__ void count_ones(int id, Bitsets sets, int* count) {
  typedef cub::BlockReduce<int, BLOCK_SIZE> BlockReduce;
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  int num = 0;
  if (pos < sets.vec_size())
    num = sets.count_num_ones(id, pos);
  int block_total = BlockReduce(temp_storage).Sum(num);
  if (threadIdx.x == 0)
    atomicAdd(count, block_total);
}

int init_support_count(unsigned m, int npatterns, unsigned threshold,
                       Bitsets small_sets, Bitsets large_sets,
                       bool* init_support_map) {
  int num_freq_patterns = 0;
  for (int i = 0; i < npatterns; i++) {
    int a, b, *d_count;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_count, sizeof(int)));
    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));
    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, small_sets, d_count);
    CudaTest("solving count_ones `failed");
    CUDA_SAFE_CALL(
        cudaMemcpy(&a, d_count, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));
    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, large_sets, d_count);
    CUDA_SAFE_CALL(
        cudaMemcpy(&b, d_count, sizeof(int), cudaMemcpyDeviceToHost));
    unsigned support = a < b ? a : b;
    if (support >= threshold) {
      init_support_map[i] = 1;
      num_freq_patterns++;
    } else
      init_support_map[i] = 0;
  }
  return num_freq_patterns;
}

int support_count(unsigned m, unsigned npatterns, unsigned threshold,
                  Bitsets small_sets, Bitsets middle_sets, Bitsets large_sets,
                  bool* support_map) {
  int num_freq_patterns = 0;
  for (int i = 0; i < npatterns; i++) {
    int a, b, c, *d_count;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_count, sizeof(int)));
    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));
    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, small_sets, d_count);
    CUDA_SAFE_CALL(
        cudaMemcpy(&a, d_count, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));
    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, large_sets, d_count);
    CUDA_SAFE_CALL(
        cudaMemcpy(&b, d_count, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));
    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, middle_sets, d_count);
    CUDA_SAFE_CALL(
        cudaMemcpy(&c, d_count, sizeof(int), cudaMemcpyDeviceToHost));
    unsigned small   = a < b ? a : b;
    unsigned support = small < c ? small : c;
    if (support >= threshold) {
      support_map[i] = 1;
      num_freq_patterns++;
    } else
      support_map[i] = 0;
  }
  return num_freq_patterns;
}

__global__ void init_filter_check(unsigned m, unsigned* pids,
                                  bool* init_support_map,
                                  IndexT* is_frequent_emb) {
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  if (pos < m) {
    unsigned pid     = pids[pos];
    bool is_frequent = init_support_map[pid];
    if (is_frequent)
      is_frequent_emb[pos] = 1;
  }
}

__global__ void copy_vids(unsigned m, EmbeddingList emb_list, IndexT* vid_list0,
                          IndexT* vid_list1) {
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  if (pos < m) {
    vid_list0[pos] = emb_list.get_idx(1, pos);
    vid_list1[pos] = emb_list.get_vid(1, pos);
  }
}

__global__ void init_filter(unsigned m, EmbeddingList emb_list,
                            IndexT* vid_list0, IndexT* vid_list1,
                            IndexT* indices, IndexT* is_frequent_emb) {
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  if (pos < m) {
    if (is_frequent_emb[pos]) {
      IndexT src     = vid_list0[pos];
      IndexT dst     = vid_list1[pos];
      unsigned start = indices[pos];
      emb_list.set_vid(1, start, dst);
      emb_list.set_idx(1, start, src);
    }
  }
}

__global__ void aggregate_check(unsigned num_emb, unsigned level,
                                CSRGraph graph, EmbeddingList emb_list,
                                unsigned* pids, int nlabels, unsigned threshold,
                                unsigned* ne) {
  unsigned tid = threadIdx.x;
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  if (pos < num_emb) {
    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);
    unsigned n = level + 1;
    assert(n < 4);
    IndexT first      = vids[tid][0];
    IndexT second     = vids[tid][1];
    IndexT third      = vids[tid][2];
    node_data_type l0 = graph.getData(first);
    node_data_type l1 = graph.getData(second);
    node_data_type l2 = graph.getData(third);
    history_type h2   = his[tid][2];
    unsigned pid      = 0;
    if (n == 3) {
      if (h2 == 0) {
        if (l1 < l2) {
          pid = get_pattern_id(l0, l2, l1, nlabels);
        } else {
          pid = get_pattern_id(l0, l1, l2, nlabels);
        }
      } else {
        assert(h2 == 1);
        if (l0 < l2) {
          pid = get_pattern_id(l1, l2, l0, nlabels);
        } else {
          pid = get_pattern_id(l1, l0, l2, nlabels);
        }
      }
    } else {
    }
    pids[pos] = pid;
    atomicAdd(&ne[pid], 1);
  }
}

__global__ void find_candidate_patterns(unsigned num_patterns, unsigned* ne,
                                        unsigned minsup, unsigned* id_map,
                                        unsigned* num_new_patterns) {
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  if (pos < num_patterns) {
    if (ne[pos] >= minsup) {
      unsigned new_id = atomicAdd(num_new_patterns, 1);
      id_map[pos]     = new_id;
    }
  }
}

__global__ void aggregate(unsigned m, unsigned num_emb, unsigned level,
                          CSRGraph graph, EmbeddingList emb_list,
                          unsigned* pids, unsigned* ne, unsigned* id_map,
                          int nlabels, unsigned threshold, Bitsets small_sets,
                          Bitsets middle_sets, Bitsets large_sets) {
  unsigned tid = threadIdx.x;
  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
  if (pos < num_emb) {
    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);
    assert(level == 2);
    IndexT first      = vids[tid][0];
    IndexT second     = vids[tid][1];
    IndexT third      = vids[tid][2];
    node_data_type l0 = graph.getData(first);
    node_data_type l1 = graph.getData(second);
    node_data_type l2 = graph.getData(third);
    history_type h2   = his[tid][2];
    IndexT small, middle, large;
    unsigned pid = pids[pos];
    if (ne[pid] >= threshold) {
      pid = id_map[pid];
      if (h2 == 0) {
        middle = first;
        if (l1 < l2) {
          small = second;
          large = third;
        } else {
          small = third;
          large = second;
        }
        small_sets.set(pid, small);
        middle_sets.set(pid, middle);
        large_sets.set(pid, large);
        if (l1 == l2) {
          small_sets.set(pid, large);
          large_sets.set(pid, small);
        }
      } else {
        assert(h2 == 1);
        middle = second;
        if (l0 < l2) {
          small = first;
          large = third;
        } else {
          small = third;
          large = first;
        }
        small_sets.set(pid, small);
        middle_sets.set(pid, middle);
        large_sets.set(pid, large);
        if (l0 == l2) {
          small_sets.set(pid, large);
          large_sets.set(pid, small);
        }
      }
    }
  }
}

void parallel_prefix_sum(int n, IndexT* in, IndexT* out) {
  IndexT total = 0;
  for (size_t i = 0; i < n; i++) {
    out[i] = total;
    total += in[i];
  }
  out[n] = total;
}

void fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,
                    AccType& total_num) {
  CSRGraph graph_cpu, graph_gpu;
  int nlabels = graph_cpu.read(fname); // read graph into CPU memoryA
  int m       = graph_cpu.get_nnodes();
  int nnz     = graph_cpu.get_nedges();
  graph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory
  EmbeddingList emb_list;
  emb_list.init(nnz, k + 1, false);
  emb_list.init_cpu(&graph_cpu);

  int nthreads          = BLOCK_SIZE;
  int nblocks           = DIVIDE_INTO(nnz, nthreads);
  int num_init_patterns = (nlabels + 1) * (nlabels + 1);
  std::cout << "Number of init patterns: " << num_init_patterns << std::endl;
  unsigned num_emb = emb_list.size();
  std::cout << "number of single-edge embeddings: " << num_emb << "\n";
  unsigned* pids;
  CUDA_SAFE_CALL(cudaMalloc((void**)&pids, sizeof(unsigned) * num_emb));
  bool* h_init_support_map = (bool*)malloc(sizeof(bool) * num_init_patterns);
  bool* d_init_support_map;
  CUDA_SAFE_CALL(cudaMalloc((void**)&d_init_support_map,
                            sizeof(bool) * num_init_patterns));
  IndexT* is_frequent_emb;
  CUDA_SAFE_CALL(
      cudaMalloc((void**)&is_frequent_emb, sizeof(IndexT) * (num_emb + 1)));
  CUDA_SAFE_CALL(
      cudaMemset(is_frequent_emb, 0, sizeof(IndexT) * (num_emb + 1)));
  IndexT *vid_list0, *vid_list1;
  CUDA_SAFE_CALL(cudaMalloc((void**)&vid_list0, sizeof(IndexT) * num_emb));
  CUDA_SAFE_CALL(cudaMalloc((void**)&vid_list1, sizeof(IndexT) * num_emb));
  Bitsets small_sets, large_sets, middle_sets;
  small_sets.alloc(MAX_NUM_PATTERNS, m);
  large_sets.alloc(MAX_NUM_PATTERNS, m);
  middle_sets.alloc(MAX_NUM_PATTERNS, m);
  small_sets.set_size(num_init_patterns, m);
  large_sets.set_size(num_init_patterns, m);
  middle_sets.set_size(num_init_patterns, m);

  IndexT *num_new_emb, *indices;
  CUDA_SAFE_CALL(cudaMalloc((void**)&indices, sizeof(IndexT) * (num_emb + 1)));
  CUDA_SAFE_CALL(cudaDeviceSynchronize());
  nblocks = (num_emb - 1) / nthreads + 1;
  unsigned* d_num_new_patterns;
  unsigned h_num_new_patterns = 0;
  CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_new_patterns, sizeof(unsigned)));
  printf("Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\n", nblocks,
         nthreads);

  Timer t;
  t.Start();
  unsigned level = 1;
  init_aggregate<<<nblocks, nthreads>>>(m, num_emb, graph_gpu, emb_list, pids,
                                        nlabels, minsup, small_sets,
                                        large_sets);
  CudaTest("solving init_aggregate `failed");
  std::cout << "Init_aggregate Done\n";
  int num_freq_patterns = init_support_count(
      m, num_init_patterns, minsup, small_sets, large_sets, h_init_support_map);
  total_num += num_freq_patterns;
  if (num_freq_patterns == 0) {
    std::cout << "No frequent pattern found\n\n";
    return;
  }
  std::cout << "Number of frequent single-edge patterns: " << num_freq_patterns
            << "\n";
  CUDA_SAFE_CALL(cudaMemcpy(d_init_support_map, h_init_support_map,
                            sizeof(bool) * num_init_patterns,
                            cudaMemcpyHostToDevice));
  init_filter_check<<<nblocks, nthreads>>>(num_emb, pids, d_init_support_map,
                                           is_frequent_emb);
  CudaTest("solving init_filter_check `failed");
  thrust::exclusive_scan(thrust::device, is_frequent_emb,
                         is_frequent_emb + num_emb + 1, indices);
  IndexT new_size;
  CUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT),
                            cudaMemcpyDeviceToHost));
  std::cout << "number of embeddings after pruning: " << new_size << "\n";
  copy_vids<<<nblocks, nthreads>>>(num_emb, emb_list, vid_list0, vid_list1);
  CudaTest("solving copy_vids `failed");
  init_filter<<<nblocks, nthreads>>>(num_emb, emb_list, vid_list0, vid_list1,
                                     indices, is_frequent_emb);
  CudaTest("solving init_filter `failed");
  CUDA_SAFE_CALL(cudaFree(indices));
  CUDA_SAFE_CALL(cudaFree(is_frequent_emb));
  CUDA_SAFE_CALL(cudaFree(pids));
  // small_sets.clean();
  // large_sets.clean();
  small_sets.clear();
  large_sets.clear();
  CUDA_SAFE_CALL(cudaFree(vid_list0));
  CUDA_SAFE_CALL(cudaFree(vid_list1));
  CUDA_SAFE_CALL(cudaFree(d_init_support_map));
  emb_list.remove_tail(new_size);

  while (1) {
    num_emb = emb_list.size();
    std::cout << "number of embeddings in level " << level << ": " << num_emb
              << "\n";
    CUDA_SAFE_CALL(
        cudaMalloc((void**)&num_new_emb, sizeof(IndexT) * (num_emb + 1)));
    CUDA_SAFE_CALL(
        cudaMalloc((void**)&indices, sizeof(IndexT) * (num_emb + 1)));
    std::cout << "Done allocating memory for embeddings in level " << level
              << "\n";
    nblocks = (num_emb - 1) / nthreads + 1;
    extend_alloc<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,
                                        num_new_emb);
    CudaTest("solving extend_alloc failed");
    thrust::exclusive_scan(thrust::device, num_new_emb,
                           num_new_emb + num_emb + 1, indices);
    CudaTest("Scan failed");
    CUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT),
                              cudaMemcpyDeviceToHost));
    std::cout << "number of new embeddings: " << new_size << "\n";
    emb_list.add_level(new_size);
    extend_insert<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,
                                         indices);
    CudaTest("solving extend_insert failed");
    std::cout << "Extend_insert Done\n";
    num_emb = emb_list.size();
    CUDA_SAFE_CALL(cudaFree(num_new_emb));
    CUDA_SAFE_CALL(cudaFree(indices));
    level++;

    int num_patterns = nlabels * num_init_patterns;
    nblocks          = (num_emb - 1) / nthreads + 1;
    std::cout << "Number of patterns in level " << level << ": " << num_patterns
              << std::endl;
    std::cout << "number of embeddings in level " << level << ": " << num_emb
              << "\n";
    unsigned *ne, *id_map;
    CUDA_SAFE_CALL(cudaMalloc((void**)&ne, sizeof(unsigned) * num_patterns));
    CUDA_SAFE_CALL(
        cudaMalloc((void**)&id_map, sizeof(unsigned) * num_patterns));
    CUDA_SAFE_CALL(cudaMemset(ne, 0, sizeof(unsigned) * num_patterns));
    CUDA_SAFE_CALL(cudaMalloc((void**)&pids, sizeof(unsigned) * num_emb));
    std::cout << "Done allocating memory for aggregation in level " << level
              << "\n";
    aggregate_check<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,
                                           pids, nlabels, minsup, ne);
    CudaTest("solving aggregate_check failed");
    CUDA_SAFE_CALL(cudaMemset(d_num_new_patterns, 0, sizeof(unsigned)));
    find_candidate_patterns<<<(num_patterns - 1) / nthreads + 1, nthreads>>>(
        num_patterns, ne, minsup, id_map, d_num_new_patterns);
    CudaTest("solving find_candidate_patterns failed");
    CUDA_SAFE_CALL(cudaMemcpy(&h_num_new_patterns, d_num_new_patterns,
                              sizeof(unsigned), cudaMemcpyDeviceToHost));
    std::cout << "Number of candidate patterns in level " << level << ": "
              << h_num_new_patterns << std::endl;

    // small_sets.alloc(h_num_new_patterns, m);
    // large_sets.alloc(h_num_new_patterns, m);
    // middle_sets.alloc(h_num_new_patterns, m);
    small_sets.set_size(h_num_new_patterns, m);
    large_sets.set_size(h_num_new_patterns, m);
    middle_sets.set_size(h_num_new_patterns, m);
    std::cout << "Done allocating sets\n";
    aggregate<<<nblocks, nthreads>>>(m, num_emb, level, graph_gpu, emb_list,
                                     pids, ne, id_map, nlabels, minsup,
                                     small_sets, middle_sets, large_sets);
    CudaTest("solving aggregate failed");
    bool* h_support_map = (bool*)malloc(sizeof(bool) * h_num_new_patterns);
    num_freq_patterns = support_count(m, h_num_new_patterns, minsup, small_sets,
                                      middle_sets, large_sets, h_support_map);
    CudaTest("solving support_count failed");
    CUDA_SAFE_CALL(cudaFree(ne));
    CUDA_SAFE_CALL(cudaFree(id_map));
    std::cout << "num_frequent_patterns: " << num_freq_patterns << "\n";
    total_num += num_freq_patterns;
    if (num_freq_patterns == 0)
      break;
    if (level == k)
      break;
    // filter<<<nblocks, nthreads>>>(level, emb_list);
  }
  CUDA_SAFE_CALL(cudaDeviceSynchronize());
  t.Stop();

  printf("\truntime = %f ms.\n", t.Millisecs());
}


================================================
FILE: lonestar/mining/gpu/frequent-subgraph-mining/fsm.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,
                    AccType& total);


================================================
FILE: lonestar/mining/gpu/frequent-subgraph-mining/fsm_gpu.cpp
================================================
// Copyright 2019, University of Texas at Austin
// Authors: Xuhao Chen <cxh@utexas.edu>
#define EDGE_INDUCED
#include "galois/Galois.h"
#include "fsm.h"
#include "MiningBench/Start.h"
#include "llvm/Support/CommandLine.h"

const char* name = "FSM";
const char* desc = "Frequent subgraph mining in an undirected graph";
const char* url  = 0;

int main(int argc, char** argv) {
  LonestarMineStart(argc, argv, name, desc, url);

  if (!simpleGraph || !symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric simple graph input "
               " which is symmetric and has no multiple edges or self-loops;"
               " please use both -symmetricGraph and -simpleGraph flag "
               " to indicate the input is a symmetric simple graph");
  }

  if (filetype != "adj") {
    galois::gError("This application only supports adj format for FSM\n"
                   "Please add the -ft=adj flag\n");
    exit(1);
  }
  AccType total = 0;
  fsm_gpu_solver(inputFile, k, minsup, total);
  std::cout << "\n\ttotal_num_frequent_patterns = " << total << "\n\n";
  return 0;
}


================================================
FILE: lonestar/mining/gpu/k-clique-listing/CMakeLists.txt
================================================
add_executable(k-clique-listing-gpu kcl_gpu.cpp kcl.cu)
add_dependencies(apps k-clique-listing-gpu)
target_link_libraries(k-clique-listing-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)
target_compile_definitions(k-clique-listing-gpu PRIVATE GALOIS_ENABLE_GPU=1)
set_property(TARGET k-clique-listing-gpu PROPERTY CUDA_STANDARD 14)
install(TARGETS k-clique-listing-gpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_mine(small1 k-clique-listing-gpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr")


================================================
FILE: lonestar/mining/gpu/k-clique-listing/README.md
================================================
K-Clique
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application counts the K-Cliques in a graph.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/gpu/k-clique-listing; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./k-clique-listing-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/gpu/k-clique-listing/kcl.cu
================================================
// Copyright (c) 2019, Xuhao Chen
#include "kcl.h"
#include "pangolin/timer.h"
#include "pangolin/cutils.h"
#define USE_SIMPLE
#define USE_BASE_TYPES
#include "pangolin/miner.cuh"
#include <cub/cub.cuh>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>

#define USE_SHM
typedef cub::BlockScan<int, BLOCK_SIZE> BlockScan;
typedef cub::BlockReduce<AccType, BLOCK_SIZE> BlockReduce;

__global__ void extend_alloc(size_t begin, size_t end, unsigned level, unsigned max_size, CSRGraph graph, EmbeddingList emb_list, size_t *num_new_emb, AccType *total) {
	unsigned tid = threadIdx.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ typename BlockReduce::TempStorage temp_storage;
#ifdef USE_SHM
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
#else
	IndexT emb[PANGOLIN_MAX_SIZE];
#endif
	AccType local_num = 0;
	if(pos < end - begin) {
#ifdef USE_SHM
		emb_list.get_embedding(level, begin + pos, emb[tid]);
#else
		emb_list.get_embedding(level, begin + pos, emb);
#endif
		IndexT vid = emb_list.get_vid(level, begin + pos);
		num_new_emb[pos] = 0;
		IndexT row_begin = graph.edge_begin(vid);
		IndexT row_end = graph.edge_end(vid);
		for (IndexT e = row_begin; e < row_end; e++) {
			IndexT dst = graph.getEdgeDst(e);
#ifdef USE_SHM
			if (is_all_connected_dag(dst, emb[tid], level, graph)) {
#else
			if (is_all_connected_dag(dst, emb, level, graph)) {
#endif
				if (level < max_size-2) num_new_emb[pos] ++;
				else local_num += 1;
			}
		}
	}
	AccType block_num = BlockReduce(temp_storage).Sum(local_num);
	if(threadIdx.x == 0) atomicAdd(total, block_num);
}

__global__ void extend_alloc_lb(size_t begin, size_t end, unsigned level, unsigned max_size, CSRGraph graph, EmbeddingList emb_list, unsigned long long *num_new_emb, AccType *total) {
	unsigned tid = threadIdx.x;
	unsigned base_id = blockIdx.x * blockDim.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ typename BlockReduce::TempStorage reduce_storage;

	const unsigned SCRATCHSIZE = BLOCK_SIZE;
	__shared__ BlockScan::TempStorage temp_storage;
	__shared__ int gather_offsets[SCRATCHSIZE];
	__shared__ unsigned src[SCRATCHSIZE];
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
	//IndexT emb[MAX_SIZE];

	gather_offsets[threadIdx.x] = 0;
	int neighbor_size = 0;
	int neighbor_offset = 0;
	int scratch_offset = 0;
	int total_edges = 0;
	IndexT row_begin = 0;
	IndexT row_end = 0;

	IndexT vid;
	AccType local_num = 0;
	if (pos < end - begin) {
		//emb_list.get_embedding(level, begin + pos, emb);
		emb_list.get_embedding(level, begin + pos, emb[tid]);
		vid = emb_list.get_vid(level, begin + pos);
		num_new_emb[pos] = 0;
		row_begin = graph.edge_begin(vid);
		row_end = graph.edge_end(vid);
		neighbor_offset = row_begin;
		neighbor_size = row_end - row_begin;
	}
	BlockScan(temp_storage).ExclusiveSum(neighbor_size, scratch_offset, total_edges);
	int done = 0;
	int neighbors_done = 0;
	while(total_edges > 0) {
		__syncthreads();
		int i;
		for(i = 0; neighbors_done + i < neighbor_size && (scratch_offset + i - done) < SCRATCHSIZE; i++) {
			gather_offsets[scratch_offset + i - done] = neighbor_offset + neighbors_done + i;
			src[scratch_offset + i - done] = tid;
		}
		neighbors_done += i;
		scratch_offset += i;
		__syncthreads();
		if(tid < total_edges) {
			int e = gather_offsets[tid];
			IndexT dst = graph.getEdgeDst(e);
			unsigned idx = src[tid];
			if (is_all_connected_dag(dst, emb[idx], level, graph)) {
				if (level < max_size-2) atomicAdd(num_new_emb+base_id+idx, 1);
				else local_num += 1;
			}
		}
		total_edges -= BLOCK_SIZE;
		done += BLOCK_SIZE;
	}
	AccType block_num = BlockReduce(reduce_storage).Sum(local_num);
	if (tid == 0) atomicAdd(total, block_num);
}


__global__ void extend_insert(size_t begin, size_t end, unsigned level, CSRGraph graph, EmbeddingList emb_list, size_t *indices) {
	unsigned tid = threadIdx.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
#ifdef USE_SHM
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
#else
	IndexT emb[PANGOLIN_MAX_SIZE];
#endif
	if(pos < end - begin) {
#ifdef USE_SHM
		emb_list.get_embedding(level, begin + pos, emb[tid]);
#else
		emb_list.get_embedding(level, begin + pos, emb);
#endif
		IndexT vid = emb_list.get_vid(level, begin + pos);
		IndexT start = indices[pos];
		IndexT row_begin = graph.edge_begin(vid);
		IndexT row_end = graph.edge_end(vid);
		for (IndexT e = row_begin; e < row_end; e++) {
			IndexT dst = graph.getEdgeDst(e);
#ifdef USE_SHM
			if (is_all_connected_dag(dst, emb[tid], level, graph)) {
#else
			if (is_all_connected_dag(dst, emb, level, graph)) {
#endif
				emb_list.set_idx(level+1, start, begin + pos);
				emb_list.set_vid(level+1, start++, dst);
			}
		}
	}
}

void kcl_gpu_solver(std::string fname, unsigned k, AccType &total, size_t N_CHUNK) {
	CSRGraph graph_cpu, graph_gpu;
	graph_cpu.read(fname, false, true); // read graph into CPU memory, use DAG
	int m = graph_cpu.get_nnodes();
	int nnz = graph_cpu.get_nedges();
	graph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory
	EmbeddingList emb_list;
	emb_list.init(nnz, k);

	int nthreads = BLOCK_SIZE;
	int nblocks = DIVIDE_INTO(m, nthreads);
	init_gpu_dag<<<nblocks, nthreads>>>(m, graph_gpu, emb_list);
	check_cuda(cudaDeviceSynchronize());

	AccType h_total = 0, *d_total;
	AccType zero = 0;
	size_t chunk_length = (nnz - 1) / N_CHUNK + 1;
	check_cuda(cudaMalloc((void **)&d_total, sizeof(AccType)));
	printf("Launching CUDA k-clique solver (%d CTAs, %d threads/CTA) ...\n", nblocks, nthreads);

	Timer t;
	t.Start();
	std::cout << "number of single-edge embeddings: " << nnz << "\n";
	for (size_t cid = 0; cid < N_CHUNK; cid ++) {
		size_t chunk_begin = cid * chunk_length;
		size_t chunk_end = std::min((cid+1) * chunk_length, (size_t)nnz);
		size_t cur_size = chunk_end-chunk_begin;
		std::cout << "Processing the " << cid << " chunk of " << cur_size << " edges\n";

		unsigned level = 1;
		while (1) {
			size_t *num_new_emb;
			size_t num_emb = emb_list.size();
			size_t begin = 0, end = num_emb;
			if (level == 1) { begin = chunk_begin; end = chunk_end; num_emb = end - begin; }
			std::cout << "\t number of embeddings in level " << level << ": " << num_emb << "\n";
			check_cuda(cudaMalloc((void **)&num_new_emb, sizeof(size_t) * (num_emb+1)));
			check_cuda(cudaMemset(num_new_emb, 0, sizeof(size_t) * (num_emb+1)));
			nblocks = (num_emb-1)/nthreads+1;
			check_cuda(cudaMemcpy(d_total, &zero, sizeof(AccType), cudaMemcpyHostToDevice));
			extend_alloc<<<nblocks, nthreads>>>(begin, end, level, k, graph_gpu, emb_list, num_new_emb, d_total);
			check_cuda(cudaMemcpy(&h_total, d_total, sizeof(AccType), cudaMemcpyDeviceToHost));
			total += h_total;
			CudaTest("solving extend alloc failed");
			if (level == k-2) {
				check_cuda(cudaFree(num_new_emb));
				break; 
			}
			size_t *indices;
			check_cuda(cudaMalloc((void **)&indices, sizeof(size_t) * (num_emb+1)));
			thrust::exclusive_scan(thrust::device, num_new_emb, num_new_emb+num_emb+1, indices);
			check_cuda(cudaFree(num_new_emb));
			size_t new_size;
			check_cuda(cudaMemcpy(&new_size, &indices[num_emb], sizeof(unsigned), cudaMemcpyDeviceToHost));
			std::cout << "\t number of new embeddings: " << new_size << "\n";
			emb_list.add_level(new_size);
			extend_insert<<<nblocks, nthreads>>>(begin, end, level, graph_gpu, emb_list, indices);
			CudaTest("solving extend insert failed");
			check_cuda(cudaFree(indices));
			level ++;
		}
		emb_list.reset_level();
	}
	check_cuda(cudaDeviceSynchronize());
	t.Stop();

	printf("\truntime = %f ms.\n", t.Millisecs());
	check_cuda(cudaFree(d_total));
}


================================================
FILE: lonestar/mining/gpu/k-clique-listing/kcl.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void kcl_gpu_solver(std::string filename, unsigned k, AccType& total,
                    size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/gpu/k-clique-listing/kcl_gpu.cpp
================================================
// Copyright 2019, University of Texas at Austin
// Authors: Xuhao Chen <cxh@utexas.edu>
#include "galois/Galois.h"
#include "kcl.h"
#include "MiningBench/Start.h"
#include "llvm/Support/CommandLine.h"

const char* name = "k-cliques";
const char* desc = "Listing all k-cliques in an undirected graph";
const char* url  = 0;

int main(int argc, char** argv) {
  LonestarMineStart(argc, argv, name, desc, url);

  if (!simpleGraph || !symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric simple graph input "
               " which is symmetric and has no multiple edges or self-loops;"
               " please use both -symmetricGraph and -simpleGraph flag "
               " to indicate the input is a symmetric simple graph");
  }

  if (filetype != "gr") {
    galois::gError("This application only supports gr format\n"
                   "Please add the -ft=gr flag\n");
    exit(1);
  }

  AccType total = 0;
  kcl_gpu_solver(inputFile, k, total);
  std::cout << "\n\ttotal_num_cliques = " << total << "\n\n";
  return 0;
}


================================================
FILE: lonestar/mining/gpu/motif-counting/CMakeLists.txt
================================================
add_executable(motif-counting-gpu motif_gpu.cpp motif.cu)
add_dependencies(apps motif-counting-gpu)
target_link_libraries(motif-counting-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)
target_compile_definitions(motif-counting-gpu PRIVATE GALOIS_ENABLE_GPU=1)
set_property(TARGET motif-counting-gpu PROPERTY CUDA_STANDARD 14)
install(TARGETS motif-counting-gpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
#add_test_mine(small1 motif "${BASEINPUT}/Mining/citeseer.csgr")
add_test_mine(small1 motif-counting-gpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr")


================================================
FILE: lonestar/mining/gpu/motif-counting/README.md
================================================
Motif Counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This application counts the motifs in a graph using BFS 
expansion. It uses the bliss library [1][2] for graph isomorphism test.

[1] Bliss: A tool for computing automorphism groups and canonical 
labelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.
[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient 
canonical labeling tool for large and sparse graphs. In Proceedings 
of the Meeting on Algorithm Engineering & Expermiments, 135-149.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/gpu/motif-counting; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./motif-counting-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 28`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/gpu/motif-counting/motif.cu
================================================
// Copyright (c) 2019, Xuhao Chen
#include "motif.h"
#include "pangolin/timer.h"
#include "pangolin/cutils.h"
#define USE_PID
#define USE_SIMPLE
#define VERTEX_INDUCED
#include "pangolin/miner.cuh"
#include <cub/cub.cuh>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
typedef cub::BlockReduce<AccType, BLOCK_SIZE> BlockReduce;

void printout_motifs(int npatterns, AccType *accumulators) {
	std::cout << std::endl;
	if (npatterns == 2) {
		std::cout << "\ttriangles\t" << accumulators[0] << std::endl;
		std::cout << "\t3-chains\t" << accumulators[1] << std::endl;
	} else if (npatterns == 6) {
		std::cout << "\t4-paths --> " << accumulators[0] << std::endl;
		std::cout << "\t3-stars --> " << accumulators[1] << std::endl;
		std::cout << "\t4-cycles --> " << accumulators[2] << std::endl;
		std::cout << "\ttailed-triangles --> " << accumulators[3] << std::endl;
		std::cout << "\tdiamonds --> " << accumulators[4] << std::endl;
		std::cout << "\t4-cliques --> " << accumulators[5] << std::endl;
	} else {
		std::cout << "\ttoo many patterns to show\n";
	}
	std::cout << std::endl;
}

__global__ void extend_alloc(unsigned m, unsigned level, CSRGraph graph, EmbeddingList emb_list, IndexT *num_new_emb) {
	unsigned tid = threadIdx.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
	if(pos < m) {
		IndexT num = 0;
		emb_list.get_embedding(level, pos, emb[tid]);
		for (unsigned i = 0; i < level+1; ++i) {
			IndexT src = emb[tid][i];
			IndexT row_begin = graph.edge_begin(src);
			IndexT row_end = graph.edge_end(src);
			for (IndexT e = row_begin; e < row_end; e++) {
				IndexT dst = graph.getEdgeDst(e);
				if (!is_vertexInduced_automorphism(level+1, emb[tid], i, src, dst, graph))
					num ++;
			}
		}
		num_new_emb[pos] = num;
	}
}

__global__ void extend_insert(unsigned m, unsigned max_size, unsigned level, CSRGraph graph, EmbeddingList emb_list, IndexT *indices) {
	unsigned tid = threadIdx.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
	if(pos < m) {
		emb_list.get_embedding(level, pos, emb[tid]);
		IndexT start = indices[pos];
		for (unsigned i = 0; i < level+1; ++i) {
			IndexT src = emb[tid][i];
			IndexT row_begin = graph.edge_begin(src);
			IndexT row_end = graph.edge_end(src);
			for (IndexT e = row_begin; e < row_end; e++) {
				IndexT dst = graph.getEdgeDst(e);
				if (!is_vertexInduced_automorphism(level+1, emb[tid], i, src, dst, graph)) {
					if (level == 1 && max_size == 4)
						emb_list.set_pid(start, find_3motif_pattern_id(i, dst, emb[tid], graph, start));
					emb_list.set_idx(level+1, start, pos);
					emb_list.set_vid(level+1, start++, dst);
				}
			}
		}
	}
}

__global__ void aggregate(unsigned m, unsigned level, unsigned npatterns, CSRGraph graph, EmbeddingList emb_list, AccType *accumulators) {
	unsigned tid = threadIdx.x;
	unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;
	//__shared__ typename BlockReduce::TempStorage temp_storage;
	__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];
	AccType local_num[6];
	for (int i = 0; i < npatterns; i++) local_num[i] = 0;
	if(pos < m) {
		unsigned pattern = 0;
		emb_list.get_embedding(level, pos, emb[tid]);
		//if (pos == 0) printout_embedding(level, emb[tid]);
		unsigned n = level+1;
		assert(n < 4);
		if (n == 3) pattern = emb_list.get_pid(pos);
		for (unsigned i = 0; i < n; ++i) {
			IndexT src = emb[tid][i];
			IndexT row_begin = graph.edge_begin(src);
			IndexT row_end = graph.edge_end(src);
			for (IndexT e = row_begin; e < row_end; e++) {
				IndexT dst = graph.getEdgeDst(e);
				if (!is_vertexInduced_automorphism(n, emb[tid], i, src, dst, graph)) {
					unsigned pid = 1; // 3-chain
					//if (i == 0 && is_connected(emb[tid][1], dst, graph)) pid = 0; // triangle
					if (n == 2) pid = find_3motif_pattern_id(i, dst, emb[tid], graph, pos);
					else pid = find_4motif_pattern_id(n, i, dst, emb[tid], pattern, graph, pos);
					//printf("pid = %u\n", pid);
					local_num[pid] += 1;
				}
			}
		}
	}
	//AccType block_num;
	for (int i = 0; i < npatterns; i++) {
		//block_num = BlockReduce(temp_storage).Sum(local_num[i]);
		//if(threadIdx.x == 0) atomicAdd(&accumulators[i], block_num);
		atomicAdd(&accumulators[i], local_num[i]);
	}
}

__global__ void clear(AccType *accumulators) {
	unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
	accumulators[i] = 0;
}

void parallel_prefix_sum(int n, IndexT *in, IndexT *out) {
	IndexT total = 0;
	for (size_t i = 0; i < n; i++) {
		out[i] = total;
		total += in[i];
	}
	out[n] = total;
}

void motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType> &acc, size_t N_CHUNK) {
	size_t npatterns = acc.size();
	AccType *h_accumulators = (AccType *)malloc(sizeof(AccType) * npatterns);
	for (int i = 0; i < npatterns; i++) h_accumulators[i] = 0;
	AccType *d_accumulators;
	CUDA_SAFE_CALL(cudaMalloc((void **)&d_accumulators, sizeof(AccType) * npatterns));
	clear<<<1, npatterns>>>(d_accumulators);
	CudaTest("clear accumulator failed");

	CSRGraph graph_cpu, graph_gpu;
	graph_cpu.read(fname, false); // read graph into CPU memoryA
	int m = graph_cpu.get_nnodes();
	int nnz = graph_cpu.get_nedges();
	graph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory

	int nthreads = BLOCK_SIZE;
	int nblocks = DIVIDE_INTO(m, nthreads);
	printf("Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\n", nblocks, nthreads);
	EmbeddingList emb_list;
	emb_list.init(nnz, k, false);
	emb_list.init_cpu(&graph_cpu);
	CUDA_SAFE_CALL(cudaDeviceSynchronize());

	Timer t;
	t.Start();
	unsigned level = 1;
	unsigned num_emb = emb_list.size();
	while (level < k-2) {
		IndexT *num_new_emb, *indices;
		CUDA_SAFE_CALL(cudaMalloc((void **)&num_new_emb, sizeof(IndexT) * (num_emb+1)));
		CUDA_SAFE_CALL(cudaMalloc((void **)&indices, sizeof(IndexT) * (num_emb+1)));
		nblocks = (num_emb-1)/nthreads+1;
		extend_alloc<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list, num_new_emb);
		CudaTest("solving extend_alloc failed");
		thrust::exclusive_scan(thrust::device, num_new_emb, num_new_emb+num_emb+1, indices);
		CudaTest("Scan failed");
		IndexT new_size;
		CUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT), cudaMemcpyDeviceToHost));
		assert(new_size < 4294967296); // TODO: currently do not support vector size larger than 2^32
		emb_list.add_level(new_size);
		#ifdef USE_WEDGE
		//if (level == 1 && max_size == 4) {
		//	is_wedge.resize(emb_list.size());
		//	std::fill(is_wedge.begin(), is_wedge.end(), 0);
		//}
		#endif
		extend_insert<<<nblocks, nthreads>>>(num_emb, k, level, graph_gpu, emb_list, indices);
		CudaTest("solving extend_insert failed");
		std::cout << "Extend_insert Done\n";
		num_emb = emb_list.size();
		CUDA_SAFE_CALL(cudaFree(num_new_emb));
		CUDA_SAFE_CALL(cudaFree(indices));
		level ++;
	}
	if (k < 5) {
		nblocks = (num_emb-1)/nthreads+1;
		aggregate<<<nblocks, nthreads>>>(num_emb, level, npatterns, graph_gpu, emb_list, d_accumulators);
		CudaTest("solving aggregate failed");
	} else {
		printf("Not supported\n");
	}
	CUDA_SAFE_CALL(cudaDeviceSynchronize());
	t.Stop();

	printf("\truntime = %f ms.\n", t.Millisecs());
	CUDA_SAFE_CALL(cudaMemcpy(h_accumulators, d_accumulators, sizeof(AccType) * npatterns, cudaMemcpyDeviceToHost));
	printout_motifs(npatterns, h_accumulators);
	CUDA_SAFE_CALL(cudaFree(d_accumulators));
}


================================================
FILE: lonestar/mining/gpu/motif-counting/motif.h
================================================
#pragma once
#include <vector>
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType>& acc,
                      size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/gpu/motif-counting/motif_gpu.cpp
================================================
// Copyright 2019, University of Texas at Austin
// Authors: Xuhao Chen <cxh@utexas.edu>
#include "galois/Galois.h"
#include "motif.h"
#include "MiningBench/Start.h"
#include "llvm/Support/CommandLine.h"

const char* name           = "k-cliques";
const char* desc           = "Listing all k-cliques in an undirected graph";
const char* url            = 0;
static int num_patterns[3] = {2, 6, 21};

int main(int argc, char** argv) {
  LonestarMineStart(argc, argv, name, desc, url);

  if (!simpleGraph || !symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric simple graph input "
               " which is symmetric and has no multiple edges or self-loops;"
               " please use both -symmetricGraph and -simpleGraph flag "
               " to indicate the input is a symmetric simple graph");
  }

  if (filetype != "gr") {
    galois::gError("This application only supports gr format\n"
                   "Please add the -ft=gr flag\n");
    exit(1);
  }
  int npatterns = num_patterns[k - 3];
  std::cout << k << "-motif has " << npatterns << " patterns in total\n";
  std::vector<AccType> accumulators(npatterns);
  for (int i = 0; i < npatterns; i++)
    accumulators[i] = 0;

  motif_gpu_solver(inputFile, k, accumulators);
  return 0;
}


================================================
FILE: lonestar/mining/gpu/triangle-counting/CMakeLists.txt
================================================
add_executable(triangle-counting-mining-gpu tc_mine_gpu.cpp tc_mine.cu)
add_dependencies(apps triangle-counting-mining-gpu)
target_link_libraries(triangle-counting-mining-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)
target_compile_definitions(triangle-counting-mining-gpu PRIVATE GALOIS_ENABLE_GPU=1)
set_property(TARGET triangle-counting-mining-gpu PROPERTY CUDA_STANDARD 14)
install(TARGETS triangle-counting-mining-gpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_mine(small1 triangle-counting-mining-gpu -symmetricGraph -simpleGraph "${BASEINPUT}/Mining/citeseer.csgr")


================================================
FILE: lonestar/mining/gpu/triangle-counting/README.md
================================================
Triangle Counting
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program counts the number of triangles in a given undirected graph.

INPUT
--------------------------------------------------------------------------------

This application takes in symmetric and simple Galois .gr graphs.
You must specify both the -symmetricGraph and the -simpleGraph flags when
running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/mining/gpu/triangle-counting; make -j`

RUN
--------------------------------------------------------------------------------

The following is an example command line.

-`$ ./triangle-counting-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`

PERFORMANCE
--------------------------------------------------------------------------------

Please see details in the paper.


================================================
FILE: lonestar/mining/gpu/triangle-counting/tc.h
================================================
#pragma once
#include <string>
#include <iostream>
#include "pangolin/types.cuh"

void tc_gpu_solver(std::string filename, AccType& total, size_t N_CHUNK = 1);


================================================
FILE: lonestar/mining/gpu/triangle-counting/tc_mine.cu
================================================
// Copyright (c) 2019, Xuhao Chen
#include "tc.h"
#include "pangolin/timer.h"
#include "pangolin/cutils.h"
#define USE_SIMPLE
#define USE_BASE_TYPES
#include <cub/cub.cuh>
#include "pangolin/miner.cuh"
typedef cub::BlockReduce<unsigned long long, BLOCK_SIZE> BlockReduce;

__global__ void warp_edge(int m, CSRGraph graph, EmbeddingList emb_list, unsigned long long *total) {
	__shared__ typename BlockReduce::TempStorage temp_storage;
	unsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
	unsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index
	//unsigned warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
	unsigned num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;      // total number of active warps

	unsigned long long local_num = 0;
	for (IndexT tid = warp_id; tid < m; tid += num_warps) {
		IndexT src = emb_list.get_idx(1, tid);
		IndexT dst = emb_list.get_vid(1, tid);
		assert(src != dst);
		IndexT src_size = graph.getOutDegree(src);
		IndexT dst_size = graph.getOutDegree(dst);
		IndexT lookup = src;
		IndexT search = dst;
		if (src_size > dst_size) {
			lookup = dst;
			search = src;
		}
		IndexT lookup_begin = graph.edge_begin(lookup);
		IndexT lookup_size = graph.getOutDegree(lookup);
		IndexT search_size = graph.getOutDegree(search);
		if (lookup_size > 0 && search_size > 0) {
			for (IndexT i = thread_lane; i < lookup_size; i += WARP_SIZE) {
				IndexT index = lookup_begin + i;
				IndexT key = graph.getEdgeDst(index);
				IndexT search_begin = graph.edge_begin(search);
				if (binary_search(graph, key, search_begin, search_begin+search_size))
					local_num += 1;
			}
		}
	}
	unsigned long long block_num = BlockReduce(temp_storage).Sum(local_num);
	if(threadIdx.x == 0) atomicAdd(total, block_num);
}

__global__ void warp(int m, IndexT *row_offsets, IndexT *column_indices, int *degrees, unsigned long long *total) {
	__shared__ typename BlockReduce::TempStorage temp_storage;
	unsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
	unsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index
	//unsigned warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
	unsigned num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;      // total number of active warps

	unsigned long long local_num = 0;
	// each warp takes one vertex
	for (IndexT src = warp_id; src < m; src += num_warps) {
		IndexT row_begin = row_offsets[src];
		IndexT row_end = row_offsets[src+1];
		IndexT src_size = degrees[src];
		// take one edge
		for (IndexT offset = row_begin; offset < row_end; offset ++) {
			IndexT dst = column_indices[offset];
			assert(src != dst);
			IndexT dst_size = degrees[dst];
			IndexT lookup = src;
			IndexT search = dst;
			if (src_size > dst_size) {
				lookup = dst;
				search = src;
			}
			IndexT lookup_begin = row_offsets[lookup];
			IndexT lookup_size = degrees[lookup];
			IndexT search_size = degrees[search];
			if (lookup_size > 0 && search_size > 0) {
				for (IndexT i = thread_lane; i < lookup_size; i += WARP_SIZE) {
					IndexT index = lookup_begin + i;
					IndexT key = column_indices[index];
					IndexT search_begin = row_offsets[search];
					if (binary_search(column_indices, key, search_begin, search_begin+search_size))
						local_num += 1;
				}
			}
		}
	}
	unsigned long long block_num = BlockReduce(temp_storage).Sum(local_num);
	if(threadIdx.x == 0) atomicAdd(total, block_num);
}


void tc_gpu_solver(std::string fname, AccType &total, size_t N_CHUNK) {
	CSRGraph graph_cpu, graph_gpu;
	graph_cpu.read(fname, false, true); // read graph into CPU memory, use DAG
	int m = graph_cpu.get_nnodes();
	int nnz = graph_cpu.get_nedges();
	graph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory
	EmbeddingList emb_list;
	emb_list.init(nnz);

	int nthreads = BLOCK_SIZE;
	int nblocks = DIVIDE_INTO(m, WARPS_PER_BLOCK);
	init_gpu_dag<<<nblocks, nthreads>>>(m, graph_gpu, emb_list);

	unsigned long long h_total = 0, *d_total;
	unsigned long long  zero = 0;
	CUDA_SAFE_CALL(cudaMalloc((void **)&d_total, sizeof(unsigned long long)));
	CUDA_SAFE_CALL(cudaMemcpy(d_total, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));
	printf("Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\n", nblocks, nthreads);

	Timer t;
	t.Start();
	//warp<<<nblocks, nthreads>>>(m, d_row_offsets, d_column_indices, d_degrees, d_total);
	warp_edge<<<nblocks, nthreads>>>(nnz, graph_gpu, emb_list, d_total);
	CudaTest("solving failed");
	CUDA_SAFE_CALL(cudaDeviceSynchronize());
	t.Stop();

	printf("\truntime = %f ms.\n", t.Millisecs());
	CUDA_SAFE_CALL(cudaMemcpy(&h_total, d_total, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
	total = h_total;
	CUDA_SAFE_CALL(cudaFree(d_total));
}


================================================
FILE: lonestar/mining/gpu/triangle-counting/tc_mine_gpu.cpp
================================================
// Copyright 2019, University of Texas at Austin
// Authors: Xuhao Chen <cxh@utexas.edu>
#include "galois/Galois.h"
#include "tc.h"
#include "MiningBench/Start.h"
#include "llvm/Support/CommandLine.h"

const char* name = "Triangle counting";
const char* desc = "Counting triangles in an undirected graph";
const char* url  = 0;

int main(int argc, char** argv) {
  LonestarMineStart(argc, argv, name, desc, url);

  if (!simpleGraph || !symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric simple graph input "
               " which is symmetric and has no multiple edges or self-loops;"
               " please use both -symmetricGraph and -simpleGraph flag "
               " to indicate the input is a symmetric simple graph");
  }

  if (filetype != "gr") {
    galois::gError("This application only supports gr format\n"
                   "Please add the -ft=gr flag\n");
    exit(1);
  }
  AccType total = 0;
  tc_gpu_solver(inputFile, total);
  std::cout << "\n\ttotal_num_triangles = " << total << "\n\n";
  return 0;
}


================================================
FILE: lonestar/mining/libminingbench/CMakeLists.txt
================================================
add_library(miningbench STATIC src/Start.cpp src/Input.cpp)
target_include_directories(miningbench PUBLIC
  "${CMAKE_CURRENT_SOURCE_DIR}/include"
)

target_link_libraries(miningbench Galois::shmem LLVMSupport lonestar)

if(GALOIS_ENABLE_DIST)
  add_library(miningbench_dist STATIC src/Start.cpp)
  target_include_directories(miningbench_dist PUBLIC
    "${CMAKE_CURRENT_SOURCE_DIR}/include"
  )
  target_compile_definitions(miningbench_dist PRIVATE GALOIS_ENABLE_DIST=1) 
  target_link_libraries(miningbench_dist Galois::shmem LLVMSupport distbench)
endif()

if(GALOIS_ENABLE_GPU)
  add_library(miningbench_gpu STATIC src/Start.cpp src/Input.cpp)
  target_include_directories(miningbench_gpu PUBLIC
    "${CMAKE_CURRENT_SOURCE_DIR}/include"
  )
  target_compile_definitions(miningbench_gpu PRIVATE GALOIS_ENABLE_GPU=1) 
  target_link_libraries(miningbench_gpu Galois::shmem LLVMSupport)
endif()


================================================
FILE: lonestar/mining/libminingbench/include/MiningBench/Start.h
================================================
#pragma once
#include "llvm/Support/CommandLine.h"

namespace cll = llvm::cl;
extern cll::opt<std::string> inputFile;
extern cll::opt<std::string> filetype;
extern cll::opt<unsigned> num_trials;
extern cll::opt<unsigned> nblocks;
extern cll::opt<std::string> pattern_filename;
extern cll::opt<std::string> morder_filename;
extern cll::opt<unsigned> fv;
extern cll::opt<unsigned> k;
extern cll::opt<unsigned> show;
extern cll::opt<unsigned> debug;
extern cll::opt<unsigned> minsup;
extern cll::opt<std::string> preset_filename;

extern cll::opt<bool> simpleGraph;

// note these may come from uplevel liblonestar or libdistbench
extern cll::opt<int> numThreads; // locally defined for gpu apps (necessary?)
extern cll::opt<bool> verify;    // TODO use skipVerify from liblonestar
#ifndef GALOIS_ENABLE_GPU
extern cll::opt<std::string> statFile;
#endif
extern cll::opt<bool> symmetricGraph; // locally defined for gpu apps

void LonestarMineStart(int argc, char** argv, const char* app, const char* desc,
                       const char* url);


================================================
FILE: lonestar/mining/libminingbench/src/Input.cpp
================================================
#include "MiningBench/Start.h"

cll::opt<std::string> inputFile(cll::Positional,
                                cll::desc("<filename: symmetrized graph>"),
                                cll::Required);


================================================
FILE: lonestar/mining/libminingbench/src/Start.cpp
================================================
#include <string>
#include <sstream>
#include <iostream>
#ifndef GALOIS_ENABLE_GPU
#include "galois/Galois.h"
#endif
#include "galois/gIO.h"
#include "MiningBench/Start.h"

namespace cll = llvm::cl;
cll::opt<std::string> filetype("ft", cll::desc("<filetype: txt,adj,mtx,gr>"),
                               cll::init("gr"));
cll::opt<unsigned> num_trials("n",
                              cll::desc("perform n trials (default value 1)"),
                              cll::init(1));
cll::opt<unsigned>
    nblocks("b", cll::desc("edge blocking to b blocks (default value 1)"),
            cll::init(1));
cll::opt<std::string>
    pattern_filename("p",
                     cll::desc("<pattern graph filename: symmetrized graph>"),
                     cll::init(""));
cll::opt<std::string>
    morder_filename("mo", cll::desc("<filename: pre-defined matching order>"),
                    cll::init(""));
cll::opt<unsigned> fv("fv", cll::desc("first vertex is special"), cll::init(0));
cll::opt<unsigned>
    k("k", cll::desc("max number of vertices in k-clique (default value 3)"),
      cll::init(3));
cll::opt<unsigned> show("s", cll::desc("print out the details"), cll::init(0));
cll::opt<unsigned>
    debug("d", cll::desc("print out the frequent patterns for debugging"),
          cll::init(0));
cll::opt<unsigned> minsup("ms",
                          cll::desc("minimum support (default value 300)"),
                          cll::init(300));
cll::opt<std::string>
    preset_filename("pf", cll::desc("<filename: preset matching order>"),
                    cll::init(""));
// TODO use skipVerify from liblonestar
cll::opt<bool>
    verify("v", llvm::cl::desc("do verification step (default value false)"),
           llvm::cl::init(false));

cll::opt<bool>
    simpleGraph("simpleGraph",
                cll::desc("Specify that the input graph is "
                          "simple (has no multiple edges or self-loops)"),
                cll::init(false));

#ifdef GALOIS_ENABLE_GPU
// TODO is numThreads necessary for gpu apps? remove it if not.
cll::opt<int> numThreads("t",
                         llvm::cl::desc("Number of threads (default value 1)"),
                         llvm::cl::init(1));
cll::opt<bool>
    symmetricGraph("symmetricGraph",
                   cll::desc("Specify that the input graph is symmetric"),
                   cll::init(false));
#endif

// TODO merge LonestarStart for cpu apps
void LonestarMineStart(int argc, char** argv, const char* app, const char* desc,
                       const char* url) {
  llvm::cl::ParseCommandLineOptions(argc, argv);

  if (!simpleGraph || !symmetricGraph) {
    GALOIS_DIE("This application requires a symmetric simple graph input "
               " which is symmetric and has no multiple edges or self-loops;"
               " please use both -symmetricGraph and -simpleGraph flag "
               " to indicate the input is a symmetric simple graph");
  }

#ifndef GALOIS_ENABLE_GPU
  numThreads = galois::setActiveThreads(numThreads);
  galois::runtime::setStatFile(statFile);
#endif
  std::cout << "Copyright (C) 2020 The University of Texas at Austin\n";
  std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
  std::cout << "application: " << (app ? app : "unspecified") << "\n";
  if (desc)
    std::cout << desc << "\n";
  if (url)
    std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
              << url << "\n";
  std::cout << "\n";
  std::ostringstream cmdout;
  for (int i = 0; i < argc; ++i) {
    cmdout << argv[i];
    if (i != argc - 1)
      cmdout << " ";
  }
#ifndef GALOIS_ENABLE_GPU
  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
  galois::runtime::reportParam("(NULL)", "Runs", num_trials);
  galois::runtime::reportParam("(NULL)", "Input", inputFile);
  galois::runtime::reportParam("(NULL)", "Hosts", 1);
#endif
}


================================================
FILE: lonestar/scientific/CMakeLists.txt
================================================
add_subdirectory(cpu)
if(GALOIS_ENABLE_GPU)
  add_subdirectory(gpu)
endif()


================================================
FILE: lonestar/scientific/cpu/CMakeLists.txt
================================================
add_subdirectory(barneshut)
add_subdirectory(delaunayrefinement)
add_subdirectory(delaunaytriangulation)
add_subdirectory(longestedge)


================================================
FILE: lonestar/scientific/cpu/barneshut/Barneshut.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/Bag.h"
#include "galois/Reduction.h"
#include "Lonestar/BoilerPlate.h"
#include "galois/runtime/Profile.h"

#include <boost/math/constants/constants.hpp>
#include <boost/iterator/transform_iterator.hpp>

#include <array>
#include <limits>
#include <iostream>
#include <fstream>
#include <random>
#include <deque>

#include <strings.h>

#include "Point.h"

const char* name = "Barnes-Hut N-Body Simulator";
const char* desc =
    "Simulates gravitational forces in a galactic cluster using the "
    "Barnes-Hut n-body algorithm";
const char* url = "barneshut";

static llvm::cl::opt<int>
    nbodies("n", llvm::cl::desc("Number of bodies (default value 10000)"),
            llvm::cl::init(10000));
static llvm::cl::opt<int>
    ntimesteps("steps", llvm::cl::desc("Number of steps (default value 1)"),
               llvm::cl::init(1));
static llvm::cl::opt<int> seed("seed",
                               llvm::cl::desc("Random seed (default value 7)"),
                               llvm::cl::init(7));

struct Node {
  Point pos;
  double mass;
  bool Leaf;
};

struct Body : public Node {
  Point vel;
  Point acc;
};

/**
 * A node in an octree is either an internal node or a leaf.
 */
struct Octree : public Node {
  std::array<galois::substrate::PtrLock<Node>, 8> child;
  char cLeafs;
  char nChildren;

  Octree(const Point& p) {
    Node::pos  = p;
    Node::Leaf = false;
    cLeafs     = 0;
    nChildren  = 0;
  }
};

std::ostream& operator<<(std::ostream& os, const Body& b) {
  os << "(pos:" << b.pos << " vel:" << b.vel << " acc:" << b.acc
     << " mass:" << b.mass << ")";
  return os;
}

struct BoundingBox {
  Point min;
  Point max;
  explicit BoundingBox(const Point& p) : min(p), max(p) {}
  BoundingBox()
      : min(std::numeric_limits<double>::max()),
        max(std::numeric_limits<double>::min()) {}

  BoundingBox merge(const BoundingBox& other) const {
    BoundingBox copy(*this);

    copy.min.pairMin(other.min);
    copy.max.pairMax(other.max);
    return copy;
  }

  double diameter() const { return (max - min).minDim(); }
  double radius() const { return diameter() * 0.5; }
  Point center() const { return (min + max) * 0.5; }
};

std::ostream& operator<<(std::ostream& os, const BoundingBox& b) {
  os << "(min:" << b.min << " max:" << b.max << ")";
  return os;
}

struct Config {
  const double dtime; // length of one time step
  const double eps;   // potential softening parameter
  const double tol;   // tolerance for stopping recursion, <0.57 to bound error
  const double dthf, epssq, itolsq;
  Config()
      : dtime(0.5), eps(0.05), tol(0.05), // 0.025),
        dthf(dtime * 0.5), epssq(eps * eps), itolsq(1.0 / (tol * tol)) {}
};

std::ostream& operator<<(std::ostream& os, const Config& c) {
  os << "Barnes-Hut configuration:"
     << " dtime: " << c.dtime << " eps: " << c.eps << " tol: " << c.tol;
  return os;
}

Config config;

inline int getIndex(const Point& a, const Point& b) {
  int index = 0;
  for (int i = 0; i < 3; ++i)
    if (a[i] < b[i])
      index += (1 << i);
  return index;
}

inline Point updateCenter(Point v, int index, double radius) {
  for (int i = 0; i < 3; i++)
    v[i] += (index & (1 << i)) > 0 ? radius : -radius;
  return v;
}

typedef galois::InsertBag<Body> Bodies;
typedef galois::InsertBag<Body*> BodyPtrs;
// FIXME: reclaim memory for multiple steps
typedef galois::InsertBag<Octree> Tree;

struct BuildOctree {

  Tree& T;

  void insert(Body* b, Octree* node, double radius) const {
    int index   = getIndex(node->pos, b->pos);
    Node* child = node->child[index].getValue();

    // go through the tree lock-free while we can
    if (child && !child->Leaf) {
      insert(b, static_cast<Octree*>(child), radius);
      return;
    }

    node->child[index].lock();
    child = node->child[index].getValue();

    if (child == NULL) {
      node->child[index].unlock_and_set(b);
      return;
    }

    radius *= 0.5;
    if (child->Leaf) {
      // Expand leaf

      Octree* new_node = &T.emplace(updateCenter(node->pos, index, radius));
      if (b->pos == child->pos) {
        // Jitter point to gaurantee uniqueness.
        double jitter = config.tol / 2;
        assert(jitter < radius);
        b->pos += (new_node->pos - b->pos) * jitter;
      }

      // assert(node->pos != b->pos);
      // node->child[index].unlock_and_set(new_node);
      insert(b, new_node, radius);
      insert(static_cast<Body*>(child), new_node, radius);
      node->child[index].unlock_and_set(new_node);
    } else {
      node->child[index].unlock();
      insert(b, static_cast<Octree*>(child), radius);
    }
  }
};

unsigned computeCenterOfMass(Octree* node) {
  double mass = 0.0;
  Point accum;
  unsigned num = 1;

  // Reorganize leaves to be dense
  // remove copies values
  int index = 0;
  for (int i = 0; i < 8; ++i)
    if (node->child[i].getValue())
      node->child[index++].setValue(node->child[i].getValue());
  for (int i = index; i < 8; ++i)
    node->child[i].setValue(NULL);
  node->nChildren = index;

  for (int i = 0; i < index; i++) {
    Node* child = node->child[i].getValue();
    if (!child->Leaf) {
      num += computeCenterOfMass(static_cast<Octree*>(child));
    } else {
      node->cLeafs |= (1 << i);
      ++num;
    }
    mass += child->mass;
    accum += child->pos * child->mass;
  }

  node->mass = mass;

  if (mass > 0.0)
    node->pos = accum / mass;
  return num;
}

/*
void printRec(std::ofstream& file, Node* node, unsigned level) {
  static const char* ct[] = {
    "blue", "cyan", "aquamarine", "chartreuse",
    "darkorchid", "darkorange",
    "deeppink", "gold", "chocolate"
  };
  if (!node) return;
  file << "\"" << node << "\" [color=" << ct[node->owner / 4] << (node->owner %
4 + 1) << (level ? "" : " style=filled") << " label = \"" << (node->Leaf ? "L" :
"N") << "\"];\n"; if (!node->Leaf) { Octree* node2 = static_cast<Octree*>(node);
    for (int i = 0; i < 8 && node2->child[i]; ++i) {
      if (level == 3 || level == 6)
        file << "subgraph cluster_" << level << "_" << i << " {\n";
      file << "\"" << node << "\" -> \"" << node2->child[i] << "\"
[weight=0.01]\n"; printRec(file, node2->child[i], level + 1); if (level == 3 ||
level == 6) file << "}\n";
    }
  }
}

void printTree(Octree* node) {
  std::ofstream file("out.txt");
  file << "digraph octree {\n";
  file << "ranksep = 2\n";
  file << "root = \"" << node << "\"\n";
  //  file << "overlap = scale\n";
  printRec(file, node, 0);
  file << "}\n";
}
*/

Point updateForce(Point delta, double psq, double mass) {
  // Computing force += delta * mass * (|delta|^2 + eps^2)^{-3/2}
  double idr   = 1 / sqrt((float)(psq + config.epssq));
  double scale = mass * idr * idr * idr;
  return delta * scale;
}

struct ComputeForces {
  // Optimize runtime for no conflict case

  Octree* top;
  double root_dsq;

  ComputeForces(Octree* _top, double diameter) : top(_top) {
    assert(diameter > 0.0 && "non positive diameter of bb");
    root_dsq = diameter * diameter * config.itolsq;
  }

  template <typename Context>
  void computeForce(Body* b, Context& cnx) {
    Point p = b->acc;
    b->acc  = Point(0.0, 0.0, 0.0);
    iterate(*b, cnx);
    b->vel += (b->acc - p) * config.dthf;
  }

  struct Frame {
    double dsq;
    Octree* node;
    Frame(Octree* _node, double _dsq) : dsq(_dsq), node(_node) {}
  };

  template <typename Context>
  void iterate(Body& b, Context& cnx) {
    std::deque<Frame, galois::PerIterAllocTy::rebind<Frame>::other> stack(
        cnx.getPerIterAlloc());
    stack.push_back(Frame(top, root_dsq));

    while (!stack.empty()) {
      const Frame f = stack.back();
      stack.pop_back();

      Point p    = b.pos - f.node->pos;
      double psq = p.dist2();

      // Node is far enough away, summarize contribution
      if (psq >= f.dsq) {
        b.acc += updateForce(p, psq, f.node->mass);
        continue;
      }

      double dsq = f.dsq * 0.25;
      for (int i = 0; i < f.node->nChildren; i++) {
        Node* n = f.node->child[i].getValue();
        assert(n);
        if (f.node->cLeafs & (1 << i)) {
          assert(n->Leaf);
          if (static_cast<const Node*>(&b) != n) {
            Point p = b.pos - n->pos;
            b.acc += updateForce(p, p.dist2(), n->mass);
          }
        } else {
#ifndef GALOIS_CXX11_DEQUE_HAS_NO_EMPLACE
          stack.emplace_back(static_cast<Octree*>(n), dsq);
#else
          stack.push_back(Frame(static_cast<Octree*>(n), dsq));
#endif
          __builtin_prefetch(n);
        }
      }
    }
  }
};

struct centerXCmp {
  template <typename T>
  bool operator()(const T& lhs, const T& rhs) const {
    return lhs.pos[0] < rhs.pos[0];
  }
};

struct centerYCmp {
  template <typename T>
  bool operator()(const T& lhs, const T& rhs) const {
    return lhs.pos[1] < rhs.pos[1];
  }
};

struct centerYCmpInv {
  template <typename T>
  bool operator()(const T& lhs, const T& rhs) const {
    return rhs.pos[1] < lhs.pos[1];
  }
};

template <typename Iter, typename Gen>
void divide(const Iter& b, const Iter& e, Gen& gen) {
  if (std::distance(b, e) > 32) {
    std::sort(b, e, centerXCmp());
    Iter m = galois::split_range(b, e);
    std::sort(b, m, centerYCmpInv());
    std::sort(m, e, centerYCmp());
    divide(b, galois::split_range(b, m), gen);
    divide(galois::split_range(b, m), m, gen);
    divide(m, galois::split_range(m, e), gen);
    divide(galois::split_range(m, e), e, gen);
  } else {
    std::shuffle(b, e, gen);
  }
}

/**
 * Generates random input according to the Plummer model, which is more
 * realistic but perhaps not so much so according to astrophysicists
 */
void generateInput(Bodies& bodies, BodyPtrs& pBodies, int nbodies, int seed) {
  double v, sq, scale;
  Point p;
  double PI = boost::math::constants::pi<double>();

  std::mt19937 gen(seed);
#if __cplusplus >= 201103L || defined(HAVE_CXX11_UNIFORM_INT_DISTRIBUTION)
  std::uniform_real_distribution<double> dist(0, 1);
#else
  std::uniform_real<double> dist(0, 1);
#endif

  double rsc = (3 * PI) / 16;
  double vsc = sqrt(1.0 / rsc);

  std::vector<Body> tmp;

  for (int body = 0; body < nbodies; body++) {
    double r = 1.0 / sqrt(pow(dist(gen) * 0.999, -2.0 / 3.0) - 1);
    do {
      for (int i = 0; i < 3; i++)
        p[i] = dist(gen) * 2.0 - 1.0;
      sq = p.dist2();
    } while (sq > 1.0);
    scale = rsc * r / sqrt(sq);

    Body b;
    b.mass = 1.0 / nbodies;
    b.pos  = p * scale;
    do {
      p[0] = dist(gen);
      p[1] = dist(gen) * 0.1;
    } while (p[1] > p[0] * p[0] * pow(1 - p[0] * p[0], 3.5));
    v = p[0] * sqrt(2.0 / sqrt(1 + r * r));
    do {
      for (int i = 0; i < 3; i++)
        p[i] = dist(gen) * 2.0 - 1.0;
      sq = p.dist2();
    } while (sq > 1.0);
    scale  = vsc * v / sqrt(sq);
    b.vel  = p * scale;
    b.Leaf = true;
    tmp.push_back(b);
    // pBodies.push_back(&bodies.push_back(b));
  }

  // sort and copy out
  divide(tmp.begin(), tmp.end(), gen);

  galois::do_all(
      galois::iterate(tmp),
      [&pBodies, &bodies](const Body& b) {
        pBodies.push_back(&(bodies.push_back(b)));
      },
      galois::loopname("InsertBody"));
}

struct CheckAllPairs {
  Bodies& bodies;

  CheckAllPairs(Bodies& b) : bodies(b) {}

  double operator()(const Body& body) const {
    const Body* me = &body;
    Point acc;
    for (Bodies::iterator ii = bodies.begin(), ei = bodies.end(); ii != ei;
         ++ii) {
      Body* b = &*ii;
      if (me == b)
        continue;
      Point delta = me->pos - b->pos;
      double psq  = delta.dist2();
      acc += updateForce(delta, psq, b->mass);
    }

    double dist2 = acc.dist2();
    acc -= me->acc;
    double retval = acc.dist2() / dist2;
    return retval;
  }
};

double checkAllPairs(Bodies& bodies, int N) {
  Bodies::iterator end(bodies.begin());
  std::advance(end, N);

  return galois::ParallelSTL::map_reduce(bodies.begin(), end,
                                         CheckAllPairs(bodies),
                                         std::plus<double>(), 0.0) /
         N;
}

void run(Bodies& bodies, BodyPtrs& pBodies, size_t nbodies) {
  typedef galois::worklists::StableIterator<true> WLL;

  galois::preAlloc(galois::getActiveThreads() +
                   (3 * sizeof(Octree) + 2 * sizeof(Body)) * nbodies /
                       galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  for (int step = 0; step < ntimesteps; step++) {

    auto mergeBoxes = [](const BoundingBox& lhs, const BoundingBox& rhs) {
      return lhs.merge(rhs);
    };

    auto identity = []() { return BoundingBox(); };

    // Do tree building sequentially
    auto boxes = galois::make_reducible(mergeBoxes, identity);

    galois::do_all(
        galois::iterate(pBodies),
        [&boxes](const Body* b) { boxes.update(BoundingBox(b->pos)); },
        galois::loopname("reduceBoxes"));

    BoundingBox box = boxes.reduce();

    Tree t;
    BuildOctree treeBuilder{t};
    Octree& top = t.emplace(box.center());

    galois::StatTimer T_build("BuildTime");
    T_build.start();
    galois::do_all(
        galois::iterate(pBodies),
        [&](Body* body) { treeBuilder.insert(body, &top, box.radius()); },
        galois::loopname("BuildTree"));
    T_build.stop();

    // update centers of mass in tree
    galois::timeThis(
        [&](void) {
          unsigned size = computeCenterOfMass(&top);
          // printTree(&top);
          std::cout << "Tree Size: " << size << "\n";
        },
        "summarize-Serial");

    ComputeForces cf(&top, box.diameter());

    galois::StatTimer T_compute("ComputeTime");
    T_compute.start();
    galois::for_each(
        galois::iterate(pBodies),
        [&](Body* b, auto& cnx) { cf.computeForce(b, cnx); },
        galois::loopname("compute"), galois::wl<WLL>(),
        galois::disable_conflict_detection(), galois::no_pushes(),
        galois::per_iter_alloc());
    T_compute.stop();

    if (!skipVerify) {
      galois::timeThis(
          [&](void) {
            std::cout << "MSE (sampled) "
                      << checkAllPairs(bodies, std::min((int)nbodies, 100))
                      << "\n";
          },
          "checkAllPairs");
    }
    // Done in compute forces
    galois::do_all(
        galois::iterate(pBodies),
        [](Body* b) {
          Point dvel(b->acc);
          dvel *= config.dthf;
          Point velh(b->vel);
          velh += dvel;
          b->pos += velh * config.dtime;
          b->vel = velh + dvel;
        },
        galois::loopname("advance"));

    std::cout << "Timestep " << step << " Center of Mass = ";
    std::ios::fmtflags flags =
        std::cout.setf(std::ios::showpos | std::ios::right |
                       std::ios::scientific | std::ios::showpoint);
    std::cout << top.pos;
    std::cout.flags(flags);
    std::cout << "\n";
  }

  galois::reportPageAlloc("MeminfoPost");
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, nullptr);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  std::cout << config << "\n";
  std::cout << nbodies << " bodies, " << ntimesteps << " time steps\n";

  Bodies bodies;
  BodyPtrs pBodies;
  generateInput(bodies, pBodies, nbodies, seed);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  run(bodies, pBodies, nbodies);
  execTime.stop();

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/scientific/cpu/barneshut/CMakeLists.txt
================================================
add_executable(barneshut-cpu Barneshut.cpp)
add_dependencies(apps barneshut-cpu)
target_link_libraries(barneshut-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS barneshut-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

if(CMAKE_COMPILER_IS_GNUCC)
  target_compile_options(barneshut-cpu PRIVATE -ffast-math)
endif()

add_test_scale(small barneshut-cpu -n 10000 -steps 1 -seed 0)


================================================
FILE: lonestar/scientific/cpu/barneshut/Point.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

struct Point {
  double val[3];
  Point() { val[0] = val[1] = val[2] = 0.0; }
  // Point(double _x, double _y, double _z) : val{_x,_y,_z} {}
  Point(double _x, double _y, double _z) {
    val[0] = _x;
    val[1] = _y;
    val[2] = _z;
  }
  // explicit Point(double v) : val{v,v,v} {}
  explicit Point(double v) {
    val[0] = v;
    val[1] = v;
    val[2] = v;
  }

  double operator[](const int index) const { return val[index]; }

  double& operator[](const int index) { return val[index]; }

  double x() const { return val[0]; }

  double y() const { return val[1]; }

  double z() const { return val[2]; }

  bool operator==(const Point& other) const {
    return val[0] == other.val[0] && val[1] == other.val[1] &&
           val[2] == other.val[2];
  }

  bool operator!=(const Point& other) const { return !operator==(other); }

  Point& operator+=(const Point& other) {
    for (int i = 0; i < 3; ++i)
      val[i] += other.val[i];
    return *this;
  }

  Point& operator-=(const Point& other) {
    for (int i = 0; i < 3; ++i)
      val[i] -= other.val[i];
    return *this;
  }

  Point& operator*=(double value) {
    for (int i = 0; i < 3; ++i)
      val[i] *= value;
    return *this;
  }

  Point operator-(const Point& other) const {
    return Point(val[0] - other.val[0], val[1] - other.val[1],
                 val[2] - other.val[2]);
  }

  Point operator+(const Point& other) const {
    return Point(val[0] + other.val[0], val[1] + other.val[1],
                 val[2] + other.val[2]);
  }

  Point operator*(double d) const {
    return Point(val[0] * d, val[1] * d, val[2] * d);
  }

  Point operator/(double d) const {
    return Point(val[0] / d, val[1] / d, val[2] / d);
  }

  double dist2() const { return dot(*this); }

  double dot(const Point& p2) const {
    return val[0] * p2.val[0] + val[1] * p2.val[1] + val[2] * p2.val[2];
  }

  void pairMin(const Point& p2) {
    for (int i = 0; i < 3; ++i)
      if (p2.val[i] < val[i])
        val[i] = p2.val[i];
  }

  void pairMax(const Point& p2) {
    for (int i = 0; i < 3; ++i)
      if (p2.val[i] > val[i])
        val[i] = p2.val[i];
  }

  double minDim() const { return std::min(val[0], std::min(val[1], val[2])); }
};

std::ostream& operator<<(std::ostream& os, const Point& p) {
  os << "(" << p[0] << "," << p[1] << "," << p[2] << ")";
  return os;
}


================================================
FILE: lonestar/scientific/cpu/barneshut/README.md
================================================
Barnes Hut
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program performs N-body simulation using Barnes-Hut algorithm.

The simulation proceeds in rounds (specified via -steps), where in every round,
it creates an Oct-Tree of the bodies (specified via -n) and performs force
computation between all pairs of bodies while traversing the Oct-Tree. 

INPUT
--------------------------------------------------------------------------------

Input is randomly generated graphs (using Plummer model) upon
running the program.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/scientific/cpu/barneshut; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./barneshut-cpu -n 12345 -t 40`
-`$ ./barneshut-cpu -n 12345 -steps 100 -t 40`

PERFORMANCE  
--------------------------------------------------------------------------------

* CHUNK_SIZE needs to be tuned for machine and input. 


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/CMakeLists.txt
================================================
add_executable(delaunayrefinement-cpu DelaunayRefinement.cpp)
add_dependencies(apps delaunayrefinement-cpu)
target_link_libraries(delaunayrefinement-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS delaunayrefinement-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

if(CMAKE_COMPILER_IS_GNUCC)
  target_compile_options(delaunayrefinement-cpu PRIVATE -ffast-math)
endif()

add_test_scale(small1 delaunayrefinement-cpu -meshGraph "${BASEINPUT}/reference/meshes/r10k.1")
add_test_scale(small2 delaunayrefinement-cpu -meshGraph "${BASEINPUT}/meshes/250k.2" NOT_QUICK)


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Cavity.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include <vector>
#include <algorithm>

class Cavity {
  //! [STL vector using PerIterAllocTy]
  typedef std::vector<EdgeTuple,
                      galois::PerIterAllocTy::rebind<EdgeTuple>::other>
      ConnTy;
  //! [STL vector using PerIterAllocTy]

  Tuple center;
  GNode centerNode;
  std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other> frontier;
  // !the cavity itself
  PreGraph pre;
  // !what the new elements should look like
  PostGraph post;
  // the edge-relations that connect the boundary to the cavity
  ConnTy connections;
  Element* centerElement;
  Graph* graph;
  int dim;

  /**
   * find the node that is opposite the obtuse angle of the element
   */
  GNode getOpposite(GNode node) {
    assert(std::distance(graph->edge_begin(node), graph->edge_end(node)) == 3);
    Element& element   = graph->getData(node, galois::MethodFlag::WRITE);
    Tuple elementTuple = element.getObtuse();
    for (Graph::edge_iterator
             ii = graph->edge_begin(node, galois::MethodFlag::WRITE),
             ee = graph->edge_end(node, galois::MethodFlag::WRITE);
         ii != ee; ++ii) {
      GNode neighbor = graph->getEdgeDst(ii);
      // Edge& edgeData = graph->getEdgeData(node, neighbor);
      Edge edgeData = element.getRelatedEdge(
          graph->getData(neighbor, galois::MethodFlag::WRITE));
      if (elementTuple != edgeData.getPoint(0) &&
          elementTuple != edgeData.getPoint(1)) {
        return neighbor;
      }
    }
    GALOIS_DIE("unreachable");
    return node;
  }

  void expand(GNode node, GNode next) {
    Element& nextElement = graph->getData(next, galois::MethodFlag::WRITE);
    if ((!(dim == 2 && nextElement.dim() == 2 && next != centerNode)) &&
        nextElement.inCircle(center)) {
      // isMember says next is part of the cavity, and we're not the second
      // segment encroaching on this cavity
      if ((nextElement.dim() == 2) && (dim != 2)) {
        // is segment, and we are encroaching
        initialize(next);
        build();
      } else {
        if (!pre.containsNode(next)) {
          pre.addNode(next);
          frontier.push_back(next);
        }
      }
    } else {
      // not a member
      // Edge& edgeData = graph->getEdgeData(node, next);
      Edge edgeData = nextElement.getRelatedEdge(
          graph->getData(node, galois::MethodFlag::WRITE));
      EdgeTuple edge(node, next, edgeData);
      if (std::find(connections.begin(), connections.end(), edge) ==
          connections.end()) {
        connections.push_back(edge);
      }
    }
  }

public:
  Cavity(Graph* g, galois::PerIterAllocTy& cnx)
      : frontier(cnx), pre(cnx), post(cnx), connections(cnx), graph(g) {}

  void initialize(GNode node) {
    pre.reset();
    post.reset();
    connections.clear();
    frontier.clear();
    centerNode    = node;
    centerElement = &graph->getData(centerNode, galois::MethodFlag::WRITE);
    while (graph->containsNode(centerNode, galois::MethodFlag::WRITE) &&
           centerElement->isObtuse()) {
      centerNode    = getOpposite(centerNode);
      centerElement = &graph->getData(centerNode, galois::MethodFlag::WRITE);
    }
    center = centerElement->getCenter();
    dim    = centerElement->dim();
    pre.addNode(centerNode);
    frontier.push_back(centerNode);
  }

  void build() {
    while (!frontier.empty()) {
      GNode curr = frontier.back();
      frontier.pop_back();
      for (Graph::edge_iterator
               ii = graph->edge_begin(curr, galois::MethodFlag::WRITE),
               ee = graph->edge_end(curr, galois::MethodFlag::WRITE);
           ii != ee; ++ii) {
        GNode neighbor = graph->getEdgeDst(ii);
        expand(curr, neighbor);
      }
    }
  }

  /**
   * Create the new cavity based on the data of the old one
   */
  void computePost() {
    if (centerElement->dim() == 2) { // we built around a segment
      GNode n1 = graph->createNode(Element(center, centerElement->getPoint(0)));
      GNode n2 = graph->createNode(Element(center, centerElement->getPoint(1)));

      post.addNode(n1);
      post.addNode(n2);
    }

    for (ConnTy::iterator ii = connections.begin(), ee = connections.end();
         ii != ee; ++ii) {
      EdgeTuple tuple = *ii;
      Element newElement(center, tuple.data.getPoint(0),
                         tuple.data.getPoint(1));
      GNode other = pre.containsNode(tuple.dst) ? tuple.src : tuple.dst;
      Element& otherElement = graph->getData(other, galois::MethodFlag::WRITE);

      GNode newNode         = graph->createNode(newElement); // XXX
      const Edge& otherEdge = newElement.getRelatedEdge(otherElement);
      post.addEdge(newNode, other, otherEdge);

      for (PostGraph::iterator ii = post.begin(), ee = post.end(); ii != ee;
           ++ii) {
        GNode node       = *ii;
        Element& element = graph->getData(node, galois::MethodFlag::WRITE);
        if (element.isRelated(newElement)) {
          const Edge& edge = newElement.getRelatedEdge(element);
          post.addEdge(newNode, node, edge);
        }
      }
      post.addNode(newNode);
    }
  }

  void update(GNode node, galois::UserContext<GNode>& ctx) {
    for (PreGraph::iterator ii = pre.begin(), ee = pre.end(); ii != ee; ++ii)
      graph->removeNode(*ii, galois::MethodFlag::UNPROTECTED);

    // add new data
    for (PostGraph::iterator ii = post.begin(), ee = post.end(); ii != ee;
         ++ii) {
      GNode n = *ii;
      graph->addNode(n, galois::MethodFlag::UNPROTECTED);
      Element& element = graph->getData(n, galois::MethodFlag::UNPROTECTED);
      if (element.isBad()) {
        ctx.push(n);
      }
    }

    for (PostGraph::edge_iterator ii = post.edge_begin(), ee = post.edge_end();
         ii != ee; ++ii) {
      EdgeTuple edge = *ii;
      graph->addEdge(edge.src, edge.dst, galois::MethodFlag::UNPROTECTED);
    }

    if (graph->containsNode(node, galois::MethodFlag::UNPROTECTED)) {
      ctx.push(node);
    }
  }
};


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Mesh.h"
#include "Cavity.h"
#include "Verifier.h"

#include "galois/Galois.h"
#include "galois/ParallelSTL.h"
#include "galois/Bag.h"
#include "galois/Timer.h"

#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

#include <iostream>
#include <string.h>
#include <cassert>

namespace cll = llvm::cl;

static const char* name = "Delaunay Mesh Refinement";
static const char* desc = "Refines a Delaunay triangulation mesh such that no "
                          "angle in the mesh is less than 30 degrees";
static const char* url = "delaunay_mesh_refinement";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

enum DetAlgo { nondet, detBase, detPrefix, detDisjoint };

static cll::opt<DetAlgo>
    detAlgo(cll::desc("Deterministic schedule (default value nondet):"),
            cll::values(clEnumVal(nondet, "Non-deterministic"),
                        clEnumVal(detBase, "Base execution"),
                        clEnumVal(detPrefix, "Prefix execution"),
                        clEnumVal(detDisjoint, "Disjoint execution")),
            cll::init(nondet));

//! Flag that forces user to be aware that they should be passing in a
//! mesh graph.
static cll::opt<bool>
    meshGraph("meshGraph", cll::desc("Specify that the input graph is a mesh"),
              cll::init(false));

template <typename WL, int Version = detBase>
void refine(galois::InsertBag<GNode>& initialBad, Graph& graph) {

  struct LocalState {
    Cavity cav;
    LocalState(Graph& graph, galois::PerIterAllocTy& alloc)
        : cav(&graph, alloc) {}
  };

  //! [for_each example]
  galois::for_each(
      galois::iterate(initialBad),
      [&](GNode item, auto& ctx) {
        if (!graph.containsNode(item, galois::MethodFlag::WRITE))
          return;

        if (Version == detDisjoint) {

          if (ctx.isFirstPass()) {
            LocalState* localState = ctx.template createLocalState<LocalState>(
                graph, ctx.getPerIterAlloc());
            localState->cav.initialize(item);
            localState->cav.build();
            localState->cav.computePost();
          } else {
            LocalState* localState = ctx.template getLocalState<LocalState>();
            localState->cav.update(item, ctx);
          }

          return;
        } else {
          //! [Accessing Per Iteration Allocator in DMR]
          Cavity cav(&graph, ctx.getPerIterAlloc());
          //! [Accessing Per Iteration Allocator in DMR]
          cav.initialize(item);
          cav.build();
          cav.computePost();
          if (Version == detPrefix)
            return;
          ctx.cautiousPoint();
          cav.update(item, ctx);
        }
      },
      galois::loopname("refine"), galois::wl<WL>(), galois::per_iter_alloc(),
      galois::local_state<LocalState>());

  //! [for_each example]
}

template <typename Loop>
void findBad(Graph& graph, galois::InsertBag<GNode>& initialBad,
             const Loop& loop) {
  loop(
      galois::iterate(graph),
      [&](GNode item) {
        if (graph.getData(item, galois::MethodFlag::UNPROTECTED).isBad()) {
          initialBad.push(item);
        }
      },
      galois::loopname("findBad"));
}

/*
struct DetLessThan {
  bool operator()(const GNode& a, const GNode& b) const {
    int idA = graph.getData(a, galois::MethodFlag::UNPROTECTED).getId();
    int idB = graph.getData(b, galois::MethodFlag::UNPROTECTED).getId();
    if (idA == 0 || idB == 0) abort();
    return idA < idB;
  }
};
*/

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!meshGraph) {
    GALOIS_DIE("This application requires a mesh graph input;"
               " please use the -meshGraph flag "
               " to indicate the input is a mesh graph.");
  }

  Graph graph;
  {
    Mesh m;
    m.read(graph, inputFile.c_str(), detAlgo == nondet);
    Verifier v;
    if (!skipVerify && !v.verify(graph)) {
      GALOIS_DIE("bad input mesh");
    }
  }
  std::cout << "configuration: " << std::distance(graph.begin(), graph.end())
            << " total triangles, "
            << std::count_if(graph.begin(), graph.end(), is_bad(graph))
            << " bad triangles\n";

  galois::reportPageAlloc("MeminfoPre1");
  // Tighter upper bound for pre-alloc, useful for machines with limited memory,
  // e.g., Intel MIC. May not be enough for deterministic execution
  constexpr size_t NODE_SIZE = sizeof(**graph.begin());
  galois::preAlloc(5 * galois::getActiveThreads() +
                   NODE_SIZE * 32 * graph.size() /
                       galois::runtime::pagePoolSize());

  galois::reportPageAlloc("MeminfoPre2");

  galois::StatTimer execTime("Timer_0");
  execTime.start();

  galois::InsertBag<GNode> initialBad;

  if (detAlgo == nondet) {
    findBad(graph, initialBad, galois::DoAll());
  } else {
    findBad(graph, initialBad, galois::StdForEach());
  }

  galois::reportPageAlloc("MeminfoMid");

  galois::StatTimer Trefine("refine");
  Trefine.start();
  using namespace galois::worklists;

  typedef Deterministic<> DWL;
  typedef PerThreadChunkLIFO<32> Chunk;

  switch (detAlgo) {
  case nondet:
    refine<Chunk>(initialBad, graph);
    break;
  case detBase:
    refine<DWL>(initialBad, graph);
    break;
  case detPrefix:
    refine<DWL, detPrefix>(initialBad, graph);
    break;
  case detDisjoint:
    refine<DWL, detDisjoint>(initialBad, graph);
    break;
  default:
    std::cerr << "Unknown algorithm" << detAlgo << "\n";
    abort();
  }
  Trefine.stop();
  execTime.stop();

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify) {
    int size = galois::ParallelSTL::count_if(graph.begin(), graph.end(),
                                             is_bad(graph));
    if (size != 0) {
      GALOIS_DIE("bad triangles remaining");
    }
    Verifier v;
    if (!v.verify(graph)) {
      GALOIS_DIE("refinement failed");
    }
    std::cout << std::distance(graph.begin(), graph.end())
              << " total triangles\n";
    std::cout << "Refinement OK\n";
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Edge.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef EDGE_H
#define EDGE_H

#include "Tuple.h"

class Element;

class Edge {
  Tuple p[2];

public:
  Edge() {}
  Edge(const Tuple& a, const Tuple& b) {
    if (a < b) {
      p[0] = a;
      p[1] = b;
    } else {
      p[0] = b;
      p[1] = a;
    }
  }
  Edge(const Edge& rhs) {
    p[0] = rhs.p[0];
    p[1] = rhs.p[1];
  }

  bool operator==(const Edge& rhs) const {
    return p[0] == rhs.p[0] && p[1] == rhs.p[1];
  }
  bool operator!=(const Edge& rhs) const { return !(*this == rhs); }
  bool operator<(const Edge& rhs) const {
    return (p[0] < rhs.p[0]) || ((p[0] == rhs.p[0]) && (p[1] < rhs.p[1]));
  }

  bool operator>(const Edge& rhs) const {
    return (p[0] > rhs.p[0]) || ((p[0] == rhs.p[0]) && (p[1] > rhs.p[1]));
  }

  Tuple getPoint(int i) const { return p[i]; }
};
#endif


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Element.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _ELEMENT_H
#define _ELEMENT_H

#include "galois/gIO.h"

#include <cassert>
#include <stdlib.h>

#include "Edge.h"

#define MINANGLE 30.0

class Element {
  Tuple coords[3]; // The three endpoints of the triangle
  // if the triangle has an obtuse angle
  // obtuse - 1 is which one
  signed char obtuse;
  bool bDim; // true == 3, false == 2
  int id;

public:
  //! Constructor for Triangles
  Element(const Tuple& a, const Tuple& b, const Tuple& c, int _id = 0)
      : obtuse(0), bDim(true), id(_id) {
    coords[0] = a;
    coords[1] = b;
    coords[2] = c;
    if (b < a || c < a) {
      if (b < c) {
        coords[0] = b;
        coords[1] = c;
        coords[2] = a;
      } else {
        coords[0] = c;
        coords[1] = a;
        coords[2] = b;
      }
    }
    //    edges[0] = Edge(coords[0], coords[1]);
    //    edges[1] = Edge(coords[1], coords[2]);
    //    edges[2] = Edge(coords[2], coords[0]);
    for (int i = 0; i < 3; i++)
      if (angleOBCheck(i))
        obtuse = i + 1;
    // computeCenter();
  }

  //! Constructor for segments
  Element(const Tuple& a, const Tuple& b, int _id = 0)
      : obtuse(0), bDim(false), id(_id) {
    coords[0] = a;
    coords[1] = b;
    if (b < a) {
      coords[0] = b;
      coords[1] = a;
    }
    // computeCenter();
  }

  Tuple getCenter() const {
    if (dim() == 2) {
      return (coords[0] + coords[1]) * 0.5;
    } else {
      const Tuple& a = coords[0];
      const Tuple& b = coords[1];
      const Tuple& c = coords[2];
      Tuple x        = b - a;
      Tuple y        = c - a;
      double xlen    = a.distance(b);
      double ylen    = a.distance(c);
      double cosine  = (x * y) / (xlen * ylen);
      double sine_sq = 1.0 - cosine * cosine;
      double plen    = ylen / xlen;
      double s       = plen * cosine;
      double t       = plen * sine_sq;
      double wp      = (plen - cosine) / (2 * t);
      double wb      = 0.5 - (wp * s);
      Tuple tmpval   = a * (1 - wb - wp);
      tmpval         = tmpval + (b * wb);
      return tmpval + (c * wp);
    }
  }

  double get_radius_squared() const { return get_radius_squared(getCenter()); }

  double get_radius_squared(const Tuple& center) const {
    return center.distance_squared(coords[0]);
  }

  bool operator<(const Element& rhs) const {
    // apparently a triangle is less than a line
    if (dim() < rhs.dim())
      return false;
    if (dim() > rhs.dim())
      return true;
    for (int i = 0; i < dim(); i++) {
      if (coords[i] < rhs.coords[i])
        return true;
      else if (coords[i] > rhs.coords[i])
        return false;
    }
    return false;
  }

  /// @return if the current triangle has a common edge with e
  bool isRelated(const Element& rhs) const {
    int num_eq = 0;
    for (int i = 0; i < dim(); ++i)
      for (int j = 0; j < rhs.dim(); ++j)
        if (coords[i] == rhs.coords[j])
          ++num_eq;
    return num_eq == 2;
  }

  bool inCircle(Tuple p) const {
    Tuple center = getCenter();
    double ds    = center.distance_squared(p);
    return ds <= get_radius_squared(center);
  }

  void angleCheck(int i, bool& ob, bool& sm, double M) const {
    int j = (i + 1) % dim();
    int k = (i + 2) % dim();
    Tuple::angleCheck(coords[j], coords[i], coords[k], ob, sm, M);
  }

  bool angleGTCheck(int i, double M) const {
    int j = (i + 1) % dim();
    int k = (i + 2) % dim();
    return Tuple::angleGTCheck(coords[j], coords[i], coords[k], M);
  }

  bool angleOBCheck(int i) const {
    int j = (i + 1) % dim();
    int k = (i + 2) % dim();
    return Tuple::angleOBCheck(coords[j], coords[i], coords[k]);
  }

  // Virtualize the Edges array
  // Used only by Mesh now
  Edge getEdge(int i) const {
    if (i == 0)
      return Edge(coords[0], coords[1]);
    if (!bDim) {
      if (i == 1)
        return Edge(coords[1], coords[0]);
    } else {
      if (i == 1)
        return Edge(coords[1], coords[2]);
      else if (i == 2)
        return Edge(coords[2], coords[0]);
    }
    GALOIS_DIE("unknown edge");
    return Edge(coords[0], coords[0]);
  }

  Edge getOppositeObtuse() const {
    // The edge opposite the obtuse angle is the edge formed by
    // the other indexes
    switch (obtuse) {
    case 1:
      return getEdge(1);
    case 2:
      return getEdge(2);
    case 3:
      return getEdge(0);
    }
    GALOIS_DIE("no obtuse edge");
    return getEdge(0);
  }

  //! Should the node be processed?
  bool isBad() const {
    if (!bDim)
      return false;
    for (int i = 0; i < 3; i++)
      if (angleGTCheck(i, MINANGLE))
        return true;
    return false;
  }

  const Tuple& getPoint(int i) const { return coords[i]; }

  const Tuple& getObtuse() const { return coords[obtuse - 1]; }

  int dim() const { return bDim ? 3 : 2; }

  int numEdges() const { return dim() + dim() - 3; }

  bool isObtuse() const { return obtuse != 0; }

  int getId() const { return id; }

  /**
   * Scans all the edges of the two elements and if it finds one that is
   * equal, then sets this as the Edge of the EdgeRelation
   */
  Edge getRelatedEdge(const Element& e) const {
    int at = 0;
    Tuple d[2];
    for (int i = 0; i < dim(); ++i)
      for (int j = 0; j < e.dim(); ++j)
        if (coords[i] == e.coords[j])
          d[at++] = coords[i];
    assert(at == 2);
    return Edge(d[0], d[1]);
  }

  std::ostream& print(std::ostream& s) const {
    s << '[';
    for (int i = 0; i < dim(); ++i)
      s << coords[i] << (i < (dim() - 1) ? ", " : "");
    s << ']';
    return s;
  }
};

static std::ostream& operator<<(std::ostream& s, const Element& E) {
  return E.print(s);
}

#endif


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Mesh.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef MESH_H
#define MESH_H

#include "Subgraph.h"

#include <vector>
#include <string>
#include <map>
#include <iostream>
#include <cstdio>

struct is_bad {
  Graph& g;
  is_bad(Graph& _g) : g(_g) {}
  bool operator()(const GNode& n) const {
    return g.getData(n, galois::MethodFlag::UNPROTECTED).isBad();
  }
};

struct centerXCmp {
  bool operator()(const Element& lhs, const Element& rhs) const {
    // return lhs.getCenter() < rhs.getCenter();
    return lhs.getPoint(0)[0] < rhs.getPoint(0)[0];
  }
};

struct centerYCmp {
  bool operator()(const Element& lhs, const Element& rhs) const {
    // return lhs.getCenter() < rhs.getCenter();
    return lhs.getPoint(0)[1] < rhs.getPoint(0)[1];
  }
};

struct centerYCmpInv {
  bool operator()(const Element& lhs, const Element& rhs) const {
    // return lhs.getCenter() < rhs.getCenter();
    return rhs.getPoint(0)[1] < lhs.getPoint(0)[1];
  }
};

/**
 * Helper class used providing methods to read in information and create the
 * graph
 *
 */
class Mesh {
  std::vector<Element> elements;
  size_t id;

private:
  void checkResults(int act, int exp, std::string& str) {
    if (act != exp) {
      std::cerr << "Failed read in " << str << "\n";
      abort();
    }
  }

  bool readNodesBin(std::string filename, std::vector<Tuple>& tuples) {
    FILE* pFile = fopen(filename.append(".node.bin").c_str(), "r");
    if (!pFile) {
      return false;
    }
    std::cout << "Using bin for node\n";
    uint32_t ntups[4];
    if (fread(&ntups[0], sizeof(uint32_t), 4, pFile) < 4) {
      std::cerr << "Malformed binary file\n";
      abort();
    }
    tuples.resize(ntups[0]);
    for (size_t i = 0; i < ntups[0]; i++) {
      struct record {
        uint32_t index;
        double x, y, z;
      };
      record R;
      if (fread(&R, sizeof(record), 1, pFile) < 1) {
        std::cerr << "Malformed binary file\n";
        abort();
      }
      tuples[R.index] = Tuple(R.x, R.y);
    }
    fclose(pFile);
    return true;
  }

  void readNodes(std::string filename, std::vector<Tuple>& tuples) {
    if (readNodesBin(filename, tuples))
      return;
    else
      writeNodes(filename);
    FILE* pFile = fopen(filename.append(".node").c_str(), "r");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << "\n";
      abort();
    }
    unsigned ntups;
    int r = fscanf(pFile, "%u %*u %*u %*u", &ntups);
    checkResults(r, 1, filename);
    tuples.resize(ntups);
    for (size_t i = 0; i < ntups; i++) {
      unsigned index;
      double x, y;
      r = fscanf(pFile, "%u %lf %lf %*f", &index, &x, &y);
      checkResults(r, 3, filename);
      tuples[index] = Tuple(x, y);
    }
    fclose(pFile);
  }

  void writeNodes(std::string filename) {
    std::string filename2 = filename;
    FILE* pFile           = fopen(filename.append(".node").c_str(), "r");
    FILE* oFile           = fopen(filename2.append(".node.bin").c_str(), "w");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << " (continuing)\n";
      return;
    }
    if (!oFile) {
      std::cerr << "Failed to open file " << filename2 << " (continuing)\n";
      return;
    }
    unsigned ntups[4];
    int r = fscanf(pFile, "%u %u %u %u", &ntups[0], &ntups[1], &ntups[2],
                   &ntups[3]);
    checkResults(r, 4, filename);
    uint32_t ntups32[4] = {ntups[0], ntups[1], ntups[2], ntups[3]};
    fwrite(&ntups32[0], sizeof(uint32_t), 4, oFile);

    for (size_t i = 0; i < ntups[0]; i++) {
      struct record {
        unsigned index;
        double x, y, z;
      };
      struct recordOut {
        uint32_t index;
        double x, y, z;
      };
      record R;
      r = fscanf(pFile, "%u %lf %lf %lf", &R.index, &R.x, &R.y, &R.z);
      checkResults(r, 4, filename);
      recordOut R2 = {R.index, R.x, R.y, R.z};
      fwrite(&R2, sizeof(recordOut), 1, oFile);
    }
    fclose(pFile);
    fclose(oFile);
  }

  bool readElementsBin(std::string filename, std::vector<Tuple>& tuples) {
    FILE* pFile = fopen(filename.append(".ele.bin").c_str(), "r");
    if (!pFile) {
      return false;
    }
    std::cout << "Using bin for ele\n";
    uint32_t nels[3];
    if (fread(&nels[0], sizeof(uint32_t), 3, pFile) < 3) {
      std::cerr << "Malformed binary file\n";
      abort();
    }
    for (size_t i = 0; i < nels[0]; i++) {
      uint32_t r[4];
      if (fread(&r[0], sizeof(uint32_t), 4, pFile) < 4) {
        std::cerr << "Malformed binary file\n";
        abort();
      }
      assert(r[1] < tuples.size());
      assert(r[2] < tuples.size());
      assert(r[3] < tuples.size());
      Element e(tuples[r[1]], tuples[r[2]], tuples[r[3]], ++id);
      elements.push_back(e);
    }
    fclose(pFile);
    return true;
  }

  void readElements(std::string filename, std::vector<Tuple>& tuples) {
    if (readElementsBin(filename, tuples))
      return;
    else
      writeElements(filename);
    FILE* pFile = fopen(filename.append(".ele").c_str(), "r");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << "\n";
      abort();
    }
    unsigned nels;
    int r = fscanf(pFile, "%u %*u %*u", &nels);
    checkResults(r, 1, filename);
    for (size_t i = 0; i < nels; i++) {
      unsigned index;
      unsigned n1, n2, n3;
      r = fscanf(pFile, "%u %u %u %u", &index, &n1, &n2, &n3);
      checkResults(r, 4, filename);
      assert(n1 < tuples.size());
      assert(n2 < tuples.size());
      assert(n3 < tuples.size());
      Element e(tuples[n1], tuples[n2], tuples[n3], ++id);
      elements.push_back(e);
    }
    fclose(pFile);
  }

  void writeElements(std::string filename) {
    std::string filename2 = filename;
    FILE* pFile           = fopen(filename.append(".ele").c_str(), "r");
    FILE* oFile           = fopen(filename2.append(".ele.bin").c_str(), "w");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << " (continuing)\n";
      return;
    }
    if (!oFile) {
      std::cerr << "Failed to open file " << filename2 << " (continuing)\n";
      return;
    }
    unsigned nels[3];
    int r = fscanf(pFile, "%u %u %u", &nels[0], &nels[1], &nels[2]);
    checkResults(r, 3, filename);
    uint32_t nels32[3] = {nels[0], nels[1], nels[2]};
    fwrite(&nels32[0], sizeof(uint32_t), 3, oFile);

    for (size_t i = 0; i < nels[0]; i++) {
      unsigned index;
      unsigned n1, n2, n3;
      r = fscanf(pFile, "%u %u %u %u", &index, &n1, &n2, &n3);
      checkResults(r, 4, filename);
      uint32_t vals[4] = {index, n1, n2, n3};
      fwrite(&vals[0], sizeof(uint32_t), 4, oFile);
    }
    fclose(pFile);
    fclose(oFile);
  }

  bool readPolyBin(std::string filename, std::vector<Tuple>& tuples) {
    FILE* pFile = fopen(filename.append(".poly.bin").c_str(), "r");
    if (!pFile) {
      return false;
    }
    std::cout << "Using bin for poly\n";
    uint32_t nsegs[4];
    if (fread(&nsegs[0], sizeof(uint32_t), 4, pFile) < 4) {
      std::cerr << "Malformed binary file\n";
      abort();
    }
    if (fread(&nsegs[0], sizeof(uint32_t), 2, pFile) < 2) {
      std::cerr << "Malformed binary file\n";
      abort();
    }
    for (size_t i = 0; i < nsegs[0]; i++) {
      uint32_t r[4];
      if (fread(&r[0], sizeof(uint32_t), 4, pFile) < 4) {
        std::cerr << "Malformed binary file\n";
        abort();
      }
      assert(r[1] < tuples.size());
      assert(r[2] < tuples.size());
      Element e(tuples[r[1]], tuples[r[2]], ++id);
      elements.push_back(e);
    }
    fclose(pFile);
    return true;
  }

  void readPoly(std::string filename, std::vector<Tuple>& tuples) {
    if (readPolyBin(filename, tuples))
      return;
    else
      writePoly(filename);
    FILE* pFile = fopen(filename.append(".poly").c_str(), "r");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << "\n";
      abort();
    }
    unsigned nsegs;
    int r = fscanf(pFile, "%*u %*u %*u %*u");
    checkResults(r, 0, filename);
    r = fscanf(pFile, "%u %*u", &nsegs);
    checkResults(r, 1, filename);
    for (size_t i = 0; i < nsegs; i++) {
      unsigned index, n1, n2;
      r = fscanf(pFile, "%u %u %u %*u", &index, &n1, &n2);
      checkResults(r, 3, filename);
      assert(n1 < tuples.size());
      assert(n2 < tuples.size());
      Element e(tuples[n1], tuples[n2], ++id);
      elements.push_back(e);
    }
    fclose(pFile);
  }

  void writePoly(std::string filename) {
    std::string filename2 = filename;
    FILE* pFile           = fopen(filename.append(".poly").c_str(), "r");
    FILE* oFile           = fopen(filename2.append(".poly.bin").c_str(), "w");
    if (!pFile) {
      std::cerr << "Failed to load file " << filename << " (continuing)\n";
      return;
    }
    if (!oFile) {
      std::cerr << "Failed to open file " << filename2 << " (continuing)\n";
      return;
    }
    unsigned nsegs[4];
    int r = fscanf(pFile, "%u %u %u %u", &nsegs[0], &nsegs[1], &nsegs[2],
                   &nsegs[3]);
    checkResults(r, 4, filename);
    uint32_t nsegs32[4] = {nsegs[0], nsegs[1], nsegs[2], nsegs[3]};
    fwrite(&nsegs32[0], sizeof(uint32_t), 4, oFile);
    r = fscanf(pFile, "%u %u", &nsegs[0], &nsegs[1]);
    checkResults(r, 2, filename);
    nsegs32[0] = nsegs[0];
    nsegs32[1] = nsegs[1];
    fwrite(&nsegs32[0], sizeof(uint32_t), 2, oFile);
    for (size_t i = 0; i < nsegs[0]; i++) {
      unsigned index, n1, n2, n3;
      r = fscanf(pFile, "%u %u %u %u", &index, &n1, &n2, &n3);
      checkResults(r, 4, filename);
      uint32_t r[4] = {index, n1, n2, n3};
      fwrite(&r[0], sizeof(uint32_t), 4, oFile);
    }
    fclose(pFile);
    fclose(oFile);
  }

  void addElement(Graph& mesh, GNode node, std::map<Edge, GNode>& edge_map) {
    Element& element = mesh.getData(node);
    for (int i = 0; i < element.numEdges(); i++) {
      Edge edge = element.getEdge(i);
      if (edge_map.find(edge) == edge_map.end()) {
        edge_map[edge] = node;
      } else {
        mesh.addEdge(node, edge_map[edge], galois::MethodFlag::UNPROTECTED);
        edge_map.erase(edge);
      }
    }
  }

  template <typename Iter>
  void divide(const Iter& b, const Iter& e) {
    if (std::distance(b, e) > 16) {
      std::sort(b, e, centerXCmp());
      Iter m = galois::split_range(b, e);
      std::sort(b, m, centerYCmpInv());
      std::sort(m, e, centerYCmp());
      divide(b, galois::split_range(b, m));
      divide(galois::split_range(b, m), m);
      divide(m, galois::split_range(m, e));
      divide(galois::split_range(m, e), e);
    }
  }

  template <typename L>
  void createNodes(Graph& g, const L& loop) {

    loop(
        galois::iterate(elements),
        [&](const Element& item) {
          GNode n = g.createNode(item);
          g.addNode(n);
        },
        galois::loopname("allocate"));
  }
  void makeGraph(Graph& mesh, bool parallelAllocate) {
    // std::sort(elements.begin(), elements.end(), centerXCmp());
    divide(elements.begin(), elements.end());

    if (parallelAllocate)
      createNodes(mesh, galois::DoAll());
    else
      createNodes(mesh, galois::StdForEach());

    std::map<Edge, GNode> edge_map;
    for (auto ii = mesh.begin(), ee = mesh.end(); ii != ee; ++ii)
      addElement(mesh, *ii, edge_map);
  }

public:
  Mesh() : id(0) {}

  void read(Graph& mesh, std::string basename, bool parallelAllocate) {
    std::vector<Tuple> tuples;
    readNodes(basename, tuples);
    readElements(basename, tuples);
    readPoly(basename, tuples);
    makeGraph(mesh, parallelAllocate);
  }
};

#endif


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/README.md
================================================
Delaunayrefinement
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program refines a 2D Delaunay Mesh such that no angle in any triangles is less
than a certain value (30 deg in this implementation).

This implementation contains both non-deterministic and deterministic parallel
schedules for refining the mesh. 

INPUT
--------------------------------------------------------------------------------

The user specifies a *basename* of 3 files read by delaunayrefinement:
  1. basename.nodes contains positions of vertices/points
  2. basename.ele contains info about vertices of triangles
  3. basename.poly contains info about which triangles are adjacent to each other

You must specify the -meshGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/scientific/cpu/delaunayrefinement; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

- `$ ./delaunayrefinement-cpu <input-basename> -meshGraph -t 40`
- `$ ./delaunayrefinement-cpu <input-basename> -meshGraph -detPrefix -t 40` for one of the
  available deterministic schedules

PERFORMANCE  
--------------------------------------------------------------------------------

* In our experience, nondet schedule in  delaunayrefinement outperforms deterministic schedules, because determinism incurs a performance cost
* Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and
  machine dependent


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Subgraph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef SUBGRAPH_H
#define SUBGRAPH_H

#include "Element.h"

#include "galois/Galois.h"
#include "galois/graphs/Graph.h"

#include <vector>
#include <algorithm>

typedef galois::graphs::MorphGraph<Element, void, false> Graph;
typedef Graph::GraphNode GNode;

struct EdgeTuple {
  GNode src;
  GNode dst;
  Edge data;
  EdgeTuple(GNode s, GNode d, const Edge& _d) : src(s), dst(d), data(_d) {}

  bool operator==(const EdgeTuple& rhs) const {
    return src == rhs.src && dst == rhs.dst && data == data;
  }
};

/**
 *  A sub-graph of the mesh. Used to store information about the original
 *  cavity
 */
class PreGraph {
  typedef std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other>
      NodesTy;
  NodesTy nodes;

public:
  typedef NodesTy::iterator iterator;

  explicit PreGraph(galois::PerIterAllocTy& cnx) : nodes(cnx) {}

  bool containsNode(GNode N) {
    return std::find(nodes.begin(), nodes.end(), N) != nodes.end();
  }

  void addNode(GNode n) { return nodes.push_back(n); }
  void reset() { nodes.clear(); }
  iterator begin() { return nodes.begin(); }
  iterator end() { return nodes.end(); }
};

/**
 *  A sub-graph of the mesh. Used to store information about the original
 *  and updated cavity
 */
class PostGraph {
  struct TempEdge {
    size_t src;
    GNode dst;
    Edge edge;
    TempEdge(size_t s, GNode d, const Edge& e) : src(s), dst(d), edge(e) {}
  };

  typedef std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other>
      NodesTy;
  typedef std::vector<EdgeTuple,
                      galois::PerIterAllocTy::rebind<EdgeTuple>::other>
      EdgesTy;

  //! the nodes in the graph before updating
  NodesTy nodes;
  //! the edges that connect the subgraph to the rest of the graph
  EdgesTy edges;

public:
  typedef NodesTy::iterator iterator;
  typedef EdgesTy::iterator edge_iterator;

  explicit PostGraph(galois::PerIterAllocTy& cnx) : nodes(cnx), edges(cnx) {}

  void addNode(GNode n) { nodes.push_back(n); }

  void addEdge(GNode src, GNode dst, const Edge& e) {
    edges.push_back(EdgeTuple(src, dst, e));
  }

  void reset() {
    nodes.clear();
    edges.clear();
  }

  iterator begin() { return nodes.begin(); }
  iterator end() { return nodes.end(); }
  edge_iterator edge_begin() { return edges.begin(); }
  edge_iterator edge_end() { return edges.end(); }
};

#endif


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Tuple.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef TUPLE_H
#define TUPLE_H

#include <ostream>
#include <cmath>

class Tuple {
  double _t[2];

public:
  Tuple(double a, double b) {
    _t[0] = a;
    _t[1] = b;
  }

  Tuple(){};
  ~Tuple(){};

  bool operator==(const Tuple& rhs) const {
    for (int x = 0; x < 2; ++x) {
      if (_t[x] != rhs._t[x])
        return false;
    }
    return true;
  }

  bool operator!=(const Tuple& rhs) const { return !(*this == rhs); }

  bool operator<(const Tuple& rhs) const {
    for (int i = 0; i < 2; ++i) {
      if (_t[i] < rhs._t[i])
        return true;
      else if (_t[i] > rhs._t[i])
        return false;
    }
    return false;
  }

  bool operator>(const Tuple& rhs) const {
    for (int i = 0; i < 2; ++i) {
      if (_t[i] > rhs._t[i])
        return true;
      else if (_t[i] < rhs._t[i])
        return false;
    }
    return false;
  }

  Tuple operator+(const Tuple& rhs) const {
    return Tuple(_t[0] + rhs._t[0], _t[1] + rhs._t[1]);
  }

  Tuple operator-(const Tuple& rhs) const {
    return Tuple(_t[0] - rhs._t[0], _t[1] - rhs._t[1]);
  }

  Tuple operator*(double d) const { // scalar product
    return Tuple(_t[0] * d, _t[1] * d);
  }

  double operator*(const Tuple& rhs) const { // dot product
    return _t[0] * rhs._t[0] + _t[1] * rhs._t[1];
  }

  double operator[](int i) const { return _t[i]; };

  int cmp(const Tuple& x) const {
    if (*this == x)
      return 0;
    if (*this > x)
      return 1;
    return -1;
  }

  double distance_squared(
      const Tuple& p) const { // squared distance between current tuple and x
    double sum = 0.0;
    for (int i = 0; i < 2; ++i) {
      double d = _t[i] - p._t[i];
      sum += d * d;
    }
    return sum;
  }

  double distance(const Tuple& p) const { // distance between current tuple and
                                          // x
    return sqrt(distance_squared(p));
  }

  double angle(const Tuple& a,
               const Tuple& b) const { // angle formed by a, current tuple, b
    Tuple vb  = a - *this;
    Tuple vc  = b - *this;
    double dp = vb * vc;
    double c  = dp / sqrt(distance_squared(a) * distance_squared(b));
    return (180 / M_PI) * acos(c);
  }

  void angleCheck(const Tuple& a, const Tuple& b, bool& ob, bool& sm,
                  double M) const { // angle formed by a, current tuple, b
    Tuple vb  = a - *this;
    Tuple vc  = b - *this;
    double dp = vb * vc;

    if (dp < 0) {
      ob = true;
      return;
    }

    double c = dp / sqrt(distance_squared(b) * distance_squared(a));
    if (c > cos(M * M_PI / 180)) {
      sm = true;
      return;
    }
    return;
  }

  bool angleGTCheck(const Tuple& a, const Tuple& b,
                    double M) const { // angle formed by a, current tuple, b
    Tuple vb  = a - *this;
    Tuple vc  = b - *this;
    double dp = vb * vc;

    if (dp < 0)
      return false;

    double c = dp / sqrt(distance_squared(b) * distance_squared(a));
    return c > cos(M * M_PI / 180);
  }

  bool
  angleOBCheck(const Tuple& a,
               const Tuple& b) const { // angle formed by a, current tuple, b
    Tuple vb  = a - *this;
    Tuple vc  = b - *this;
    double dp = vb * vc;

    return dp < 0;
  }

  void print(std::ostream& os) const {
    os << "(" << _t[0] << ", " << _t[1] << ")";
  }

  static int cmp(Tuple a, Tuple b) { return a.cmp(b); }
  static double distance(Tuple a, Tuple b) { return a.distance(b); }
  static double angle(const Tuple& a, const Tuple& b, const Tuple& c) {
    return b.angle(a, c);
  }
  static void angleCheck(const Tuple& a, const Tuple& b, const Tuple& c,
                         bool& ob, bool& sm, double M) {
    b.angleCheck(a, c, ob, sm, M);
  }
  static bool angleGTCheck(const Tuple& a, const Tuple& b, const Tuple& c,
                           double M) {
    return b.angleGTCheck(a, c, M);
  }
  static bool angleOBCheck(const Tuple& a, const Tuple& b, const Tuple& c) {
    return b.angleOBCheck(a, c);
  }
};

static inline std::ostream& operator<<(std::ostream& os, const Tuple& rhs) {
  rhs.print(os);
  return os;
}

static inline Tuple operator*(double d, Tuple rhs) { return rhs * d; }

#endif


================================================
FILE: lonestar/scientific/cpu/delaunayrefinement/Verifier.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef VERIFIER_H
#define VERIFIER_H

#include "galois/Galois.h"
#include "galois/ParallelSTL.h"

#include <stack>
#include <set>
#include <iostream>

class Verifier {
  struct inconsistent {
    Graph& graph;
    inconsistent(Graph& g) : graph(g) {}

    bool operator()(const GNode& node) const {
      Element& e = graph.getData(node);

      size_t dist = std::distance(graph.edge_begin(node), graph.edge_end(node));
      if (e.dim() == 2) {
        if (dist != 1) {
          std::cerr << "Error: Segment " << e << " has " << dist
                    << " relation(s)\n";
          return true;
        }
      } else if (e.dim() == 3) {
        if (dist != 3) {
          std::cerr << "Error: Triangle " << e << " has " << dist
                    << " relation(s)\n";
          return true;
        }
      } else {
        std::cerr << "Error: Element with " << e.dim() << " edges\n";
        return true;
      }
      return false;
    }
  };

  struct not_delaunay {
    Graph& graph;
    not_delaunay(Graph& g) : graph(g) {}

    bool operator()(const GNode& node) {
      Element& e1 = graph.getData(node);

      for (Graph::edge_iterator jj = graph.edge_begin(node),
                                ej = graph.edge_end(node);
           jj != ej; ++jj) {
        const GNode& n = graph.getEdgeDst(jj);
        Element& e2    = graph.getData(n);
        if (e1.dim() == 3 && e2.dim() == 3) {
          Tuple t2;
          if (!getTupleT2OfRelatedEdge(e1, e2, t2)) {
            std::cerr << "missing tuple\n";
            return true;
          }
          if (e1.inCircle(t2)) {
            std::cerr << "Delaunay property violated: point " << t2
                      << " in element " << e1 << "\n";
            return true;
          }
        }
      }
      return false;
    }

    bool getTupleT2OfRelatedEdge(const Element& e1, const Element& e2,
                                 Tuple& t) {
      int e2_0  = -1;
      int e2_1  = -1;
      int phase = 0;

      for (int i = 0; i < e1.dim(); i++) {
        for (int j = 0; j < e2.dim(); j++) {
          if (e1.getPoint(i) != e2.getPoint(j))
            continue;

          if (phase == 0) {
            e2_0  = j;
            phase = 1;
            break;
          }

          e2_1 = j;
          for (int k = 0; k < 3; k++) {
            if (k != e2_0 && k != e2_1) {
              t = e2.getPoint(k);
              return true;
            }
          }
        }
      }
      return false;
    }
  };

  bool checkReachability(Graph& graph) {
    std::stack<GNode> remaining;
    std::set<GNode> found;
    remaining.push(*(graph.begin()));

    while (!remaining.empty()) {
      GNode node = remaining.top();
      remaining.pop();
      if (!found.count(node)) {
        if (!graph.containsNode(node)) {
          std::cerr << "Reachable node was removed from graph\n";
        }
        found.insert(node);
        int i = 0;
        for (Graph::edge_iterator ii = graph.edge_begin(node),
                                  ei = graph.edge_end(node);
             ii != ei; ++ii) {
          GNode n = graph.getEdgeDst(ii);
          assert(i < 3);
          assert(graph.containsNode(n));
          assert(node != n);
          ++i;
          remaining.push(n);
        }
      }
    }

    if (found.size() != graph.size()) {
      std::cerr << "Error: Not all elements are reachable. ";
      std::cerr << "Found: " << found.size() << " needed: " << graph.size()
                << ".\n";
      return false;
    }
    return true;
  }

public:
  bool verify(Graph& g) {
    return galois::ParallelSTL::find_if(g.begin(), g.end(), inconsistent(g)) ==
               g.end() &&
           galois::ParallelSTL::find_if(g.begin(), g.end(), not_delaunay(g)) ==
               g.end() &&
           checkReachability(g);
  }
};

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/CMakeLists.txt
================================================
add_executable(delaunaytriangulation-cpu DelaunayTriangulation.cpp Element.cpp)
add_dependencies(apps delaunaytriangulation-cpu)
target_link_libraries(delaunaytriangulation-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS delaunaytriangulation-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 delaunaytriangulation-cpu -meshGraph "${BASEINPUT}/reference/meshes/r10k.node")
add_test_scale(small2 delaunaytriangulation-cpu -meshGraph "${BASEINPUT}/meshes/250k.2.node" NOT_QUICK)

if(CMAKE_COMPILER_IS_GNUCC)
  target_compile_options(delaunaytriangulation-cpu PRIVATE -ffast-math)
endif()

add_executable(delaunaytriangulation-deterministic-cpu DelaunayTriangulationDet.cpp Element.cpp)
add_dependencies(apps delaunaytriangulation-deterministic-cpu)
target_link_libraries(delaunaytriangulation-deterministic-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS delaunaytriangulation-deterministic-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
add_test_scale(small1 delaunaytriangulation-deterministic-cpu -meshGraph "${BASEINPUT}/reference/meshes/r10k.node")
add_test_scale(small2 delaunaytriangulation-deterministic-cpu -meshGraph "${BASEINPUT}/meshes/250k.2.node" NOT_QUICK)

if(CMAKE_COMPILER_IS_GNUCC)
  target_compile_options(delaunaytriangulation-deterministic-cpu PRIVATE -ffast-math)
endif()


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Cavity.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef CAVITY_H
#define CAVITY_H

#include "Graph.h"

#include <vector>

//! A cavity which will be retrangulated
template <typename Alloc = std::allocator<char>>
class Cavity : private boost::noncopyable {
  typedef typename Alloc::template rebind<GNode>::other GNodeVectorAlloc;
  typedef std::vector<GNode, GNodeVectorAlloc> GNodeVector;
  typedef typename Alloc::template rebind<std::pair<GNode, int>>::other
      GNodeIntPairVectorAlloc;
  typedef std::vector<std::pair<GNode, int>, GNodeIntPairVectorAlloc>
      GNodeIntPairVector;

  struct InCircumcenter {
    const Graph& graph;
    Tuple tuple;
    InCircumcenter(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}
    bool operator()(const GNode& n) const {
      Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);
      return e.inCircle(tuple);
    }
  };

  Searcher<Alloc> searcher;
  GNodeVector newNodes;
  GNodeIntPairVector outside;
  GNode center;
  Point* point;
  Graph& graph;
  const Alloc& alloc;

  //! Find triangles that border cavity but are not in the cavity
  void findOutside() {
    for (const auto& ii : searcher.inside) {
      for (auto jj : graph.edges(ii, galois::MethodFlag::UNPROTECTED)) {
        GNode n = graph.getEdgeDst(jj);
        // i.e., if (!e.boundary() && e.inCircle(point->t()))
        if (std::find(searcher.matches.begin(), searcher.matches.end(), n) !=
            searcher.matches.end())
          continue;

        int index = graph.getEdgeData(
            graph.findEdge(n, ii, galois::MethodFlag::UNPROTECTED));
        outside.push_back(std::make_pair(n, index));

        Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);
        Point* p2  = e.getPoint(index);
        Point* p3  = e.getPoint((index + 1) % 3);

        p2->get(galois::MethodFlag::WRITE);
        p3->get(galois::MethodFlag::WRITE);
      }
    }
  }

  void addElements() {
    GNodeVector newNodes(alloc);

    // Create new nodes
    for (auto& ii : outside) {
      const GNode& n = ii.first;
      int& index     = ii.second;

      Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);

      Point* p2 = e.getPoint(index);
      Point* p3 = e.getPoint((index + 1) % 3);

      Element newE(point, p2, p3);
      GNode newNode = graph.createNode(newE);
      graph.addNode(newNode, galois::MethodFlag::UNPROTECTED);

      point->addElement(newNode);
      p2->addElement(newNode);
      p3->addElement(newNode);

      graph.getEdgeData(
          graph.addEdge(newNode, n, galois::MethodFlag::UNPROTECTED)) = 1;
      graph.getEdgeData(
          graph.addEdge(n, newNode, galois::MethodFlag::UNPROTECTED)) = index;

      newNodes.push_back(newNode);
    }

    // Update new node connectivity
    for (unsigned i = 0; i < newNodes.size(); ++i) {
      const GNode& n1   = newNodes[i];
      const Element& e1 = graph.getData(n1, galois::MethodFlag::UNPROTECTED);
      for (unsigned j = i + 1; j < newNodes.size(); ++j) {
        if (i != j) {
          const GNode& n2 = newNodes[j];
          const Element& e2 =
              graph.getData(n2, galois::MethodFlag::UNPROTECTED);

          for (int x = 2; x >= 1; --x) {
            for (int y = 2; y >= 1; --y) {
              if (e1.getPoint(x) == e2.getPoint(y)) {
                int indexForNewNode                           = x & 2;
                int indexForNode                              = y & 2;
                graph.getEdgeData(graph.addEdge(
                    n1, n2, galois::MethodFlag::UNPROTECTED)) = indexForNewNode;
                graph.getEdgeData(graph.addEdge(
                    n2, n1, galois::MethodFlag::UNPROTECTED)) = indexForNode;
              }
            }
          }
        }
      }
    }
  }

  void removeElements() {
    for (auto ii : searcher.matches) {
      graph.removeNode(ii, galois::MethodFlag::UNPROTECTED);
    }
  }

public:
  Cavity(Graph& g, const Alloc& a = Alloc())
      : searcher(g, a), newNodes(a), outside(a), graph(g), alloc(a) {}

  void init(const GNode& c, Point* p) {
    center = c;
    point  = p;
  }

  void build() {
    assert(graph.getData(center).inCircle(point->t()));
    searcher.findAll(center, InCircumcenter(graph, point->t()));
    assert(!searcher.inside.empty());
    findOutside();
  }

  void update() {
    removeElements();
    addElements();
  }
};

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/DelaunayTriangulation.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Point.h"
#include "Cavity.h"
#include "Verifier.h"

#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/Timer.h"
#include "galois/graphs/SpatialTree.h"
#include "Lonestar/BoilerPlate.h"
#include "llvm/Support/CommandLine.h"

#include "galois/runtime/Profile.h"

#include <boost/iterator/transform_iterator.hpp>
#include <boost/iterator/counting_iterator.hpp>

#include <algorithm>
#include <deque>
#include <fstream>
#include <iostream>
#include <limits>
#include <vector>

#include <string.h>
#include <unistd.h>

namespace cll = llvm::cl;

static const char* name = "Delaunay Triangulation";
static const char* desc =
    "Produces a Delaunay triangulation for a set of points";
static const char* url = "delaunay_triangulation";

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<std::string>
    doWriteMesh("writemesh",
                cll::desc("Write the mesh out to files with basename"),
                cll::value_desc("basename"));

//! Flag that forces user to be aware that they should be passing in a
//! mesh graph.
static cll::opt<bool>
    meshGraph("meshGraph", cll::desc("Specify that the input graph is a mesh"),
              cll::init(false));

using Tree = typename galois::graphs::SpatialTree2d<Point*>;

//! All Point* refer to elements in this bag
using basePointBag = typename galois::InsertBag<Point>;

//! [Define Insert Bag]
using ptrPointBag = typename galois::InsertBag<Point*>;

//! Our main functor
struct Process {
  Graph& graph;
  Tree& tree;
  ptrPointBag& ptrPoints;

  Process(Graph& g, Tree& t, ptrPointBag& p)
      : graph(g), tree(t), ptrPoints(p) {}

  typedef galois::PerIterAllocTy Alloc;

  struct ContainsTuple {
    const Graph& graph;
    Tuple tuple;
    ContainsTuple(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}
    bool operator()(const GNode& n) const {
      assert(!graph.getData(n, galois::MethodFlag::UNPROTECTED).boundary());
      return graph.getData(n, galois::MethodFlag::UNPROTECTED)
          .inTriangle(tuple);
    }
  };

  void computeCenter(const Element& e, Tuple& t) const {
    for (int i = 0; i < 3; ++i) {
      const Tuple& o = e.getPoint(i)->t();
      for (int j = 0; j < 2; ++j) {
        t[j] += o[j];
      }
    }
    for (int j = 0; j < 2; ++j) {
      t[j] *= 1 / 3.0;
    }
  }

  void findBestNormal(const Element& element, const Point* p,
                      const Point*& bestP1, const Point*& bestP2) {
    Tuple center(0);
    computeCenter(element, center);
    int scale = element.clockwise() ? 1 : -1;

    Tuple origin = p->t() - center;
    //        double length2 = origin.x() * origin.x() + origin.y() *
    //        origin.y();
    bestP1 = bestP2 = NULL;
    double bestVal  = 0.0;
    for (int i = 0; i < 3; ++i) {
      int next = i + 1;
      if (next > 2)
        next -= 3;

      const Point* p1 = element.getPoint(i);
      const Point* p2 = element.getPoint(next);
      double dx       = p2->t().x() - p1->t().x();
      double dy       = p2->t().y() - p1->t().y();
      Tuple normal(scale * -dy, scale * dx);
      double val = normal.dot(origin); // / length2;
      if (bestP1 == NULL || val > bestVal) {
        bestVal = val;
        bestP1  = p1;
        bestP2  = p2;
      }
    }
    assert(bestP1 != NULL && bestP2 != NULL && bestVal > 0);
  }

  GNode findCorrespondingNode(GNode start, const Point* p1, const Point* p2) {
    for (auto ii : graph.edges(start)) {
      GNode dst  = graph.getEdgeDst(ii);
      Element& e = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
      int count  = 0;
      for (int i = 0; i < e.dim(); ++i) {
        if (e.getPoint(i) == p1 || e.getPoint(i) == p2) {
          if (++count == 2)
            return dst;
        }
      }
    }
    GALOIS_DIE("unreachable");
    return start;
  }

  bool planarSearch(const Point* p, GNode start, GNode& node) {
    // Try simple hill climbing instead
    ContainsTuple contains(graph, p->t());
    while (!contains(start)) {
      Element& element = graph.getData(start, galois::MethodFlag::WRITE);
      if (element.boundary()) {
        // Should only happen when quad tree returns a boundary point which is
        // rare There's only one way to go from here
        assert(std::distance(graph.edge_begin(start), graph.edge_end(start)) ==
               1);
        start = graph.getEdgeDst(
            graph.edge_begin(start, galois::MethodFlag::WRITE));
      } else {
        // Find which neighbor will get us to point fastest by computing normal
        // vectors
        const Point *p1, *p2;
        findBestNormal(element, p, p1, p2);
        start = findCorrespondingNode(start, p1, p2);
      }
    }

    node = start;
    return true;
  }

  bool findContainingElement(const Point* p, GNode& node) {
    Point** rp = tree.find(p->t().x(), p->t().y());
    if (!rp)
      return false;

    (*rp)->get(galois::MethodFlag::WRITE);

    GNode someNode = (*rp)->someElement();

    // Not in mesh yet
    if (!someNode) {
      GALOIS_DIE("unreachable");
      return false;
    }

    return planarSearch(p, someNode, node);
  }

  void generateMesh() {
    typedef galois::worklists::PerThreadChunkLIFO<32> CA;
    galois::for_each(
        galois::iterate(ptrPoints),
        [&, self = this](Point* p, auto& ctx) {
          p->get(galois::MethodFlag::WRITE);
          assert(!p->inMesh());

          GNode node;
          if (!self->findContainingElement(p, node)) {
            // Someone updated an element while we were searching,
            // producing a semi-consistent state ctx.push(p);
            // Current version is safe with locking so this
            // shouldn't happen
            GALOIS_DIE("unreachable");
            return;
          }

          assert(self->graph.getData(node).inTriangle(p->t()));
          assert(self->graph.containsNode(node));

          Cavity<Alloc> cav(self->graph, ctx.getPerIterAlloc());
          cav.init(node, p);
          cav.build();
          cav.update();
          self->tree.insert(p->t().x(), p->t().y(), p);
        },
        galois::no_pushes(), galois::per_iter_alloc(), galois::loopname("Main"),
        galois::wl<CA>());
  }
};

typedef std::vector<Point> PointList;

class ReadPoints {
  void addBoundaryPoints() {
    double minX, maxX, minY, maxY;

    minX = minY = std::numeric_limits<double>::max();
    maxX = maxY = std::numeric_limits<double>::min();

    for (const auto& p : points) {
      double x = p.t().x();
      double y = p.t().y();
      if (x < minX)
        minX = x;
      else if (x > maxX)
        maxX = x;
      if (y < minY)
        minY = y;
      else if (y > maxY)
        maxY = y;
    }

    tree.init(minX, minY, maxX, maxY);

    size_t size      = points.size();
    double width     = maxX - minX;
    double height    = maxY - minY;
    double maxLength = std::max(width, height);
    double centerX   = minX + width / 2.0;
    double centerY   = minY + height / 2.0;
    double radius =
        maxLength * 3.0; // radius of circle that should cover all points

    for (int i = 0; i < 3; ++i) {
      double dX = radius * cos(2 * M_PI * (i / 3.0));
      double dY = radius * sin(2 * M_PI * (i / 3.0));
      points.push_back(Point(centerX + dX, centerY + dY, size + i));
    }
  }

  void nextLine(std::ifstream& scanner) {
    scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
  }

  void fromTriangle(std::ifstream& scanner) {
    double x, y;
    long numPoints;

    scanner >> numPoints;

    int dim;
    scanner >> dim;
    assert(dim == 2);
    int k;
    scanner >> k; // number of attributes
    assert(k == 0);
    scanner >> k; // has boundary markers?

    for (long id = 0; id < numPoints; ++id) {
      scanner >> k; // point id
      scanner >> x >> y;
      nextLine(scanner);
      points.push_back(Point(x, y, id));
    }
  }

  void fromPointList(std::ifstream& scanner) {
    double x, y;

    // comment line
    nextLine(scanner);
    size_t id = 0;
    while (!scanner.eof()) {
      scanner >> x >> y;
      if (x == 0 && y == 0)
        break;
      points.push_back(Point(x, y, id++));
      x = y = 0;
      nextLine(scanner);
    }
  }

  PointList& points;
  Tree& tree;

public:
  ReadPoints(PointList& p, Tree& t) : points(p), tree(t) {}

  void from(const std::string& name) {
    std::ifstream scanner(name.c_str());
    if (!scanner.good()) {
      GALOIS_DIE("could not open file: ", name);
    }
    if (name.find(".node") == name.size() - 5) {
      fromTriangle(scanner);
    } else {
      fromPointList(scanner);
    }
    scanner.close();

    if (points.size())
      addBoundaryPoints();
    else {
      GALOIS_DIE("no points found in file: ", name);
    }
  }
};

struct ReadInput {
  Graph& graph;
  Tree& tree;
  basePointBag& basePoints;
  ptrPointBag& ptrPoints;
  std::random_device rng;
  std::mt19937 urng;

  ReadInput(Graph& g, Tree& t, basePointBag& b, ptrPointBag& p)
      : graph(g), tree(t), basePoints(b), ptrPoints(p), urng(rng()) {}

  void addBoundaryNodes(Point* p1, Point* p2, Point* p3) {
    Element large_triangle(p1, p2, p3);
    GNode large_node = graph.createNode(large_triangle);
    graph.addNode(large_node);

    p1->addElement(large_node);
    p2->addElement(large_node);
    p3->addElement(large_node);

    tree.insert(p1->t().x(), p1->t().y(), p1);

    Element border_ele1(p1, p2);
    Element border_ele2(p2, p3);
    Element border_ele3(p3, p1);

    GNode border_node1 = graph.createNode(border_ele1);
    GNode border_node2 = graph.createNode(border_ele2);
    GNode border_node3 = graph.createNode(border_ele3);

    graph.addNode(border_node1);
    graph.addNode(border_node2);
    graph.addNode(border_node3);

    graph.getEdgeData(graph.addEdge(large_node, border_node1)) = 0;
    graph.getEdgeData(graph.addEdge(large_node, border_node2)) = 1;
    graph.getEdgeData(graph.addEdge(large_node, border_node3)) = 2;

    graph.getEdgeData(graph.addEdge(border_node1, large_node)) = 0;
    graph.getEdgeData(graph.addEdge(border_node2, large_node)) = 0;
    graph.getEdgeData(graph.addEdge(border_node3, large_node)) = 0;
  }

  struct centerXCmp {
    template <typename T>
    bool operator()(const T& lhs, const T& rhs) const {
      return lhs.t().x() < rhs.t().x();
    }
  };

  struct centerYCmp {
    template <typename T>
    bool operator()(const T& lhs, const T& rhs) const {
      return lhs.t().y() < rhs.t().y();
    }
  };

  struct centerYCmpInv {
    template <typename T>
    bool operator()(const T& lhs, const T& rhs) const {
      return rhs.t().y() < lhs.t().y();
    }
  };

  template <typename Iter>
  void divide(const Iter& b, const Iter& e) {
    if (std::distance(b, e) > 64) {
      std::sort(b, e, centerXCmp());
      Iter m = galois::split_range(b, e);
      std::sort(b, m, centerYCmpInv());
      std::sort(m, e, centerYCmp());
      divide(b, galois::split_range(b, m));
      divide(galois::split_range(b, m), m);
      divide(m, galois::split_range(m, e));
      divide(galois::split_range(m, e), e);
    } else {
      std::shuffle(b, e, urng);
    }
  }

  void layoutPoints(PointList& points) {
    divide(points.begin(), points.end() - 3);
    galois::do_all(galois::iterate(points.begin(), points.end() - 3),
                   [&](Point& p) {
                     Point* pr = &basePoints.push(p);
                     ptrPoints.push(pr);
                   });
    //! [Insert elements into InsertBag]
    Point* p1 = &basePoints.push(*(points.end() - 1));
    Point* p2 = &basePoints.push(*(points.end() - 2));
    Point* p3 = &basePoints.push(*(points.end() - 3));
    //! [Insert elements into InsertBag]
    addBoundaryNodes(p1, p2, p3);
  }

  void operator()(const std::string& filename) {
    PointList points;
    ReadPoints(points, tree).from(filename);

    std::cout << "configuration: " << points.size() << " points\n";

    galois::preAlloc(2 * numThreads // some per-thread state
                     + 2 * points.size() *
                           sizeof(Element) // mesh is about 2x number of points
                                           // (for random points)
                           * 32            // include graph node size
                           / (galois::runtime::pagePoolSize()) // in pages
    );
    galois::reportPageAlloc("MeminfoPre");

    layoutPoints(points);
  }
};

static void writePoints(const std::string& filename, const PointList& points) {
  std::ofstream out(filename.c_str());
  // <num vertices> <dimension> <num attributes> <has boundary markers>
  out << points.size() << " 2 0 0\n";
  // out.setf(std::ios::fixed, std::ios::floatfield);
  out.setf(std::ios::scientific, std::ios::floatfield);
  out.precision(10);
  long id = 0;
  for (const auto& p : points) {
    const Tuple& t = p.t();
    out << id++ << " " << t.x() << " " << t.y() << " 0\n";
  }

  out.close();
}

static void writeMesh(const std::string& filename, Graph& graph) {
  long numTriangles = 0;
  long numSegments  = 0;
  for (auto n : graph) {
    Element& e = graph.getData(n);
    if (e.boundary()) {
      numSegments++;
    } else {
      numTriangles++;
    }
  }

  long tid = 0;
  long sid = 0;
  std::string elementName(filename);
  std::string polyName(filename);

  elementName.append(".ele");
  polyName.append(".poly");

  std::ofstream eout(elementName.c_str());
  std::ofstream pout(polyName.c_str());
  // <num triangles> <nodes per triangle> <num attributes>
  eout << numTriangles << " 3 0\n";
  // <num vertices> <dimension> <num attributes> <has boundary markers>
  // ...
  // <num segments> <has boundary markers>
  pout << "0 2 0 0\n";
  pout << numSegments << " 1\n";
  for (auto n : graph) {
    const Element& e = graph.getData(n);
    if (e.boundary()) {
      // <segment id> <vertex> <vertex> <is boundary>
      pout << sid++ << " " << e.getPoint(0)->id() << " " << e.getPoint(1)->id()
           << " 1\n";
    } else {
      // <triangle id> <vertex> <vertex> <vertex> [in ccw order]
      eout << tid++ << " " << e.getPoint(0)->id() << " ";
      if (e.clockwise()) {
        eout << e.getPoint(2)->id() << " " << e.getPoint(1)->id() << "\n";
      } else {
        eout << e.getPoint(1)->id() << " " << e.getPoint(2)->id() << "\n";
      }
    }
  }

  eout.close();
  // <num holes>
  pout << "0\n";
  pout.close();
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!meshGraph) {
    GALOIS_DIE("This application requires a mesh graph input;"
               " please use the -meshGraph flag "
               " to indicate the input is a mesh graph.");
  }

  Graph graph;
  Tree tree;
  basePointBag basePoints;
  ptrPointBag ptrPoints;

  ReadInput(graph, tree, basePoints, ptrPoints)(inputFile);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  galois::runtime::profileVtune(
      [&]() { Process(graph, tree, ptrPoints).generateMesh(); },
      "MeshGeneration");
  execTime.stop();
  std::cout << "mesh size: " << graph.size() << "\n";

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify) {
    Verifier verifier;
    if (!verifier.verify(&graph)) {
      GALOIS_DIE("triangulation failed");
    }
    std::cout << "Triangulation OK\n";
  }

  if (doWriteMesh.size()) {
    std::string base = doWriteMesh;
    std::cout << "Writing " << base << "\n";
    writeMesh(base.c_str(), graph);

    PointList points;
    // Reordering messes up connection between id and place in pointlist
    ReadPoints(points, tree).from(inputFile);
    writePoints(base.append(".node"), points);
  }

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/DelaunayTriangulationDet.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Point.h"
#include "Cavity.h"
#include "QuadTree.h"
#include "Verifier.h"

#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/Timer.h"

#include "Lonestar/BoilerPlate.h"
#include "llvm/Support/CommandLine.h"

#include <boost/iterator/transform_iterator.hpp>
#include <boost/iterator/counting_iterator.hpp>

#include <algorithm>
#include <deque>
#include <fstream>
#include <iostream>
#include <limits>
#include <vector>

#include <string.h>
#include <unistd.h>

namespace cll = llvm::cl;

static const char* name = "Delaunay Triangulation";
static const char* desc =
    "Produces a Delaunay triangulation for a set of points";
static const char* url = "delaunay_triangulation";

static cll::opt<std::string>
    doWriteMesh("writemesh",
                cll::desc("Write the mesh out to files with basename"),
                cll::value_desc("basename"));
static cll::opt<std::string>
    doWritePoints("writepoints",
                  cll::desc("Write the (reordered) points to filename"),
                  cll::value_desc("filename"));
static cll::opt<bool>
    noReorderPoints("noreorder",
                    cll::desc("Don't reorder points to improve locality"),
                    cll::init(false));
static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

enum DetAlgo { nondet, detBase, detPrefix, detDisjoint };

static cll::opt<DetAlgo>
    detAlgo(cll::desc("Deterministic algorithm:"),
            cll::values(clEnumVal(nondet, "Non-deterministic"),
                        clEnumVal(detBase, "Base execution"),
                        clEnumVal(detPrefix, "Prefix execution"),
                        clEnumVal(detDisjoint, "Disjoint execution")),
            cll::init(nondet));

//! Flag that forces user to be aware that they should be passing in a
//! mesh graph.
static cll::opt<bool>
    meshGraph("meshGraph", cll::desc("Specify that the input graph is a mesh"),
              cll::init(false));

struct GetPointer {
  Point* operator()(Point& p) const { return &p; }
};

typedef std::vector<Point> PointList;

class ReadPoints {
  void addBoundaryPoints() {
    double minX, maxX, minY, maxY;

    minX = minY = std::numeric_limits<double>::max();
    maxX = maxY = std::numeric_limits<double>::min();

    for (auto& p : points) {
      double x = p.t().x();
      double y = p.t().y();
      if (x < minX)
        minX = x;
      else if (x > maxX)
        maxX = x;
      if (y < minY)
        minY = y;
      else if (y > maxY)
        maxY = y;
    }

    size_t size      = points.size();
    double width     = maxX - minX;
    double height    = maxY - minY;
    double maxLength = std::max(width, height);
    double centerX   = minX + width / 2.0;
    double centerY   = minY + height / 2.0;
    double radius =
        maxLength * 3.0; // radius of circle that should cover all points

    for (int i = 0; i < 3; ++i) {
      double dX = radius * cos(2 * M_PI * (i / 3.0));
      double dY = radius * sin(2 * M_PI * (i / 3.0));
      points.push_back(Point(centerX + dX, centerY + dY, size + i));
    }
  }

  void nextLine(std::ifstream& scanner) {
    scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
  }

  void fromTriangle(std::ifstream& scanner) {
    double x, y;
    long numPoints;

    scanner >> numPoints;

    int dim;
    scanner >> dim;
    assert(dim == 2);
    int k;
    scanner >> k; // number of attributes
    assert(k == 0);
    scanner >> k; // has boundary markers?

    for (long id = 0; id < numPoints; ++id) {
      scanner >> k; // point id
      scanner >> x >> y;
      nextLine(scanner);
      points.push_back(Point(x, y, id));
    }
  }

  void fromPointList(std::ifstream& scanner) {
    double x, y;

    // comment line
    nextLine(scanner);
    size_t id = 0;
    while (!scanner.eof()) {
      scanner >> x >> y;
      if (x == 0 && y == 0)
        break;
      points.push_back(Point(x, y, id++));
      x = y = 0;
      nextLine(scanner);
    }
  }

  PointList& points;

public:
  ReadPoints(PointList& p) : points(p) {}

  void from(const std::string& name) {
    std::ifstream scanner(name.c_str());
    if (!scanner.good()) {
      GALOIS_DIE("could not open file: ", name);
    }
    if (name.find(".node") == name.size() - 5) {
      fromTriangle(scanner);
    } else {
      fromPointList(scanner);
    }
    scanner.close();

    if (points.size())
      addBoundaryPoints();
    else {
      GALOIS_DIE("no points found in file: ", name);
    }
  }
};

static void writePoints(const std::string& filename, const PointList& points) {
  std::ofstream out(filename.c_str());
  // <num vertices> <dimension> <num attributes> <has boundary markers>
  out << points.size() << " 2 0 0\n";
  // out.setf(std::ios::fixed, std::ios::floatfield);
  out.setf(std::ios::scientific, std::ios::floatfield);
  out.precision(10);
  long id = 0;
  for (const auto& p : points) {
    const Tuple& t = p.t();
    out << id++ << " " << t.x() << " " << t.y() << " 0\n";
  }

  out.close();
}

using BasePoints = galois::InsertBag<Point>;
using PtrPoints  = galois::InsertBag<Point*>;
using Rounds     = std::vector<PtrPoints*>;

size_t maxRounds;
const int roundShift = 4; //! round sizes are portional to (1 << roundsShift)

static void copyPointsFromRounds(PointList& points, Rounds& rounds) {
  for (int i = maxRounds - 1; i >= 0; --i) {
    //! [Access elements of InsertBag]
    // PtrPoints expands to galois::InsertBag<Point*>
    // points is of type std::vector<Point>
    PtrPoints& pptrs = *rounds[i];
    for (auto ii : pptrs) {
      points.push_back(*ii);
    }
    //! [Access elements of InsertBag]
  }
}

struct ReadInput {
  Graph& graph;
  BasePoints& basePoints;
  Rounds& rounds;
  std::random_device rng;
  std::mt19937 urng;

  ReadInput(Graph& g, BasePoints& b, Rounds& r)
      : graph(g), basePoints(b), rounds(r), urng(rng()) {}

  void addBoundaryNodes(Point* p1, Point* p2, Point* p3) {
    Element large_triangle(p1, p2, p3);
    GNode large_node = graph.createNode(large_triangle);
    graph.addNode(large_node);

    p1->addElement(large_node);
    p2->addElement(large_node);
    p3->addElement(large_node);

    Element border_ele1(p1, p2);
    Element border_ele2(p2, p3);
    Element border_ele3(p3, p1);

    GNode border_node1 = graph.createNode(border_ele1);
    GNode border_node2 = graph.createNode(border_ele2);
    GNode border_node3 = graph.createNode(border_ele3);

    graph.addNode(border_node1);
    graph.addNode(border_node2);
    graph.addNode(border_node3);

    graph.getEdgeData(graph.addEdge(large_node, border_node1)) = 0;
    graph.getEdgeData(graph.addEdge(large_node, border_node2)) = 1;
    graph.getEdgeData(graph.addEdge(large_node, border_node3)) = 2;

    graph.getEdgeData(graph.addEdge(border_node1, large_node)) = 0;
    graph.getEdgeData(graph.addEdge(border_node2, large_node)) = 0;
    graph.getEdgeData(graph.addEdge(border_node3, large_node)) = 0;
  }

  template <typename L>
  void generateRoundsImpl(const L& loop, size_t size, PointList& points,
                          size_t log2) {
    loop(
        galois::iterate(size_t{0}, size),
        [&, this](size_t index) {
          const Point& p = points[index];

          Point* ptr = &(basePoints.push(p));
          int r      = 0;
          for (size_t i = 0; i < log2; ++i) {
            size_t mask = (1UL << (i + 1)) - 1;
            if ((index & mask) == (1UL << i)) {
              r = i;
              break;
            }
          }

          rounds[r / roundShift]->push(ptr);
        },
        galois::loopname("generateRoundsImpl"));
  }

  //! Blocked point distribution (exponentially increasing block size) with
  //! points randomized within a round
  void generateRoundsOld(PointList& points, bool randomize) {
    size_t counter = 0;
    size_t round   = 0;
    size_t next    = 1 << roundShift;
    std::vector<Point*> buf;

    PointList::iterator ii = points.begin(), ei = points.end();
    while (ii != ei) {
      Point* ptr = &(basePoints.push(*ii));
      buf.push_back(ptr);
      ++ii;
      if (ii == ei || counter > next) {
        next *= next;
        int r = maxRounds - 1 - round;
        if (randomize)
          std::shuffle(buf.begin(), buf.end(), urng);
        std::copy(buf.begin(), buf.end(), std::back_inserter(*rounds[r]));
        buf.clear();
        ++round;
      }
      ++counter;
    }
  }

  void generateRounds(PointList& points, bool addBoundary) {
    size_t size = points.size() - 3;

    size_t log2 = std::max((size_t)floor(log(size) / log(2)), (size_t)1);
    maxRounds   = log2 / roundShift;
    for (size_t i = 0; i <= maxRounds;
         i++) { // rounds[maxRounds+1] for boundary points
      rounds.push_back(new galois::InsertBag<Point*>);
    }

    PointList ordered;
    // ordered.reserve(size);

    if (noReorderPoints) {
      std::copy(points.begin(), points.begin() + size,
                std::back_inserter(ordered));
      generateRoundsOld(ordered, false);
    } else {
      // Reorganize spatially
      QuadTree q(
          boost::make_transform_iterator(points.begin(), GetPointer()),
          boost::make_transform_iterator(points.begin() + size, GetPointer()));

      q.output(std::back_inserter(ordered));

      if (true) {
        if (detAlgo == nondet) {
          generateRoundsImpl(galois::DoAll(), size, ordered, log2);

        } else {
          generateRoundsImpl(galois::StdForEach(), size, ordered, log2);
        }
      } else {
        generateRoundsOld(ordered, true);
      }
    }

    if (!addBoundary)
      return;

    // Now, handle boundary points
    size_t last = points.size();
    //! [Insert elements into InsertBag]
    // basePoints is of type galois::InsertBag<Point>
    // points is of type std::vector<Point>
    Point* p1 = &(basePoints.push(points[last - 1]));
    Point* p2 = &(basePoints.push(points[last - 2]));
    Point* p3 = &(basePoints.push(points[last - 3]));
    //! [Insert elements into InsertBag]

    rounds[maxRounds]->push(p1);
    rounds[maxRounds]->push(p2);
    rounds[maxRounds]->push(p3);

    addBoundaryNodes(p1, p2, p3);
  }

  void operator()(const std::string& filename, bool addBoundary) {
    PointList points;
    ReadPoints(points).from(filename);

    std::cout << "configuration: " << points.size() << " points\n";

#if 1
    galois::preAlloc(
        32 * points.size() * sizeof(Element) *
        1.5 // mesh is about 2x number of points (for random points)
        / (galois::runtime::pagePoolSize()) // in pages
    );
#else
    galois::preAlloc(1 * numThreads // some per-thread state
                     + 2 * points.size() *
                           sizeof(Element) // mesh is about 2x number of points
                                           // (for random points)
                           * 32            // include graph node size
                           / (galois::runtime::hugePageSize) // in pages
    );
#endif
    galois::reportPageAlloc("MeminfoPre");

    galois::StatTimer T("generateRounds");
    T.start();
    generateRounds(points, addBoundary);
    T.stop();
  }
};

static void writeMesh(const std::string& filename, Graph& graph) {
  long numTriangles = 0;
  long numSegments  = 0;
  for (auto n : graph) {
    Element& e = graph.getData(n);
    if (e.boundary()) {
      numSegments++;
    } else {
      numTriangles++;
    }
  }

  long tid = 0;
  long sid = 0;
  std::string elementName(filename);
  std::string polyName(filename);

  elementName.append(".ele");
  polyName.append(".poly");

  std::ofstream eout(elementName.c_str());
  std::ofstream pout(polyName.c_str());
  // <num triangles> <nodes per triangle> <num attributes>
  eout << numTriangles << " 3 0\n";
  // <num vertices> <dimension> <num attributes> <has boundary markers>
  // ...
  // <num segments> <has boundary markers>
  pout << "0 2 0 0\n";
  pout << numSegments << " 1\n";
  for (auto n : graph) {
    const Element& e = graph.getData(n);
    if (e.boundary()) {
      // <segment id> <vertex> <vertex> <is boundary>
      pout << sid++ << " " << e.getPoint(0)->id() << " " << e.getPoint(1)->id()
           << " 1\n";
    } else {
      // <triangle id> <vertex> <vertex> <vertex> [in ccw order]
      eout << tid++ << " " << e.getPoint(0)->id() << " ";
      if (e.clockwise()) {
        eout << e.getPoint(2)->id() << " " << e.getPoint(1)->id() << "\n";
      } else {
        eout << e.getPoint(1)->id() << " " << e.getPoint(2)->id() << "\n";
      }
    }
  }

  eout.close();
  // <num holes>
  pout << "0\n";
  pout.close();
}

struct DelaunayTriangulation {

  QuadTree* tree;
  Graph& graph;

  struct ContainsTuple {
    const Graph& graph;
    Tuple tuple;
    ContainsTuple(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}
    bool operator()(const GNode& n) const {
      assert(!graph.getData(n, galois::MethodFlag::UNPROTECTED).boundary());
      return graph.getData(n, galois::MethodFlag::UNPROTECTED)
          .inTriangle(tuple);
    }
  };

  void computeCenter(const Element& e, Tuple& t) const {
    for (int i = 0; i < 3; ++i) {
      const Tuple& o = e.getPoint(i)->t();
      for (int j = 0; j < 2; ++j) {
        t[j] += o[j];
      }
    }
    for (int j = 0; j < 2; ++j) {
      t[j] *= 1 / 3.0;
    }
  }

  void findBestNormal(const Element& element, const Point* p,
                      const Point*& bestP1, const Point*& bestP2) {
    Tuple center(0);
    computeCenter(element, center);
    int scale = element.clockwise() ? 1 : -1;

    Tuple origin = p->t() - center;
    //        double length2 = origin.x() * origin.x() + origin.y() *
    //        origin.y();
    bestP1 = bestP2 = NULL;
    double bestVal  = 0.0;
    for (int i = 0; i < 3; ++i) {
      int next = i + 1;
      if (next > 2)
        next -= 3;

      const Point* p1 = element.getPoint(i);
      const Point* p2 = element.getPoint(next);
      double dx       = p2->t().x() - p1->t().x();
      double dy       = p2->t().y() - p1->t().y();
      Tuple normal(scale * -dy, scale * dx);
      double val = normal.dot(origin); // / length2;
      if (bestP1 == NULL || val > bestVal) {
        bestVal = val;
        bestP1  = p1;
        bestP2  = p2;
      }
    }
    assert(bestP1 != NULL && bestP2 != NULL && bestVal > 0);
  }

  GNode findCorrespondingNode(GNode start, const Point* p1, const Point* p2) {
    for (auto ii : graph.edges(start)) {
      GNode dst  = graph.getEdgeDst(ii);
      Element& e = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
      int count  = 0;
      for (int i = 0; i < e.dim(); ++i) {
        if (e.getPoint(i) == p1 || e.getPoint(i) == p2) {
          if (++count == 2)
            return dst;
        }
      }
    }
    GALOIS_DIE("unreachable");
    return start;
  }

  bool planarSearch(const Point* p, GNode start, GNode& node) {
    // Try simple hill climbing instead
    ContainsTuple contains(graph, p->t());
    while (!contains(start)) {
      Element& element = graph.getData(start, galois::MethodFlag::WRITE);
      if (element.boundary()) {
        // Should only happen when quad tree returns a boundary point which is
        // rare There's only one way to go from here
        assert(std::distance(graph.edge_begin(start), graph.edge_end(start)) ==
               1);
        start = graph.getEdgeDst(
            graph.edge_begin(start, galois::MethodFlag::WRITE));
      } else {
        // Find which neighbor will get us to point fastest by computing normal
        // vectors
        const Point *p1, *p2;
        findBestNormal(element, p, p1, p2);
        start = findCorrespondingNode(start, p1, p2);
      }
    }

    node = start;
    return true;
  }

  bool findContainingElement(const Point* p, GNode& node) {
    Point* result;
    if (!tree->find(p, result)) {
      return false;
    }

    result->get(galois::MethodFlag::WRITE);

    GNode someNode = result->someElement();

    // Not in mesh yet
    if (!someNode) {
      return false;
    }

    return planarSearch(p, someNode, node);
  }

  using Alloc = galois::PerIterAllocTy;

  struct LocalState {
    Cavity<Alloc> cav;
    LocalState(Graph& graph, Alloc& alloc) : cav(graph, alloc) {}
  };

  template <int Version, typename C>
  void processPoint(Point* p, C& ctx) {
    Cavity<Alloc>* cavp = NULL;

    if (Version == detDisjoint) {

      if (ctx.isFirstPass()) {
        LocalState* localState = ctx.template createLocalState<LocalState>(
            graph, ctx.getPerIterAlloc());
        cavp = &localState->cav;

      } else {

        LocalState* localState = ctx.template getLocalState<LocalState>();
        localState->cav.update();
        return;
      }
    }

    p->get(galois::MethodFlag::WRITE);
    assert(!p->inMesh());

    GNode node;
    if (!findContainingElement(p, node)) {
      // Someone updated an element while we were searching, producing
      // a semi-consistent state
      // ctx.push(p);
      // Current version is safe with locking so this shouldn't happen
      GALOIS_DIE("unreachable");
      return;
    }

    assert(graph.getData(node).inTriangle(p->t()));
    assert(graph.containsNode(node));

    if (Version == detDisjoint && ctx.isFirstPass()) {
      cavp->init(node, p);
      cavp->build();
    } else {
      Cavity<Alloc> cav(graph, ctx.getPerIterAlloc());
      cav.init(node, p);
      cav.build();
      if (Version == detPrefix)
        return;
      ctx.cautiousPoint();
      cav.update();
    }
  }

  template <int Version, typename WL, typename B, typename... Args>
  void generateMesh(B& pptrs, Args&&... args) {

    galois::for_each(
        galois::iterate(pptrs),
        [&, this](Point* p, auto& ctx) { this->processPoint<Version>(p, ctx); },
        galois::wl<WL>(), galois::loopname("generateMesh"),
        galois::local_state<LocalState>(), galois::per_iter_alloc(),
        galois::no_pushes(), std::forward<Args>(args)...);
  }
};

/*
template<int Version=detBase>
struct Process {


  //! Serial operator
  void operator()(Point* p) {
    p->get(galois::MethodFlag::WRITE);
    assert(!p->inMesh());

    GNode node;
    if (!findContainingElement(p, node)) {
      GALOIS_DIE("Could not find triangle containing point");
      return;
    }

    assert(graph.getData(node).inTriangle(p->t()));
    assert(graph.containsNode(node));

    Cavity<> cav(graph);
    cav.init(node, p);
    cav.build();
    cav.update();
  }
};
*/

static void run(Rounds& rounds, Graph& graph) {
  typedef galois::worklists::PerThreadChunkLIFO<32> Chunk;
  typedef galois::worklists::Deterministic<> DWL;

  for (int i = maxRounds - 1; i >= 0; --i) {

    galois::StatTimer BT("buildtree");
    BT.start();
    assert(rounds[i + 1]);
    PtrPoints& tptrs = *(rounds[i + 1]);
    QuadTree tree(tptrs.begin(), tptrs.end());
    BT.stop();

    galois::StatTimer PT("ParallelTime");
    PT.start();

    assert(rounds[i]);
    galois::InsertBag<Point*>& pptrs = *(rounds[i]);

    DelaunayTriangulation dt{&tree, graph};
    switch (detAlgo) {
    case nondet:
      dt.generateMesh<detBase, Chunk>(pptrs);
      break;
    case detBase:
      dt.generateMesh<detBase, DWL>(pptrs);
      break;
    case detPrefix: {
      auto nv = [&dt](Point* p, auto& ctx) {
        dt.processPoint<detPrefix>(p, ctx);
      };
      dt.generateMesh<detBase, DWL>(
          pptrs, galois::neighborhood_visitor<decltype(nv)>(nv));
      break;
    }
    case detDisjoint:
      dt.generateMesh<detDisjoint, DWL>(pptrs);
      break;
    default:
      GALOIS_DIE("unknown algorithm: ", detAlgo);
    }

    PT.stop();
  }
}

void deleteRounds(Rounds& rounds) {
  for (auto r : rounds)
    delete r;
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, url, &inputFile);

  galois::StatTimer totalTime("TimerTotal");
  totalTime.start();

  if (!meshGraph) {
    GALOIS_DIE("This application requires a mesh graph input;"
               " please use the -meshGraph flag "
               " to indicate the input is a mesh graph.");
  }

  Graph graph;

  //! All Point* refer to elements in this bag
  //! [Define InsertBag]
  // BasePoints expands to galois::InsertBag<Point>
  BasePoints basePoints;
  //! [Define InsertBag]

  Rounds rounds;

  bool writepoints = doWritePoints.size() > 0;
  ReadInput(graph, basePoints, rounds)(inputFile, !writepoints);
  if (writepoints) {
    std::cout << "Writing " << doWritePoints << "\n";
    PointList points;
    copyPointsFromRounds(points, rounds);
    writePoints(doWritePoints, points);
    deleteRounds(rounds);
    return 0;
  }

  const char* name = 0;
  switch (detAlgo) {
  case nondet:
    name = "nondet";
    break;
  case detBase:
    name = "detBase";
    break;
  case detPrefix:
    name = "detPrefix";
    break;
  case detDisjoint:
    name = "detDisjoint";
    break;
  default:
    name = "unknown";
    break;
  }
  galois::gInfo("Algorithm ", name);

  galois::StatTimer execTime("Timer_0");
  execTime.start();
  run(rounds, graph);
  execTime.stop();
  std::cout << "mesh size: " << graph.size() << "\n";

  galois::reportPageAlloc("MeminfoPost");

  if (!skipVerify) {
    Verifier verifier;
    if (!verifier.verify(&graph)) {
      GALOIS_DIE("triangulation failed");
    }
    std::cout << "Triangulation OK\n";
  }

  if (doWriteMesh.size()) {
    std::string base = doWriteMesh;
    std::cout << "Writing " << base << "\n";
    writeMesh(base.c_str(), graph);

    PointList points;
    // Reordering messes up connection between id and place in pointlist
    ReadPoints(points).from(inputFile);
    writePoints(base.append(".node"), points);
  }

  deleteRounds(rounds);

  totalTime.stop();

  return 0;
}


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Element.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Element.h"
#include "Point.h"

std::ostream& operator<<(std::ostream& out, const Element& e) {
  return e.print(out);
}

bool Element::inTriangle(const Tuple& p) const {
  if (boundary())
    return false;

  const Tuple& p1 = points[0]->t();
  const Tuple& p2 = points[1]->t();
  const Tuple& p3 = points[2]->t();

  if ((p1 == p) || (p2 == p) || (p3 == p)) {
    return false;
  }

  int count  = 0;
  double px  = p.x();
  double py  = p.y();
  double p1x = p1.x();
  double p1y = p1.y();
  double p2x = p2.x();
  double p2y = p2.y();
  double p3x = p3.x();
  double p3y = p3.y();

  if (p2x < p1x) {
    if ((p2x < px) && (p1x >= px)) {
      if (((py - p2y) * (p1x - p2x)) < ((px - p2x) * (p1y - p2y))) {
        count = 1;
      }
    }
  } else {
    if ((p1x < px) && (p2x >= px)) {
      if (((py - p1y) * (p2x - p1x)) < ((px - p1x) * (p2y - p1y))) {
        count = 1;
      }
    }
  }

  if (p3x < p2x) {
    if ((p3x < px) && (p2x >= px)) {
      if (((py - p3y) * (p2x - p3x)) < ((px - p3x) * (p2y - p3y))) {
        if (count == 1) {
          return false;
        }
        count++;
      }
    }
  } else {
    if ((p2x < px) && (p3x >= px)) {
      if (((py - p2y) * (p3x - p2x)) < ((px - p2x) * (p3y - p2y))) {
        if (count == 1) {
          return false;
        }
        count++;
      }
    }
  }

  if (p1x < p3x) {
    if ((p1x < px) && (p3x >= px)) {
      if (((py - p1y) * (p3x - p1x)) < ((px - p1x) * (p3y - p1y))) {
        if (count == 1) {
          return false;
        }
        count++;
      }
    }
  } else {
    if ((p3x < px) && (p1x >= px)) {
      if (((py - p3y) * (p1x - p3x)) < ((px - p3x) * (p1y - p3y))) {
        if (count == 1) {
          return false;
        }
        count++;
      }
    }
  }

  return count == 1;
}

bool Element::clockwise() const {
  assert(!boundary());

  double t1_x = points[0]->t().x();
  double t1_y = points[0]->t().y();

  double t2_x = points[1]->t().x();
  double t2_y = points[1]->t().y();

  double t3_x = points[2]->t().x();
  double t3_y = points[2]->t().y();

  double counter_clockwise =
      (t2_x - t1_x) * (t3_y - t1_y) - (t3_x - t1_x) * (t2_y - t1_y);

  return counter_clockwise < 0;
}

bool Element::inCircle(const Tuple& p) const {
  if (boundary())
    return false;

  // This version computes the determinant of a matrix including the
  // coordinates of each points + distance of these points to the origin
  // in order to check if a point is inside a triangle or not
  double t1_x = points[0]->t().x();
  double t1_y = points[0]->t().y();

  double t2_x = points[1]->t().x();
  double t2_y = points[1]->t().y();

  double t3_x = points[2]->t().x();
  double t3_y = points[2]->t().y();

  double p_x = p.x();
  double p_y = p.y();

  // Check if the points (t1,t2,t3) are sorted clockwise or
  // counter-clockwise:
  // -> counter_clockwise > 0 => counter clockwise
  // -> counter_clockwise = 0 => degenerated triangle
  // -> counter_clockwise < 0 => clockwise
  double counter_clockwise =
      (t2_x - t1_x) * (t3_y - t1_y) - (t3_x - t1_x) * (t2_y - t1_y);

  // If the triangle is degenerate, then the triangle should be updated
  if (counter_clockwise == 0.0) {
    return true;
  }

  // Compute the following determinant:
  // | t1_x-p_x  t1_y-p_y  (t1_x-p_x)^2+(t1_y-p_y)^2 |
  // | t2_x-p_x  t2_y-p_y  (t2_x-p_x)^2+(t2_y-p_y)^2 |
  // | t3_x-p_x  t3_y-p_y  (t3_x-p_x)^2+(t3_y-p_y)^2 |
  //
  // If the determinant is >0 then the point (p_x,p_y) is inside the
  // circumcircle of the triangle (t1,t2,t3).

  // Value of columns 1 and 2 of the matrix
  double t1_p_x, t1_p_y, t2_p_x, t2_p_y, t3_p_x, t3_p_y;
  // Determinant of minors extracted from columns 1 and 2
  // (det_t3_t1_m corresponds to the opposite)
  double det_t1_t2, det_t2_t3, det_t3_t1_m;
  // Values of the column 3 of the matrix
  double t1_col3, t2_col3, t3_col3;

  t1_p_x = t1_x - p_x;
  t1_p_y = t1_y - p_y;
  t2_p_x = t2_x - p_x;
  t2_p_y = t2_y - p_y;
  t3_p_x = t3_x - p_x;
  t3_p_y = t3_y - p_y;

  det_t1_t2   = t1_p_x * t2_p_y - t2_p_x * t1_p_y;
  det_t2_t3   = t2_p_x * t3_p_y - t3_p_x * t2_p_y;
  det_t3_t1_m = t3_p_x * t1_p_y - t1_p_x * t3_p_y;
  t1_col3     = t1_p_x * t1_p_x + t1_p_y * t1_p_y;
  t2_col3     = t2_p_x * t2_p_x + t2_p_y * t2_p_y;
  t3_col3     = t3_p_x * t3_p_x + t3_p_y * t3_p_y;

  double det =
      t1_col3 * det_t2_t3 + t2_col3 * det_t3_t1_m + t3_col3 * det_t1_t2;

  // If the points are enumerated in clockwise, then negate the result
  if (counter_clockwise < 0) {
    return det < 0;
  }
  return det > 0;
}

std::ostream& Element::print(std::ostream& out) const {
  out << '[';
  for (int i = 0; i < dim(); ++i) {
    out << points[i]->id() << " ";
    points[i]->print(out);
    out << (i < (dim() - 1) ? ", " : "");
  }
  out << ']';
  return out;
}


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Element.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef ELEMENT_H
#define ELEMENT_H

#include "Tuple.h"

#include <ostream>
#include <stdlib.h>

class Point;

class Element {
  Point* points[3];

public:
  Element(const Element& e) {
    points[0] = e.points[0];
    points[1] = e.points[1];
    points[2] = e.points[2];
  }

  Element(Point* a, Point* b, Point* c) {
    points[0] = a;
    points[1] = b;
    points[2] = c;
  }

  Element(Point* a, Point* b) {
    points[0] = a;
    points[1] = b;
    points[2] = NULL;
  }

  Point* getPoint(int i) { return points[i]; }
  const Point* getPoint(int i) const { return points[i]; }

  bool boundary() const { return points[2] == NULL; }
  int dim() const { return boundary() ? 2 : 3; }

  bool clockwise() const;

  //! determine if a tuple is inside the triangle
  bool inTriangle(const Tuple& p) const;

  //! determine if the circumcircle of the triangle contains the tuple
  bool inCircle(const Tuple& p) const;

  std::ostream& print(std::ostream& out) const;
};

std::ostream& operator<<(std::ostream& out, const Element& e);

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Graph.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef GRAPH_H
#define GRAPH_H

#include "Element.h"

#include "galois/optional.h"
#include "galois/graphs/Graph.h"

#include <vector>
#include <deque>

typedef galois::graphs::MorphGraph<Element, char, true> Graph;
typedef Graph::GraphNode GNode;

//! Factor out common graph traversals
template <typename Alloc = std::allocator<char>>
struct Searcher : private boost::noncopyable {
  typedef Alloc allocator_type;
  typedef typename Alloc::template rebind<GNode>::other GNodeVectorAlloc;
  typedef std::vector<GNode, GNodeVectorAlloc> GNodeVector;

  struct Marker {
    GNodeVector seen;
    Marker(Graph&, const Alloc& a) : seen(a) {}
    void mark(GNode n) { seen.push_back(n); }
    bool hasMark(GNode n) {
      return std::find(seen.begin(), seen.end(), n) != seen.end();
    }
  };

  Graph& graph;
  GNodeVector matches, inside;
  const allocator_type& alloc;

  Searcher(Graph& g, const Alloc& a = allocator_type())
      : graph(g), matches(a), inside(a), alloc(a) {}

  struct DetLess {
    Graph& g;
    DetLess(Graph& x) : g(x) {}
    bool operator()(GNode a, GNode b) const {
      Element& e1 = g.getData(a, galois::MethodFlag::UNPROTECTED);
      Element& e2 = g.getData(b, galois::MethodFlag::UNPROTECTED);

      for (int i = 0; i < 3; ++i) {
        uintptr_t v1 = (i < 2 || !e1.boundary())
                           ? reinterpret_cast<uintptr_t>(e1.getPoint(i))
                           : 0;
        uintptr_t v2 = (i < 2 || !e2.boundary())
                           ? reinterpret_cast<uintptr_t>(e2.getPoint(i))
                           : 0;
        if (v1 < v2)
          return true;
        else if (v1 > v2)
          return false;
      }
      return false;
    }
  };

  void removeDupes(GNodeVector& v) {
    std::sort(v.begin(), v.end(), DetLess(graph));
    typename GNodeVector::iterator end = std::unique(v.begin(), v.end());
    v.resize(end - v.begin());
  }

  template <typename Pred>
  void find_(const GNode& start, const Pred& pred, bool all) {
    typedef galois::optional<GNode> SomeGNode;
    typedef typename Alloc::template rebind<std::pair<GNode, SomeGNode>>::other
        WorklistAlloc;
    typedef std::deque<std::pair<GNode, SomeGNode>, WorklistAlloc> Worklist;

    Worklist wl(alloc);
    wl.push_back(std::make_pair(start, SomeGNode()));

    Marker marker(graph, alloc);
    while (!wl.empty()) {
      GNode cur      = wl.front().first;
      SomeGNode prev = wl.front().second;

      wl.pop_front();

      if (!graph.containsNode(cur, galois::MethodFlag::WRITE))
        continue;

      if (marker.hasMark(cur))
        continue;

      // NB(ddn): Technically this makes DelaunayTriangulation.cpp::Process not
      // cautious
      if (!all)
        marker.mark(cur);

      bool matched = false;
      if (pred(cur)) {
        matched = true;
        matches.push_back(cur);
        if (all) {
          marker.mark(cur);
        } else
          break; // Found it
      } else {
        if (all && prev)
          inside.push_back(*prev);
      }

      // Search neighbors (a) when matched and looking for all or (b) when no
      // match and looking for first
      if (matched == all) {
        for (auto ii : graph.edges(cur)) {
          GNode dst = graph.getEdgeDst(ii);
          wl.push_back(std::make_pair(dst, SomeGNode(cur)));
        }
      }
    }

    if (all) {
      removeDupes(matches);
      removeDupes(inside);
    }
  }

  //! Find the first occurance of element matching pred
  template <typename Pred>
  void findFirst(const GNode& start, const Pred& p) {
    find_(start, p, false);
  }

  //! Find all the elements matching pred (assuming monotonic predicate)
  template <typename Pred>
  void findAll(const GNode& start, const Pred& p) {
    find_(start, p, true);
    return;
  }
};

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Point.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef POINT_H
#define POINT_H

#include "Tuple.h"
#include "Graph.h"

#include "galois/CheckedObject.h"

#include <ostream>
#include <algorithm>

class Point : public galois::GChecked<void> {
  Tuple m_t;
  GNode m_n;
  long m_id;

public:
  Point(double x, double y, long id) : m_t(x, y), m_n(NULL), m_id(id) {}

  const Tuple& t() const { return m_t; }
  long id() const { return m_id; }

  Tuple& t() { return m_t; }
  long& id() { return m_id; }

  void addElement(const GNode& n) { m_n = n; }

  void removeElement(const GNode& n) {
    if (m_n == n)
      m_n = NULL;
  }

  bool inMesh() const { return m_n != NULL; }

  GNode someElement() const { return m_n; }

  void print(std::ostream& os) const {
    os << "(id: " << m_id << " t: ";
    m_t.print(os);
    if (m_n != NULL)
      os << " SOME)";
    else
      os << " NULL)";
  }
};

static inline std::ostream& operator<<(std::ostream& os, const Point& rhs) {
  rhs.print(os);
  return os;
}

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/QuadTree.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef QUADTREE_H
#define QUADTREE_H

#include "Point.h"
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include <boost/iterator/transform_iterator.hpp>
#include <boost/array.hpp>

#include <limits>

inline int getIndex(const Tuple& a, const Tuple& b) {
  int index = 0;
  for (int i = 0; i < 2; ++i) {
    if (a[i] < b[i]) {
      index += 1 << i;
    }
  }
  return index;
}

inline void makeNewCenter(int index, const Tuple& center, double radius,
                          Tuple& newCenter) {
  newCenter = center;
  for (int i = 0; i < 2; ++i) {
    newCenter[i] += (index & (1 << i)) > 0 ? radius : -radius;
  }
}

static const int maxLeafSize = 16;

/**
 * Finds points nearby a given point.
 */
class PQuadTree {
  struct FindResult {
    Point* p;
    double best;
  };

  struct DerefPointer {
    Point operator()(Point* p) const { return *p; }
  };

  struct Node {
    typedef boost::array<Point*, maxLeafSize> PointsTy;
    Node* child[4];
    PointsTy* points;
    int size;

    //! Make internal node
    explicit Node() {
      memset(child, 0, sizeof(*child) * 4);
      points = NULL;
    }

    //! Make leaf node
    Node(Point* p, PointsTy* ps) {
      memset(child, 0, sizeof(*child) * 4);
      points        = ps;
      points->at(0) = p;
      size          = 1;
    }

    bool isLeaf() const { return points != NULL; }
  };

  void deleteNode(Node* root) {
    if (root->isLeaf()) {
      pointsAlloc.destroy(root->points);
      pointsAlloc.deallocate(root->points, 1);
    } else {
      for (int i = 0; i < 4; ++i) {
        if (root->child[i])
          deleteNode(root->child[i]);
      }
    }
    nodeAlloc.destroy(root);
    nodeAlloc.deallocate(root, 1);
  }

  Node* newNode() {
    Node* n = nodeAlloc.allocate(1);
    nodeAlloc.construct(n, Node());
    return n;
  }

  Node* newNode(Point* p) {
    Node* n            = nodeAlloc.allocate(1);
    Node::PointsTy* ps = pointsAlloc.allocate(1);
    pointsAlloc.construct(ps, Node::PointsTy());
    nodeAlloc.construct(n, Node(p, ps));
    return n;
  }

  template <typename IterTy>
  struct WorkItem {
    IterTy begin;
    IterTy end;
    Tuple center;
    double radius;
    Node* root;
    PQuadTree* self;

    WorkItem(PQuadTree* s, IterTy b, IterTy e, Node* n, Tuple c, double r)
        : begin(b), end(e), center(c), radius(r), root(n), self(s) {}

    void operator()() {
      for (; begin != end; ++begin) {
        self->add(root, *begin, center, radius);
      }
    }
  };

  template <typename IterTy>
  struct PAdd {
    void operator()(WorkItem<IterTy>& w) { w(); }
    void operator()(WorkItem<IterTy>& w,
                    galois::UserContext<WorkItem<IterTy>>&) {
      w();
    }
  };

  struct Split {
    int index;
    TupleDataTy pivot;
    Split(int i, TupleDataTy p) : index(i), pivot(p) {}
    bool operator()(Point* p) { return p->t()[index] < pivot; }
  };

  Tuple m_center;
  double m_radius;
  Node* m_root;

  galois::FixedSizeAllocator<Node> nodeAlloc;
  galois::FixedSizeAllocator<Node::PointsTy> pointsAlloc;

  template <typename IterTy>
  void init(IterTy begin, IterTy end) {

    galois::GReduceMin<TupleDataTy> minX;
    galois::GReduceMin<TupleDataTy> minY;

    galois::GReduceMax<TupleDataTy> maxX;
    galois::GReduceMax<TupleDataTy> maxY;

    galois::do_all(galois::iterate(begin, end), [&](const Point* p) {
      minX.update(p->t().x());
      minY.update(p->t().y());

      maxX.update(p->t().x());
      maxY.update(p->t().y());
    });

    Tuple mmost(maxX.reduce(), maxY.reduce());
    Tuple lleast(minX.reduce(), minY.reduce());

    m_radius = std::max(mmost.x() - lleast.x(), mmost.y() - lleast.y()) / 2.0;

    m_center = lleast;
    m_center.x() += m_radius;
    m_center.y() += m_radius;
  }

  template <typename IterTy, typename OutIterTy>
  void divideWork(IterTy begin, IterTy end, Node* root, Tuple center,
                  double radius, OutIterTy& out, int depth) {
    if (depth == 0 || std::distance(begin, end) <= 16) {
      *out++ = WorkItem<IterTy>(this, begin, end, root, center, radius);
      return;
    }

    IterTy its[5];
    its[0] = begin;
    its[4] = end;

    its[2] = std::partition(its[0], its[4], Split(1, center[1]));
    its[1] = std::partition(its[0], its[2], Split(0, center[0]));
    its[3] = std::partition(its[2], its[4], Split(0, center[0]));

    radius *= 0.5;
    --depth;

    for (int i = 0; i < 4; ++i) {
      Tuple newC;
      root->child[i] = newNode();
      makeNewCenter(i, center, radius, newC);
      divideWork(its[i], its[i + 1], root->child[i], newC, radius, out, depth);
    }
  }

  bool couldBeCloser(const Point* p, const Tuple& center, double radius,
                     FindResult& result) {
    if (result.p == NULL)
      return true;

    const Tuple& t = p->t();
    double d       = 0;
    for (int i = 0; i < t.dim(); ++i) {
      double min = center[i] - radius - t[i];
      double max = center[i] + radius - t[i];
      d += std::min(min * min, max * max);
    }
    return d < result.best;
  }

  bool find(Node* root, const Point* p, const Tuple& center, double radius,
            FindResult& result) {
    if (root->isLeaf()) {
      bool retval     = false;
      const Tuple& t0 = p->t();
      for (int i = 0; i < root->size; ++i) {
        const Point* o = root->points->at(i);
        if (!o->inMesh())
          continue;

        double d        = 0;
        const Tuple& t1 = o->t();
        for (int j = 0; j < t0.dim(); ++j) {
          double v = t0[j] - t1[j];
          d += v * v;
        }
        if (result.p == NULL || d < result.best) {
          result.p    = root->points->at(i);
          result.best = d;
          retval      = true;
        }
      }
      return retval;
    }

    // Search, starting at closest quadrant to p
    radius *= 0.5;
    int start = getIndex(center, p->t());
    for (int i = 0; i < 4; ++i) {
      int index = (start + i) % 4;
      Node* kid = root->child[index];
      if (kid != NULL) {
        Tuple newCenter;
        makeNewCenter(index, center, radius, newCenter);
        if (couldBeCloser(p, newCenter, radius, result)) {
          if (false) {
            // exhaustive
            find(kid, p, newCenter, radius, result);
          } else {
            // return only first
            if (find(kid, p, newCenter, radius, result))
              return true;
          }
        }
      }
    }
    return false;
  }

  void makeInternal(Node* root, const Tuple& center, double radius) {
    assert(root->isLeaf());

    Node::PointsTy* points = root->points;
    root->points           = NULL;

    for (Node::PointsTy::iterator ii = points->begin(),
                                  ei = points->begin() + root->size;
         ii != ei; ++ii) {
      add(root, *ii, center, radius);
    }
    pointsAlloc.destroy(points);
    pointsAlloc.deallocate(points, 1);
  }

  void add(Node* root, Point* p, const Tuple& center, double radius) {
    if (root->isLeaf()) {
      if (root->size < maxLeafSize) {
        root->points->at(root->size++) = p;
      } else {
        makeInternal(root, center, radius);
        add(root, p, center, radius);
      }
      return;
    }

    int index  = getIndex(center, p->t());
    Node*& kid = root->child[index];
    if (kid == NULL) {
      kid = newNode(p);
    } else {
      radius *= 0.5;
      assert(radius != 0.0);
      Tuple newCenter;
      makeNewCenter(index, center, radius, newCenter);
      add(kid, p, newCenter, radius);
    }
  }

  template <typename OutputTy>
  void output(Node* root, OutputTy out) {
    if (root->isLeaf()) {
      std::copy(
          boost::make_transform_iterator(root->points->begin(), DerefPointer()),
          boost::make_transform_iterator(root->points->begin() + root->size,
                                         DerefPointer()),
          out);
    } else {
      for (int i = 0; i < 4; ++i) {
        Node* kid = root->child[i];
        if (kid != NULL)
          output(kid, out);
      }
    }
  }

public:
  template <typename IterTy>
  PQuadTree(IterTy begin, IterTy end) {
    m_root = newNode();

    init(begin, end);

    typedef std::vector<Point*> PointsBufTy;
    typedef WorkItem<PointsBufTy::iterator> WIT;
    typedef std::vector<WIT> WorkTy;
    typedef galois::worklists::PerSocketChunkLIFO<1> WL;
    PointsBufTy points;
    std::copy(begin, end, std::back_inserter(points));

    WorkTy work;
    std::back_insert_iterator<WorkTy> it(work);
    divideWork(points.begin(), points.end(), m_root, m_center, m_radius, it, 4);
    galois::for_each(galois::iterate(work), PAdd<PointsBufTy::iterator>(),
                     galois::wl<WL>());
  }

  ~PQuadTree() { deleteNode(m_root); }

  template <typename OutputTy>
  void output(OutputTy out) {
    if (m_root != NULL) {
      output(m_root, out);
    }
  }

  //! Find point nearby to p
  bool find(const Point* p, Point*& result) {
    FindResult r;
    r.p = NULL;
    if (m_root) {
      find(m_root, p, m_center, m_radius, r);
      if (r.p != NULL) {
        result = r.p;
        return true;
      }
    }
    return false;
  }
};

/**
 * Finds points nearby a given point.
 */
class SQuadTree {
  struct FindResult {
    Point* p;
    double best;
  };

  struct DerefPointer {
    Point operator()(Point* p) const { return *p; }
  };

  struct Node {
    Node* child[4];
    Point** points;
    int size;

    bool isLeaf() const { return points != NULL; }

    void makeInternal(const Tuple& center, double radius) {
      memset(child, 0, sizeof(*child) * 4);
      Point** begin = points;
      points        = NULL;

      for (Point **p = begin, **end = begin + size; p != end; ++p) {
        add(*p, center, radius);
      }
      delete[] begin;
    }

    void add(Point* p, const Tuple& center, double radius) {
      if (isLeaf()) {
        if (size < maxLeafSize) {
          points[size] = p;
          ++size;
        } else {
          makeInternal(center, radius);
          add(p, center, radius);
        }
        return;
      }

      int index  = getIndex(center, p->t());
      Node*& kid = child[index];
      if (kid == NULL) {
        kid            = new Node();
        kid->points    = new Point*[maxLeafSize];
        kid->points[0] = p;
        kid->size      = 1;
      } else {
        radius *= 0.5;
        assert(radius != 0.0);
        Tuple newCenter;
        makeNewCenter(index, center, radius, newCenter);
        kid->add(p, newCenter, radius);
      }
    }

    bool couldBeCloser(const Point* p, const Tuple& center, double radius,
                       FindResult& result) {
      if (result.p == NULL)
        return true;

      const Tuple& t = p->t();
      double d       = 0;
      for (int i = 0; i < t.dim(); ++i) {
        double min = center[i] - radius - t[i];
        double max = center[i] + radius - t[i];
        d += std::min(min * min, max * max);
      }
      return d < result.best;
    }

    void find(const Point* p, const Tuple& center, double radius,
              FindResult& result) {
      if (isLeaf()) {
        const Tuple& t0 = p->t();
        for (int i = 0; i < size; ++i) {
          double d       = 0;
          const Point* o = points[i];
          if (!o->inMesh())
            continue;
          const Tuple& t1 = o->t();
          for (int j = 0; j < t0.dim(); ++j) {
            double v = t0[j] - t1[j];
            d += v * v;
          }
          if (result.p == NULL || d < result.best) {
            result.p    = points[i];
            result.best = d;
          }
        }
        return;
      }

      // Search, starting at closest quadrant to p
      radius *= 0.5;
      int start = getIndex(center, p->t());
      for (int i = 0; i < 4; ++i) {
        int index = (start + i) % 4;
        Node* kid = child[index];
        if (kid != NULL) {
          Tuple newCenter;
          makeNewCenter(index, center, radius, newCenter);
          if (kid->couldBeCloser(p, newCenter, radius, result))
            kid->find(p, newCenter, radius, result);
        }
      }
    }

    template <typename OutputTy>
    void output(OutputTy out) {
      if (isLeaf()) {
        std::copy(boost::make_transform_iterator(points, DerefPointer()),
                  boost::make_transform_iterator(points + size, DerefPointer()),
                  out);
      } else {
        for (int i = 0; i < 4; ++i) {
          Node* kid = child[i];
          if (kid != NULL)
            kid->output(out);
        }
      }
    }
  };

  void deleteNode(Node*& n) {
    if (n == NULL)
      return;
    if (n->isLeaf()) {
      delete[] n->points;
      n->points = NULL;
    } else {
      for (int i = 0; i < 4; ++i) {
        deleteNode(n->child[i]);
      }
    }

    delete n;
    n = NULL;
  }

  template <typename Begin, typename End>
  void computeBox(Begin begin, End end, Tuple& least, Tuple& most) {
    least.x() = least.y() = std::numeric_limits<double>::max();
    most.x() = most.y() = std::numeric_limits<double>::min();

    for (; begin != end; ++begin) {
      const Tuple& p = (*begin)->t();
      for (int i = 0; i < 2; ++i) {
        if (p[i] < least[i])
          least[i] = p[i];

        if (p[i] > most[i])
          most[i] = p[i];
      }
    }
  }

  template <typename Begin, typename End>
  void init(Begin begin, End end) {
    Tuple least, most;
    computeBox(begin, end, least, most);

    radius = std::max(most.x() - least.x(), most.y() - least.y()) / 2.0;
    center = least;
    center.x() += radius;
    center.y() += radius;
  }

  void add(Point* p) {
    if (root == NULL) {
      root         = new Node();
      root->points = NULL;
      memset(root->child, 0, sizeof(*root->child) * 4);
    }
    root->add(p, center, radius);
  }

  Tuple center;
  double radius;
  Node* root;

public:
  template <typename Begin, typename End>
  SQuadTree(Begin begin, End end) : root(NULL) {
    init(begin, end);
    for (; begin != end; ++begin)
      add(*begin);
  }

  ~SQuadTree() { deleteNode(root); }

  //! Find point nearby to p
  bool find(const Point* p, Point*& result) {
    FindResult r;
    r.p = NULL;
    if (root) {
      root->find(p, center, radius, r);
      if (r.p != NULL) {
        result = r.p;
        return true;
      }
    }
    return false;
  }

  template <typename OutputTy>
  void output(OutputTy out) {
    if (root != NULL) {
      root->output(out);
    }
  }
};

typedef PQuadTree QuadTree;

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/README.md
================================================
Delaunay Triangulation
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

This program produces a Delaunay triangulation from a set of 2-D points. We 
implement the algorithm proposed by Bowyer and that by Watson:

1. Adrian Bowyer. Computing Dirichlet tessellations, The Computer Journal, 
Vol. 24, No. 2, pp 162 - 166, 1981.

2. David F. Watson. Computing the n-dimensional tessellation with application to 
Voronoi polytopes, The Computer Journal, Vol. 24, No. 2, pp 167 - 172, 1981. 

INPUT
--------------------------------------------------------------------------------

The implementations expect a list of nodes with their coordinates.

You must specify the -meshGraph flag when running this benchmark.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/scientific/cpu/delaunaytriangulation; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

-`$ ./delaunaytriangulation-cpu -meshGraph <path-to-node-list> -t 40`
-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -nondet -t 40`
-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detBase -t 20`
-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detPrefix -t 30`
-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detDisjoint -t 15`

PERFORMANCE
--------------------------------------------------------------------------------

* In our experience, delaunaytriangulation outperforms deterministic variants in 
  delaunaytriangulation-det.

* For the for_each loop named "Main", the chunk size of galois::wl<CA>() should be 
  tuned. It controls the granularity of work distribution. The optimal value of the 
  constant might depend on the architecture, so you might want to evaluate the 
  performance over a range of values (say [16-4096]).


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Tuple.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef TUPLE_H
#define TUPLE_H

#include <ostream>
#include <cmath>

typedef double TupleDataTy;

class Tuple {
  TupleDataTy data[2];

public:
  Tuple() {
    data[0] = 0;
    data[1] = 0;
  }
  Tuple(TupleDataTy xy) {
    data[0] = xy;
    data[1] = xy;
  }
  Tuple(TupleDataTy x, TupleDataTy y) {
    data[0] = x;
    data[1] = y;
  }
  int dim() const { return 2; }
  TupleDataTy x() const { return data[0]; }
  TupleDataTy y() const { return data[1]; }

  TupleDataTy& x() { return data[0]; }
  TupleDataTy& y() { return data[1]; }

  bool operator==(const Tuple& rhs) const {
    for (int i = 0; i < 2; ++i) {
      if (data[i] != rhs.data[i])
        return false;
    }
    return true;
  }

  bool operator!=(const Tuple& rhs) const { return !(*this == rhs); }

  TupleDataTy operator[](int index) const { return data[index]; }

  TupleDataTy& operator[](int index) { return data[index]; }

  Tuple operator+(const Tuple& rhs) const {
    return Tuple(data[0] + rhs.data[0], data[1] + rhs.data[1]);
  }

  Tuple operator-(const Tuple& rhs) const {
    return Tuple(data[0] - rhs.data[0], data[1] - rhs.data[1]);
  }

  //! scalar product
  Tuple operator*(TupleDataTy d) const {
    return Tuple(data[0] * d, data[1] * d);
  }

  //! dot product
  TupleDataTy dot(const Tuple& rhs) const {
    return data[0] * rhs.data[0] + data[1] * rhs.data[1];
  }

  TupleDataTy cross(const Tuple& rhs) const {
    return data[0] * rhs.data[1] - data[1] * rhs.data[0];
  }

  void print(std::ostream& os) const {
    os << "(" << data[0] << ", " << data[1] << ")";
  }
};

static inline std::ostream& operator<<(std::ostream& os, const Tuple& rhs) {
  rhs.print(os);
  return os;
}

class Tuple3 {
  TupleDataTy data[3];

public:
  Tuple3() {
    data[0] = 0;
    data[1] = 0;
    data[2] = 0;
  }
  Tuple3(TupleDataTy xyz) {
    data[0] = xyz;
    data[1] = xyz;
    data[1] = xyz;
  }
  Tuple3(TupleDataTy x, TupleDataTy y, TupleDataTy z) {
    data[0] = x;
    data[1] = y;
    data[2] = z;
  }
  int dim() const { return 3; }
  TupleDataTy x() const { return data[0]; }
  TupleDataTy y() const { return data[1]; }
  TupleDataTy z() const { return data[2]; }

  TupleDataTy& x() { return data[0]; }
  TupleDataTy& y() { return data[1]; }
  TupleDataTy& z() { return data[2]; }

  bool operator==(const Tuple3& rhs) const {
    for (int i = 0; i < 3; ++i) {
      if (data[i] != rhs.data[i])
        return false;
    }
    return true;
  }

  bool operator!=(const Tuple3& rhs) const { return !(*this == rhs); }

  TupleDataTy operator[](int index) const { return data[index]; }

  TupleDataTy& operator[](int index) { return data[index]; }

  Tuple3 operator+(const Tuple3& rhs) const {
    return Tuple3(data[0] + rhs.data[0], data[1] + rhs.data[1],
                  data[2] + rhs.data[2]);
  }

  Tuple3 operator-(const Tuple3& rhs) const {
    return Tuple3(data[0] - rhs.data[0], data[1] - rhs.data[1],
                  data[2] + rhs.data[2]);
  }

  //! scalar product
  Tuple3 operator*(TupleDataTy d) const {
    return Tuple3(data[0] * d, data[1] * d, data[2] * d);
  }

  //! dot product
  TupleDataTy dot(const Tuple3& rhs) const {
    return data[0] * rhs.data[0] + data[1] * rhs.data[1] +
           data[2] * rhs.data[2];
  }

  Tuple3 cross(const Tuple3& rhs) const {
    return Tuple3(data[1] * rhs.data[2] - data[2] * rhs.data[1],
                  data[2] * rhs.data[0] - data[0] * rhs.data[2],
                  data[0] * rhs.data[1] - data[1] * rhs.data[0]);
  }

  void print(std::ostream& os) const {
    os << "(" << data[0] << ", " << data[1] << ", " << data[2] << ")";
  }
};

static inline std::ostream& operator<<(std::ostream& os, const Tuple3& rhs) {
  rhs.print(os);
  return os;
}

#endif


================================================
FILE: lonestar/scientific/cpu/delaunaytriangulation/Verifier.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef VERIFIER_H
#define VERIFIER_H

#include "Graph.h"
#include "Point.h"

#include "galois/Galois.h"
#include "galois/ParallelSTL.h"

#include <stack>
#include <set>
#include <iostream>

class Verifier {
  struct inconsistent {
    Graph* graph;
    inconsistent(Graph* g) : graph(g) {}

    bool operator()(const GNode& node) const {
      Element& e = graph->getData(node);

      size_t dist =
          std::distance(graph->edge_begin(node), graph->edge_end(node));
      if (e.dim() == 2) {
        if (dist != 1) {
          std::cerr << "Error: Segment " << e << " has " << dist
                    << " relation(s)\n";
          return true;
        }
      } else if (e.dim() == 3) {
        if (dist != 3) {
          std::cerr << "Error: Triangle " << e << " has " << dist
                    << " relation(s)\n";
          return true;
        }
      } else {
        std::cerr << "Error: Element with " << e.dim() << " edges\n";
        return true;
      }
      return false;
    }
  };

  struct not_delaunay {
    Graph* graph;
    not_delaunay(Graph* g) : graph(g) {}

    bool operator()(const GNode& node) {
      Element& e1 = graph->getData(node);

      for (auto jj : graph->edges(node)) {
        const GNode& n = graph->getEdgeDst(jj);
        Element& e2    = graph->getData(n);
        if (e1.dim() == 3 && e2.dim() == 3) {
          Tuple t2;
          if (!getTupleT2OfRelatedEdge(e1, e2, t2)) {
            std::cerr << "missing tuple\n";
            return true;
          }
          if (e1.inCircle(t2)) {
            std::cerr << "Delaunay property violated: point " << t2
                      << " in element " << e1 << "\n";
            return true;
          }
        }
      }
      return false;
    }

    bool getTupleT2OfRelatedEdge(const Element& e1, const Element& e2,
                                 Tuple& t) {
      int e2_0  = -1;
      int e2_1  = -1;
      int phase = 0;

      for (int i = 0; i < e1.dim(); i++) {
        for (int j = 0; j < e2.dim(); j++) {
          if (e1.getPoint(i) != e2.getPoint(j))
            continue;

          if (phase == 0) {
            e2_0  = j;
            phase = 1;
            break;
          }

          e2_1 = j;
          for (int k = 0; k < 3; k++) {
            if (k != e2_0 && k != e2_1) {
              t = e2.getPoint(k)->t();
              return true;
            }
          }
        }
      }
      return false;
    }
  };

  bool checkReachability(Graph* graph) {
    std::stack<GNode> remaining;
    std::set<GNode> found;
    remaining.push(*(graph->begin()));

    while (!remaining.empty()) {
      GNode node = remaining.top();
      remaining.pop();
      if (!found.count(node)) {
        if (!graph->containsNode(node)) {
          std::cerr << "Reachable node was removed from graph\n";
        }
        found.insert(node);
        int i = 0;
        for (auto ii : graph->edges(node)) {
          GNode n = graph->getEdgeDst(ii);
          assert(i < 3);
          assert(graph->containsNode(n));
          assert(node != n);
          ++i;
          remaining.push(n);
        }
      }
    }

    if (found.size() != graph->size()) {
      std::cerr << "Error: Not all elements are reachable. ";
      std::cerr << "Found: " << found.size() << " needed: " << graph->size()
                << ".\n";
      return false;
    }
    return true;
  }

public:
  bool verify(Graph* g) {
    return galois::ParallelSTL::find_if(g->begin(), g->end(),
                                        inconsistent(g)) == g->end() &&
           galois::ParallelSTL::find_if(g->begin(), g->end(),
                                        not_delaunay(g)) == g->end() &&
           checkReachability(g);
  }
};

#endif


================================================
FILE: lonestar/scientific/cpu/longestedge/CMakeLists.txt
================================================
add_executable(longestedge-cpu src/LongestEdge.cpp src/model/Map.cpp
               src/readers/SrtmReader.cpp src/readers/AsciiReader.cpp
               src/libmgrs/mgrs.c src/libmgrs/polarst.c src/libmgrs/tranmerc.c
               src/libmgrs/utm.c src/libmgrs/ups.c src/utils/Utils.cpp
               src/readers/InpReader.cpp src/writers/InpWriter.cpp
               src/writers/TriangleFormatWriter.cpp)
add_dependencies(apps longestedge-cpu)
target_link_libraries(longestedge-cpu PRIVATE Galois::shmem lonestar)
install(TARGETS longestedge-cpu DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)

add_executable(longestedgeTest test/TestMain.cpp src/model/Map.cpp
               src/libmgrs/mgrs.c src/libmgrs/polarst.c src/libmgrs/tranmerc.c
               src/libmgrs/utm.c src/libmgrs/ups.c src/utils/Utils.cpp)
add_dependencies(apps longestedgeTest)
target_link_libraries(longestedgeTest PRIVATE Galois::shmem lonestar)

add_test(test_longest_edge longestedgeTest)


================================================
FILE: lonestar/scientific/cpu/longestedge/README.md
================================================
Longest Edge
============

DESCRIPTION 
-----------

This program runs a variant of Rivaras mesh refinement algorithm on portions of the earth's surface.
It requires the data available at https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/ and can generate `.node`, `.ele.`, and `.poly` files that follow the same format used in https://www.cs.cmu.edu/~quake/triangle.html.
The command line inputs are the bounds of a box in UTM coordinates.

BUILD
-----

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/scientific/cpu/longestedge && make -j`


RUN
---

The following is an example command line call:

 - `./longestedge -l 25 -s 14 -N 52.4 -S 49. -E 23.1 -W 18.1 -data <dataDirectory> -o <outputFile> -square -altOutput`


================================================
FILE: lonestar/scientific/cpu/longestedge/out/.gitignore
================================================
# Ignore everything in this directory
*
# Except this file
!.gitignore

================================================
FILE: lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
================================================
#include "conditions/TerrainConditionChecker.h"
#include "libmgrs/utm.h"
#include "model/Coordinates.h"
#include "model/Graph.h"
#include "model/Map.h"
#include "model/ProductionState.h"
#include "productions/Production.h"
#include "productions/Production1.h"
#include "productions/Production2.h"
#include "productions/Production3.h"
#include "productions/Production4.h"
#include "productions/Production5.h"
#include "productions/Production6.h"
#include "readers/InpReader.h"
#include "readers/SrtmReader.h"
#include "writers/InpWriter.h"
#include "writers/TriangleFormatWriter.h"
#include "utils/ConnectivityManager.h"
#include "utils/GraphGenerator.h"
#include "utils/Utils.h"
#include "readers/AsciiReader.h"

#include <Lonestar/BoilerPlate.h>

#include <algorithm>
#include <cstdlib>
#include <string>
#include <tuple>
#include <vector>

namespace cll = llvm::cl;

static const char* name = "Longest edge mesh generator";
static const char* desc = "Implementation of Rivara's Longest Edge algorithm "
                          "based on hyper-graph grammars.";
static const char* url = "longest_edge";

// Command line arguments
static cll::opt<std::string> dataDir("data", cll::Positional,
                                     cll::desc("Directory with data files"));
static cll::opt<std::string> output("o", cll::Positional,
                                    cll::desc("Basename for output file"));
static cll::opt<int>
    tolerance("l", cll::Positional,
              cll::desc("Tolerance for for refinement in meters"),
              cll::init(5));
static cll::opt<bool>
    version2D("version2D",
              cll::desc("Calculate distances using only XY coordinates"));
static cll::opt<int> steps("s", cll::Positional, cll::desc("Number of steps"));
static cll::opt<double> N("N", cll::desc("Latitude of north border"));
static cll::opt<double> S("S", cll::desc("Latitude of south border"));
static cll::opt<double> E("E", cll::desc("Longitude of east border"));
static cll::opt<double> W("W", cll::desc("Longitude of west border"));
static cll::opt<bool> ascii("a", cll::desc("Read data from ascii file"));
static cll::opt<std::string>
    asciiFile("asciiFile", cll::Positional,
              cll::desc("File with data in ASCII format"));
static cll::opt<bool> square("square", cll::desc("Bind domain to square"));
static cll::opt<std::string>
    inputMeshFile("imesh", cll::desc("Filename of the inp mesh. It should be "
                                     "inside dataDir and use UTM coordinates"));
static cll::opt<long> zone("zone", cll::desc("UTM zone of the inputMeshFile"));
static cll::opt<char> hemisphere("hemisphere",
                                 cll::desc("Hemisphere of the inputMeshFile"));
static cll::opt<bool> altOutput(
    "altOutput",
    cll::desc("Write to .ele,.node,.poly files instead of AVS UCD (.inp)"));
static cll::opt<bool> display("display",
                              cll::desc("Use external visualizator."));

void afterStep(int i, Graph& graph);

bool basicCondition(const Graph& graph, GNode& node);

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  LonestarStart(argc, argv, name, desc, url, nullptr);
  Graph graph{};

  galois::reportPageAlloc("MeminfoPre1");
  // Tighter upper bound for pre-alloc, useful for machines with limited memory,
  // e.g., Intel MIC. May not be enough for deterministic execution
  constexpr size_t NODE_SIZE = sizeof(**graph.begin());

  // preallocating memory
  galois::preAlloc(5 * galois::getActiveThreads() +
                   NODE_SIZE * 32 * graph.size() /
                       galois::runtime::pagePoolSize());

  galois::reportPageAlloc("MeminfoPre2");

  galois::gInfo("Initial configuration set.");

  Map* map;

  // creates the initial mesh using the borders and the new map
  if (inputMeshFile.empty()) {
    if (ascii) {
      AsciiReader reader;
      map = reader.read(asciiFile);
      GraphGenerator::generateSampleGraphWithData(
          graph, *map, 0, map->getLength() - 1, map->getWidth() - 1, 0,
          version2D);
    } else {
      SrtmReader reader;
      // terrain setup:  load terrain heights into the map object
      map = reader.read(W, N, E, S, dataDir.c_str());
      galois::gInfo("Terrain data read.");
      GraphGenerator::generateSampleGraphWithDataWithConversionToUtm(
          graph, *map, W, N, E, S, version2D, square);
    }
    galois::gInfo("Initial graph generated");
  } else {

    inpRead(dataDir + "/" + inputMeshFile, graph, N, S, E, W, version2D);
    galois::gInfo("INP mesh read.");

    // Let's convert the four corners to geodesic coordinates
    double x1, x2, x3, x4, y1, y2, y3, y4;
    Convert_UTM_To_Geodetic(zone, hemisphere, E, N, &y1, &x1);
    Convert_UTM_To_Geodetic(zone, hemisphere, E, S, &y2, &x2);
    Convert_UTM_To_Geodetic(zone, hemisphere, W, N, &y3, &x3);
    Convert_UTM_To_Geodetic(zone, hemisphere, W, S, &y4, &x4);

    std::tie(W, E) = std::minmax(
        {Utils::r2d(x1), Utils::r2d(x2), Utils::r2d(x3), Utils::r2d(x4)});
    std::tie(S, N) = std::minmax(
        {Utils::r2d(y1), Utils::r2d(y2), Utils::r2d(y3), Utils::r2d(y4)});

    // Create the map
    SrtmReader reader;

    // terrain setup:  load terrain heights into the map object
    map = reader.read(W, N, E, S, dataDir.c_str());
    galois::gInfo("Terrain data read.");

    map->setZone(zone);
    map->setHemisphere(hemisphere);

    // Update the coordinates of all graph nodes (mesh nodes, and the interior
    // nodes)
    for (auto node : graph) {
      const auto coords = node->getData().getCoords();

      node->getData().setCoords(
          Coordinates{coords.getX(), coords.getY(), *map});
    }
  }

  // initialize wrapper over graph object (ConnManager)
  ConnectivityManager connManager{graph};
  //    DummyConditionChecker checker = DummyConditionChecker();
  TerrainConditionChecker checker =
      TerrainConditionChecker(tolerance, connManager, *map);
  Production1 production1{connManager};
  Production2 production2{connManager};
  Production3 production3{connManager};
  Production4 production4{connManager};
  Production5 production5{connManager};
  Production6 production6{connManager};
  vector<Production*> productions = {&production1, &production2, &production3,
                                     &production4, &production5, &production6};
  galois::gInfo("Loop is being started...");
  //    afterStep(0, graph);
  for (int j = 0; j < steps; j++) {
    galois::for_each(galois::iterate(graph.begin(), graph.end()),
                     [&](GNode node, auto&) {
                       if (basicCondition(graph, node)) {

                         // terrain checker to see if refinement needed
                         // based on terrain
                         checker.execute(node);
                       }
                     });
    galois::gInfo("Condition chceking in step ", j, " finished.");
    galois::StatTimer step(("step" + std::to_string(j)).c_str());
    step.start();

    auto prodExecuted = true;

    while (prodExecuted) {
      prodExecuted = false;

      galois::for_each(
          galois::iterate(graph.begin(), graph.end()),
          [&](GNode node, auto& ctx) {
            // only need to check hyperedges
            if (!basicCondition(graph, node)) {
              return;
            }

            // TODO does this have to be initialized for every one?
            // may be able to optimize
            ProductionState pState(connManager, node, version2D,
                                   [&map](double x, double y) -> double {
                                     return map->get_height(x, y);
                                   });

            // loop through productions and apply the first applicable
            // one
            for (Production* production : productions) {
              if (production->execute(pState, ctx)) {
                afterStep(j, graph);
                prodExecuted = true;
                return;
              }
            }
          },
          galois::loopname(("step" + std::to_string(j)).c_str()));
    }

    step.stop();
    galois::gInfo("Step ", j, " finished.");
  }
  galois::gInfo("All steps finished.");

  // final result writing
  if (!output.empty()) {
    if (altOutput) {
      triangleFormatWriter(output, graph);
    } else {
      inpWriter(output + ".inp", graph);
    }
    galois::gInfo("Graph written to file ", output);
  }

  if (display) {
    if (system((std::string("./display.sh ") + output).c_str()))
      std::abort();
  }

  delete map;
  return 0;
}

//! Checks if node exists + is hyperedge
bool basicCondition(const Graph& graph, GNode& node) {
  return graph.containsNode(node, galois::MethodFlag::WRITE) &&
         node->getData().isHyperEdge();
}

//! Writes intermediate data to file
void afterStep(int GALOIS_UNUSED(step), Graph& GALOIS_UNUSED(graph)) {}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/conditions/ConditionChecker.h
================================================
#ifndef GALOIS_CONDITIONCHECKER_H
#define GALOIS_CONDITIONCHECKER_H

#include "../model/Graph.h"

//! An implementation of a condition checker just needs to implement
//! the execute function to check the condition on some node
class ConditionChecker {
public:
  virtual bool execute(GNode& node) = 0;
};

#endif // GALOIS_CONDITIONCHECKER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/conditions/DummyConditionChecker.h
================================================
#ifndef GALOIS_DUMMYCONDITIONCHECKER_H
#define GALOIS_DUMMYCONDITIONCHECKER_H

#include "ConditionChecker.h"

//! This condition checker always sets a hyperedge node to be refined and
//! returns true
class DummyConditionChecker : ConditionChecker {
public:
  //! Sets refinement and returns true for hyperedge nodes
  bool execute(GNode& node) override {
    NodeData& nodeData = node->getData();
    if (!nodeData.isHyperEdge()) {
      return false;
    }
    nodeData.setToRefine(true);
    return true;
  }
};

#endif // GALOIS_DUMMYCONDITIONCHECKER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/conditions/TerrainConditionChecker.h
================================================
#ifndef GALOIS_TERRAINCONDITIONCHECKER_H
#define GALOIS_TERRAINCONDITIONCHECKER_H

#include <cmath>
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"
#include "../model/Map.h"
#include "../model/ProductionState.h"
#include "../libmgrs/utm.h"
#include "ConditionChecker.h"

//! Uses terrain to determine if a triangle is to be refined.
class TerrainConditionChecker : public ConditionChecker {
public:
  explicit TerrainConditionChecker(double tolerance,
                                   ConnectivityManager& connManager, Map& map)
      : tolerance(tolerance), connManager(connManager), map(map) {}

  //! Only refine if meets inside_condition + is hyperedge node
  bool execute(GNode& node) override {
    NodeData& nodeData = node->getData();
    if (!nodeData.isHyperEdge()) {
      return false;
    }

    // gets coordinates of vertices connected by this hyperedge
    vector<Coordinates> verticesCoords = connManager.getVerticesCoords(node);

    if (!inside_condition(verticesCoords)) {
      return false;
    }

    nodeData.setToRefine(true);
    return true;
  }

private:
  double tolerance;
  ConnectivityManager& connManager;
  Map& map;

  bool inside_condition(const vector<Coordinates>& verticesCoords) {

    // lowest x among 3
    double lowest_x = verticesCoords[0].getX() < verticesCoords[1].getX()
                          ? verticesCoords[0].getX()
                          : verticesCoords[1].getX();
    lowest_x = verticesCoords[2].getX() < lowest_x ? verticesCoords[2].getX()
                                                   : lowest_x;

    // highest x among 3
    double highest_x = verticesCoords[0].getX() > verticesCoords[1].getX()
                           ? verticesCoords[0].getX()
                           : verticesCoords[1].getX();
    highest_x = verticesCoords[2].getX() > highest_x ? verticesCoords[2].getX()
                                                     : highest_x;

    // lowest y among 3
    double lowest_y = verticesCoords[0].getY() < verticesCoords[1].getY()
                          ? verticesCoords[0].getY()
                          : verticesCoords[1].getY();
    lowest_y = verticesCoords[2].getY() < lowest_y ? verticesCoords[2].getY()
                                                   : lowest_y;

    // highest y among 3
    double highest_y = verticesCoords[0].getY() > verticesCoords[1].getY()
                           ? verticesCoords[0].getY()
                           : verticesCoords[1].getY();
    highest_y = verticesCoords[2].getY() > highest_y ? verticesCoords[2].getY()
                                                     : highest_y;

    double step = map.isUtm() ? 90 : map.getCellWidth();
    for (double i = lowest_x; i <= highest_x; i += step) {
      for (double j = lowest_y; j <= highest_y; j += step) {
        Coordinates tmp{i, j, 0.};
        double barycentric_point[3];
        compute_barycentric_coords(barycentric_point, tmp, verticesCoords);
        if (is_inside_triangle(barycentric_point)) {
          double height = 0;
          for (int k = 0; k < 3; ++k) {
            height += barycentric_point[k] * verticesCoords[k].getZ();
          }
          if (fabs(height - map.get_height(i, j)) > tolerance) {
            return true;
          }
        }
      }
    }
    return false;
  }

  void compute_barycentric_coords(double* barycentric_coords,
                                  Coordinates& point,
                                  const vector<Coordinates>& verticesCoords) {
    double triangle_area =
        get_area(verticesCoords[0], verticesCoords[1], verticesCoords[2]);
    barycentric_coords[2] =
        get_area(point, verticesCoords[0], verticesCoords[1]) / triangle_area;
    barycentric_coords[1] =
        get_area(point, verticesCoords[2], verticesCoords[0]) / triangle_area;
    barycentric_coords[0] =
        get_area(point, verticesCoords[1], verticesCoords[2]) / triangle_area;
  }

  bool is_inside_triangle(double barycentric_coords[]) {
    return !greater(barycentric_coords[0] + barycentric_coords[1] +
                        barycentric_coords[2],
                    1.);
  }

  double get_area(const Coordinates& a, const Coordinates& b,
                  const Coordinates& c) {
    return 0.5 * fabs((b.getX() - a.getX()) * (c.getY() - a.getY()) -
                      (b.getY() - a.getY()) * (c.getX() - a.getX()));
  }
};

#endif // GALOIS_TERRAINCONDITIONCHECKER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/LICENSE
================================================
The MIT License (MIT)
Copyright (c) 2016 Howard Butler <howard@hobu.co>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/mgrs.c
================================================
/***************************************************************************/
/* RSC IDENTIFIER:  MGRS
 *
 * ABSTRACT
 *
 *    This component converts between geodetic coordinates (latitude and
 *    longitude) and Military Grid Reference System (MGRS) coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found, the error code is combined with the current error code using
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *          MGRS_NO_ERROR          : No errors occurred in function
 *          MGRS_LAT_ERROR         : Latitude outside of valid range
 *                                    (-90 to 90 degrees)
 *          MGRS_LON_ERROR         : Longitude outside of valid range
 *                                    (-180 to 360 degrees)
 *          MGRS_STR_ERROR         : An MGRS string error: string too long,
 *                                    too short, or badly formed
 *          MGRS_PRECISION_ERROR   : The precision must be between 0 and 5
 *                                    inclusive.
 *          MGRS_A_ERROR           : Semi-major axis less than or equal to zero
 *          MGRS_INV_F_ERROR       : Inverse flattening outside of valid range
 *									                  (250 to 350)
 *          MGRS_EASTING_ERROR     : Easting outside of valid range
 *                                    (100,000 to 900,000 meters for UTM)
 *                                    (0 to 4,000,000 meters for UPS)
 *          MGRS_NORTHING_ERROR    : Northing outside of valid range
 *                                    (0 to 10,000,000 meters for UTM)
 *                                    (0 to 4,000,000 meters for UPS)
 *          MGRS_ZONE_ERROR        : Zone outside of valid range (1 to 60)
 *          MGRS_HEMISPHERE_ERROR  : Invalid hemisphere ('N' or 'S')
 *
 * REUSE NOTES
 *
 *    MGRS is intended for reuse by any application that does conversions
 *    between geodetic coordinates and MGRS coordinates.
 *
 * REFERENCES
 *
 *    Further information on MGRS can be found in the Reuse Manual.
 *
 *    MGRS originated from : U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *
 * ENVIRONMENT
 *
 *    MGRS was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC version 2.8.1
 *    2. Windows 95 with MS Visual C++ version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    16-11-94          Original Code
 *    15-09-99          Reengineered upper layers
 *    02-05-03          Corrected latitude band bug in GRID_UTM
 *    08-20-03          Reengineered lower layers
 */


/***************************************************************************/
/*
 *                               INCLUDES
 */
#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include "ups.h"
#include "utm.h"
#include "mgrs.h"

/*
 *      ctype.h     - Standard C character handling library
 *      math.h      - Standard C math library
 *      stdio.h     - Standard C input/output library
 *      string.h    - Standard C string handling library
 *      ups.h       - Universal Polar Stereographic (UPS) projection
 *      utm.h       - Universal Transverse Mercator (UTM) projection
 *      mgrs.h      - function prototype error checking
 */


/***************************************************************************/
/*
 *                              GLOBAL DECLARATIONS
 */
#define DEG_TO_RAD       0.017453292519943295 /* PI/180                      */
#define RAD_TO_DEG       57.29577951308232087 /* 180/PI                      */
#define LETTER_A               0   /* ARRAY INDEX FOR LETTER A               */
#define LETTER_B               1   /* ARRAY INDEX FOR LETTER B               */
#define LETTER_C               2   /* ARRAY INDEX FOR LETTER C               */
#define LETTER_D               3   /* ARRAY INDEX FOR LETTER D               */
#define LETTER_E               4   /* ARRAY INDEX FOR LETTER E               */
#define LETTER_F               5   /* ARRAY INDEX FOR LETTER F               */
#define LETTER_G               6   /* ARRAY INDEX FOR LETTER G               */
#define LETTER_H               7   /* ARRAY INDEX FOR LETTER H               */
#define LETTER_I               8   /* ARRAY INDEX FOR LETTER I               */
#define LETTER_J               9   /* ARRAY INDEX FOR LETTER J               */
#define LETTER_K              10   /* ARRAY INDEX FOR LETTER K               */
#define LETTER_L              11   /* ARRAY INDEX FOR LETTER L               */
#define LETTER_M              12   /* ARRAY INDEX FOR LETTER M               */
#define LETTER_N              13   /* ARRAY INDEX FOR LETTER N               */
#define LETTER_O              14   /* ARRAY INDEX FOR LETTER O               */
#define LETTER_P              15   /* ARRAY INDEX FOR LETTER P               */
#define LETTER_Q              16   /* ARRAY INDEX FOR LETTER Q               */
#define LETTER_R              17   /* ARRAY INDEX FOR LETTER R               */
#define LETTER_S              18   /* ARRAY INDEX FOR LETTER S               */
#define LETTER_T              19   /* ARRAY INDEX FOR LETTER T               */
#define LETTER_U              20   /* ARRAY INDEX FOR LETTER U               */
#define LETTER_V              21   /* ARRAY INDEX FOR LETTER V               */
#define LETTER_W              22   /* ARRAY INDEX FOR LETTER W               */
#define LETTER_X              23   /* ARRAY INDEX FOR LETTER X               */
#define LETTER_Y              24   /* ARRAY INDEX FOR LETTER Y               */
#define LETTER_Z              25   /* ARRAY INDEX FOR LETTER Z               */
#define MGRS_LETTERS            3  /* NUMBER OF LETTERS IN MGRS              */
#define ONEHT          100000.e0    /* ONE HUNDRED THOUSAND                  */
#define TWOMIL        2000000.e0    /* TWO MILLION                           */
#define TRUE                      1  /* CONSTANT VALUE FOR TRUE VALUE  */
#define FALSE                     0  /* CONSTANT VALUE FOR FALSE VALUE */
#define PI    3.14159265358979323e0  /* PI                             */
#define PI_OVER_2  (PI / 2.0e0)

#define MIN_EASTING  100000
#define MAX_EASTING  900000
#define MIN_NORTHING 0
#define MAX_NORTHING 10000000
#define MAX_PRECISION           5   /* Maximum precision of easting & northing */
#define MIN_UTM_LAT      ( (-80 * PI) / 180.0 ) /* -80 degrees in radians    */
#define MAX_UTM_LAT      ( (84 * PI) / 180.0 )  /* 84 degrees in radians     */

#define MIN_EAST_NORTH 0
#define MAX_EAST_NORTH 4000000


/* Ellipsoid parameters, default to WGS 84 */
double MGRS_a = 6378137.0;    /* Semi-major axis of ellipsoid in meters */
double MGRS_f = 1 / 298.257223563; /* Flattening of ellipsoid           */
char   MGRS_Ellipsoid_Code[3] = {'W','E',0};


/*
 *    CLARKE_1866 : Ellipsoid code for CLARKE_1866
 *    CLARKE_1880 : Ellipsoid code for CLARKE_1880
 *    BESSEL_1841 : Ellipsoid code for BESSEL_1841
 *    BESSEL_1841_NAMIBIA : Ellipsoid code for BESSEL 1841 (NAMIBIA)
 */
const char* CLARKE_1866 = "CC";
const char* CLARKE_1880 = "CD";
const char* BESSEL_1841 = "BR";
const char* BESSEL_1841_NAMIBIA = "BN";


typedef struct Latitude_Band_Value
{
  long letter;            /* letter representing latitude band  */
  double min_northing;    /* minimum northing for latitude band */
  double north;           /* upper latitude for latitude band   */
  double south;           /* lower latitude for latitude band   */
  double northing_offset; /* latitude band northing offset      */
} Latitude_Band;

static const Latitude_Band Latitude_Band_Table[20] =
  {{LETTER_C, 1100000.0, -72.0, -80.5, 0.0},
  {LETTER_D, 2000000.0, -64.0, -72.0, 2000000.0},
  {LETTER_E, 2800000.0, -56.0, -64.0, 2000000.0},
  {LETTER_F, 3700000.0, -48.0, -56.0, 2000000.0},
  {LETTER_G, 4600000.0, -40.0, -48.0, 4000000.0},
  {LETTER_H, 5500000.0, -32.0, -40.0, 4000000.0},
  {LETTER_J, 6400000.0, -24.0, -32.0, 6000000.0},
  {LETTER_K, 7300000.0, -16.0, -24.0, 6000000.0},
  {LETTER_L, 8200000.0, -8.0, -16.0, 8000000.0},
  {LETTER_M, 9100000.0, 0.0, -8.0, 8000000.0},
  {LETTER_N, 0.0, 8.0, 0.0, 0.0},
  {LETTER_P, 800000.0, 16.0, 8.0, 0.0},
  {LETTER_Q, 1700000.0, 24.0, 16.0, 0.0},
  {LETTER_R, 2600000.0, 32.0, 24.0, 2000000.0},
  {LETTER_S, 3500000.0, 40.0, 32.0, 2000000.0},
  {LETTER_T, 4400000.0, 48.0, 40.0, 4000000.0},
  {LETTER_U, 5300000.0, 56.0, 48.0, 4000000.0},
  {LETTER_V, 6200000.0, 64.0, 56.0, 6000000.0},
  {LETTER_W, 7000000.0, 72.0, 64.0, 6000000.0},
  {LETTER_X, 7900000.0, 84.5, 72.0, 6000000.0}};


typedef struct UPS_Constant_Value
{
  long letter;            /* letter representing latitude band      */
  long ltr2_low_value;    /* 2nd letter range - low number         */
  long ltr2_high_value;   /* 2nd letter range - high number          */
  long ltr3_high_value;   /* 3rd letter range - high number (UPS)   */
  double false_easting;   /* False easting based on 2nd letter      */
  double false_northing;  /* False northing based on 3rd letter     */
} UPS_Constant;

static const UPS_Constant UPS_Constant_Table[4] =
  {{LETTER_A, LETTER_J, LETTER_Z, LETTER_Z, 800000.0, 800000.0},
  {LETTER_B, LETTER_A, LETTER_R, LETTER_Z, 2000000.0, 800000.0},
  {LETTER_Y, LETTER_J, LETTER_Z, LETTER_P, 800000.0, 1300000.0},
  {LETTER_Z, LETTER_A, LETTER_J, LETTER_P, 2000000.0, 1300000.0}};

/***************************************************************************/
/*
 *                              FUNCTIONS
 */

long Get_Latitude_Band_Min_Northing(long letter, double* min_northing, double* northing_offset)
/*
 * The function Get_Latitude_Band_Min_Northing receives a latitude band letter
 * and uses the Latitude_Band_Table to determine the minimum northing and northing offset
 * for that latitude band letter.
 *
 *   letter        : Latitude band letter             (input)
 *   min_northing  : Minimum northing for that letter	(output)
 */
{ /* Get_Latitude_Band_Min_Northing */
  long error_code = MGRS_NO_ERROR;

  if ((letter >= LETTER_C) && (letter <= LETTER_H))
  {
    *min_northing = Latitude_Band_Table[letter-2].min_northing;
    *northing_offset = Latitude_Band_Table[letter-2].northing_offset;
  }
  else if ((letter >= LETTER_J) && (letter <= LETTER_N))
  {
    *min_northing = Latitude_Band_Table[letter-3].min_northing;
    *northing_offset = Latitude_Band_Table[letter-3].northing_offset;
  }
  else if ((letter >= LETTER_P) && (letter <= LETTER_X))
  {
    *min_northing = Latitude_Band_Table[letter-4].min_northing;
    *northing_offset = Latitude_Band_Table[letter-4].northing_offset;
  }
  else
    error_code |= MGRS_STRING_ERROR;

  return error_code;
} /* Get_Latitude_Band_Min_Northing */


long Get_Latitude_Range(long letter, double* north, double* south)
/*
 * The function Get_Latitude_Range receives a latitude band letter
 * and uses the Latitude_Band_Table to determine the latitude band
 * boundaries for that latitude band letter.
 *
 *   letter   : Latitude band letter                        (input)
 *   north    : Northern latitude boundary for that letter	(output)
 *   north    : Southern latitude boundary for that letter	(output)
 */
{ /* Get_Latitude_Range */
  long error_code = MGRS_NO_ERROR;

  if ((letter >= LETTER_C) && (letter <= LETTER_H))
  {
    *north = Latitude_Band_Table[letter-2].north * DEG_TO_RAD;
    *south = Latitude_Band_Table[letter-2].south * DEG_TO_RAD;
  }
  else if ((letter >= LETTER_J) && (letter <= LETTER_N))
  {
    *north = Latitude_Band_Table[letter-3].north * DEG_TO_RAD;
    *south = Latitude_Band_Table[letter-3].south * DEG_TO_RAD;
  }
  else if ((letter >= LETTER_P) && (letter <= LETTER_X))
  {
    *north = Latitude_Band_Table[letter-4].north * DEG_TO_RAD;
    *south = Latitude_Band_Table[letter-4].south * DEG_TO_RAD;
  }
  else
    error_code |= MGRS_STRING_ERROR;

  return error_code;
} /* Get_Latitude_Range */


long Get_Latitude_Letter(double latitude, int* letter)
/*
 * The function Get_Latitude_Letter receives a latitude value
 * and uses the Latitude_Band_Table to determine the latitude band
 * letter for that latitude.
 *
 *   latitude   : Latitude              (input)
 *   letter     : Latitude band letter  (output)
 */
{ /* Get_Latitude_Letter */
  double temp = 0.0;
  long error_code = MGRS_NO_ERROR;
  double lat_deg = latitude * RAD_TO_DEG;

  if (lat_deg >= 72 && lat_deg < 84.5)
    *letter = LETTER_X;
  else if (lat_deg > -80.5 && lat_deg < 72)
  {
    temp = ((latitude + (80.0 * DEG_TO_RAD)) / (8.0 * DEG_TO_RAD)) + 1.0e-12;
    *letter = Latitude_Band_Table[(int)temp].letter;
  }
  else
    error_code |= MGRS_LAT_ERROR;

  return error_code;
} /* Get_Latitude_Letter */


long Check_Zone(char* MGRS, long* zone_exists)
/*
 * The function Check_Zone receives an MGRS coordinate string.
 * If a zone is given, TRUE is returned. Otherwise, FALSE
 * is returned.
 *
 *   MGRS           : MGRS coordinate string        (input)
 *   zone_exists    : TRUE if a zone is given,
 *                    FALSE if a zone is not given  (output)
 */
{ /* Check_Zone */
  int i = 0;
  int j = 0;
  int num_digits = 0;
  long error_code = MGRS_NO_ERROR;

  /* skip any leading blanks */
  while (MGRS[i] == ' ')
    i++;
  j = i;
  while (isdigit(MGRS[i]))
    i++;
  num_digits = i - j;
  if (num_digits <= 2)
    if (num_digits > 0)
      *zone_exists = TRUE;
    else
      *zone_exists = FALSE;
  else
    error_code |= MGRS_STRING_ERROR;

  return error_code;
} /* Check_Zone */


long Make_MGRS_String (char* MGRS,
                       long Zone,
                       int Letters[MGRS_LETTERS],
                       double Easting,
                       double Northing,
                       long Precision)
/*
 * The function Make_MGRS_String constructs an MGRS string
 * from its component parts.
 *
 *   MGRS           : MGRS coordinate string          (output)
 *   Zone           : UTM Zone                        (input)
 *   Letters        : MGRS coordinate string letters  (input)
 *   Easting        : Easting value                   (input)
 *   Northing       : Northing value                  (input)
 *   Precision      : Precision level of MGRS string  (input)
 */
{ /* Make_MGRS_String */
  long i;
  long j;
  double divisor;
  long east;
  long north;
  char alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  long error_code = MGRS_NO_ERROR;

  i = 0;
  if (Zone)
    i = sprintf (MGRS+i,"%2.2ld",Zone);
  else
    strncpy(MGRS, "  ", 2);  // 2 spaces

  for (j=0;j<3;j++)
    MGRS[i++] = alphabet[Letters[j]];
  divisor = pow (10.0, (5 - Precision));
  Easting = fmod (Easting, 100000.0);
  if (Easting >= 99999.5)
    Easting = 99999.0;
  east = (long)(Easting/divisor);
  i += sprintf (MGRS+i, "%*.*ld", (int)Precision, (int)Precision, east);
  Northing = fmod (Northing, 100000.0);
  if (Northing >= 99999.5)
    Northing = 99999.0;
  north = (long)(Northing/divisor);
  i += sprintf (MGRS+i, "%*.*ld", (int)Precision, (int)Precision, north);
  return (error_code);
} /* Make_MGRS_String */


long Break_MGRS_String (char* MGRS,
                        long* Zone,
                        long Letters[MGRS_LETTERS],
                        double* Easting,
                        double* Northing,
                        long* Precision)
/*
 * The function Break_MGRS_String breaks down an MGRS
 * coordinate string into its component parts.
 *
 *   MGRS           : MGRS coordinate string          (input)
 *   Zone           : UTM Zone                        (output)
 *   Letters        : MGRS coordinate string letters  (output)
 *   Easting        : Easting value                   (output)
 *   Northing       : Northing value                  (output)
 *   Precision      : Precision level of MGRS string  (output)
 */
{ /* Break_MGRS_String */
  long num_digits;
  long num_letters;
  long i = 0;
  long j = 0;
  long error_code = MGRS_NO_ERROR;

  while (MGRS[i] == ' ')
    i++;  /* skip any leading blanks */
  j = i;
  while (isdigit(MGRS[i]))
    i++;
  num_digits = i - j;
  if (num_digits <= 2)
    if (num_digits > 0)
    {
      char zone_string[3];
      /* get zone */
      strncpy (zone_string, MGRS+j, 2);
      zone_string[2] = 0;
      sscanf (zone_string, "%ld", Zone);
      if ((*Zone < 1) || (*Zone > 60))
        error_code |= MGRS_STRING_ERROR;
    }
    else
      *Zone = 0;
  else
    error_code |= MGRS_STRING_ERROR;
  j = i;

  while (isalpha(MGRS[i]))
    i++;
  num_letters = i - j;
  if (num_letters == 3)
  {
    /* get letters */
    Letters[0] = (toupper(MGRS[j]) - (long)'A');
    if ((Letters[0] == LETTER_I) || (Letters[0] == LETTER_O))
      error_code |= MGRS_STRING_ERROR;
    Letters[1] = (toupper(MGRS[j+1]) - (long)'A');
    if ((Letters[1] == LETTER_I) || (Letters[1] == LETTER_O))
      error_code |= MGRS_STRING_ERROR;
    Letters[2] = (toupper(MGRS[j+2]) - (long)'A');
    if ((Letters[2] == LETTER_I) || (Letters[2] == LETTER_O))
      error_code |= MGRS_STRING_ERROR;
  }
  else
    error_code |= MGRS_STRING_ERROR;
  j = i;
  while (isdigit(MGRS[i]))
    i++;
  num_digits = i - j;
  if ((num_digits <= 10) && (num_digits%2 == 0))
  {
    long n;
    char east_string[6];
    char north_string[6];
    long east;
    long north;
    double multiplier;
    /* get easting & northing */
    n = num_digits/2;
    *Precision = n;
    if (n > 0)
    {
      strncpy (east_string, MGRS+j, n);
      east_string[n] = 0;
      sscanf (east_string, "%ld", &east);
      strncpy (north_string, MGRS+j+n, n);
      north_string[n] = 0;
      sscanf (north_string, "%ld", &north);
      multiplier = pow (10.0, 5 - n);
      *Easting = east * multiplier;
      *Northing = north * multiplier;
    }
    else
    {
      *Easting = 0.0;
      *Northing = 0.0;
    }
  }
  else
    error_code |= MGRS_STRING_ERROR;

  return (error_code);
} /* Break_MGRS_String */


void Get_Grid_Values (long zone,
                      long* ltr2_low_value,
                      long* ltr2_high_value,
                      double *pattern_offset)
/*
 * The function getGridValues sets the letter range used for
 * the 2nd letter in the MGRS coordinate string, based on the set
 * number of the utm zone. It also sets the pattern offset using a
 * value of A for the second letter of the grid square, based on
 * the grid pattern and set number of the utm zone.
 *
 *    zone            : Zone number             (input)
 *    ltr2_low_value  : 2nd letter low number   (output)
 *    ltr2_high_value : 2nd letter high number  (output)
 *    pattern_offset  : Pattern offset          (output)
 */
{ /* BEGIN Get_Grid_Values */
  long set_number;    /* Set number (1-6) based on UTM zone number */
  long aa_pattern;    /* Pattern based on ellipsoid code */

  set_number = zone % 6;

  if (!set_number)
    set_number = 6;

  if (!strcmp(MGRS_Ellipsoid_Code,CLARKE_1866) || !strcmp(MGRS_Ellipsoid_Code, CLARKE_1880) ||
      !strcmp(MGRS_Ellipsoid_Code,BESSEL_1841) || !strcmp(MGRS_Ellipsoid_Code,BESSEL_1841_NAMIBIA))
    aa_pattern = FALSE;
  else
    aa_pattern = TRUE;

  if ((set_number == 1) || (set_number == 4))
  {
    *ltr2_low_value = LETTER_A;
    *ltr2_high_value = LETTER_H;
  }
  else if ((set_number == 2) || (set_number == 5))
  {
    *ltr2_low_value = LETTER_J;
    *ltr2_high_value = LETTER_R;
  }
  else if ((set_number == 3) || (set_number == 6))
  {
    *ltr2_low_value = LETTER_S;
    *ltr2_high_value = LETTER_Z;
  }

  /* False northing at A for second letter of grid square */
  if (aa_pattern)
  {
    if ((set_number % 2) ==  0)
      *pattern_offset = 500000.0;
    else
      *pattern_offset = 0.0;
  }
  else
  {
    if ((set_number % 2) == 0)
      *pattern_offset =  1500000.0;
    else
      *pattern_offset = 1000000.00;
  }
} /* END OF Get_Grid_Values */


long UTM_To_MGRS (long Zone,
                  char Hemisphere,
                  double Longitude,
                  double Latitude,
                  double Easting,
                  double Northing,
                  long Precision,
                  char *MGRS)
/*
 * The function UTM_To_MGRS calculates an MGRS coordinate string
 * based on the zone, latitude, easting and northing.
 *
 *    Zone      : Zone number             (input)
 *    Hemisphere: Hemisphere              (input)
 *    Longitude : Longitude in radians    (input)
 *    Latitude  : Latitude in radians     (input)
 *    Easting   : Easting                 (input)
 *    Northing  : Northing                (input)
 *    Precision : Precision               (input)
 *    MGRS      : MGRS coordinate string  (output)
 */
{ /* BEGIN UTM_To_MGRS */
  double pattern_offset;      /* Northing offset for 3rd letter               */
  double grid_easting;        /* Easting used to derive 2nd letter of MGRS   */
  double grid_northing;       /* Northing used to derive 3rd letter of MGRS  */
  long ltr2_low_value;        /* 2nd letter range - low number               */
  long ltr2_high_value;       /* 2nd letter range - high number              */
  int letters[MGRS_LETTERS];  /* Number location of 3 letters in alphabet    */
  long temp_error_code = MGRS_NO_ERROR;
  long error_code = MGRS_NO_ERROR;


	/* Special check for rounding to (truncated) eastern edge of zone 31V */
	if ((Zone == 31) && (((Latitude >= 56.0 * DEG_TO_RAD) && (Latitude < 64.0 * DEG_TO_RAD)) && ((Longitude >= 3.0 * DEG_TO_RAD) || (Easting >= 500000.0))))
	{ /* Reconvert to UTM zone 32 */
    Set_UTM_Parameters (MGRS_a, MGRS_f, 32);
    temp_error_code = Convert_Geodetic_To_UTM (Latitude, Longitude, &Zone, &Hemisphere, &Easting, &Northing);
    if(temp_error_code)
    {
      if(temp_error_code & UTM_LAT_ERROR)
        error_code |= MGRS_LAT_ERROR;
      if(temp_error_code & UTM_LON_ERROR)
        error_code |= MGRS_LON_ERROR;
      if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)
        error_code |= MGRS_ZONE_ERROR;
      if(temp_error_code & UTM_EASTING_ERROR)
        error_code |= MGRS_EASTING_ERROR;
      if(temp_error_code & UTM_NORTHING_ERROR)
        error_code |= MGRS_NORTHING_ERROR;

      return error_code;
    }
  }

  if( Latitude <= 0.0 && Northing == 1.0e7)
  {
    Latitude = 0.0;
    Northing = 0.0;
  }

  Get_Grid_Values(Zone, &ltr2_low_value, &ltr2_high_value, &pattern_offset);

  error_code = Get_Latitude_Letter(Latitude, &letters[0]);

  if (!error_code)
  {
    grid_northing = Northing;

    while (grid_northing >= TWOMIL)
    {
      grid_northing = grid_northing - TWOMIL;
    }
    grid_northing = grid_northing + pattern_offset;
    if(grid_northing >= TWOMIL)
      grid_northing = grid_northing - TWOMIL;

    letters[2] = (long)(grid_northing / ONEHT);
    if (letters[2] > LETTER_H)
      letters[2] = letters[2] + 1;

    if (letters[2] > LETTER_N)
      letters[2] = letters[2] + 1;

    grid_easting = Easting;
    if (((letters[0] == LETTER_V) && (Zone == 31)) && (grid_easting == 500000.0))
      grid_easting = grid_easting - 1.0; /* SUBTRACT 1 METER */

    letters[1] = ltr2_low_value + ((long)(grid_easting / ONEHT) -1);
    if ((ltr2_low_value == LETTER_J) && (letters[1] > LETTER_N))
      letters[1] = letters[1] + 1;

    Make_MGRS_String (MGRS, Zone, letters, grid_easting, Northing, Precision);
  }
  return error_code;
} /* END UTM_To_MGRS */


long Set_MGRS_Parameters (double a,
                          double f,
                          char   *Ellipsoid_Code)
/*
 * The function SET_MGRS_PARAMETERS receives the ellipsoid parameters and sets
 * the corresponding state variables. If any errors occur, the error code(s)
 * are returned by the function, otherwise MGRS_NO_ERROR is returned.
 *
 *   a                : Semi-major axis of ellipsoid in meters  (input)
 *   f                : Flattening of ellipsoid					        (input)
 *   Ellipsoid_Code   : 2-letter code for ellipsoid             (input)
 */
{ /* Set_MGRS_Parameters  */

  double inv_f = 1 / f;
  long Error_Code = MGRS_NO_ERROR;

  if (a <= 0.0)
  { /* Semi-major axis must be greater than zero */
    Error_Code |= MGRS_A_ERROR;
  }
  if ((inv_f < 250) || (inv_f > 350))
  { /* Inverse flattening must be between 250 and 350 */
    Error_Code |= MGRS_INV_F_ERROR;
  }
  if (!Error_Code)
  { /* no errors */
    MGRS_a = a;
    MGRS_f = f;
    strcpy (MGRS_Ellipsoid_Code, Ellipsoid_Code);
  }
  return (Error_Code);
}  /* Set_MGRS_Parameters  */


void Get_MGRS_Parameters (double *a,
                          double *f,
                          char* Ellipsoid_Code)
/*
 * The function Get_MGRS_Parameters returns the current ellipsoid
 * parameters.
 *
 *  a                : Semi-major axis of ellipsoid, in meters (output)
 *  f                : Flattening of ellipsoid					       (output)
 *  Ellipsoid_Code   : 2-letter code for ellipsoid             (output)
 */
{ /* Get_MGRS_Parameters */
  *a = MGRS_a;
  *f = MGRS_f;
  strcpy (Ellipsoid_Code, MGRS_Ellipsoid_Code);
  return;
} /* Get_MGRS_Parameters */


long Convert_Geodetic_To_MGRS (double Latitude,
                               double Longitude,
                               long Precision,
                               char* MGRS)
/*
 * The function Convert_Geodetic_To_MGRS converts Geodetic (latitude and
 * longitude) coordinates to an MGRS coordinate string, according to the
 * current ellipsoid parameters.  If any errors occur, the error code(s)
 * are returned by the function, otherwise MGRS_NO_ERROR is returned.
 *
 *    Latitude   : Latitude in radians              (input)
 *    Longitude  : Longitude in radians             (input)
 *    Precision  : Precision level of MGRS string   (input)
 *    MGRS       : MGRS coordinate string           (output)
 *
 */
{ /* Convert_Geodetic_To_MGRS */
  long zone;
  char hemisphere;
  double easting;
  double northing;
  long temp_error_code = MGRS_NO_ERROR;
  long error_code = MGRS_NO_ERROR;

  if ((Latitude < -PI_OVER_2) || (Latitude > PI_OVER_2))
  { /* Latitude out of range */
    error_code |= MGRS_LAT_ERROR;
  }
  if ((Longitude < -PI) || (Longitude > (2*PI)))
  { /* Longitude out of range */
    error_code |= MGRS_LON_ERROR;
  }
  if ((Precision < 0) || (Precision > MAX_PRECISION))
    error_code |= MGRS_PRECISION_ERROR;
  if (!error_code)
  {
    if ((Latitude < MIN_UTM_LAT) || (Latitude > MAX_UTM_LAT))
    {
      temp_error_code = Set_UPS_Parameters (MGRS_a, MGRS_f);
      if(!temp_error_code)
      {
        temp_error_code = Convert_Geodetic_To_UPS (Latitude, Longitude, &hemisphere, &easting, &northing);
        if(!temp_error_code)
        {
          error_code |= Convert_UPS_To_MGRS (hemisphere, easting, northing, Precision, MGRS);
        }
        else
        {
          if(temp_error_code & UPS_LAT_ERROR)
            error_code |= MGRS_LAT_ERROR;
          if(temp_error_code & UPS_LON_ERROR)
            error_code |= MGRS_LON_ERROR;
        }
      }
      else
      {
        if(temp_error_code & UPS_A_ERROR)
          error_code |= MGRS_A_ERROR;
        if(temp_error_code & UPS_INV_F_ERROR)
          error_code |= MGRS_INV_F_ERROR;
      }
    }
    else
    {
      temp_error_code = Set_UTM_Parameters (MGRS_a, MGRS_f, 0);
      if(!temp_error_code)
      {
        temp_error_code = Convert_Geodetic_To_UTM (Latitude, Longitude, &zone, &hemisphere, &easting, &northing);
        if(!temp_error_code)
          error_code |= UTM_To_MGRS (zone, hemisphere, Longitude, Latitude, easting, northing, Precision, MGRS);
        else
        {
          if(temp_error_code & UTM_LAT_ERROR)
            error_code |= MGRS_LAT_ERROR;
          if(temp_error_code & UTM_LON_ERROR)
            error_code |= MGRS_LON_ERROR;
          if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)
            error_code |= MGRS_ZONE_ERROR;
          if(temp_error_code & UTM_EASTING_ERROR)
            error_code |= MGRS_EASTING_ERROR;
          if(temp_error_code & UTM_NORTHING_ERROR)
            error_code |= MGRS_NORTHING_ERROR;
        }
      }
      else
      {
        if(temp_error_code & UTM_A_ERROR)
          error_code |= MGRS_A_ERROR;
        if(temp_error_code & UTM_INV_F_ERROR)
          error_code |= MGRS_INV_F_ERROR;
        if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)
          error_code |= MGRS_ZONE_ERROR;
      }
    }
  }
  return (error_code);
} /* Convert_Geodetic_To_MGRS */


long Convert_MGRS_To_Geodetic (char* MGRS,
                               double *Latitude,
                               double *Longitude)
/*
 * The function Convert_MGRS_To_Geodetic converts an MGRS coordinate string
 * to Geodetic (latitude and longitude) coordinates
 * according to the current ellipsoid parameters.  If any errors occur,
 * the error code(s) are returned by the function, otherwise UTM_NO_ERROR
 * is returned.
 *
 *    MGRS       : MGRS coordinate string           (input)
 *    Latitude   : Latitude in radians              (output)
 *    Longitude  : Longitude in radians             (output)
 *
 */
{ /* Convert_MGRS_To_Geodetic */
  long zone;
  char hemisphere;
  double easting;
  double northing;
  long zone_exists;
  long temp_error_code = MGRS_NO_ERROR;
  long error_code = MGRS_NO_ERROR;

  error_code = Check_Zone(MGRS, &zone_exists);
  if (!error_code)
  {
    if (zone_exists)
    {
      error_code |= Convert_MGRS_To_UTM (MGRS, &zone, &hemisphere, &easting, &northing);
      if(!error_code || (error_code & MGRS_LAT_WARNING))
      {
        temp_error_code = Set_UTM_Parameters (MGRS_a, MGRS_f, 0);
        if(!temp_error_code)
        {
          temp_error_code = Convert_UTM_To_Geodetic (zone, hemisphere, easting, northing, Latitude, Longitude);
          if(temp_error_code)
          {
            if((temp_error_code & UTM_ZONE_ERROR) || (temp_error_code & UTM_HEMISPHERE_ERROR))
              error_code |= MGRS_STRING_ERROR;
            if(temp_error_code & UTM_EASTING_ERROR)
              error_code |= MGRS_EASTING_ERROR;
            if(temp_error_code & UTM_NORTHING_ERROR)
              error_code |= MGRS_NORTHING_ERROR;
          }
        }
        else
        {
          if(temp_error_code & UTM_A_ERROR)
            error_code |= MGRS_A_ERROR;
          if(temp_error_code & UTM_INV_F_ERROR)
            error_code |= MGRS_INV_F_ERROR;
          if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)
            error_code |= MGRS_ZONE_ERROR;
        }
      }
    }
    else
    {
      error_code |= Convert_MGRS_To_UPS (MGRS, &hemisphere, &easting, &northing);
      if(!error_code)
      {
        temp_error_code = Set_UPS_Parameters (MGRS_a, MGRS_f);
        if(!temp_error_code)
        {
          temp_error_code = Convert_UPS_To_Geodetic (hemisphere, easting, northing, Latitude, Longitude);
          if(temp_error_code)
          {
            if(temp_error_code & UPS_HEMISPHERE_ERROR)
              error_code |= MGRS_STRING_ERROR;
            if(temp_error_code & UPS_EASTING_ERROR)
              error_code |= MGRS_EASTING_ERROR;
            if(temp_error_code & UPS_LAT_ERROR)
              error_code |= MGRS_NORTHING_ERROR;
          }
        }
        else
        {
          if(temp_error_code & UPS_A_ERROR)
            error_code |= MGRS_A_ERROR;
          if(temp_error_code & UPS_INV_F_ERROR)
            error_code |= MGRS_INV_F_ERROR;
        }
      }
    }
  }
  return (error_code);
} /* END OF Convert_MGRS_To_Geodetic */


long Convert_UTM_To_MGRS (long Zone,
                          char Hemisphere,
                          double Easting,
                          double Northing,
                          long Precision,
                          char* MGRS)
/*
 * The function Convert_UTM_To_MGRS converts UTM (zone, easting, and
 * northing) coordinates to an MGRS coordinate string, according to the
 * current ellipsoid parameters.  If any errors occur, the error code(s)
 * are returned by the function, otherwise MGRS_NO_ERROR is returned.
 *
 *    Zone       : UTM zone                         (input)
 *    Hemisphere : North or South hemisphere        (input)
 *    Easting    : Easting (X) in meters            (input)
 *    Northing   : Northing (Y) in meters           (input)
 *    Precision  : Precision level of MGRS string   (input)
 *    MGRS       : MGRS coordinate string           (output)
 */
{ /* Convert_UTM_To_MGRS */
  double latitude;           /* Latitude of UTM point */
  double longitude;          /* Longitude of UTM point */
  long utm_error_code = MGRS_NO_ERROR;
  long error_code = MGRS_NO_ERROR;

  if ((Zone < 1) || (Zone > 60))
    error_code |= MGRS_ZONE_ERROR;
  if ((Hemisphere != 'S') && (Hemisphere != 'N'))
    error_code |= MGRS_HEMISPHERE_ERROR;
  if ((Easting < MIN_EASTING) || (Easting > MAX_EASTING))
    error_code |= MGRS_EASTING_ERROR;
  if ((Northing < MIN_NORTHING) || (Northing > MAX_NORTHING))
    error_code |= MGRS_NORTHING_ERROR;
  if ((Precision < 0) || (Precision > MAX_PRECISION))
    error_code |= MGRS_PRECISION_ERROR;
  if (!error_code)
  {
    Set_UTM_Parameters (MGRS_a, MGRS_f, 0);
    utm_error_code = Convert_UTM_To_Geodetic (Zone, Hemisphere, Easting, Northing, &latitude, &longitude);
    if(utm_error_code)
    {
      if((utm_error_code & UTM_ZONE_ERROR) || (utm_error_code & UTM_HEMISPHERE_ERROR))
        error_code |= MGRS_STRING_ERROR;
      if(utm_error_code & UTM_EASTING_ERROR)
        error_code |= MGRS_EASTING_ERROR;
      if(utm_error_code & UTM_NORTHING_ERROR)
        error_code |= MGRS_NORTHING_ERROR;
    }

	  error_code = UTM_To_MGRS (Zone, Hemisphere, longitude, latitude, Easting, Northing, Precision, MGRS);
  }
  return (error_code);
} /* Convert_UTM_To_MGRS */


long Convert_MGRS_To_UTM (char   *MGRS,
                          long   *Zone,
                          char   *Hemisphere,
                          double *Easting,
                          double *Northing)
/*
 * The function Convert_MGRS_To_UTM converts an MGRS coordinate string
 * to UTM projection (zone, hemisphere, easting and northing) coordinates
 * according to the current ellipsoid parameters.  If any errors occur,
 * the error code(s) are returned by the function, otherwise UTM_NO_ERROR
 * is returned.
 *
 *    MGRS       : MGRS coordinate string           (input)
 *    Zone       : UTM zone                         (output)
 *    Hemisphere : North or South hemisphere        (output)
 *    Easting    : Easting (X) in meters            (output)
 *    Northing   : Northing (Y) in meters           (output)
 */
{ /* Convert_MGRS_To_UTM */
  double min_northing;
  double northing_offset;
  long ltr2_low_value;
  long ltr2_high_value;
  double pattern_offset;
  double upper_lat_limit;     /* North latitude limits based on 1st letter  */
  double lower_lat_limit;     /* South latitude limits based on 1st letter  */
  double grid_easting;        /* Easting for 100,000 meter grid square      */
  double grid_northing;       /* Northing for 100,000 meter grid square     */
  long letters[MGRS_LETTERS];
  long in_precision;
  double latitude = 0.0;
  double longitude = 0.0;
  double divisor = 1.0;
  long utm_error_code = MGRS_NO_ERROR;
  long error_code = MGRS_NO_ERROR;

  error_code = Break_MGRS_String (MGRS, Zone, letters, Easting, Northing, &in_precision);
  if (!*Zone)
    error_code |= MGRS_STRING_ERROR;
  else
  {
    if (!error_code)
    {
      if ((letters[0] == LETTER_X) && ((*Zone == 32) || (*Zone == 34) || (*Zone == 36)))
        error_code |= MGRS_STRING_ERROR;
      else
      {
        if (letters[0] < LETTER_N)
          *Hemisphere = 'S';
        else
          *Hemisphere = 'N';

        Get_Grid_Values(*Zone, &ltr2_low_value, &ltr2_high_value, &pattern_offset);

        /* Check that the second letter of the MGRS string is within
         * the range of valid second letter values
         * Also check that the third letter is valid */
        if ((letters[1] < ltr2_low_value) || (letters[1] > ltr2_high_value) || (letters[2] > LETTER_V))
          error_code |= MGRS_STRING_ERROR;

        if (!error_code)
        {
          double row_letter_northing = (double)(letters[2]) * ONEHT;
          grid_easting = (double)((letters[1]) - ltr2_low_value + 1) * ONEHT;
          if ((ltr2_low_value == LETTER_J) && (letters[1] > LETTER_O))
            grid_easting = grid_easting - ONEHT;

          if (letters[2] > LETTER_O)
            row_letter_northing = row_letter_northing - ONEHT;

          if (letters[2] > LETTER_I)
            row_letter_northing = row_letter_northing - ONEHT;

          if (row_letter_northing >= TWOMIL)
            row_letter_northing = row_letter_northing - TWOMIL;

          error_code = Get_Latitude_Band_Min_Northing(letters[0], &min_northing, &northing_offset);
          if (!error_code)
          {
            grid_northing = row_letter_northing - pattern_offset;
            if(grid_northing < 0)
              grid_northing += TWOMIL;

            grid_northing += northing_offset;

            if(grid_northing < min_northing)
              grid_northing += TWOMIL;

            *Easting = grid_easting + *Easting;
            *Northing = grid_northing + *Northing;

            /* check that point is within Zone Letter bounds */
            utm_error_code = Set_UTM_Parameters(MGRS_a,MGRS_f,0);
            if (!utm_error_code)
            {
              utm_error_code = Convert_UTM_To_Geodetic(*Zone,*Hemisphere,*Easting,*Northing,&latitude,&longitude);
              if (!utm_error_code)
              {
                divisor = pow (10.0, in_precision);
                error_code = Get_Latitude_Range(letters[0], &upper_lat_limit, &lower_lat_limit);
                if (!error_code)
                {
                  if (!(((lower_lat_limit - DEG_TO_RAD/divisor) <= latitude) && (latitude <= (upper_lat_limit + DEG_TO_RAD/divisor))))
                    error_code |= MGRS_LAT_WARNING;
                }
              }
              else
              {
                if((utm_error_code & UTM_ZONE_ERROR) || (utm_error_code & UTM_HEMISPHERE_ERROR))
                  error_code |= MGRS_STRING_ERROR;
                if(utm_error_code & UTM_EASTING_ERROR)
                  error_code |= MGRS_EASTING_ERROR;
                if(utm_error_code & UTM_NORTHING_ERROR)
                  error_code |= MGRS_NORTHING_ERROR;
              }
            }
            else
            {
              if(utm_error_code & UTM_A_ERROR)
                error_code |= MGRS_A_ERROR;
              if(utm_error_code & UTM_INV_F_ERROR)
                error_code |= MGRS_INV_F_ERROR;
              if(utm_error_code & UTM_ZONE_OVERRIDE_ERROR)
                error_code |= MGRS_ZONE_ERROR;
            }
          }
        }
      }
    }
  }
  return (error_code);
} /* Convert_MGRS_To_UTM */


long Convert_UPS_To_MGRS (char   Hemisphere,
                          double Easting,
                          double Northing,
                          long   Precision,
                          char*  MGRS)
/*
 *  The function Convert_UPS_To_MGRS converts UPS (hemisphere, easting,
 *  and northing) coordinates to an MGRS coordinate string according to
 *  the current ellipsoid parameters.  If any errors occur, the error
 *  code(s) are returned by the function, otherwise UPS_NO_ERROR is
 *  returned.
 *
 *    Hemisphere    : Hemisphere either 'N' or 'S'     (input)
 *    Easting       : Easting/X in meters              (input)
 *    Northing      : Northing/Y in meters             (input)
 *    Precision     : Precision level of MGRS string   (input)
 *    MGRS          : MGRS coordinate string           (output)
 */
{ /* Convert_UPS_To_MGRS */
  double false_easting;       /* False easting for 2nd letter                 */
  double false_northing;      /* False northing for 3rd letter                */
  double grid_easting;        /* Easting used to derive 2nd letter of MGRS    */
  double grid_northing;       /* Northing used to derive 3rd letter of MGRS   */
  long ltr2_low_value;        /* 2nd letter range - low number                */
  int letters[MGRS_LETTERS];  /* Number location of 3 letters in alphabet     */
  int index = 0;
  long error_code = MGRS_NO_ERROR;

  if ((Hemisphere != 'N') && (Hemisphere != 'S'))
    error_code |= MGRS_HEMISPHERE_ERROR;
  if ((Easting < MIN_EAST_NORTH) || (Easting > MAX_EAST_NORTH))
    error_code |= MGRS_EASTING_ERROR;
  if ((Northing < MIN_EAST_NORTH) || (Northing > MAX_EAST_NORTH))
    error_code |= MGRS_NORTHING_ERROR;
  if ((Precision < 0) || (Precision > MAX_PRECISION))
    error_code |= MGRS_PRECISION_ERROR;
  if (!error_code)
  {

    if (Hemisphere == 'N')
    {
      if (Easting >= TWOMIL)
        letters[0] = LETTER_Z;
      else
        letters[0] = LETTER_Y;

      index = letters[0] - 22;
      ltr2_low_value = UPS_Constant_Table[index].ltr2_low_value;
      false_easting = UPS_Constant_Table[index].false_easting;
      false_northing = UPS_Constant_Table[index].false_northing;
    }
    else
    {
      if (Easting >= TWOMIL)
        letters[0] = LETTER_B;
      else
        letters[0] = LETTER_A;

      ltr2_low_value = UPS_Constant_Table[letters[0]].ltr2_low_value;
      false_easting = UPS_Constant_Table[letters[0]].false_easting;
      false_northing = UPS_Constant_Table[letters[0]].false_northing;
    }

    grid_northing = Northing;
    grid_northing = grid_northing - false_northing;
    letters[2] = (long)(grid_northing / ONEHT);

    if (letters[2] > LETTER_H)
      letters[2] = letters[2] + 1;

    if (letters[2] > LETTER_N)
      letters[2] = letters[2] + 1;

    grid_easting = Easting;
    grid_easting = grid_easting - false_easting;
    letters[1] = ltr2_low_value + ((long)(grid_easting / ONEHT));

    if (Easting < TWOMIL)
    {
      if (letters[1] > LETTER_L)
        letters[1] = letters[1] + 3;

      if (letters[1] > LETTER_U)
        letters[1] = letters[1] + 2;
    }
    else
    {
      if (letters[1] > LETTER_C)
        letters[1] = letters[1] + 2;

      if (letters[1] > LETTER_H)
        letters[1] = letters[1] + 1;

      if (letters[1] > LETTER_L)
        letters[1] = letters[1] + 3;
    }

    Make_MGRS_String (MGRS, 0, letters, Easting, Northing, Precision);
  }
  return (error_code);
} /* Convert_UPS_To_MGRS */


long Convert_MGRS_To_UPS ( char   *MGRS,
                           char   *Hemisphere,
                           double *Easting,
                           double *Northing)
/*
 *  The function Convert_MGRS_To_UPS converts an MGRS coordinate string
 *  to UPS (hemisphere, easting, and northing) coordinates, according
 *  to the current ellipsoid parameters. If any errors occur, the error
 *  code(s) are returned by the function, otherwide UPS_NO_ERROR is returned.
 *
 *    MGRS          : MGRS coordinate string           (input)
 *    Hemisphere    : Hemisphere either 'N' or 'S'     (output)
 *    Easting       : Easting/X in meters              (output)
 *    Northing      : Northing/Y in meters             (output)
 */
{ /* Convert_MGRS_To_UPS */
  long ltr2_high_value;       /* 2nd letter range - high number             */
  long ltr3_high_value;       /* 3rd letter range - high number (UPS)       */
  long ltr2_low_value;        /* 2nd letter range - low number              */
  double false_easting;       /* False easting for 2nd letter               */
  double false_northing;      /* False northing for 3rd letter              */
  double grid_easting;        /* easting for 100,000 meter grid square      */
  double grid_northing;       /* northing for 100,000 meter grid square     */
  long zone;
  long letters[MGRS_LETTERS];
  long in_precision;
  int index = 0;
  long error_code = MGRS_NO_ERROR;

  error_code = Break_MGRS_String (MGRS, &zone, letters, Easting, Northing, &in_precision);
  if (zone)
    error_code |= MGRS_STRING_ERROR;
  else
  {
    if (!error_code)
    {
      if (letters[0] >= LETTER_Y)
      {
        *Hemisphere = 'N';

        index = letters[0] - 22;
        ltr2_low_value = UPS_Constant_Table[index].ltr2_low_value;
        ltr2_high_value = UPS_Constant_Table[index].ltr2_high_value;
        ltr3_high_value = UPS_Constant_Table[index].ltr3_high_value;
        false_easting = UPS_Constant_Table[index].false_easting;
        false_northing = UPS_Constant_Table[index].false_northing;
      }
      else
      {
        *Hemisphere = 'S';

        ltr2_low_value = UPS_Constant_Table[letters[0]].ltr2_low_value;
        ltr2_high_value = UPS_Constant_Table[letters[0]].ltr2_high_value;
        ltr3_high_value = UPS_Constant_Table[letters[0]].ltr3_high_value;
        false_easting = UPS_Constant_Table[letters[0]].false_easting;
        false_northing = UPS_Constant_Table[letters[0]].false_northing;
      }

      /* Check that the second letter of the MGRS string is within
       * the range of valid second letter values
       * Also check that the third letter is valid */
      if ((letters[1] < ltr2_low_value) || (letters[1] > ltr2_high_value) ||
          ((letters[1] == LETTER_D) || (letters[1] == LETTER_E) ||
          (letters[1] == LETTER_M) || (letters[1] == LETTER_N) ||
          (letters[1] == LETTER_V) || (letters[1] == LETTER_W)) ||
          (letters[2] > ltr3_high_value))
          error_code |= MGRS_STRING_ERROR;

      if (!error_code)
      {
        grid_northing = (double)letters[2] * ONEHT + false_northing;
        if (letters[2] > LETTER_I)
          grid_northing = grid_northing - ONEHT;

        if (letters[2] > LETTER_O)
          grid_northing = grid_northing - ONEHT;

        grid_easting = (double)((letters[1]) - ltr2_low_value) * ONEHT + false_easting;
        if (ltr2_low_value != LETTER_A)
        {
          if (letters[1] > LETTER_L)
            grid_easting = grid_easting - 300000.0;

          if (letters[1] > LETTER_U)
            grid_easting = grid_easting - 200000.0;
        }
        else
        {
          if (letters[1] > LETTER_C)
            grid_easting = grid_easting - 200000.0;

          if (letters[1] > LETTER_I)
            grid_easting = grid_easting - ONEHT;

          if (letters[1] > LETTER_L)
            grid_easting = grid_easting - 300000.0;
        }

        *Easting = grid_easting + *Easting;
        *Northing = grid_northing + *Northing;
      }
    }
  }
  return (error_code);
} /* Convert_MGRS_To_UPS */

void initlibmgrs() {}
void PyInit_libmgrs() {}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/mgrs.h
================================================
#ifndef MGRS_H
#define MGRS_H

#ifdef _WIN32
#define USE_DLL __declspec(dllexport)
#else
#define USE_DLL
#endif

/***************************************************************************/
/* RSC IDENTIFIER:  MGRS
 *
 * ABSTRACT
 *
 *    This component converts between geodetic coordinates (latitude and
 *    longitude) and Military Grid Reference System (MGRS) coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found, the error code is combined with the current error code using
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *          MGRS_NO_ERROR          : No errors occurred in function
 *          MGRS_LAT_ERROR         : Latitude outside of valid range
 *                                    (-90 to 90 degrees)
 *          MGRS_LON_ERROR         : Longitude outside of valid range
 *                                    (-180 to 360 degrees)
 *          MGRS_STR_ERROR         : An MGRS string error: string too long,
 *                                    too short, or badly formed
 *          MGRS_PRECISION_ERROR   : The precision must be between 0 and 5
 *                                    inclusive.
 *          MGRS_A_ERROR           : Semi-major axis less than or equal to zero
 *          MGRS_INV_F_ERROR       : Inverse flattening outside of valid range
 *									                  (250 to 350)
 *          MGRS_EASTING_ERROR     : Easting outside of valid range
 *                                    (100,000 to 900,000 meters for UTM)
 *                                    (0 to 4,000,000 meters for UPS)
 *          MGRS_NORTHING_ERROR    : Northing outside of valid range
 *                                    (0 to 10,000,000 meters for UTM)
 *                                    (0 to 4,000,000 meters for UPS)
 *          MGRS_ZONE_ERROR        : Zone outside of valid range (1 to 60)
 *          MGRS_HEMISPHERE_ERROR  : Invalid hemisphere ('N' or 'S')
 *
 * REUSE NOTES
 *
 *    MGRS is intended for reuse by any application that does conversions
 *    between geodetic coordinates and MGRS coordinates.
 *
 * REFERENCES
 *
 *    Further information on MGRS can be found in the Reuse Manual.
 *
 *    MGRS originated from : U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *
 * ENVIRONMENT
 *
 *    MGRS was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC version 2.8.1
 *    2. Windows 95 with MS Visual C++ version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    16-11-94          Original Code
 *    15-09-99          Reengineered upper layers
 *
 */

/***************************************************************************/
/*
 *                              DEFINES
 */

#define MGRS_NO_ERROR 0x0000
#define MGRS_LAT_ERROR 0x0001
#define MGRS_LON_ERROR 0x0002
#define MGRS_STRING_ERROR 0x0004
#define MGRS_PRECISION_ERROR 0x0008
#define MGRS_A_ERROR 0x0010
#define MGRS_INV_F_ERROR 0x0020
#define MGRS_EASTING_ERROR 0x0040
#define MGRS_NORTHING_ERROR 0x0080
#define MGRS_ZONE_ERROR 0x0100
#define MGRS_HEMISPHERE_ERROR 0x0200
#define MGRS_LAT_WARNING 0x0400

/***************************************************************************/
/*
 *                              FUNCTION PROTOTYPES
 */

/* ensure proper linkage to c++ programs */
#ifdef __cplusplus
extern "C" {
#endif

long USE_DLL Set_MGRS_Parameters(double a, double f, char* Ellipsoid_Code);
/*
 * The function Set_MGRS_Parameters receives the ellipsoid parameters and sets
 * the corresponding state variables. If any errors occur, the error code(s)
 * are returned by the function, otherwise MGRS_NO_ERROR is returned.
 *
 *   a                : Semi-major axis of ellipsoid in meters (input)
 *   f                : Flattening of ellipsoid					       (input)
 *   Ellipsoid_Code   : 2-letter code for ellipsoid            (input)
 */

void USE_DLL Get_MGRS_Parameters(double* a, double* f, char* Ellipsoid_Code);
/*
 * The function Get_MGRS_Parameters returns the current ellipsoid
 * parameters.
 *
 *  a                : Semi-major axis of ellipsoid, in meters (output)
 *  f                : Flattening of ellipsoid					       (output)
 *  Ellipsoid_Code   : 2-letter code for ellipsoid             (output)
 */

long USE_DLL Convert_Geodetic_To_MGRS(double Latitude, double Longitude,
                                      long Precision, char* MGRS);
/*
 * The function Convert_Geodetic_To_MGRS converts geodetic (latitude and
 * longitude) coordinates to an MGRS coordinate string, according to the
 * current ellipsoid parameters.  If any errors occur, the error code(s)
 * are returned by the  function, otherwise MGRS_NO_ERROR is returned.
 *
 *    Latitude   : Latitude in radians              (input)
 *    Longitude  : Longitude in radians             (input)
 *    Precision  : Precision level of MGRS string   (input)
 *    MGRS       : MGRS coordinate string           (output)
 *
 */

long USE_DLL Convert_MGRS_To_Geodetic(char* MGRS, double* Latitude,
                                      double* Longitude);
/*
 * This function converts an MGRS coordinate string to Geodetic (latitude
 * and longitude in radians) coordinates.  If any errors occur, the error
 * code(s) are returned by the  function, otherwise MGRS_NO_ERROR is returned.
 *
 *    MGRS       : MGRS coordinate string           (input)
 *    Latitude   : Latitude in radians              (output)
 *    Longitude  : Longitude in radians             (output)
 *
 */

long USE_DLL Convert_UTM_To_MGRS(long Zone, char Hemisphere, double Easting,
                                 double Northing, long Precision, char* MGRS);
/*
 * The function Convert_UTM_To_MGRS converts UTM (zone, easting, and
 * northing) coordinates to an MGRS coordinate string, according to the
 * current ellipsoid parameters.  If any errors occur, the error code(s)
 * are returned by the  function, otherwise MGRS_NO_ERROR is returned.
 *
 *    Zone       : UTM zone                         (input)
 *    Hemisphere : North or South hemisphere        (input)
 *    Easting    : Easting (X) in meters            (input)
 *    Northing   : Northing (Y) in meters           (input)
 *    Precision  : Precision level of MGRS string   (input)
 *    MGRS       : MGRS coordinate string           (output)
 */

long USE_DLL Convert_MGRS_To_UTM(char* MGRS, long* Zone, char* Hemisphere,
                                 double* Easting, double* Northing);
/*
 * The function Convert_MGRS_To_UTM converts an MGRS coordinate string
 * to UTM projection (zone, hemisphere, easting and northing) coordinates
 * according to the current ellipsoid parameters.  If any errors occur,
 * the error code(s) are returned by the function, otherwise UTM_NO_ERROR
 * is returned.
 *
 *    MGRS       : MGRS coordinate string           (input)
 *    Zone       : UTM zone                         (output)
 *    Hemisphere : North or South hemisphere        (output)
 *    Easting    : Easting (X) in meters            (output)
 *    Northing   : Northing (Y) in meters           (output)
 */

long USE_DLL Convert_UPS_To_MGRS(char Hemisphere, double Easting,
                                 double Northing, long Precision, char* MGRS);

/*
 *  The function Convert_UPS_To_MGRS converts UPS (hemisphere, easting,
 *  and northing) coordinates to an MGRS coordinate string according to
 *  the current ellipsoid parameters.  If any errors occur, the error
 *  code(s) are returned by the function, otherwise UPS_NO_ERROR is
 *  returned.
 *
 *    Hemisphere    : Hemisphere either 'N' or 'S'     (input)
 *    Easting       : Easting/X in meters              (input)
 *    Northing      : Northing/Y in meters             (input)
 *    Precision     : Precision level of MGRS string   (input)
 *    MGRS          : MGRS coordinate string           (output)
 */

long USE_DLL Convert_MGRS_To_UPS(char* MGRS, char* Hemisphere, double* Easting,
                                 double* Northing);
/*
 *  The function Convert_MGRS_To_UPS converts an MGRS coordinate string
 *  to UPS (hemisphere, easting, and northing) coordinates, according
 *  to the current ellipsoid parameters. If any errors occur, the error
 *  code(s) are returned by the function, otherwide UPS_NO_ERROR is returned.
 *
 *    MGRS          : MGRS coordinate string           (input)
 *    Hemisphere    : Hemisphere either 'N' or 'S'     (output)
 *    Easting       : Easting/X in meters              (output)
 *    Northing      : Northing/Y in meters             (output)
 */

void initlibmgrs();
void PyInit_libmgrs();

#ifdef __cplusplus
}
#endif

#endif /* MGRS_H */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/polarst.c
================================================
/***************************************************************************/
/* RSC IDENTIFIER: POLAR STEREOGRAPHIC 
 *
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic (latitude and
 *    longitude) coordinates and Polar Stereographic (easting and northing) 
 *    coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid 
 *    value is found the error code is combined with the current error code 
 *    using the bitwise or.  This combining allows multiple error codes to 
 *    be returned. The possible error codes are:
 *
 *          POLAR_NO_ERROR           : No errors occurred in function
 *          POLAR_LAT_ERROR          : Latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *          POLAR_LON_ERROR          : Longitude outside of valid range
 *                                      (-180 to 360 degrees) 
 *          POLAR_ORIGIN_LAT_ERROR   : Latitude of true scale outside of valid
 *                                      range (-90 to 90 degrees)
 *          POLAR_ORIGIN_LON_ERROR   : Longitude down from pole outside of valid
 *                                      range (-180 to 360 degrees)
 *          POLAR_EASTING_ERROR      : Easting outside of valid range,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_NORTHING_ERROR     : Northing outside of valid range,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_RADIUS_ERROR       : Coordinates too far from pole,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_A_ERROR            : Semi-major axis less than or equal to zero
 *          POLAR_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	                  (250 to 350)
 *
 *
 * REUSE NOTES
 *
 *    POLAR STEREOGRAPHIC is intended for reuse by any application that  
 *    performs a Polar Stereographic projection.
 *
 *
 * REFERENCES
 *
 *    Further information on POLAR STEREOGRAPHIC can be found in the
 *    Reuse Manual.
 *
 *
 *    POLAR STEREOGRAPHIC originated from :
 *                                U.S. Army Topographic Engineering Center
 *                                Geospatial Information Division
 *                                7701 Telegraph Road
 *                                Alexandria, VA  22310-3864
 *
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 *
 * RESTRICTIONS
 *
 *    POLAR STEREOGRAPHIC has no restrictions.
 *
 *
 * ENVIRONMENT
 *
 *    POLAR STEREOGRAPHIC was tested and certified in the following
 *    environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. Window 95 with MS Visual C++, version 6
 *
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    06-11-95          Original Code
 *    03-01-97          Original Code
 *
 *
 */


/************************************************************************/
/*
 *                               INCLUDES
 */

#include <math.h>
#include "polarst.h"

/*
 *    math.h     - Standard C math library
 *    polarst.h  - Is for prototype error checking
 */


/************************************************************************/
/*                               DEFINES
 *
 */


#define PI           3.14159265358979323e0       /* PI     */
#define PI_OVER_2    (PI / 2.0)           
#define TWO_PI       (2.0 * PI)
#define POLAR_POW(EsSin)     pow((1.0 - EsSin) / (1.0 + EsSin), es_OVER_2)

/************************************************************************/
/*                           GLOBAL DECLARATIONS
 *
 */

const double PI_Over_4 = (PI / 4.0);

/* Ellipsoid Parameters, default to WGS 84  */
static double Polar_a = 6378137.0;                    /* Semi-major axis of ellipsoid in meters  */
static double Polar_f = 1 / 298.257223563;            /* Flattening of ellipsoid  */
static double es = 0.08181919084262188000;            /* Eccentricity of ellipsoid    */
static double es_OVER_2 = .040909595421311;           /* es / 2.0 */
static double Southern_Hemisphere = 0;                /* Flag variable */
static double tc = 1.0;
static double e4 = 1.0033565552493;
static double Polar_a_mc = 6378137.0;                 /* Polar_a * mc */
static double two_Polar_a = 12756274.0;               /* 2.0 * Polar_a */

/* Polar Stereographic projection Parameters */
static double Polar_Origin_Lat = ((PI * 90) / 180);   /* Latitude of origin in radians */
static double Polar_Origin_Long = 0.0;                /* Longitude of origin in radians */
static double Polar_False_Easting = 0.0;              /* False easting in meters */
static double Polar_False_Northing = 0.0;             /* False northing in meters */

/* Maximum variance for easting and northing values for WGS 84. */
static double Polar_Delta_Easting = 12713601.0;
static double Polar_Delta_Northing = 12713601.0;

/* These state variables are for optimization purposes. The only function
 * that should modify them is Set_Polar_Stereographic_Parameters.         
 */


/************************************************************************/
/*                              FUNCTIONS
 *
 */


long Set_Polar_Stereographic_Parameters (double a,
                                         double f,
                                         double Latitude_of_True_Scale,
                                         double Longitude_Down_from_Pole,
                                         double False_Easting,
                                         double False_Northing)

{  /* BEGIN Set_Polar_Stereographic_Parameters   */
/*  
 *  The function Set_Polar_Stereographic_Parameters receives the ellipsoid
 *  parameters and Polar Stereograpic projection parameters as inputs, and
 *  sets the corresponding state variables.  If any errors occur, error
 *  code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.
 *
 *  a                : Semi-major axis of ellipsoid, in meters         (input)
 *  f                : Flattening of ellipsoid					               (input)
 *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (input)
 *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (input)
 *  False_Easting    : Easting (X) at center of projection, in meters  (input)
 *  False_Northing   : Northing (Y) at center of projection, in meters (input)
 */

  double es2;
  double slat, clat;
  double essin;
  double one_PLUS_es, one_MINUS_es;
  double pow_es;
  double temp, temp_northing;
  double inv_f = 1 / f;
  double mc;                    
//  const double  epsilon = 1.0e-2;
  long Error_Code = POLAR_NO_ERROR;

  if (a <= 0.0)
  { /* Semi-major axis must be greater than zero */
    Error_Code |= POLAR_A_ERROR;
  }
  if ((inv_f < 250) || (inv_f > 350))
  { /* Inverse flattening must be between 250 and 350 */
    Error_Code |= POLAR_INV_F_ERROR;
  }
  if ((Latitude_of_True_Scale < -PI_OVER_2) || (Latitude_of_True_Scale > PI_OVER_2))
  { /* Origin Latitude out of range */
    Error_Code |= POLAR_ORIGIN_LAT_ERROR;
  }
  if ((Longitude_Down_from_Pole < -PI) || (Longitude_Down_from_Pole > TWO_PI))
  { /* Origin Longitude out of range */
    Error_Code |= POLAR_ORIGIN_LON_ERROR;
  }

  if (!Error_Code)
  { /* no errors */

    Polar_a = a;
    two_Polar_a = 2.0 * Polar_a;
    Polar_f = f;

    if (Longitude_Down_from_Pole > PI)
      Longitude_Down_from_Pole -= TWO_PI;
    if (Latitude_of_True_Scale < 0)
    {
      Southern_Hemisphere = 1;
      Polar_Origin_Lat = -Latitude_of_True_Scale;
      Polar_Origin_Long = -Longitude_Down_from_Pole;
    }
    else
    {
      Southern_Hemisphere = 0;
      Polar_Origin_Lat = Latitude_of_True_Scale;
      Polar_Origin_Long = Longitude_Down_from_Pole;
    }
    Polar_False_Easting = False_Easting;
    Polar_False_Northing = False_Northing;

    es2 = 2 * Polar_f - Polar_f * Polar_f;
    es = sqrt(es2);
    es_OVER_2 = es / 2.0;

    if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)
    {
      slat = sin(Polar_Origin_Lat);
      essin = es * slat;
      pow_es = POLAR_POW(essin);
      clat = cos(Polar_Origin_Lat);
      mc = clat / sqrt(1.0 - essin * essin);
      Polar_a_mc = Polar_a * mc;
      tc = tan(PI_Over_4 - Polar_Origin_Lat / 2.0) / pow_es;
    }
    else
    {
      one_PLUS_es = 1.0 + es;
      one_MINUS_es = 1.0 - es;
      e4 = sqrt(pow(one_PLUS_es, one_PLUS_es) * pow(one_MINUS_es, one_MINUS_es));
    }

    /* Calculate Radius */
    Convert_Geodetic_To_Polar_Stereographic(0, Longitude_Down_from_Pole, 
                                            &temp, &temp_northing);

    Polar_Delta_Northing = temp_northing;
    if(Polar_False_Northing)
      Polar_Delta_Northing -= Polar_False_Northing;
    if (Polar_Delta_Northing < 0)
      Polar_Delta_Northing = -Polar_Delta_Northing;
    Polar_Delta_Northing *= 1.01;

    Polar_Delta_Easting = Polar_Delta_Northing;

  /*  Polar_Delta_Easting = temp_northing;
    if(Polar_False_Easting)
      Polar_Delta_Easting -= Polar_False_Easting;
    if (Polar_Delta_Easting < 0)
      Polar_Delta_Easting = -Polar_Delta_Easting;
    Polar_Delta_Easting *= 1.01;*/
  }

  return (Error_Code);
} /* END OF Set_Polar_Stereographic_Parameters */


void Get_Polar_Stereographic_Parameters (double *a,
                                         double *f,
                                         double *Latitude_of_True_Scale,
                                         double *Longitude_Down_from_Pole,
                                         double *False_Easting,
                                         double *False_Northing)

{ /* BEGIN Get_Polar_Stereographic_Parameters  */
/*
 * The function Get_Polar_Stereographic_Parameters returns the current
 * ellipsoid parameters and Polar projection parameters.
 *
 *  a                : Semi-major axis of ellipsoid, in meters         (output)
 *  f                : Flattening of ellipsoid					               (output)
 *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (output)
 *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (output)
 *  False_Easting    : Easting (X) at center of projection, in meters  (output)
 *  False_Northing   : Northing (Y) at center of projection, in meters (output)
 */

  *a = Polar_a;
  *f = Polar_f;
  *Latitude_of_True_Scale = Polar_Origin_Lat;
  *Longitude_Down_from_Pole = Polar_Origin_Long;
  *False_Easting = Polar_False_Easting;
  *False_Northing = Polar_False_Northing;
  return;
} /* END OF Get_Polar_Stereographic_Parameters */


long Convert_Geodetic_To_Polar_Stereographic (double Latitude,
                                              double Longitude,
                                              double *Easting,
                                              double *Northing)

{  /* BEGIN Convert_Geodetic_To_Polar_Stereographic */

/*
 * The function Convert_Geodetic_To_Polar_Stereographic converts geodetic
 * coordinates (latitude and longitude) to Polar Stereographic coordinates
 * (easting and northing), according to the current ellipsoid
 * and Polar Stereographic projection parameters. If any errors occur, error
 * code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.
 *
 *    Latitude   :  Latitude, in radians                      (input)
 *    Longitude  :  Longitude, in radians                     (input)
 *    Easting    :  Easting (X), in meters                    (output)
 *    Northing   :  Northing (Y), in meters                   (output)
 */

  double dlam;
  double slat;
  double essin;
  double t;
  double rho;
  double pow_es;
  long Error_Code = POLAR_NO_ERROR;

  if ((Latitude < -PI_OVER_2) || (Latitude > PI_OVER_2))
  {   /* Latitude out of range */
    Error_Code |= POLAR_LAT_ERROR;
  }
  if ((Latitude < 0) && (Southern_Hemisphere == 0))
  {   /* Latitude and Origin Latitude in different hemispheres */
    Error_Code |= POLAR_LAT_ERROR;
  }
  if ((Latitude > 0) && (Southern_Hemisphere == 1))
  {   /* Latitude and Origin Latitude in different hemispheres */
    Error_Code |= POLAR_LAT_ERROR;
  }
  if ((Longitude < -PI) || (Longitude > TWO_PI))
  {  /* Longitude out of range */
    Error_Code |= POLAR_LON_ERROR;
  }


  if (!Error_Code)
  {  /* no errors */

    if (fabs(fabs(Latitude) - PI_OVER_2) < 1.0e-10)
    {
      *Easting = Polar_False_Easting;
      *Northing = Polar_False_Northing;
    }
    else
    {
      if (Southern_Hemisphere != 0)
      {
        Longitude *= -1.0;
        Latitude *= -1.0;
      }
      dlam = Longitude - Polar_Origin_Long;
      if (dlam > PI)
      {
        dlam -= TWO_PI;
      }
      if (dlam < -PI)
      {
        dlam += TWO_PI;
      }
      slat = sin(Latitude);
      essin = es * slat;
      pow_es = POLAR_POW(essin);
      t = tan(PI_Over_4 - Latitude / 2.0) / pow_es;

      if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)
        rho = Polar_a_mc * t / tc;
      else
        rho = two_Polar_a * t / e4;


      if (Southern_Hemisphere != 0)
      {
        *Easting = -(rho * sin(dlam) - Polar_False_Easting);
     //   *Easting *= -1.0;
        *Northing = rho * cos(dlam) + Polar_False_Northing;
      }
      else
      {
        *Easting = rho * sin(dlam) + Polar_False_Easting;
        *Northing = -rho * cos(dlam) + Polar_False_Northing;
      }

    }
  }
  return (Error_Code);
} /* END OF Convert_Geodetic_To_Polar_Stereographic */


long Convert_Polar_Stereographic_To_Geodetic (double Easting,
                                              double Northing,
                                              double *Latitude,
                                              double *Longitude)

{ /*  BEGIN Convert_Polar_Stereographic_To_Geodetic  */
/*
 *  The function Convert_Polar_Stereographic_To_Geodetic converts Polar
 *  Stereographic coordinates (easting and northing) to geodetic
 *  coordinates (latitude and longitude) according to the current ellipsoid
 *  and Polar Stereographic projection Parameters. If any errors occur, the
 *  code(s) are returned by the function, otherwise POLAR_NO_ERROR
 *  is returned.
 *
 *  Easting          : Easting (X), in meters                   (input)
 *  Northing         : Northing (Y), in meters                  (input)
 *  Latitude         : Latitude, in radians                     (output)
 *  Longitude        : Longitude, in radians                    (output)
 *
 */

  double dy = 0, dx = 0;
  double rho = 0;
  double t;
  double PHI, sin_PHI;
  double tempPHI = 0.0;
  double essin;
  double pow_es;
  double delta_radius;
  long Error_Code = POLAR_NO_ERROR;
  double min_easting = Polar_False_Easting - Polar_Delta_Easting;
  double max_easting = Polar_False_Easting + Polar_Delta_Easting;
  double min_northing = Polar_False_Northing - Polar_Delta_Northing;
  double max_northing = Polar_False_Northing + Polar_Delta_Northing;

  if (Easting > max_easting || Easting < min_easting)
  { /* Easting out of range */
    Error_Code |= POLAR_EASTING_ERROR;
  }
  if (Northing > max_northing || Northing < min_northing)
  { /* Northing out of range */
    Error_Code |= POLAR_NORTHING_ERROR;
  }

  if (!Error_Code)
  {
    dy = Northing - Polar_False_Northing;
    dx = Easting - Polar_False_Easting;

    /* Radius of point with origin of false easting, false northing */
    rho = sqrt(dx * dx + dy * dy);   
    
    delta_radius = sqrt(Polar_Delta_Easting * Polar_Delta_Easting + Polar_Delta_Northing * Polar_Delta_Northing);

    if(rho > delta_radius)
    { /* Point is outside of projection area */
      Error_Code |= POLAR_RADIUS_ERROR;
    }

    if (!Error_Code)
    { /* no errors */
      if ((dy == 0.0) && (dx == 0.0))
      {
        *Latitude = PI_OVER_2;
        *Longitude = Polar_Origin_Long;

      }
      else
      {
        if (Southern_Hemisphere != 0)
        {
          dy *= -1.0;
          dx *= -1.0;
        }

        if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)
          t = rho * tc / (Polar_a_mc);
        else
          t = rho * e4 / (two_Polar_a);
        PHI = PI_OVER_2 - 2.0 * atan(t);
        while (fabs(PHI - tempPHI) > 1.0e-10)
        {
          tempPHI = PHI;
          sin_PHI = sin(PHI);
          essin =  es * sin_PHI;
          pow_es = POLAR_POW(essin);
          PHI = PI_OVER_2 - 2.0 * atan(t * pow_es);
        }
        *Latitude = PHI;
        *Longitude = Polar_Origin_Long + atan2(dx, -dy);

        if (*Longitude > PI)
          *Longitude -= TWO_PI;
        else if (*Longitude < -PI)
          *Longitude += TWO_PI;


        if (*Latitude > PI_OVER_2)  /* force distorted values to 90, -90 degrees */
          *Latitude = PI_OVER_2;
        else if (*Latitude < -PI_OVER_2)
          *Latitude = -PI_OVER_2;

        if (*Longitude > PI)  /* force distorted values to 180, -180 degrees */
          *Longitude = PI;
        else if (*Longitude < -PI)
          *Longitude = -PI;

      }
      if (Southern_Hemisphere != 0)
      {
        *Latitude *= -1.0;
        *Longitude *= -1.0;
      }
    }
  }
  return (Error_Code);
} /* END OF Convert_Polar_Stereographic_To_Geodetic */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/polarst.h
================================================
#ifndef POLARST_H
#define POLARST_H
/***************************************************************************/
/* RSC IDENTIFIER: POLAR STEREOGRAPHIC
 *
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic (latitude and
 *    longitude) coordinates and Polar Stereographic (easting and northing)
 *    coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid
 *    value is found the error code is combined with the current error code
 *    using the bitwise or.  This combining allows multiple error codes to
 *    be returned. The possible error codes are:
 *
 *          POLAR_NO_ERROR           : No errors occurred in function
 *          POLAR_LAT_ERROR          : Latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *          POLAR_LON_ERROR          : Longitude outside of valid range
 *                                      (-180 to 360 degrees)
 *          POLAR_ORIGIN_LAT_ERROR   : Latitude of true scale outside of valid
 *                                      range (-90 to 90 degrees)
 *          POLAR_ORIGIN_LON_ERROR   : Longitude down from pole outside of valid
 *                                      range (-180 to 360 degrees)
 *          POLAR_EASTING_ERROR      : Easting outside of valid range,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_NORTHING_ERROR     : Northing outside of valid range,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_RADIUS_ERROR       : Coordinates too far from pole,
 *                                      depending on ellipsoid and
 *                                      projection parameters
 *          POLAR_A_ERROR            : Semi-major axis less than or equal to
 *zero POLAR_INV_F_ERROR        : Inverse flattening outside of valid range (250
 *to 350)
 *
 *
 * REUSE NOTES
 *
 *    POLAR STEREOGRAPHIC is intended for reuse by any application that
 *    performs a Polar Stereographic projection.
 *
 *
 * REFERENCES
 *
 *    Further information on POLAR STEREOGRAPHIC can be found in the
 *    Reuse Manual.
 *
 *
 *    POLAR STEREOGRAPHIC originated from :
 *                                U.S. Army Topographic Engineering Center
 *                                Geospatial Information Division
 *                                7701 Telegraph Road
 *                                Alexandria, VA  22310-3864
 *
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 *
 * RESTRICTIONS
 *
 *    POLAR STEREOGRAPHIC has no restrictions.
 *
 *
 * ENVIRONMENT
 *
 *    POLAR STEREOGRAPHIC was tested and certified in the following
 *    environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. Window 95 with MS Visual C++, version 6
 *
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    06-11-95          Original Code
 *    03-01-97          Original Code
 *
 *
 */

/**********************************************************************/
/*
 *                        DEFINES
 */

#define POLAR_NO_ERROR 0x0000
#define POLAR_LAT_ERROR 0x0001
#define POLAR_LON_ERROR 0x0002
#define POLAR_ORIGIN_LAT_ERROR 0x0004
#define POLAR_ORIGIN_LON_ERROR 0x0008
#define POLAR_EASTING_ERROR 0x0010
#define POLAR_NORTHING_ERROR 0x0020
#define POLAR_A_ERROR 0x0040
#define POLAR_INV_F_ERROR 0x0080
#define POLAR_RADIUS_ERROR 0x0100

/**********************************************************************/
/*
 *                        FUNCTION PROTOTYPES
 */

/* ensure proper linkage to c++ programs */
#ifdef __cplusplus
extern "C" {
#endif

long Set_Polar_Stereographic_Parameters(double a, double f,
                                        double Latitude_of_True_Scale,
                                        double Longitude_Down_from_Pole,
                                        double False_Easting,
                                        double False_Northing);
/*
 *  The function Set_Polar_Stereographic_Parameters receives the ellipsoid
 *  parameters and Polar Stereograpic projection parameters as inputs, and
 *  sets the corresponding state variables.  If any errors occur, error
 *  code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.
 *
 *  a                : Semi-major axis of ellipsoid, in meters         (input)
 *  f                : Flattening of ellipsoid                         (input)
 *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (input)
 *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (input)
 *  False_Easting    : Easting (X) at center of projection, in meters  (input)
 *  False_Northing   : Northing (Y) at center of projection, in meters (input)
 */

void Get_Polar_Stereographic_Parameters(double* a, double* f,
                                        double* Latitude_of_True_Scale,
                                        double* Longitude_Down_from_Pole,
                                        double* False_Easting,
                                        double* False_Northing);
/*
 * The function Get_Polar_Stereographic_Parameters returns the current
 * ellipsoid parameters and Polar projection parameters.
 *
 *  a                : Semi-major axis of ellipsoid, in meters         (output)
 *  f                : Flattening of ellipsoid                         (output)
 *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (output)
 *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (output)
 *  False_Easting    : Easting (X) at center of projection, in meters  (output)
 *  False_Northing   : Northing (Y) at center of projection, in meters (output)
 */

long Convert_Geodetic_To_Polar_Stereographic(double Latitude, double Longitude,
                                             double* Easting, double* Northing);
/*
 * The function Convert_Geodetic_To_Polar_Stereographic converts geodetic
 * coordinates (latitude and longitude) to Polar Stereographic coordinates
 * (easting and northing), according to the current ellipsoid
 * and Polar Stereographic projection parameters. If any errors occur, error
 * code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.
 *
 *    Latitude   :  Latitude, in radians                      (input)
 *    Longitude  :  Longitude, in radians                     (input)
 *    Easting    :  Easting (X), in meters                    (output)
 *    Northing   :  Northing (Y), in meters                   (output)
 */

long Convert_Polar_Stereographic_To_Geodetic(double Easting, double Northing,
                                             double* Latitude,
                                             double* Longitude);

/*
 *  The function Convert_Polar_Stereographic_To_Geodetic converts Polar
 *  Stereographic coordinates (easting and northing) to geodetic
 *  coordinates (latitude and longitude) according to the current ellipsoid
 *  and Polar Stereographic projection Parameters. If any errors occur, the
 *  code(s) are returned by the function, otherwise POLAR_NO_ERROR
 *  is returned.
 *
 *  Easting          : Easting (X), in meters                   (input)
 *  Northing         : Northing (Y), in meters                  (input)
 *  Latitude         : Latitude, in radians                     (output)
 *  Longitude        : Longitude, in radians                    (output)
 *
 */

#ifdef __cplusplus
}
#endif

#endif /* POLARST_H  */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/tranmerc.c
================================================
/***************************************************************************/
/* RSC IDENTIFIER: TRANSVERSE MERCATOR
 *
 * ABSTRACT
 *
 *    This component provides conversions between Geodetic coordinates 
 *    (latitude and longitude) and Transverse Mercator projection coordinates
 *    (easting and northing).
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found the error code is combined with the current error code using 
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *       TRANMERC_NO_ERROR           : No errors occurred in function
 *       TRANMERC_LAT_ERROR          : Latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *       TRANMERC_LON_ERROR          : Longitude outside of valid range
 *                                      (-180 to 360 degrees, and within
 *                                        +/-90 of Central Meridian)
 *       TRANMERC_EASTING_ERROR      : Easting outside of valid range
 *                                      (depending on ellipsoid and
 *                                       projection parameters)
 *       TRANMERC_NORTHING_ERROR     : Northing outside of valid range
 *                                      (depending on ellipsoid and
 *                                       projection parameters)
 *       TRANMERC_ORIGIN_LAT_ERROR   : Origin latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *       TRANMERC_CENT_MER_ERROR     : Central meridian outside of valid range
 *                                      (-180 to 360 degrees)
 *       TRANMERC_A_ERROR            : Semi-major axis less than or equal to zero
 *       TRANMERC_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	                  (250 to 350)
 *       TRANMERC_SCALE_FACTOR_ERROR : Scale factor outside of valid
 *                                     range (0.3 to 3.0)
 *		 TM_LON_WARNING              : Distortion will result if longitude is more
 *                                       than 9 degrees from the Central Meridian
 *
 * REUSE NOTES
 *
 *    TRANSVERSE MERCATOR is intended for reuse by any application that 
 *    performs a Transverse Mercator projection or its inverse.
 *    
 * REFERENCES
 *
 *    Further information on TRANSVERSE MERCATOR can be found in the 
 *    Reuse Manual.
 *
 *    TRANSVERSE MERCATOR originated from :  
 *                      U.S. Army Topographic Engineering Center
 *                      Geospatial Information Division
 *                      7701 Telegraph Road
 *                      Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *    TRANSVERSE MERCATOR has no restrictions.
 *
 * ENVIRONMENT
 *
 *    TRANSVERSE MERCATOR was tested and certified in the following 
 *    environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. Windows 95 with MS Visual C++, version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    10-02-97          Original Code
 *    03-02-97          Re-engineered Code
 *
 */


/***************************************************************************/
/*
 *                               INCLUDES
 */

#include <math.h>
#include "tranmerc.h"

/*
 *    math.h      - Standard C math library
 *    tranmerc.h  - Is for prototype error checking
 */


/***************************************************************************/
/*                               DEFINES 
 *
 */

#define PI              3.14159265358979323e0   /* PI     */
#define PI_OVER_2         (PI/2.0e0)            /* PI over 2 */
#define MAX_LAT         ((PI * 89.99)/180.0)    /* 89.99 degrees in radians */
#define MAX_DELTA_LONG  ((PI * 90)/180.0)       /* 90 degrees in radians */
#define MIN_SCALE_FACTOR  0.3
#define MAX_SCALE_FACTOR  3.0

#define SPHTMD(Latitude) ((double) (TranMerc_ap * Latitude \
      - TranMerc_bp * sin(2.e0 * Latitude) + TranMerc_cp * sin(4.e0 * Latitude) \
      - TranMerc_dp * sin(6.e0 * Latitude) + TranMerc_ep * sin(8.e0 * Latitude) ) )

#define SPHSN(Latitude) ((double) (TranMerc_a / sqrt( 1.e0 - TranMerc_es * \
      pow(sin(Latitude), 2))))

#define SPHSR(Latitude) ((double) (TranMerc_a * (1.e0 - TranMerc_es) / \
    pow(DENOM(Latitude), 3)))

#define DENOM(Latitude) ((double) (sqrt(1.e0 - TranMerc_es * pow(sin(Latitude),2))))


/**************************************************************************/
/*                               GLOBAL DECLARATIONS
 *
 */

/* Ellipsoid Parameters, default to WGS 84  */
static double TranMerc_a = 6378137.0;              /* Semi-major axis of ellipsoid in meters */
static double TranMerc_f = 1 / 298.257223563;      /* Flattening of ellipsoid  */
static double TranMerc_es = 0.0066943799901413800; /* Eccentricity (0.08181919084262188000) squared */
static double TranMerc_ebs = 0.0067394967565869;   /* Second Eccentricity squared */

/* Transverse_Mercator projection Parameters */
static double TranMerc_Origin_Lat = 0.0;           /* Latitude of origin in radians */
static double TranMerc_Origin_Long = 0.0;          /* Longitude of origin in radians */
static double TranMerc_False_Northing = 0.0;       /* False northing in meters */
static double TranMerc_False_Easting = 0.0;        /* False easting in meters */
static double TranMerc_Scale_Factor = 1.0;         /* Scale factor  */

/* Isometeric to geodetic latitude parameters, default to WGS 84 */
static double TranMerc_ap = 6367449.1458008;
static double TranMerc_bp = 16038.508696861;
static double TranMerc_cp = 16.832613334334;
static double TranMerc_dp = 0.021984404273757;
static double TranMerc_ep = 3.1148371319283e-005;

/* Maximum variance for easting and northing values for WGS 84. */
static double TranMerc_Delta_Easting = 40000000.0;
static double TranMerc_Delta_Northing = 40000000.0;

/* These state variables are for optimization purposes. The only function
 * that should modify them is Set_Tranverse_Mercator_Parameters.         */


/************************************************************************/
/*                              FUNCTIONS     
 *
 */


long Set_Transverse_Mercator_Parameters(double a,
                                        double f,
                                        double Origin_Latitude,
                                        double Central_Meridian,
                                        double False_Easting,
                                        double False_Northing,
                                        double Scale_Factor)

{ /* BEGIN Set_Tranverse_Mercator_Parameters */
  /*
   * The function Set_Tranverse_Mercator_Parameters receives the ellipsoid
   * parameters and Tranverse Mercator projection parameters as inputs, and
   * sets the corresponding state variables. If any errors occur, the error
   * code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
   * returned.
   *
   *    a                 : Semi-major axis of ellipsoid, in meters    (input)
   *    f                 : Flattening of ellipsoid						         (input)
   *    Origin_Latitude   : Latitude in radians at the origin of the   (input)
   *                         projection
   *    Central_Meridian  : Longitude in radians at the center of the  (input)
   *                         projection
   *    False_Easting     : Easting/X at the center of the projection  (input)
   *    False_Northing    : Northing/Y at the center of the projection (input)
   *    Scale_Factor      : Projection scale factor                    (input) 
   */

  double tn;        /* True Meridianal distance constant  */
  double tn2;
  double tn3;
  double tn4;
  double tn5;
  double dummy_northing;
  double TranMerc_b; /* Semi-minor axis of ellipsoid, in meters */
  double inv_f = 1 / f;
  long Error_Code = TRANMERC_NO_ERROR;

  if (a <= 0.0)
  { /* Semi-major axis must be greater than zero */
    Error_Code |= TRANMERC_A_ERROR;
  }
  if ((inv_f < 250) || (inv_f > 350))
  { /* Inverse flattening must be between 250 and 350 */
    Error_Code |= TRANMERC_INV_F_ERROR;
  }
  if ((Origin_Latitude < -PI_OVER_2) || (Origin_Latitude > PI_OVER_2))
  { /* origin latitude out of range */
    Error_Code |= TRANMERC_ORIGIN_LAT_ERROR;
  }
  if ((Central_Meridian < -PI) || (Central_Meridian > (2*PI)))
  { /* origin longitude out of range */
    Error_Code |= TRANMERC_CENT_MER_ERROR;
  }
  if ((Scale_Factor < MIN_SCALE_FACTOR) || (Scale_Factor > MAX_SCALE_FACTOR))
  {
    Error_Code |= TRANMERC_SCALE_FACTOR_ERROR;
  }
  if (!Error_Code)
  { /* no errors */
    TranMerc_a = a;
    TranMerc_f = f;
    TranMerc_Origin_Lat = Origin_Latitude;
    if (Central_Meridian > PI)
      Central_Meridian -= (2*PI);
    TranMerc_Origin_Long = Central_Meridian;
    TranMerc_False_Northing = False_Northing;
    TranMerc_False_Easting = False_Easting; 
    TranMerc_Scale_Factor = Scale_Factor;

    /* Eccentricity Squared */
    TranMerc_es = 2 * TranMerc_f - TranMerc_f * TranMerc_f;
    /* Second Eccentricity Squared */
    TranMerc_ebs = (1 / (1 - TranMerc_es)) - 1;

    TranMerc_b = TranMerc_a * (1 - TranMerc_f);    
    /*True meridianal constants  */
    tn = (TranMerc_a - TranMerc_b) / (TranMerc_a + TranMerc_b);
    tn2 = tn * tn;
    tn3 = tn2 * tn;
    tn4 = tn3 * tn;
    tn5 = tn4 * tn;

    TranMerc_ap = TranMerc_a * (1.e0 - tn + 5.e0 * (tn2 - tn3)/4.e0
                                + 81.e0 * (tn4 - tn5)/64.e0 );
    TranMerc_bp = 3.e0 * TranMerc_a * (tn - tn2 + 7.e0 * (tn3 - tn4)
                                       /8.e0 + 55.e0 * tn5/64.e0 )/2.e0;
    TranMerc_cp = 15.e0 * TranMerc_a * (tn2 - tn3 + 3.e0 * (tn4 - tn5 )/4.e0) /16.0;
    TranMerc_dp = 35.e0 * TranMerc_a * (tn3 - tn4 + 11.e0 * tn5 / 16.e0) / 48.e0;
    TranMerc_ep = 315.e0 * TranMerc_a * (tn4 - tn5) / 512.e0;
    Convert_Geodetic_To_Transverse_Mercator(MAX_LAT,
                                            MAX_DELTA_LONG + Central_Meridian,
                                            &TranMerc_Delta_Easting,
                                            &TranMerc_Delta_Northing);
    Convert_Geodetic_To_Transverse_Mercator(0,
                                            MAX_DELTA_LONG + Central_Meridian,
                                            &TranMerc_Delta_Easting,
                                            &dummy_northing);
    TranMerc_Delta_Northing++;
    TranMerc_Delta_Easting++;

  } /* END OF if(!Error_Code) */
  return (Error_Code);
}  /* END of Set_Transverse_Mercator_Parameters  */


void Get_Transverse_Mercator_Parameters(double *a,
                                        double *f,
                                        double *Origin_Latitude,
                                        double *Central_Meridian,
                                        double *False_Easting,
                                        double *False_Northing,
                                        double *Scale_Factor)

{ /* BEGIN Get_Tranverse_Mercator_Parameters  */
  /*
   * The function Get_Transverse_Mercator_Parameters returns the current
   * ellipsoid and Transverse Mercator projection parameters.
   *
   *    a                 : Semi-major axis of ellipsoid, in meters    (output)
   *    f                 : Flattening of ellipsoid						         (output)
   *    Origin_Latitude   : Latitude in radians at the origin of the   (output)
   *                         projection
   *    Central_Meridian  : Longitude in radians at the center of the  (output)
   *                         projection
   *    False_Easting     : Easting/X at the center of the projection  (output)
   *    False_Northing    : Northing/Y at the center of the projection (output)
   *    Scale_Factor      : Projection scale factor                    (output) 
   */

  *a = TranMerc_a;
  *f = TranMerc_f;
  *Origin_Latitude = TranMerc_Origin_Lat;
  *Central_Meridian = TranMerc_Origin_Long;
  *False_Easting = TranMerc_False_Easting;
  *False_Northing = TranMerc_False_Northing;
  *Scale_Factor = TranMerc_Scale_Factor;
  return;
} /* END OF Get_Tranverse_Mercator_Parameters */


long Convert_Geodetic_To_Transverse_Mercator (double Latitude,
                                              double Longitude,
                                              double *Easting,
                                              double *Northing)

{      /* BEGIN Convert_Geodetic_To_Transverse_Mercator */

  /*
   * The function Convert_Geodetic_To_Transverse_Mercator converts geodetic
   * (latitude and longitude) coordinates to Transverse Mercator projection
   * (easting and northing) coordinates, according to the current ellipsoid
   * and Transverse Mercator projection coordinates.  If any errors occur, the
   * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
   * returned.
   *
   *    Latitude      : Latitude in radians                         (input)
   *    Longitude     : Longitude in radians                        (input)
   *    Easting       : Easting/X in meters                         (output)
   *    Northing      : Northing/Y in meters                        (output)
   */

  double c;       /* Cosine of latitude                          */
  double c2;
  double c3;
  double c5;
  double c7;
  double dlam;    /* Delta longitude - Difference in Longitude       */
  double eta;     /* constant - TranMerc_ebs *c *c                   */
  double eta2;
  double eta3;
  double eta4;
  double s;       /* Sine of latitude                        */
  double sn;      /* Radius of curvature in the prime vertical       */
  double t;       /* Tangent of latitude                             */
  double tan2;
  double tan3;
  double tan4;
  double tan5;
  double tan6;
  double t1;      /* Term in coordinate conversion formula - GP to Y */
  double t2;      /* Term in coordinate conversion formula - GP to Y */
  double t3;      /* Term in coordinate conversion formula - GP to Y */
  double t4;      /* Term in coordinate conversion formula - GP to Y */
  double t5;      /* Term in coordinate conversion formula - GP to Y */
  double t6;      /* Term in coordinate conversion formula - GP to Y */
  double t7;      /* Term in coordinate conversion formula - GP to Y */
  double t8;      /* Term in coordinate conversion formula - GP to Y */
  double t9;      /* Term in coordinate conversion formula - GP to Y */
  double tmd;     /* True Meridional distance                        */
  double tmdo;    /* True Meridional distance for latitude of origin */
  long    Error_Code = TRANMERC_NO_ERROR;
  double temp_Origin;
  double temp_Long;

  if ((Latitude < -MAX_LAT) || (Latitude > MAX_LAT))
  {  /* Latitude out of range */
    Error_Code|= TRANMERC_LAT_ERROR;
  }
  if (Longitude > PI)
    Longitude -= (2 * PI);
  if ((Longitude < (TranMerc_Origin_Long - MAX_DELTA_LONG))
      || (Longitude > (TranMerc_Origin_Long + MAX_DELTA_LONG)))
  {
    if (Longitude < 0)
      temp_Long = Longitude + 2 * PI;
    else
      temp_Long = Longitude;
    if (TranMerc_Origin_Long < 0)
      temp_Origin = TranMerc_Origin_Long + 2 * PI;
    else
      temp_Origin = TranMerc_Origin_Long;
    if ((temp_Long < (temp_Origin - MAX_DELTA_LONG))
        || (temp_Long > (temp_Origin + MAX_DELTA_LONG)))
      Error_Code|= TRANMERC_LON_ERROR;
  }
  if (!Error_Code)
  { /* no errors */

    /* 
     *  Delta Longitude
     */
    dlam = Longitude - TranMerc_Origin_Long;

    if (fabs(dlam) > (9.0 * PI / 180))
    { /* Distortion will result if Longitude is more than 9 degrees from the Central Meridian */
      Error_Code |= TRANMERC_LON_WARNING;
    }

    if (dlam > PI)
      dlam -= (2 * PI);
    if (dlam < -PI)
      dlam += (2 * PI);
    if (fabs(dlam) < 2.e-10)
      dlam = 0.0;

    s = sin(Latitude);
    c = cos(Latitude);
    c2 = c * c;
    c3 = c2 * c;
    c5 = c3 * c2;
    c7 = c5 * c2;
    t = tan (Latitude);
    tan2 = t * t;
    tan3 = tan2 * t;
    tan4 = tan3 * t;
    tan5 = tan4 * t;
    tan6 = tan5 * t;
    eta = TranMerc_ebs * c2;
    eta2 = eta * eta;
    eta3 = eta2 * eta;
    eta4 = eta3 * eta;

    /* radius of curvature in prime vertical */
    sn = SPHSN(Latitude);

    /* True Meridianal Distances */
    tmd = SPHTMD(Latitude);

    /*  Origin  */
    tmdo = SPHTMD (TranMerc_Origin_Lat);

    /* northing */
    t1 = (tmd - tmdo) * TranMerc_Scale_Factor;
    t2 = sn * s * c * TranMerc_Scale_Factor/ 2.e0;
    t3 = sn * s * c3 * TranMerc_Scale_Factor * (5.e0 - tan2 + 9.e0 * eta 
                                                + 4.e0 * eta2) /24.e0; 

    t4 = sn * s * c5 * TranMerc_Scale_Factor * (61.e0 - 58.e0 * tan2
                                                + tan4 + 270.e0 * eta - 330.e0 * tan2 * eta + 445.e0 * eta2
                                                + 324.e0 * eta3 -680.e0 * tan2 * eta2 + 88.e0 * eta4 
                                                -600.e0 * tan2 * eta3 - 192.e0 * tan2 * eta4) / 720.e0;

    t5 = sn * s * c7 * TranMerc_Scale_Factor * (1385.e0 - 3111.e0 * 
                                                tan2 + 543.e0 * tan4 - tan6) / 40320.e0;

    *Northing = TranMerc_False_Northing + t1 + pow(dlam,2.e0) * t2
                + pow(dlam,4.e0) * t3 + pow(dlam,6.e0) * t4
                + pow(dlam,8.e0) * t5; 

    /* Easting */
    t6 = sn * c * TranMerc_Scale_Factor;
    t7 = sn * c3 * TranMerc_Scale_Factor * (1.e0 - tan2 + eta ) /6.e0;
    t8 = sn * c5 * TranMerc_Scale_Factor * (5.e0 - 18.e0 * tan2 + tan4
                                            + 14.e0 * eta - 58.e0 * tan2 * eta + 13.e0 * eta2 + 4.e0 * eta3 
                                            - 64.e0 * tan2 * eta2 - 24.e0 * tan2 * eta3 )/ 120.e0;
    t9 = sn * c7 * TranMerc_Scale_Factor * ( 61.e0 - 479.e0 * tan2
                                             + 179.e0 * tan4 - tan6 ) /5040.e0;

    *Easting = TranMerc_False_Easting + dlam * t6 + pow(dlam,3.e0) * t7 
               + pow(dlam,5.e0) * t8 + pow(dlam,7.e0) * t9;
  }
  return (Error_Code);
} /* END OF Convert_Geodetic_To_Transverse_Mercator */


long Convert_Transverse_Mercator_To_Geodetic (
                                             double Easting,
                                             double Northing,
                                             double *Latitude,
                                             double *Longitude)
{      /* BEGIN Convert_Transverse_Mercator_To_Geodetic */

  /*
   * The function Convert_Transverse_Mercator_To_Geodetic converts Transverse
   * Mercator projection (easting and northing) coordinates to geodetic
   * (latitude and longitude) coordinates, according to the current ellipsoid
   * and Transverse Mercator projection parameters.  If any errors occur, the
   * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
   * returned.
   *
   *    Easting       : Easting/X in meters                         (input)
   *    Northing      : Northing/Y in meters                        (input)
   *    Latitude      : Latitude in radians                         (output)
   *    Longitude     : Longitude in radians                        (output)
   */

  double c;       /* Cosine of latitude                          */
  double de;      /* Delta easting - Difference in Easting (Easting-Fe)    */
  double dlam;    /* Delta longitude - Difference in Longitude       */
  double eta;     /* constant - TranMerc_ebs *c *c                   */
  double eta2;
  double eta3;
  double eta4;
  double ftphi;   /* Footpoint latitude                              */
  int    i;       /* Loop iterator                   */
  double s;       /* Sine of latitude                        */
  double sn;      /* Radius of curvature in the prime vertical       */
  double sr;      /* Radius of curvature in the meridian             */
  double t;       /* Tangent of latitude                             */
  double tan2;
  double tan4;
  double t10;     /* Term in coordinate conversion formula - GP to Y */
  double t11;     /* Term in coordinate conversion formula - GP to Y */
  double t12;     /* Term in coordinate conversion formula - GP to Y */
  double t13;     /* Term in coordinate conversion formula - GP to Y */
  double t14;     /* Term in coordinate conversion formula - GP to Y */
  double t15;     /* Term in coordinate conversion formula - GP to Y */
  double t16;     /* Term in coordinate conversion formula - GP to Y */
  double t17;     /* Term in coordinate conversion formula - GP to Y */
  double tmd;     /* True Meridional distance                        */
  double tmdo;    /* True Meridional distance for latitude of origin */
  long Error_Code = TRANMERC_NO_ERROR;

  if ((Easting < (TranMerc_False_Easting - TranMerc_Delta_Easting))
      ||(Easting > (TranMerc_False_Easting + TranMerc_Delta_Easting)))
  { /* Easting out of range  */
    Error_Code |= TRANMERC_EASTING_ERROR;
  }
  if ((Northing < (TranMerc_False_Northing - TranMerc_Delta_Northing))
      || (Northing > (TranMerc_False_Northing + TranMerc_Delta_Northing)))
  { /* Northing out of range */
    Error_Code |= TRANMERC_NORTHING_ERROR;
  }

  if (!Error_Code)
  {
    /* True Meridional Distances for latitude of origin */
    tmdo = SPHTMD(TranMerc_Origin_Lat);

    /*  Origin  */
    tmd = tmdo +  (Northing - TranMerc_False_Northing) / TranMerc_Scale_Factor; 

    /* First Estimate */
    sr = SPHSR(0.e0);
    ftphi = tmd/sr;

    for (i = 0; i < 5 ; i++)
    {
      t10 = SPHTMD (ftphi);
      sr = SPHSR(ftphi);
      ftphi = ftphi + (tmd - t10) / sr;
    }

    /* Radius of Curvature in the meridian */
    sr = SPHSR(ftphi);

    /* Radius of Curvature in the meridian */
    sn = SPHSN(ftphi);

    /* Sine Cosine terms */
    s = sin(ftphi);
    c = cos(ftphi);

    /* Tangent Value  */
    t = tan(ftphi);
    tan2 = t * t;
    tan4 = tan2 * tan2;
    eta = TranMerc_ebs * pow(c,2);
    eta2 = eta * eta;
    eta3 = eta2 * eta;
    eta4 = eta3 * eta;
    de = Easting - TranMerc_False_Easting;
    if (fabs(de) < 0.0001)
      de = 0.0;

    /* Latitude */
    t10 = t / (2.e0 * sr * sn * pow(TranMerc_Scale_Factor, 2));
    t11 = t * (5.e0  + 3.e0 * tan2 + eta - 4.e0 * pow(eta,2)
               - 9.e0 * tan2 * eta) / (24.e0 * sr * pow(sn,3) 
                                       * pow(TranMerc_Scale_Factor,4));
    t12 = t * (61.e0 + 90.e0 * tan2 + 46.e0 * eta + 45.E0 * tan4
               - 252.e0 * tan2 * eta  - 3.e0 * eta2 + 100.e0 
               * eta3 - 66.e0 * tan2 * eta2 - 90.e0 * tan4
               * eta + 88.e0 * eta4 + 225.e0 * tan4 * eta2
               + 84.e0 * tan2* eta3 - 192.e0 * tan2 * eta4)
          / ( 720.e0 * sr * pow(sn,5) * pow(TranMerc_Scale_Factor, 6) );
    t13 = t * ( 1385.e0 + 3633.e0 * tan2 + 4095.e0 * tan4 + 1575.e0 
                * pow(t,6))/ (40320.e0 * sr * pow(sn,7) * pow(TranMerc_Scale_Factor,8));
    *Latitude = ftphi - pow(de,2) * t10 + pow(de,4) * t11 - pow(de,6) * t12 
                + pow(de,8) * t13;

    t14 = 1.e0 / (sn * c * TranMerc_Scale_Factor);

    t15 = (1.e0 + 2.e0 * tan2 + eta) / (6.e0 * pow(sn,3) * c * 
                                        pow(TranMerc_Scale_Factor,3));

    t16 = (5.e0 + 6.e0 * eta + 28.e0 * tan2 - 3.e0 * eta2
           + 8.e0 * tan2 * eta + 24.e0 * tan4 - 4.e0 
           * eta3 + 4.e0 * tan2 * eta2 + 24.e0 
           * tan2 * eta3) / (120.e0 * pow(sn,5) * c  
                             * pow(TranMerc_Scale_Factor,5));

    t17 = (61.e0 +  662.e0 * tan2 + 1320.e0 * tan4 + 720.e0 
           * pow(t,6)) / (5040.e0 * pow(sn,7) * c 
                          * pow(TranMerc_Scale_Factor,7));

    /* Difference in Longitude */
    dlam = de * t14 - pow(de,3) * t15 + pow(de,5) * t16 - pow(de,7) * t17;

    /* Longitude */
    (*Longitude) = TranMerc_Origin_Long + dlam;

    if((fabs)(*Latitude) > (90.0 * PI / 180.0))
      Error_Code |= TRANMERC_NORTHING_ERROR;

    if((*Longitude) > (PI))
    {
      *Longitude -= (2 * PI);
      if((fabs)(*Longitude) > PI)
        Error_Code |= TRANMERC_EASTING_ERROR;
    }
    else if((*Longitude) < (-PI))
    {
      *Longitude += (2 * PI);
      if((fabs)(*Longitude) > PI)
        Error_Code |= TRANMERC_EASTING_ERROR;
    }

    if (fabs(dlam) > (9.0 * PI / 180) * cos(*Latitude))
    { /* Distortion will result if Longitude is more than 9 degrees from the Central Meridian at the equator */
      /* and decreases to 0 degrees at the poles */
      /* As you move towards the poles, distortion will become more significant */
      Error_Code |= TRANMERC_LON_WARNING;
    }
  }
  return (Error_Code);
} /* END OF Convert_Transverse_Mercator_To_Geodetic */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/tranmerc.h
================================================
#ifndef TRANMERC_H
#define TRANMERC_H

/***************************************************************************/
/* RSC IDENTIFIER: TRANSVERSE MERCATOR
 *
 * ABSTRACT
 *
 *    This component provides conversions between Geodetic coordinates
 *    (latitude and longitude) and Transverse Mercator projection coordinates
 *    (easting and northing).
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found the error code is combined with the current error code using
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *       TRANMERC_NO_ERROR           : No errors occurred in function
 *       TRANMERC_LAT_ERROR          : Latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *       TRANMERC_LON_ERROR          : Longitude outside of valid range
 *                                      (-180 to 360 degrees, and within
 *                                        +/-90 of Central Meridian)
 *       TRANMERC_EASTING_ERROR      : Easting outside of valid range
 *                                      (depending on ellipsoid and
 *                                       projection parameters)
 *       TRANMERC_NORTHING_ERROR     : Northing outside of valid range
 *                                      (depending on ellipsoid and
 *                                       projection parameters)
 *       TRANMERC_ORIGIN_LAT_ERROR   : Origin latitude outside of valid range
 *                                      (-90 to 90 degrees)
 *       TRANMERC_CENT_MER_ERROR     : Central meridian outside of valid range
 *                                      (-180 to 360 degrees)
 *       TRANMERC_A_ERROR            : Semi-major axis less than or equal to
 *zero TRANMERC_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	                  (250 to 350)
 *       TRANMERC_SCALE_FACTOR_ERROR : Scale factor outside of valid
 *                                     range (0.3 to 3.0)
 *		 TM_LON_WARNING              : Distortion will result if longitude is
 *more than 9 degrees from the Central Meridian
 *
 * REUSE NOTES
 *
 *    TRANSVERSE MERCATOR is intended for reuse by any application that
 *    performs a Transverse Mercator projection or its inverse.
 *
 * REFERENCES
 *
 *    Further information on TRANSVERSE MERCATOR can be found in the
 *    Reuse Manual.
 *
 *    TRANSVERSE MERCATOR originated from :
 *                      U.S. Army Topographic Engineering Center
 *                      Geospatial Information Division
 *                      7701 Telegraph Road
 *                      Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *    TRANSVERSE MERCATOR has no restrictions.
 *
 * ENVIRONMENT
 *
 *    TRANSVERSE MERCATOR was tested and certified in the following
 *    environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. Windows 95 with MS Visual C++, version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    10-02-97          Original Code
 *    03-02-97          Re-engineered Code
 *
 */

/***************************************************************************/
/*
 *                              DEFINES
 */

#define TRANMERC_NO_ERROR 0x0000
#define TRANMERC_LAT_ERROR 0x0001
#define TRANMERC_LON_ERROR 0x0002
#define TRANMERC_EASTING_ERROR 0x0004
#define TRANMERC_NORTHING_ERROR 0x0008
#define TRANMERC_ORIGIN_LAT_ERROR 0x0010
#define TRANMERC_CENT_MER_ERROR 0x0020
#define TRANMERC_A_ERROR 0x0040
#define TRANMERC_INV_F_ERROR 0x0080
#define TRANMERC_SCALE_FACTOR_ERROR 0x0100
#define TRANMERC_LON_WARNING 0x0200

/***************************************************************************/
/*
 *                              FUNCTION PROTOTYPES
 *                                for TRANMERC.C
 */

/* ensure proper linkage to c++ programs */
#ifdef __cplusplus
extern "C" {
#endif

long Set_Transverse_Mercator_Parameters(
    double a, double f, double Origin_Latitude, double Central_Meridian,
    double False_Easting, double False_Northing, double Scale_Factor);
/*
 * The function Set_Tranverse_Mercator_Parameters receives the ellipsoid
 * parameters and Tranverse Mercator projection parameters as inputs, and
 * sets the corresponding state variables. If any errors occur, the error
 * code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
 * returned.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters    (input)
 *    f                 : Flattening of ellipsoid                    (input)
 *    Origin_Latitude   : Latitude in radians at the origin of the   (input)
 *                         projection
 *    Central_Meridian  : Longitude in radians at the center of the  (input)
 *                         projection
 *    False_Easting     : Easting/X at the center of the projection  (input)
 *    False_Northing    : Northing/Y at the center of the projection (input)
 *    Scale_Factor      : Projection scale factor                    (input)
 */

void Get_Transverse_Mercator_Parameters(
    double* a, double* f, double* Origin_Latitude, double* Central_Meridian,
    double* False_Easting, double* False_Northing, double* Scale_Factor);
/*
 * The function Get_Transverse_Mercator_Parameters returns the current
 * ellipsoid and Transverse Mercator projection parameters.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters    (output)
 *    f                 : Flattening of ellipsoid                    (output)
 *    Origin_Latitude   : Latitude in radians at the origin of the   (output)
 *                         projection
 *    Central_Meridian  : Longitude in radians at the center of the  (output)
 *                         projection
 *    False_Easting     : Easting/X at the center of the projection  (output)
 *    False_Northing    : Northing/Y at the center of the projection (output)
 *    Scale_Factor      : Projection scale factor                    (output)
 */

long Convert_Geodetic_To_Transverse_Mercator(double Latitude, double Longitude,
                                             double* Easting, double* Northing);

/*
 * The function Convert_Geodetic_To_Transverse_Mercator converts geodetic
 * (latitude and longitude) coordinates to Transverse Mercator projection
 * (easting and northing) coordinates, according to the current ellipsoid
 * and Transverse Mercator projection coordinates.  If any errors occur, the
 * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
 * returned.
 *
 *    Latitude      : Latitude in radians                         (input)
 *    Longitude     : Longitude in radians                        (input)
 *    Easting       : Easting/X in meters                         (output)
 *    Northing      : Northing/Y in meters                        (output)
 */

long Convert_Transverse_Mercator_To_Geodetic(double Easting, double Northing,
                                             double* Latitude,
                                             double* Longitude);

/*
 * The function Convert_Transverse_Mercator_To_Geodetic converts Transverse
 * Mercator projection (easting and northing) coordinates to geodetic
 * (latitude and longitude) coordinates, according to the current ellipsoid
 * and Transverse Mercator projection parameters.  If any errors occur, the
 * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is
 * returned.
 *
 *    Easting       : Easting/X in meters                         (input)
 *    Northing      : Northing/Y in meters                        (input)
 *    Latitude      : Latitude in radians                         (output)
 *    Longitude     : Longitude in radians                        (output)
 */

#ifdef __cplusplus
}
#endif

#endif /* TRANMERC_H */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/ups.c
================================================
/********************************************************************/
/* RSC IDENTIFIER: UPS
 *
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic (latitude
 *    and longitude) coordinates and Universal Polar Stereographic (UPS)
 *    projection (hemisphere, easting, and northing) coordinates.
 *
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an 
 *    invalid value is found the error code is combined with the 
 *    current error code using the bitwise or.  This combining allows  
 *    multiple error codes to be returned. The possible error codes 
 *    are:
 *
 *         UPS_NO_ERROR           : No errors occurred in function
 *         UPS_LAT_ERROR          : Latitude outside of valid range
 *                                   (North Pole: 83.5 to 90,
 *                                    South Pole: -79.5 to -90)
 *         UPS_LON_ERROR          : Longitude outside of valid range
 *                                   (-180 to 360 degrees)
 *         UPS_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')
 *         UPS_EASTING_ERROR      : Easting outside of valid range,
 *                                   (0 to 4,000,000m)
 *         UPS_NORTHING_ERROR     : Northing outside of valid range,
 *                                   (0 to 4,000,000m)
 *         UPS_A_ERROR            : Semi-major axis less than or equal to zero
 *         UPS_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	               (250 to 350)
 *
 *
 * REUSE NOTES
 *
 *    UPS is intended for reuse by any application that performs a Universal
 *    Polar Stereographic (UPS) projection.
 *
 *
 * REFERENCES
 *
 *    Further information on UPS can be found in the Reuse Manual.
 *
 *    UPS originated from :  U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 *
 * RESTRICTIONS
 *
 *    UPS has no restrictions.
 *
 *
 * ENVIRONMENT
 *
 *    UPS was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC version 2.8.1
 *    2. Windows 95 with MS Visual C++ version 6
 *
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    06-11-95          Original Code
 *    03-01-97          Original Code
 *
 *
 */


/************************************************************************/
/*
 *                               INCLUDES
 */

#include <math.h>
#include "polarst.h"
#include "ups.h"
/*
 *    math.h     - Is needed to call the math functions.
 *    polar.h    - Is used to convert polar stereographic coordinates
 *    ups.h      - Defines the function prototypes for the ups module.
 */


/************************************************************************/
/*                               GLOBAL DECLARATIONS
 *
 */

#define PI       3.14159265358979323e0  /* PI     */
#define PI_OVER    (PI/2.0e0)           /* PI over 2 */
#define MAX_LAT    ((PI * 90)/180.0)    /* 90 degrees in radians */
#define MAX_ORIGIN_LAT ((81.114528 * PI) / 180.0)
#define MIN_NORTH_LAT (83.5*PI/180.0)
#define MIN_SOUTH_LAT (-79.5*PI/180.0)
#define MIN_EAST_NORTH 0
#define MAX_EAST_NORTH 4000000

/* Ellipsoid Parameters, default to WGS 84  */
static double UPS_a = 6378137.0;          /* Semi-major axis of ellipsoid in meters   */
static double UPS_f = 1 / 298.257223563;  /* Flattening of ellipsoid  */
const double UPS_False_Easting = 2000000;
const double UPS_False_Northing = 2000000;
static double UPS_Origin_Latitude = MAX_ORIGIN_LAT;  /*set default = North Hemisphere */
static double UPS_Origin_Longitude = 0.0;


/************************************************************************/
/*                              FUNCTIONS
 *
 */


long Set_UPS_Parameters( double a,
                         double f)
{
/*
 * The function SET_UPS_PARAMETERS receives the ellipsoid parameters and sets
 * the corresponding state variables. If any errors occur, the error code(s)
 * are returned by the function, otherwise UPS_NO_ERROR is returned.
 *
 *   a     : Semi-major axis of ellipsoid in meters (input)
 *   f     : Flattening of ellipsoid					      (input)
 */

  double inv_f = 1 / f;
  long Error_Code = UPS_NO_ERROR;

  if (a <= 0.0)
  { /* Semi-major axis must be greater than zero */
    Error_Code |= UPS_A_ERROR;
  }
  if ((inv_f < 250) || (inv_f > 350))
  { /* Inverse flattening must be between 250 and 350 */
    Error_Code |= UPS_INV_F_ERROR;
  }

  if (!Error_Code)
  { /* no errors */
    UPS_a = a;
    UPS_f = f;
  }
  return (Error_Code);
}  /* END of Set_UPS_Parameters  */


void Get_UPS_Parameters( double *a,
                         double *f)
{
/*
 * The function Get_UPS_Parameters returns the current ellipsoid parameters.
 *
 *  a      : Semi-major axis of ellipsoid, in meters (output)
 *  f      : Flattening of ellipsoid					       (output)
 */

  *a = UPS_a;
  *f = UPS_f;
  return;
} /* END OF Get_UPS_Parameters */


long Convert_Geodetic_To_UPS ( double Latitude,
                               double Longitude,
                               char   *Hemisphere,
                               double *Easting,
                               double *Northing)
{
/*
 *  The function Convert_Geodetic_To_UPS converts geodetic (latitude and
 *  longitude) coordinates to UPS (hemisphere, easting, and northing)
 *  coordinates, according to the current ellipsoid parameters. If any 
 *  errors occur, the error code(s) are returned by the function, 
 *  otherwide UPS_NO_ERROR is returned.
 *
 *    Latitude      : Latitude in radians                       (input)
 *    Longitude     : Longitude in radians                      (input)
 *    Hemisphere    : Hemisphere either 'N' or 'S'              (output)
 *    Easting       : Easting/X in meters                       (output)
 *    Northing      : Northing/Y in meters                      (output)
 */

  double tempEasting, tempNorthing;
  long Error_Code = UPS_NO_ERROR;

  if ((Latitude < -MAX_LAT) || (Latitude > MAX_LAT))
  {   /* latitude out of range */
    Error_Code |= UPS_LAT_ERROR;
  }
  if ((Latitude < 0) && (Latitude > MIN_SOUTH_LAT))
    Error_Code |= UPS_LAT_ERROR;
  if ((Latitude >= 0) && (Latitude < MIN_NORTH_LAT))
    Error_Code |= UPS_LAT_ERROR;
  if ((Longitude < -PI) || (Longitude > (2 * PI)))
  {  /* slam out of range */
    Error_Code |= UPS_LON_ERROR;
  }

  if (!Error_Code)
  {  /* no errors */
    if (Latitude < 0)
    {
      UPS_Origin_Latitude = -MAX_ORIGIN_LAT; 
      *Hemisphere = 'S';
    }
    else
    {
      UPS_Origin_Latitude = MAX_ORIGIN_LAT; 
      *Hemisphere = 'N';
    }


    Set_Polar_Stereographic_Parameters( UPS_a,
                                        UPS_f,
                                        UPS_Origin_Latitude,
                                        UPS_Origin_Longitude,
                                        UPS_False_Easting,
                                        UPS_False_Northing);

    Convert_Geodetic_To_Polar_Stereographic(Latitude,
                                            Longitude,
                                            &tempEasting,
                                            &tempNorthing);

    *Easting = tempEasting;
    *Northing = tempNorthing;
  }  /*  END of if(!Error_Code)   */

  return Error_Code;
}  /* END OF Convert_Geodetic_To_UPS  */


long Convert_UPS_To_Geodetic(char   Hemisphere,
                             double Easting,
                             double Northing,
                             double *Latitude,
                             double *Longitude)
{
/*
 *  The function Convert_UPS_To_Geodetic converts UPS (hemisphere, easting, 
 *  and northing) coordinates to geodetic (latitude and longitude) coordinates
 *  according to the current ellipsoid parameters.  If any errors occur, the 
 *  error code(s) are returned by the function, otherwise UPS_NO_ERROR is 
 *  returned.
 *
 *    Hemisphere    : Hemisphere either 'N' or 'S'              (input)
 *    Easting       : Easting/X in meters                       (input)
 *    Northing      : Northing/Y in meters                      (input)
 *    Latitude      : Latitude in radians                       (output)
 *    Longitude     : Longitude in radians                      (output)
 */

  long Error_Code = UPS_NO_ERROR;

  if ((Hemisphere != 'N') && (Hemisphere != 'S'))
    Error_Code |= UPS_HEMISPHERE_ERROR;
  if ((Easting < MIN_EAST_NORTH) || (Easting > MAX_EAST_NORTH))
    Error_Code |= UPS_EASTING_ERROR;
  if ((Northing < MIN_EAST_NORTH) || (Northing > MAX_EAST_NORTH))
    Error_Code |= UPS_NORTHING_ERROR;

  if (Hemisphere =='N')
  {UPS_Origin_Latitude = MAX_ORIGIN_LAT;}
  if (Hemisphere =='S')
  {UPS_Origin_Latitude = -MAX_ORIGIN_LAT;}

  if (!Error_Code)
  {   /*  no errors   */
    Set_Polar_Stereographic_Parameters( UPS_a,
                                        UPS_f,
                                        UPS_Origin_Latitude,
                                        UPS_Origin_Longitude,
                                        UPS_False_Easting,
                                        UPS_False_Northing);


    Convert_Polar_Stereographic_To_Geodetic( Easting,
                                             Northing,
                                             Latitude,
                                             Longitude); 


    if ((*Latitude < 0) && (*Latitude > MIN_SOUTH_LAT))
      Error_Code |= UPS_LAT_ERROR;
    if ((*Latitude >= 0) && (*Latitude < MIN_NORTH_LAT))
      Error_Code |= UPS_LAT_ERROR;
  }  /*  END OF if(!Error_Code) */
  return (Error_Code);
}  /*  END OF Convert_UPS_To_Geodetic  */ 


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/ups.h
================================================
#ifndef UPS_H
#define UPS_H
/********************************************************************/
/* RSC IDENTIFIER: UPS
 *
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic (latitude
 *    and longitude) coordinates and Universal Polar Stereographic (UPS)
 *    projection (hemisphere, easting, and northing) coordinates.
 *
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an
 *    invalid value is found the error code is combined with the
 *    current error code using the bitwise or.  This combining allows
 *    multiple error codes to be returned. The possible error codes
 *    are:
 *
 *         UPS_NO_ERROR           : No errors occurred in function
 *         UPS_LAT_ERROR          : Latitude outside of valid range
 *                                   (North Pole: 83.5 to 90,
 *                                    South Pole: -79.5 to -90)
 *         UPS_LON_ERROR          : Longitude outside of valid range
 *                                   (-180 to 360 degrees)
 *         UPS_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')
 *         UPS_EASTING_ERROR      : Easting outside of valid range,
 *                                   (0 to 4,000,000m)
 *         UPS_NORTHING_ERROR     : Northing outside of valid range,
 *                                   (0 to 4,000,000m)
 *         UPS_A_ERROR            : Semi-major axis less than or equal to zero
 *         UPS_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	               (250 to 350)
 *
 *
 * REUSE NOTES
 *
 *    UPS is intended for reuse by any application that performs a Universal
 *    Polar Stereographic (UPS) projection.
 *
 *
 * REFERENCES
 *
 *    Further information on UPS can be found in the Reuse Manual.
 *
 *    UPS originated from :  U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 *
 * RESTRICTIONS
 *
 *    UPS has no restrictions.
 *
 *
 * ENVIRONMENT
 *
 *    UPS was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC version 2.8.1
 *    2. Windows 95 with MS Visual C++ version 6
 *
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    06-11-95          Original Code
 *    03-01-97          Original Code
 *
 *
 */

/**********************************************************************/
/*
 *                        DEFINES
 */

#define UPS_NO_ERROR 0x0000
#define UPS_LAT_ERROR 0x0001
#define UPS_LON_ERROR 0x0002
#define UPS_HEMISPHERE_ERROR 0x0004
#define UPS_EASTING_ERROR 0x0008
#define UPS_NORTHING_ERROR 0x0010
#define UPS_A_ERROR 0x0020
#define UPS_INV_F_ERROR 0x0040

/**********************************************************************/
/*
 *                        FUNCTION PROTOTYPES
 *                          for UPS.C
 */

/* ensure proper linkage to c++ programs */
#ifdef __cplusplus
extern "C" {
#endif

long Set_UPS_Parameters(double a, double f);
/*
 * The function SET_UPS_PARAMETERS receives the ellipsoid parameters and sets
 * the corresponding state variables. If any errors occur, the error code(s)
 * are returned by the function, otherwise UPS_NO_ERROR is returned.
 *
 *   a     : Semi-major axis of ellipsoid in meters (input)
 *   f     : Flattening of ellipsoid                (input)
 */

void Get_UPS_Parameters(double* a, double* f);
/*
 * The function Get_UPS_Parameters returns the current ellipsoid parameters.
 *
 *  a      : Semi-major axis of ellipsoid, in meters (output)
 *  f      : Flattening of ellipsoid                 (output)
 */

long Convert_Geodetic_To_UPS(double Latitude, double Longitude,
                             char* Hemisphere, double* Easting,
                             double* Northing);
/*
 *  The function Convert_Geodetic_To_UPS converts geodetic (latitude and
 *  longitude) coordinates to UPS (hemisphere, easting, and northing)
 *  coordinates, according to the current ellipsoid parameters. If any
 *  errors occur, the error code(s) are returned by the function,
 *  otherwide UPS_NO_ERROR is returned.
 *
 *    Latitude      : Latitude in radians                       (input)
 *    Longitude     : Longitude in radians                      (input)
 *    Hemisphere    : Hemisphere either 'N' or 'S'              (output)
 *    Easting       : Easting/X in meters                       (output)
 *    Northing      : Northing/Y in meters                      (output)
 */

long Convert_UPS_To_Geodetic(char Hemisphere, double Easting, double Northing,
                             double* Latitude, double* Longitude);

/*
 *  The function Convert_UPS_To_Geodetic converts UPS (hemisphere, easting,
 *  and northing) coordinates to geodetic (latitude and longitude) coordinates
 *  according to the current ellipsoid parameters.  If any errors occur, the
 *  error code(s) are returned by the function, otherwise UPS_NO_ERROR is
 *  returned.
 *
 *    Hemisphere    : Hemisphere either 'N' or 'S'              (input)
 *    Easting       : Easting/X in meters                       (input)
 *    Northing      : Northing/Y in meters                      (input)
 *    Latitude      : Latitude in radians                       (output)
 *    Longitude     : Longitude in radians                      (output)
 */

#ifdef __cplusplus
}
#endif

#endif /* UPS_H  */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/utm.c
================================================
/***************************************************************************/
/* RSC IDENTIFIER: UTM
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic coordinates 
 *    (latitude and longitudes) and Universal Transverse Mercator (UTM)
 *    projection (zone, hemisphere, easting, and northing) coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found, the error code is combined with the current error code using 
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *          UTM_NO_ERROR           : No errors occurred in function
 *          UTM_LAT_ERROR          : Latitude outside of valid range
 *                                    (-80.5 to 84.5 degrees)
 *          UTM_LON_ERROR          : Longitude outside of valid range
 *                                    (-180 to 360 degrees)
 *          UTM_EASTING_ERROR      : Easting outside of valid range
 *                                    (100,000 to 900,000 meters)
 *          UTM_NORTHING_ERROR     : Northing outside of valid range
 *                                    (0 to 10,000,000 meters)
 *          UTM_ZONE_ERROR         : Zone outside of valid range (1 to 60)
 *          UTM_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')
 *          UTM_ZONE_OVERRIDE_ERROR: Zone outside of valid range
 *                                    (1 to 60) and within 1 of 'natural' zone
 *          UTM_A_ERROR            : Semi-major axis less than or equal to zero
 *          UTM_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	                (250 to 350)
 *
 * REUSE NOTES
 *
 *    UTM is intended for reuse by any application that performs a Universal
 *    Transverse Mercator (UTM) projection or its inverse.
 *    
 * REFERENCES
 *
 *    Further information on UTM can be found in the Reuse Manual.
 *
 *    UTM originated from :  U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *    UTM has no restrictions.
 *
 * ENVIRONMENT
 *
 *    UTM was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. MSDOS with MS Visual C++, version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    10-02-97          Original Code
 *
 */


/***************************************************************************/
/*
 *                              INCLUDES
 */
#include <stdio.h>
#include "tranmerc.h"
#include "utm.h"
/*
 *    tranmerc.h    - Is used to convert transverse mercator coordinates
 *    utm.h         - Defines the function prototypes for the utm module.
 */


/***************************************************************************/
/*
 *                              DEFINES
 */

#define PI           3.14159265358979323e0    /* PI                        */
#define MIN_LAT      ( (-80.5 * PI) / 180.0 ) /* -80.5 degrees in radians    */
#define MAX_LAT      ( (84.5 * PI) / 180.0 )  /* 84.5 degrees in radians     */
#define MIN_EASTING  100000
#define MAX_EASTING  900000
#define MIN_NORTHING 0
#define MAX_NORTHING 10000000

/***************************************************************************/
/*
 *                              GLOBAL DECLARATIONS
 */

static double UTM_a = 6378137.0;         /* Semi-major axis of ellipsoid in meters  */
static double UTM_f = 1 / 298.257223563; /* Flattening of ellipsoid                 */
static long   UTM_Override = 0;          /* Zone override flag                      */


/***************************************************************************/
/*
 *                                FUNCTIONS
 *
 */

long Set_UTM_Parameters(double a,      
                        double f,
                        long   override)
{
/*
 * The function Set_UTM_Parameters receives the ellipsoid parameters and
 * UTM zone override parameter as inputs, and sets the corresponding state
 * variables.  If any errors occur, the error code(s) are returned by the 
 * function, otherwise UTM_NO_ERROR is returned.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters       (input)
 *    f                 : Flattening of ellipsoid						            (input)
 *    override          : UTM override zone, zero indicates no override (input)
 */

  double inv_f = 1 / f;
  long Error_Code = UTM_NO_ERROR;

  if (a <= 0.0)
  { /* Semi-major axis must be greater than zero */
    Error_Code |= UTM_A_ERROR;
  }
  if ((inv_f < 250) || (inv_f > 350))
  { /* Inverse flattening must be between 250 and 350 */
    Error_Code |= UTM_INV_F_ERROR;
  }
  if ((override < 0) || (override > 60))
  {
    Error_Code |= UTM_ZONE_OVERRIDE_ERROR;
  }
  if (!Error_Code)
  { /* no errors */
    UTM_a = a;
    UTM_f = f;
    UTM_Override = override;
  }
  return (Error_Code);
} /* END OF Set_UTM_Parameters */


void Get_UTM_Parameters(double *a,
                        double *f,
                        long   *override)
{
/*
 * The function Get_UTM_Parameters returns the current ellipsoid
 * parameters and UTM zone override parameter.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters       (output)
 *    f                 : Flattening of ellipsoid						            (output)
 *    override          : UTM override zone, zero indicates no override (output)
 */

  *a = UTM_a;
  *f = UTM_f;
  *override = UTM_Override;
} /* END OF Get_UTM_Parameters */


long Convert_Geodetic_To_UTM (double Latitude,
                              double Longitude,
                              long   *Zone,
                              char   *Hemisphere,
                              double *Easting,
                              double *Northing)
{ 
/*
 * The function Convert_Geodetic_To_UTM converts geodetic (latitude and
 * longitude) coordinates to UTM projection (zone, hemisphere, easting and
 * northing) coordinates according to the current ellipsoid and UTM zone
 * override parameters.  If any errors occur, the error code(s) are returned
 * by the function, otherwise UTM_NO_ERROR is returned.
 *
 *    Latitude          : Latitude in radians                 (input)
 *    Longitude         : Longitude in radians                (input)
 *    Zone              : UTM zone                            (output)
 *    Hemisphere        : North or South hemisphere           (output)
 *    Easting           : Easting (X) in meters               (output)
 *    Northing          : Northing (Y) in meters              (output)
 */

  long Lat_Degrees;
  long Long_Degrees;
  long temp_zone;
  long Error_Code = UTM_NO_ERROR;
  double Origin_Latitude = 0;
  double Central_Meridian = 0;
  double False_Easting = 500000;
  double False_Northing = 0;
  double Scale = 0.9996;

    if ((Latitude < MIN_LAT) || (Latitude > MAX_LAT))
  { /* Latitude out of range */
      Error_Code |= UTM_LAT_ERROR;
  }
    if ((Longitude < -PI) || (Longitude > (2*PI)))
  { /* Longitude out of range */
      Error_Code |= UTM_LON_ERROR;
  }
    if (!Error_Code)
  { /* no errors */
      if((Latitude > -1.0e-9) && (Latitude < 0)) {
          Latitude = 0.0;
      }
      if (Longitude < 0) {
          Longitude += (2*PI) + 1.0e-10;
      }

      Lat_Degrees = (long)(Latitude * 180.0 / PI);
      Long_Degrees = (long)(Longitude * 180.0 / PI);

      if (Longitude < PI) {
          temp_zone = (long)(31 + ((Longitude * 180.0 / PI) / 6.0));
      }
    else {
          temp_zone = (long)(((Longitude * 180.0 / PI) / 6.0) - 29);
      }

      if (temp_zone > 60) {
          temp_zone = 1;
      }
      /* UTM special cases */
      if ((Lat_Degrees > 55) && (Lat_Degrees < 64) && (Long_Degrees > -1)
        && (Long_Degrees < 3)) {
          temp_zone = 31;
      }
      if ((Lat_Degrees > 55) && (Lat_Degrees < 64) && (Long_Degrees > 2)
        && (Long_Degrees < 12)) {
          temp_zone = 32;
      }
      if ((Lat_Degrees > 71) && (Long_Degrees > -1) && (Long_Degrees < 9)) {
          temp_zone = 31;
      }
      if ((Lat_Degrees > 71) && (Long_Degrees > 8) && (Long_Degrees < 21)) {
          temp_zone = 33;
      }
      if ((Lat_Degrees > 71) && (Long_Degrees > 20) && (Long_Degrees < 33)) {
          temp_zone = 35;
      }
      if ((Lat_Degrees > 71) && (Long_Degrees > 32) && (Long_Degrees < 42)) {
          temp_zone = 37;
      }

      if (UTM_Override)
    {
        if ((temp_zone == 1) && (UTM_Override == 60)) {
            temp_zone = UTM_Override;
        }
      else if ((temp_zone == 60) && (UTM_Override == 1)) {
            temp_zone = UTM_Override;
        }
      else if ((Lat_Degrees > 71) && (Long_Degrees > -1) && (Long_Degrees < 42))
      {
          if (((temp_zone-2) <= UTM_Override) && (UTM_Override <= (temp_zone+2))) {
              temp_zone = UTM_Override;
          }
        else {
              Error_Code = UTM_ZONE_OVERRIDE_ERROR;
          }
      }
      else if (((temp_zone-1) <= UTM_Override) && (UTM_Override <= (temp_zone+1))) {
            temp_zone = UTM_Override;
        }
      else {
            Error_Code = UTM_ZONE_OVERRIDE_ERROR;
        }
    }
      if (!Error_Code)
    {
        if (temp_zone >= 31) {
            Central_Meridian = (6 * temp_zone - 183) * PI / 180.0;
        }
      else {
            Central_Meridian = (6 * temp_zone + 177) * PI / 180.0;
        }
        *Zone = temp_zone;
        if (Latitude < 0)
      {
          False_Northing = 10000000;
          *Hemisphere = 'S';
      }
      else {
            *Hemisphere = 'N';
        }
        Set_Transverse_Mercator_Parameters(UTM_a, UTM_f, Origin_Latitude,
                                         Central_Meridian, False_Easting, False_Northing, Scale);
        Convert_Geodetic_To_Transverse_Mercator(Latitude, Longitude, Easting,
                                              Northing);
        if ((*Easting < MIN_EASTING) || (*Easting > MAX_EASTING)) {
            Error_Code = UTM_EASTING_ERROR;
        }
        if ((*Northing < MIN_NORTHING) || (*Northing > MAX_NORTHING)) {
            Error_Code |= UTM_NORTHING_ERROR;
        }
    }
  } /* END OF if (!Error_Code) */
    return (Error_Code);
} /* END OF Convert_Geodetic_To_UTM */


long Convert_UTM_To_Geodetic(long   Zone,
                             char   Hemisphere,
                             double Easting,
                             double Northing,
                             double *Latitude,
                             double *Longitude)
{
/*
 * The function Convert_UTM_To_Geodetic converts UTM projection (zone, 
 * hemisphere, easting and northing) coordinates to geodetic(latitude
 * and  longitude) coordinates, according to the current ellipsoid
 * parameters.  If any errors occur, the error code(s) are returned
 * by the function, otherwise UTM_NO_ERROR is returned.
 *
 *    Zone              : UTM zone                               (input)
 *    Hemisphere        : North or South hemisphere              (input)
 *    Easting           : Easting (X) in meters                  (input)
 *    Northing          : Northing (Y) in meters                 (input)
 *    Latitude          : Latitude in radians                    (output)
 *    Longitude         : Longitude in radians                   (output)
 */
  long Error_Code = UTM_NO_ERROR;
  long tm_error_code = UTM_NO_ERROR;
  double Origin_Latitude = 0;
  double Central_Meridian = 0;
  double False_Easting = 500000;
  double False_Northing = 0;
  double Scale = 0.9996;

  if ((Zone < 1) || (Zone > 60))
    Error_Code |= UTM_ZONE_ERROR;
  if ((Hemisphere != 'S') && (Hemisphere != 'N'))
    Error_Code |= UTM_HEMISPHERE_ERROR;
  if ((Easting < MIN_EASTING) || (Easting > MAX_EASTING))
    Error_Code |= UTM_EASTING_ERROR;
  if ((Northing < MIN_NORTHING) || (Northing > MAX_NORTHING))
    Error_Code |= UTM_NORTHING_ERROR;
  if (!Error_Code)
  { /* no errors */
    if (Zone >= 31)
      Central_Meridian = ((6 * Zone - 183) * PI / 180.0 /*+ 0.00000005*/);
    else
      Central_Meridian = ((6 * Zone + 177) * PI / 180.0 /*+ 0.00000005*/);
    if (Hemisphere == 'S')
      False_Northing = 10000000;
    Set_Transverse_Mercator_Parameters(UTM_a, UTM_f, Origin_Latitude,
                                       Central_Meridian, False_Easting, False_Northing, Scale);

    tm_error_code = Convert_Transverse_Mercator_To_Geodetic(Easting, Northing, Latitude, Longitude);
    if(tm_error_code)
    {
      if(tm_error_code & TRANMERC_EASTING_ERROR)
        Error_Code |= UTM_EASTING_ERROR;
      if(tm_error_code & TRANMERC_NORTHING_ERROR)
        Error_Code |= UTM_NORTHING_ERROR;
    }

    if ((*Latitude < MIN_LAT) || (*Latitude > MAX_LAT))
    { /* Latitude out of range */
      Error_Code |= UTM_NORTHING_ERROR;
    }
  }
  return (Error_Code);
} /* END OF Convert_UTM_To_Geodetic */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/libmgrs/utm.h
================================================
#ifndef UTM_H
#define UTM_H

/***************************************************************************/
/* RSC IDENTIFIER: UTM
 *
 * ABSTRACT
 *
 *    This component provides conversions between geodetic coordinates
 *    (latitude and longitudes) and Universal Transverse Mercator (UTM)
 *    projection (zone, hemisphere, easting, and northing) coordinates.
 *
 * ERROR HANDLING
 *
 *    This component checks parameters for valid values.  If an invalid value
 *    is found, the error code is combined with the current error code using
 *    the bitwise or.  This combining allows multiple error codes to be
 *    returned. The possible error codes are:
 *
 *          UTM_NO_ERROR           : No errors occurred in function
 *          UTM_LAT_ERROR          : Latitude outside of valid range
 *                                    (-80.5 to 84.5 degrees)
 *          UTM_LON_ERROR          : Longitude outside of valid range
 *                                    (-180 to 360 degrees)
 *          UTM_EASTING_ERROR      : Easting outside of valid range
 *                                    (100,000 to 900,000 meters)
 *          UTM_NORTHING_ERROR     : Northing outside of valid range
 *                                    (0 to 10,000,000 meters)
 *          UTM_ZONE_ERROR         : Zone outside of valid range (1 to 60)
 *          UTM_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')
 *          UTM_ZONE_OVERRIDE_ERROR: Zone outside of valid range
 *                                    (1 to 60) and within 1 of 'natural' zone
 *          UTM_A_ERROR            : Semi-major axis less than or equal to zero
 *          UTM_INV_F_ERROR        : Inverse flattening outside of valid range
 *								  	                (250 to 350)
 *
 * REUSE NOTES
 *
 *    UTM is intended for reuse by any application that performs a Universal
 *    Transverse Mercator (UTM) projection or its inverse.
 *
 * REFERENCES
 *
 *    Further information on UTM can be found in the Reuse Manual.
 *
 *    UTM originated from :  U.S. Army Topographic Engineering Center
 *                           Geospatial Information Division
 *                           7701 Telegraph Road
 *                           Alexandria, VA  22310-3864
 *
 * LICENSES
 *
 *    None apply to this component.
 *
 * RESTRICTIONS
 *
 *    UTM has no restrictions.
 *
 * ENVIRONMENT
 *
 *    UTM was tested and certified in the following environments:
 *
 *    1. Solaris 2.5 with GCC, version 2.8.1
 *    2. MSDOS with MS Visual C++, version 6
 *
 * MODIFICATIONS
 *
 *    Date              Description
 *    ----              -----------
 *    10-02-97          Original Code
 *
 */

/***************************************************************************/
/*
 *                              DEFINES
 */

#define UTM_NO_ERROR 0x0000
#define UTM_LAT_ERROR 0x0001
#define UTM_LON_ERROR 0x0002
#define UTM_EASTING_ERROR 0x0004
#define UTM_NORTHING_ERROR 0x0008
#define UTM_ZONE_ERROR 0x0010
#define UTM_HEMISPHERE_ERROR 0x0020
#define UTM_ZONE_OVERRIDE_ERROR 0x0040
#define UTM_A_ERROR 0x0080
#define UTM_INV_F_ERROR 0x0100

/***************************************************************************/
/*
 *                              FUNCTION PROTOTYPES
 *                                for UTM.C
 */

/* ensure proper linkage to c++ programs */
#ifdef __cplusplus
extern "C" {
#endif

long Set_UTM_Parameters(double a, double f, long override);
/*
 * The function Set_UTM_Parameters receives the ellipsoid parameters and
 * UTM zone override parameter as inputs, and sets the corresponding state
 * variables.  If any errors occur, the error code(s) are returned by the
 * function, otherwise UTM_NO_ERROR is returned.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters       (input)
 *    f                 : Flattening of ellipsoid                       (input)
 *    override          : UTM override zone, zero indicates no override (input)
 */

void Get_UTM_Parameters(double* a, double* f, long* override);
/*
 * The function Get_UTM_Parameters returns the current ellipsoid
 * parameters and UTM zone override parameter.
 *
 *    a                 : Semi-major axis of ellipsoid, in meters       (output)
 *    f                 : Flattening of ellipsoid                       (output)
 *    override          : UTM override zone, zero indicates no override (output)
 */

long Convert_Geodetic_To_UTM(double Latitude, double Longitude, long* Zone,
                             char* Hemisphere, double* Easting,
                             double* Northing);
/*
 * The function Convert_Geodetic_To_UTM converts geodetic (latitude and
 * longitude) coordinates to UTM projection (zone, hemisphere, easting and
 * northing) coordinates according to the current ellipsoid and UTM zone
 * override parameters.  If any errors occur, the error code(s) are returned
 * by the function, otherwise UTM_NO_ERROR is returned.
 *
 *    Latitude          : Latitude in radians                 (input)
 *    Longitude         : Longitude in radians                (input)
 *    Zone              : UTM zone                            (output)
 *    Hemisphere        : North or South hemisphere           (output)
 *    Easting           : Easting (X) in meters               (output)
 *    Northing          : Northing (Y) in meters              (output)
 */

long Convert_UTM_To_Geodetic(long Zone, char Hemisphere, double Easting,
                             double Northing, double* Latitude,
                             double* Longitude);
/*
 * The function Convert_UTM_To_Geodetic converts UTM projection (zone,
 * hemisphere, easting and northing) coordinates to geodetic(latitude
 * and  longitude) coordinates, according to the current ellipsoid
 * parameters.  If any errors occur, the error code(s) are returned
 * by the function, otherwise UTM_NO_ERROR is returned.
 *
 *    Zone              : UTM zone                               (input)
 *    Hemisphere        : North or South hemisphere              (input)
 *    Easting           : Easting (X) in meters                  (input)
 *    Northing          : Northing (Y) in meters                 (input)
 *    Latitude          : Latitude in radians                    (output)
 *    Longitude         : Longitude in radians                   (output)
 */

#ifdef __cplusplus
}
#endif

#endif /* UTM_H */


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/Coordinates.h
================================================
#ifndef GALOIS_COORDINATES_H
#define GALOIS_COORDINATES_H

#include <ostream>
#include "../utils/GaloisUtils.h"
#include "Map.h"

/**
 * Container for x, y, and z coordinates and various utitlity functions on the
 * coordinates
 */
class Coordinates {
private:
  double x;
  double y;
  double z;

public:
  //! empty constructor: x, y, z initialized to default double
  Coordinates() = default;

  //! Initialize x/y as specified, z = 0
  Coordinates(double x, double y) : x(x), y(y), z(0.) {}

  //! Initialize all 3 as specified
  Coordinates(double x, double y, double z) : x(x), y(y), z(z) {}

  //! Determine z from given x, y based on the provided Map
  Coordinates(double x, double y, Map& map)
      : x(x), y(y), z(map.get_height(x, y)) {}

  //! x, y in the pair, height given by Map
  Coordinates(std::pair<double, double> coords, Map& map)
      : x(coords.first), y(coords.second),
        z(map.get_height(coords.first, coords.second)) {}

  // what follows are self explanatory get/set functions

  double getX() const { return x; }

  void setX(double x) { Coordinates::x = x; }

  double getY() const { return y; }

  void setY(double y) { Coordinates::y = y; }

  double getZ() const { return z; }

  void setZ(double z) { Coordinates::z = z; }

  void setXYZ(double x, double y, double z) {
    Coordinates::x = x;
    Coordinates::y = y;
    Coordinates::z = z;
  }

  //! Get 2D or 3D distances given another set of coordinates
  double dist(const Coordinates& rhs, bool version2D) const {
    if (version2D) {
      return dist2D(rhs);
    } else {
      return dist3D(rhs);
    }
  }

  //! Take z into account for distance
  double dist3D(const Coordinates& rhs) const {
    return sqrt(pow(x - rhs.x, 2) + pow(y - rhs.y, 2) + pow(z - rhs.z, 2));
  }

  //! Distance of just x/y coordinates
  double dist2D(const Coordinates& rhs) const {
    return sqrt(pow(x - rhs.x, 2) + pow(y - rhs.y, 2));
  }

  bool isXYequal(const Coordinates& rhs) {
    return equals(x, rhs.x) && equals(y, rhs.y);
  }

  std::string toString() const {
    return std::to_string(x) + " " + std::to_string(y) + " " +
           std::to_string(z);
  }

  //! element wise add of x,y,z
  Coordinates operator+(const Coordinates& rhs) const {
    return Coordinates{x + rhs.x, y + rhs.y, z + rhs.z};
  }

  //! element wise subtract of x,y,z
  Coordinates operator-(const Coordinates& rhs) const {
    return Coordinates{x - rhs.x, y - rhs.y, z - rhs.z};
  }

  //! element wise multiply of x,y,z
  Coordinates operator*(double rhs) const {
    return Coordinates{x * rhs, y * rhs, z * rhs};
  }

  //! element wise divide of x,y,z
  Coordinates operator/(double rhs) const {
    return Coordinates{x / rhs, y / rhs, z / rhs};
  }

  //! element wise equality check
  bool operator==(const Coordinates& rhs) const {
    return equals(x, rhs.x) && equals(y, rhs.y) && equals(z, rhs.z);
  }

  //! element wise inequality check
  bool operator!=(const Coordinates& rhs) const { return !(rhs == *this); }

  //! Less than check; checks x, y, z in that order
  bool operator<(const Coordinates& rhs) const {
    if (less(x, rhs.x))
      return true;
    if (less(rhs.x, x))
      return false;
    if (less(y, rhs.y))
      return true;
    if (less(rhs.y, y))
      return false;
    return less(z, rhs.z);
  }

  bool operator>(const Coordinates& rhs) const { return rhs < *this; }

  bool operator<=(const Coordinates& rhs) const { return !(rhs < *this); }

  bool operator>=(const Coordinates& rhs) const { return !(*this < rhs); }
};

#endif // GALOIS_COORDINATES_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/EdgeData.h
================================================
#ifndef GALOIS_EDGEDATA_H
#define GALOIS_EDGEDATA_H

#include "Coordinates.h"
#include "NodeData.h"

class EdgeData {
private:
  bool border;             //!< tells if this is a border edge
  double length;           //!< length
  Coordinates middlePoint; //!< point at middle of this edge

public:
  //! default: not a border, negative length, no middle point
  EdgeData() : border(false), length(-1), middlePoint(){};

  //! Initialize all fields
  EdgeData(bool border, double length, Coordinates middlePoint)
      : border(border), length(length), middlePoint(middlePoint) {}

  // self explanatory functions below

  bool isBorder() const { return border; }

  void setBorder(bool isBorder) { EdgeData::border = isBorder; }

  double getLength() const { return length; }

  void setLength(double l) { EdgeData::length = l; }

  const Coordinates& getMiddlePoint() const { return middlePoint; }

  //! Explicitly set middle point given coordinate class
  void setMiddlePoint(const Coordinates& coordinates) {
    EdgeData::middlePoint.setXYZ(coordinates.getX(), coordinates.getY(),
                                 coordinates.getZ());
  }

  //! Explicitly set middle point given coordinates as three vars
  void setMiddlePoint(const double x, const double y, const double z) {
    EdgeData::middlePoint.setXYZ(x, y, z);
  }
};
#endif // GALOIS_EDGEDATA_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/Graph.h
================================================
#ifndef GALOIS_GRAPH_H
#define GALOIS_GRAPH_H

#include <galois/graphs/MorphGraph.h>
#include "NodeData.h"
#include "EdgeData.h"

using Graph        = galois::graphs::MorphGraph<NodeData, EdgeData, false>;
using GNode        = Graph::GraphNode;
using EdgeIterator = Graph::edge_iterator;
using galois::optional;

#endif // GALOIS_GRAPH_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/Map.cpp
================================================
#include "Map.h"

#include "../utils/Utils.h"
#include "../libmgrs/utm.h"

#include <cstdio>
#include <cmath>
#include <limits>
#include <iostream>

double** Map::init_map_data(size_t rows, size_t cols) {
  double** map;
  map = (double**)malloc(rows * sizeof(double*));
  for (size_t i = 0; i < rows; ++i) {
    map[i] = (double*)malloc(cols * sizeof(double));
  }
  return map;
}

void Map::print_map() {
  for (size_t i = 0; i < this->length; ++i) {
    for (size_t j = 0; j < this->width; ++j) {
      fprintf(stdout, "%5.0lf ", this->data[i][j]);
    }
    fprintf(stdout, "\n");
  }
}

double Map::get_height(double lon, double lat) {
  return get_height(lon, lat, utm);
}

double Map::get_height(double lon, double lat, bool convert) {

  double x, y;

  // convert to geodetic if required
  if (convert) {
    if (Convert_UTM_To_Geodetic(zone, hemisphere, lon, lat, &y, &x)) {
      fprintf(stderr, "Error during conversion to geodetic.\n");
      exit(18);
    }
    x = Utils::r2d(x);
    y = Utils::r2d(y);
  } else {
    x = lon;
    y = lat;
  }

  // Compute "grid coordinates".
  // modf returns the fractional part of the number,
  // and assigns the integral part to the second argument.
  //
  // The integral part let us know in which "cell" of the map the point is
  // located, and the fractional part let us interpolate the heights.
  double x_grid_int_part, y_grid_int_part;
  const auto y_fract =
      std::modf((north_border - y) / cell_length, &y_grid_int_part);
  const auto x_fract =
      std::modf((x - west_border) / cell_width, &x_grid_int_part);

  // using Lagrange bilinear interpolation
  // Compute the height of the four corners
  double top_left_height =
      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 1);
  double top_right_height =
      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 2);
  double bottom_right_height =
      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 3);
  double bottom_left_height =
      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 4);

  // Sum the contributions of each corner
  double height = 0.;
  height += top_left_height * (1 - x_fract) * (1 - y_fract);
  height += top_right_height * x_fract * (1 - y_fract);
  height += bottom_right_height * x_fract * y_fract;
  height += bottom_left_height * (1 - x_fract) * y_fract;

  return height;
}

// corner: 1 - top_left, 2 - top_right, 3 - bottom_right, 4 - bottom_left
double Map::get_height_wo_interpol(const double lon_grid, const double lat_grid,
                                   const int corner) {

  auto x = (int)lon_grid;
  auto y = (int)lat_grid;

  switch (corner) {
  case 1:
    break;
  case 2:
    ++x;
    break;
  case 3:
    ++x;
    ++y;
    break;
  case 4:
    ++y;
    break;
  default:
    // XXX[AOS]: I think we should raise an error, unless it is used elsewhere.
    return std::numeric_limits<double>::min();
  }

  x = std::max(0, x);
  y = std::max(0, y);

  return data[y][x];
}

Map::~Map() {
  for (size_t i = 0; i < this->length; ++i) {
    free((double*)this->data[i]);
  }
  free(this->data);
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/Map.h
================================================
#ifndef TERGEN_MAP_H
#define TERGEN_MAP_H

#include <cstdlib>

/**
 * Holds the elevation of a particular point for some specified region (borders
 * and their lengths).
 */
class Map {
private:
  size_t width;

  size_t length;

  double cell_width;

  double cell_length;

  double** data;

  double north_border;

  double west_border;

  bool utm;

  long zone;

  char hemisphere;

  double get_height_wo_interpol(const double lon, const double lat,
                                const int corner);

public:
  Map(double** data, size_t width, size_t length, double cellWidth,
      double cellLength)
      : width(width), length(length), cell_width(cellWidth),
        cell_length(cellLength), data(data), utm(true), zone(-1) {}

  static double** init_map_data(size_t rows, size_t cols);

  void print_map();

  double get_height(double lon, double lat);

  double get_height(double lon, double lat, bool convert);

  size_t getWidth() const { return width; }

  void setWidth(size_t width) { Map::width = width; }

  size_t getLength() const { return length; }

  void setLength(size_t length) { Map::length = length; }

  double getCellWidth() const { return cell_width; }

  void setCellWidth(double cellWidth) { cell_width = cellWidth; }

  double getCellLength() const { return cell_length; }

  void setCellLength(double cellLength) { cell_length = cellLength; }

  double** getData() const { return data; }

  void setData(double** data) { Map::data = data; }

  double getNorthBorder() const { return north_border; }

  void setNorthBorder(double northBorder) { north_border = northBorder; }

  double getWestBorder() const { return west_border; }

  void setWestBorder(double westBorder) { west_border = westBorder; }

  bool isUtm() const { return utm; }

  void setUtm(bool utm) { Map::utm = utm; }

  long getZone() const { return zone; }

  void setZone(long zone) { Map::zone = zone; }

  char getHemisphere() const { return hemisphere; }

  void setHemisphere(char hemisphere) { Map::hemisphere = hemisphere; }

  ~Map();
};

#endif // TERGEN_MAP_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/NodeData.h
================================================
#ifndef GALOIS_NODEDATA_H
#define GALOIS_NODEDATA_H

#include "Coordinates.h"

class NodeData {
private:
  bool hyperEdge;     //!< Indicates if node is a metanode to track triangles
  Coordinates coords; //!< Coordinates of node
  bool toRefine;      //!< Indicates if node needs to be refined
  bool hanging;       //!< Indicates hanging node status

public:
  NodeData(bool isHyperEdge, const Coordinates& coords, bool hanging)
      : hyperEdge(isHyperEdge), coords(), toRefine(false), hanging(hanging) {
    setCoords(coords);
  }

  NodeData(bool isHyperEdge, bool toRefine)
      : hyperEdge(isHyperEdge), coords(), toRefine(toRefine), hanging(false) {}

  NodeData(bool isHyperEdge, bool toRefine, Coordinates coords)
      : hyperEdge(isHyperEdge), coords(), toRefine(toRefine), hanging(false) {
    setCoords(coords);
  }

  // self-explanatory set/get functions

  Coordinates getCoords() const { return coords; }

  void setCoords(const Coordinates& coordinates) {
    NodeData::coords.setXYZ(coordinates.getX(), coordinates.getY(),
                            coordinates.getZ());
  }

  void setCoords(const double x, const double y, const double z) {
    NodeData::coords.setXYZ(x, y, z);
  }

  bool isToRefine() const { return toRefine; }

  void setToRefine(bool refine) { NodeData::toRefine = refine; }

  bool isHanging() const { return hanging; }

  void setHanging(bool hangingNode) { NodeData::hanging = hangingNode; }

  bool isHyperEdge() const { return hyperEdge; }

  bool operator==(const NodeData& rhs) const {
    return hyperEdge == rhs.hyperEdge && coords == rhs.coords &&
           (hyperEdge ? toRefine == rhs.toRefine : hanging == rhs.hanging);
  }

  bool operator!=(const NodeData& rhs) const { return !(rhs == *this); }

  bool operator<(const NodeData& rhs) const {
    if (hyperEdge < rhs.hyperEdge)
      return true;
    if (rhs.hyperEdge < hyperEdge)
      return false;
    if (coords < rhs.coords)
      return true;
    if (rhs.coords < coords)
      return false;
    if (hyperEdge) {
      return toRefine < rhs.toRefine;
    } else {
      return hanging < rhs.hanging;
    }
  }

  bool operator>(const NodeData& rhs) const { return rhs < *this; }

  bool operator<=(const NodeData& rhs) const { return !(rhs < *this); }

  bool operator>=(const NodeData& rhs) const { return !(*this < rhs); }
};

#endif // GALOIS_NODEDATA_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/model/ProductionState.h
================================================
#ifndef GALOIS_PRODUCTIONSTATE_H
#define GALOIS_PRODUCTIONSTATE_H

#include "Graph.h"
#include "../utils/ConnectivityManager.h"

using std::vector;

/**
 * Wraps a hyperedge representing a triangle and provides methods to make
 * working with it easier.
 */
class ProductionState {
private:
  //! hyperedge ID
  GNode& interior;
  //! hyperedge data
  NodeData& interiorData;
  //! node data of the triangle the hyperedge connects
  vector<NodeData> verticesData;
  //! vertices connected by the hyper edge (i.e. a triangle)
  const vector<GNode> vertices;
  //! Edges of the triangle represented by the hyperedge
  const vector<optional<EdgeIterator>> edgesIterators;
  //! Edge data of all the triangle edges
  vector<galois::optional<EdgeData>> edgesData;
  //! lenghts of the edges of the triangle
  vector<double> lengths;
  //! edges indices that are supposed to exist via the hyperedge but do not
  //! (e.g., removed by another production)
  vector<int> brokenEdges;
  //! Indicator of what version is being used
  bool version2D;
  //! function to get the height of a point in the terrain
  std::function<double(double, double)> zGetter;

public:
  /**
   * Initialize state needed for a production given a hyperedge connecting
   * a triangle
   *
   * There's an assumption from getNeighbours that the edges will be in
   * 0->1, 1->2, 2->0 order that getTriangleEdges from connection manager relies
   * on.
   */
  ProductionState(ConnectivityManager& connManager, GNode& interior,
                  bool version2D, std::function<double(double, double)> zGetter)
      : interior(interior), interiorData(interior->getData()),
        vertices(connManager.getNeighbours(interior)),
        edgesIterators(connManager.getTriangleEdges(vertices)),
        version2D(version2D), zGetter(zGetter) {
    Graph& graph = connManager.getGraph();

    // loop over 3 nodes/edges of triangle (if they exist)
    for (int i = 0; i < 3; ++i) {
      auto maybeEdgeIter = edgesIterators[i];
      edgesData.push_back(
          maybeEdgeIter
              ? graph.getEdgeData(maybeEdgeIter.get())
              : galois::optional<EdgeData>()); // TODO: Look for
                                               // possible optimization
      lengths.push_back(maybeEdgeIter ? edgesData[i].get().getLength() : -1);
      verticesData.push_back(
          graph.getData(vertices[i])); // TODO: Look for possible optimization

      // if an edge doesn't exist, push to broken edges
      if (!maybeEdgeIter) {
        brokenEdges.push_back(i);
      }
    }
  }

  //! find the longest edges (includes ties)
  std::vector<int> getLongestEdges() const {
    std::vector<int> longestEdges;
    for (int i = 0; i < 3; ++i) {
      if (!less(lengths[i], lengths[(i + 1) % 3]) &&
          !less(lengths[i], lengths[(i + 2) % 3])) {
        longestEdges.push_back(i);
      }
    }
    return longestEdges;
  }

  int getAnyBrokenEdge() const {
    if (!brokenEdges.empty()) {
      return brokenEdges[0];
    } else {
      return -1;
    }
  }

  //! Look at all edges, return the indcies of the ones with max distance
  //! among them.
  //! ASSUMPTION: 0->1, 1->2, 2->0 order of edges
  std::vector<int> getLongestEdgesIncludingBrokenOnes() const {
    std::vector<double> verticesDistances(3);
    for (int i = 0; i < 3; ++i) {
      verticesDistances[i] = verticesData[i].getCoords().dist(
          verticesData[(i + 1) % 3].getCoords(), version2D);
    }
    return indexesOfMaxElems(verticesDistances);
  }

  GNode& getInterior() const { return interior; }

  NodeData& getInteriorData() const { return interiorData; }

  const vector<galois::optional<EdgeData>>& getEdgesData() const {
    return edgesData;
  }

  const vector<double>& getLengths() const { return lengths; }

  const vector<NodeData>& getVerticesData() const { return verticesData; }

  const vector<GNode>& getVertices() const { return vertices; }

  const vector<optional<EdgeIterator>>& getEdgesIterators() const {
    return edgesIterators;
  }

  const vector<int>& getBrokenEdges() const { return brokenEdges; }

  bool isVersion2D() const { return version2D; }

  const std::function<double(double, double)>& getZGetter() const {
    return zGetter;
  }
};

#endif // GALOIS_PRODUCTIONSTATE_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production.h
================================================
#ifndef GALOIS_PRODUCTION_H
#define GALOIS_PRODUCTION_H

#include "../model/ProductionState.h"

class Production {

public:
  //! constructor needs a connection manager wrapping the graph
  explicit Production(const ConnectivityManager& connManager)
      : connManager(connManager) {}

  virtual bool execute(ProductionState& pState,
                       galois::UserContext<GNode>& ctx) = 0;

protected:
  ConnectivityManager connManager;

  bool checkIfBrokenEdgeIsTheLongest(
      int brokenEdge, const std::vector<optional<EdgeIterator>>& edgesIterators,
      const std::vector<GNode>& vertices) const {
    std::vector<double> lengths(4);
    Graph& graph = connManager.getGraph();
    for (int i = 0, j = 0; i < 3; ++i) {
      if (i != brokenEdge) {
        lengths[j++] = graph.getEdgeData(edgesIterators[i].get()).getLength();
      } else {
        std::pair<int, int> brokenEdgeVertices = getEdgeVertices(brokenEdge);
        // Suppress warning false positive.
        // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80635
        GALOIS_IGNORE_MAYBE_UNINITIALIZED
        GNode& hangingNode =
            connManager
                .findNodeBetween(vertices[brokenEdgeVertices.first],
                                 vertices[brokenEdgeVertices.second])
                .get();
        lengths[2] = graph
                         .getEdgeData(graph.findEdge(
                             vertices[brokenEdgeVertices.first], hangingNode))
                         .getLength();
        lengths[3] = graph
                         .getEdgeData(graph.findEdge(
                             vertices[brokenEdgeVertices.second], hangingNode))
                         .getLength();
        GALOIS_END_IGNORE_MAYBE_UNINITIALIZED
      }
    }
    return !less(lengths[2] + lengths[3], lengths[0]) &&
           !less(lengths[2] + lengths[3], lengths[1]);
  }

  //! vertices of an edge
  //! assumption: edges connect vertices that have adjacent IDs
  std::pair<int, int> getEdgeVertices(int edge) const {
    return std::pair<int, int>{edge, (edge + 1) % 3};
  }

  //! get the vertex not connected to the edge
  //! assumption: edges are 0->1, 1->2, 2->0
  int getNeutralVertex(int edgeToBreak) const { return (edgeToBreak + 2) % 3; }

  void breakElementWithHangingNode(int edgeToBreak, ProductionState& pState,
                                   galois::UserContext<GNode>& ctx) const {
    GNode hangingNode = getHangingNode(edgeToBreak, pState);

    breakElementUsingNode(edgeToBreak, hangingNode, pState, ctx);

    hangingNode->getData().setHanging(false);
  }

  //! Break an edge that doesn't have a hanging node already on it
  void breakElementWithoutHangingNode(int edgeToBreak, ProductionState& pState,
                                      galois::UserContext<GNode>& ctx) const {
    // create new node + its edges
    GNode newNode = createNodeOnEdge(edgeToBreak, pState, ctx);
    // create the hyperedges that result from the break
    breakElementUsingNode(edgeToBreak, newNode, pState, ctx);
  }

  //! logging/debug function; print info to cout
  static void logg(const NodeData& interiorData,
                   const std::vector<NodeData>& verticesData) {
    std::cout << "interior: (" << interiorData.getCoords().toString()
              << "), neighbours: (";
    for (auto vertex : verticesData) {
      std::cout << vertex.getCoords().toString() + ", ";
    }
    std::cout << ") ";
  }

private:
  //! Creates a new node on an edge + its endpoints; does not
  //! create corresponding hyperedges
  GNode createNodeOnEdge(int edgeToBreak, ProductionState& pState,
                         galois::UserContext<GNode>& ctx) const {
    Graph& graph = connManager.getGraph();
    // edges of triangle
    const vector<galois::optional<EdgeData>>& edgesData = pState.getEdgesData();
    bool breakingOnBorder = edgesData[edgeToBreak].get().isBorder();
    int neutralVertex     = getNeutralVertex(edgeToBreak);

    //        const EdgeIterator &edge =
    //        pState.getEdgesIterators()[edgeToBreak].get();
    //        graph.removeEdge(*(graph.getEdgeData(edge).getSrc()), edge);

    //        auto edgePair =
    //        connManager.findSrc(pState.getEdgesIterators()[edgeToBreak].get());
    //        auto edgePair = connManager.findSrc(edgesData[edgeToBreak].get());
    //        graph.removeEdge(edgePair.first, edgePair.second);

    // remove original edge from graph
    const std::pair<int, int>& edgeVertices = getEdgeVertices(edgeToBreak);
    connManager.removeEdge(pState.getVertices()[edgeVertices.first],
                           pState.getVertices()[edgeVertices.second]);

    // new point is midway point; height comes from terrain
    const Coordinates& newPointCoords = getNewPointCoords(
        pState.getVerticesData()[edgeVertices.first].getCoords(),
        pState.getVerticesData()[edgeVertices.second].getCoords(),
        pState.getZGetter());

    // create the new node, push to graph and worklist
    // note: border nodes are never hanging; hanging means it needs to be
    // broken on the other end
    NodeData newNodeData = NodeData{false, newPointCoords, !breakingOnBorder};
    GNode newNode        = graph.createNode(newNodeData);
    graph.addNode(newNode);
    ctx.push(newNode);

    // connect vertices in original triangle to new node
    for (int i = 0; i < 3; ++i) {
      auto vertexData = pState.getVerticesData()[i];
      // addition of the new edge
      auto edge = graph.addEdge(newNode, pState.getVertices()[i]);

      graph.getEdgeData(edge).setBorder(i != neutralVertex ? breakingOnBorder
                                                           : false);
      // midpoint
      graph.getEdgeData(edge).setMiddlePoint(
          (newNodeData.getCoords().getX() + vertexData.getCoords().getX()) / 2.,
          (newNodeData.getCoords().getY() + vertexData.getCoords().getY()) / 2.,
          (newNodeData.getCoords().getZ() + vertexData.getCoords().getZ()) /
              2.);
      // distance
      graph.getEdgeData(edge).setLength(newNodeData.getCoords().dist(
          vertexData.getCoords(), pState.isVersion2D()));
    }
    return newNode;
  }

  //! Given a hanging node, create the hyperedges for the 2 resulting triangles
  void breakElementUsingNode(int edgeToBreak, GNode const& hangingNode,
                             const ProductionState& pState,
                             galois::UserContext<GNode>& ctx) const {
    const std::pair<int, int>& brokenEdgeVertices =
        getEdgeVertices(edgeToBreak);
    Graph& graph      = connManager.getGraph();
    int neutralVertex = getNeutralVertex(edgeToBreak);
    // newly added hangingnode
    NodeData hNodeData = hangingNode->getData();
    double length      = 0;
    length             = hNodeData.getCoords().dist(
        pState.getVerticesData()[neutralVertex].getCoords(),
        pState.isVersion2D());
    // add edge between hanging node and node that it doesn't connect in
    // triangle
    // TODO might this already done in create node on edge?
    addEdge(graph, hangingNode, pState.getVertices()[neutralVertex], false,
            length,
            (hNodeData.getCoords() +
             pState.getVerticesData()[neutralVertex].getCoords()) /
                2);

    // create the 2 hyperedges that results from the two triangles
    connManager.createInterior(hangingNode, pState.getVertices()[neutralVertex],
                               pState.getVertices()[brokenEdgeVertices.first],
                               ctx);
    connManager.createInterior(hangingNode, pState.getVertices()[neutralVertex],
                               pState.getVertices()[brokenEdgeVertices.second],
                               ctx);

    // remove original hyperedge
    graph.removeNode(pState.getInterior());
  }

  //! Get the hanging node on a broken edge (i.e. midpoint typically)
  GNode getHangingNode(int edgeToBreak, const ProductionState& pState) const {
    const std::pair<int, int>& brokenEdgeVertices =
        getEdgeVertices(edgeToBreak);
    return connManager
        .findNodeBetween(pState.getVertices()[brokenEdgeVertices.first],
                         pState.getVertices()[brokenEdgeVertices.second])
        .get();
  }

  //! Adds an edge to the graph given all neccessary parameters
  void addEdge(Graph& graph, GNode const& node1, GNode const& node2,
               bool border, double length,
               const Coordinates& middlePoint) const {
    const EdgeIterator& newEdge = graph.addEdge(node1, node2);
    graph.getEdgeData(newEdge).setBorder(border);
    graph.getEdgeData(newEdge).setLength(length);
    graph.getEdgeData(newEdge).setMiddlePoint(middlePoint);
  }

  //! Find the halfway point of 2 coordinates + get its height using
  //! the provided zgetter function
  Coordinates getNewPointCoords(
      const Coordinates& coords1, const Coordinates& coords2,
      const std::function<double(double, double)>& zGetter) const {
    double x = (coords1.getX() + coords2.getX()) / 2.;
    double y = (coords1.getY() + coords2.getY()) / 2.;
    return {x, y, zGetter(x, y)};
  }
};

#endif // GALOIS_PRODUCTION_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production1.h
================================================
#ifndef GALOIS_PRODUCTION1_H
#define GALOIS_PRODUCTION1_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production1 : public Production {
private:
  bool checkApplicabilityCondition(
      const NodeData& nodeData,
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return nodeData.isToRefine() && !connManager.hasBrokenEdge(edgesIterators);
  }

  int getEdgeToBreak(const ProductionState& pState) const {
    const vector<NodeData>& verticesData = pState.getVerticesData();
    for (int longest : pState.getLongestEdges()) {
      if (pState.getEdgesData()[longest].get().isBorder()) {
        return longest;
      }
      if (!verticesData[getEdgeVertices(longest).first].isHanging() &&
          !verticesData[getEdgeVertices(longest).second].isHanging()) {

        return longest;
      }
    }
    return -1;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getInteriorData(),
                                     pState.getEdgesIterators())) {
      return false;
    }

    //        logg(pState.getInteriorData(), pState.getVerticesData());

    int edgeToBreak = getEdgeToBreak(pState);
    if (edgeToBreak == -1) {
      return false;
    }

    breakElementWithoutHangingNode(edgeToBreak, pState, ctx);
    //        std::cout << "P1 executed ";

    return true;
  }
};

#endif // GALOIS_PRODUCTION1_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production2.h
================================================
#ifndef GALOIS_PRODUCTION2_H
#define GALOIS_PRODUCTION2_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production2 : public Production {
private:
  bool checkApplicabilityCondition(
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return connManager.countBrokenEdges(edgesIterators) == 1;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {
      return false;
    }

    //        logg(pState.getInteriorData(), pState.getVerticesData());

    int brokenEdge = pState.getAnyBrokenEdge();
    assert(brokenEdge != -1);

    if (!checkIfBrokenEdgeIsTheLongest(brokenEdge, pState.getEdgesIterators(),
                                       pState.getVertices())) {
      return false;
    }

    breakElementWithHangingNode(brokenEdge, pState, ctx);
    //        std::cout << "P2 executed ";
    return true;
  }
};

#endif // GALOIS_PRODUCTION2_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production3.h
================================================
#ifndef GALOIS_PRODUCTION3_H
#define GALOIS_PRODUCTION3_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production3 : public Production {
private:
  bool checkApplicabilityCondition(
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return connManager.countBrokenEdges(edgesIterators) == 1;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {
      return false;
    }

    int brokenEdge = pState.getAnyBrokenEdge();
    assert(brokenEdge != -1);

    if (checkIfBrokenEdgeIsTheLongest(brokenEdge, pState.getEdgesIterators(),
                                      pState.getVertices())) {
      return false;
    }

    //        logg(pState.getInteriorData(), pState.getVerticesData());

    const vector<int>& longestEdges = pState.getLongestEdges();

    for (int longest : longestEdges) {
      if (pState.getEdgesData()[longest].get().isBorder()) {
        breakElementWithoutHangingNode(longest, pState, ctx);
        //                std::cout << "P3 executed ";
        return true;
      }
    }
    for (int longest : longestEdges) {
      if (!pState.getVerticesData()[getEdgeVertices(longest).first]
               .isHanging() &&
          !pState.getVerticesData()[getEdgeVertices(longest).second]
               .isHanging()) {

        breakElementWithoutHangingNode(longest, pState, ctx);
        //                std::cout << "P3 executed ";
        return true;
      }
    }
    return false;
  }
};

#endif // GALOIS_PRODUCTION3_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production4.h
================================================
#ifndef GALOIS_PRODUCTION4_H
#define GALOIS_PRODUCTION4_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production4 : public Production {
private:
  bool checkApplicabilityCondition(
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return connManager.countBrokenEdges(edgesIterators) == 2;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {
      return false;
    }
    //        logg(pState.getInteriorData(), pState.getVerticesData());

    const vector<int>& longestEdges =
        pState.getLongestEdgesIncludingBrokenOnes();
    for (int longest : longestEdges) {
      const vector<int>& brokenEdges = pState.getBrokenEdges();
      if (std::find(brokenEdges.begin(), brokenEdges.end(), longest) !=
          brokenEdges.end()) {
        breakElementWithHangingNode(longest, pState, ctx);
        //                std::cout << "P4 executed ";
        return true;
      }
    }
    return false;
  }
};

#endif // GALOIS_PRODUCTION4_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production5.h
================================================
#ifndef GALOIS_PRODUCTION5_H
#define GALOIS_PRODUCTION5_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production5 : public Production {
private:
  bool checkApplicabilityCondition(
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return connManager.countBrokenEdges(edgesIterators) == 2;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {
      return false;
    }

    const vector<int>& longestEdges =
        pState.getLongestEdgesIncludingBrokenOnes();
    if (longestEdges.size() > 1) {
      return false;
    }

    //        logg(pState.getInteriorData(), pState.getVerticesData());
    const vector<int>& brokenEdges = pState.getBrokenEdges();
    if (std::find(brokenEdges.begin(), brokenEdges.end(), longestEdges[0]) ==
        brokenEdges.end()) {
      breakElementWithoutHangingNode(longestEdges[0], pState, ctx);
      //            std::cout << "P5 executed ";
      return true;
    }
    return false;
  }
};

#endif // GALOIS_PRODUCTION2_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/productions/Production6.h
================================================
#ifndef GALOIS_PRODUCTION6_H
#define GALOIS_PRODUCTION6_H

#include "Production.h"
#include "../utils/ConnectivityManager.h"
#include "../utils/GaloisUtils.h"

class Production6 : public Production {
private:
  bool checkApplicabilityCondition(
      const std::vector<optional<EdgeIterator>>& edgesIterators) const {
    return connManager.countBrokenEdges(edgesIterators) == 3;
  }

public:
  using Production::Production;

  bool execute(ProductionState& pState,
               galois::UserContext<GNode>& ctx) override {
    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {
      return false;
    }

    const vector<int>& longestEdges =
        pState.getLongestEdgesIncludingBrokenOnes();

    //        logg(pState.getInteriorData(), pState.getVerticesData());

    breakElementWithHangingNode(longestEdges[0], pState, ctx);
    //        std::cout << "P5 executed ";
    return true;
  }
};

#endif // GALOIS_PRODUCTION2_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/AsciiReader.cpp
================================================
#include "AsciiReader.h"
#include "../model/Map.h"

#include <cmath>
#include <cstddef>
#include <cstdio>
#include <cstring>

int AsciiReader::readLine(FILE* f, char* buffer, const size_t buffersize,
                          size_t* line_number) {

  do {
    if (fgets(buffer, buffersize, f) != NULL) {
      ++(*line_number);
      char* p = strchr(buffer, '\n');
      if (p) {
        *p = '\0';
      } else {
        fprintf(stderr,
                "Line %zu longer than buffer\n"
                "line content: %s\n",
                *line_number, buffer);
        return 1;
      }
    } else {
      *buffer = '\0';
      return 2;
    }
  } while (((buffer[0] == '#') || (buffer[0] == '\0')));

  return 0;
}

Map* AsciiReader::read(const std::string filename) {
  const size_t tambuf = 256;
  char buf[tambuf];
  size_t line_number = 0;

  FILE* fp = fopen(filename.c_str(), "r");
  if (fp == NULL) {
    fprintf(stderr, "Cannot open file %s\n", filename.c_str());
    exit(EXIT_FAILURE);
  }

  double dtm_data[6] = {INFINITY, INFINITY, INFINITY,
                        INFINITY, INFINITY, INFINITY};

  for (size_t data_idx = 0; data_idx < 6; ++data_idx) {
    if (readLine(fp, buf, tambuf, &line_number) == 0) {
      const char space   = ' ';
      const char* result = strchr(buf, space);
      if (result == NULL) {
        fprintf(stderr,
                "Header line has no space\n"
                "%s:%zu\n",
                filename.c_str(), line_number);
        exit(EXIT_FAILURE);
      }
      char* p;
      dtm_data[data_idx] = strtod(result, &p);
      if (buf == p) {
        fprintf(stderr,
                "%s: not a decimal number\n"
                "%s:%zu\n",
                buf, filename.c_str(), line_number);
        exit(EXIT_FAILURE);
      }

    } else {
      fprintf(stderr,
              "Problem reading ASC file\n"
              "%s:%zu\n",
              filename.c_str(), line_number);
      exit(EXIT_FAILURE);
    }
  }

  const size_t nCols    = dtm_data[0];
  const size_t nRows    = dtm_data[1];
  const double xMin     = dtm_data[2];
  const double yMin     = dtm_data[3];
  const double cellSize = dtm_data[4];
  // const double noData   = dtm_data[5];

  const size_t numOfPoints = nCols * nRows;

  double** coords = (double**)malloc(sizeof(double*) * numOfPoints);

  for (size_t i = 0; i < numOfPoints; ++i) {
    coords[i] = (double*)malloc(sizeof(double) * 3);
  }

  for (size_t j = 0; j < nRows; ++j) {
    const double y = yMin + (cellSize * (nRows - (j + 1)));

    if (readLine(fp, buf, tambuf, &line_number) != 0) {
      fprintf(stderr,
              "Problem reading ASC file\n"
              "%s:%zu\n",
              filename.c_str(), line_number);
      exit(EXIT_FAILURE);
    }

    char* buf_dummy = buf;
    for (size_t i = 0; i < nCols; ++i) {

      const double x = xMin + (cellSize * i);

      char* p;
      const double z = strtod(buf_dummy, &p);
      if (buf_dummy == p) {
        fprintf(stderr,
                "%s: not a decimal number\n"
                "%s:%zu\n",
                buf_dummy, filename.c_str(), line_number);
        exit(EXIT_FAILURE);
      } else {
        buf_dummy = p + 1;
      }

      coords[i + (nCols * j)][0] = x;
      coords[i + (nCols * j)][1] = y;
      coords[i + (nCols * j)][2] = z;
    }
  }

  Map* map = convert(coords, nRows, nCols);

  for (size_t k = 0; k < numOfPoints; ++k) {
    free(coords[k]);
  }
  free(coords);
  return map;
}

Map* AsciiReader::convert(double** coords, size_t nRows, size_t nCols) {
  double** map_data = Map::init_map_data(nRows, nCols);
  for (size_t k = 0; k < nRows; ++k) {
    for (size_t i = 0; i < nCols; ++i) {
      map_data[k][i] = coords[k * nCols + i][2];
    }
  }
  Map* map = new Map(map_data, nCols, nRows, 1, 1);
  return map;
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/AsciiReader.h
================================================
#ifndef ASC_READER_H
#define ASC_READER_H

#include <stddef.h>
#include <stdio.h>
#include <string>
#include "../model/Map.h"

class AsciiReader {
public:
  Map* read(const std::string filename);

private:
  static int readLine(FILE* f, char* buffer, const size_t buffersize,
                      size_t* line_number);

  static Map* convert(double** coords, const size_t nRows, const size_t nCols);
};

#endif // ASC_READER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/InpReader.cpp
================================================
#include "InpReader.h"

#include "../utils/ConnectivityManager.h"

#include <cstdlib>
#include <fstream>
#include <iostream>
#include <limits>
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

using inpEdge = std::pair<size_t, size_t>;

void inpRead(const std::string& filename, Graph& graph, double N, double S,
             double E, double W, bool version2D) {

  auto connManager = ConnectivityManager{graph};

  // First, assign corners to max and min values
  E = std::numeric_limits<double>::lowest();
  N = std::numeric_limits<double>::lowest();
  S = std::numeric_limits<double>::max();
  W = std::numeric_limits<double>::max();

  std::ifstream file(filename, std::ios_base::in);

  if (!file.is_open()) {
    std::cerr << "File " << filename << " cannot be opened!" << std::endl;
    exit(EXIT_FAILURE);
  }

  // Read the header (Number of nodes and number of elements)
  size_t numberOfNodes, numberOfElements, dummy;

  file >> numberOfNodes;
  file >> numberOfElements;
  file >> dummy;
  file >> dummy;
  file >> dummy;

  // Vector to store the nodes in order to create the interiors
  auto nodes = std::vector<GNode>(numberOfNodes);

  // Read all coordinates and generating the nodes
  for (auto i = 0u; i < numberOfNodes; ++i) {
    file >> dummy;
    double x, y, z;
    file >> x;
    file >> y;
    file >> z;
    const auto& coordinates = Coordinates{x, y, z};

    nodes[i] = connManager.createNode(NodeData{false, coordinates, false});

    // Update the four corners
    N = std::max(y, N);
    S = std::min(y, S);
    E = std::max(x, E);
    W = std::min(x, W);
  }

  // Containers for edges
  // Since the same edge can be in two triangles, we need to make sure to create
  // only one
  auto edgeSet = std::set<inpEdge>{};
  // We will keep track if one edge is on the boundary or not
  auto isEdgeBoundary = std::map<inpEdge, bool>{};

  // Read elements, create interiors, and populate edgeSet and isEdgeBoundary
  for (auto i = 0u; i < numberOfElements; ++i) {
    file >> dummy;
    file >> dummy;
    std::string dummy_str;
    file >> dummy_str;
    size_t conc1, conc2, conc3;
    file >> conc1;
    file >> conc2;
    file >> conc3;

    connManager.createInterior(nodes[conc1], nodes[conc2], nodes[conc3]);

    const auto edges = std::vector<inpEdge>{
        inpEdge{std::min(conc1, conc2), std::max(conc1, conc2)},
        inpEdge{std::min(conc2, conc3), std::max(conc2, conc3)},
        inpEdge{std::min(conc3, conc1), std::max(conc3, conc1)}};

    for (const auto& edge : edges) {
      if (edgeSet.insert(edge).second) {
        // First time it's inserted we assume it's on the boundary.
        isEdgeBoundary[edge] = true;
      } else {
        // If we try to insert it again, it means that the edge is shared by two
        // elements, therefore is not on the boundary
        isEdgeBoundary[edge] = false;
      }
    }
  }

  // Finally, generate edges
  for (const auto& edge : edgeSet) {
    auto node1 = nodes[edge.first];
    auto node2 = nodes[edge.second];

    const auto node1Coords = node1->getData().getCoords();
    const auto node2Coords = node2->getData().getCoords();

    auto midPointCoords = (node1Coords + node2Coords) / 2.;

    const auto length = node1Coords.dist(node2Coords, version2D);

    connManager.createEdge(node1, node2, isEdgeBoundary.at(edge),
                           midPointCoords, length);
  }

  file.close();
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/InpReader.h
================================================
#ifndef INP_READER_H
#define INP_READER_H

#include "../model/Graph.h"

#include <string>

//!
//! \brief inpRead Function that reads a mesh in INP format generates the graph,
//! and computes the four corners of the mesh. \param filename Path of the INP
//! file. \param graph Galois hypergraph representing the mesh. \param config
//! Config object. The four corners E, W, N, S are populated in the function.
//! version2D is used for the edge length
//!

void inpRead(const std::string& filename, Graph& graph, double N, double S,
             double E, double W, bool version2D);

#endif // INP_READER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
================================================
#include <cstdio>
#include <cerrno>
#include <cstring>
#include <cmath>
#include <galois/gIO.h>
#include "SrtmReader.h"
#include "../utils/Utils.h"

Map* SrtmReader::read(
    const double west_border, const double north_border,
    const double east_border, const double south_border,
    const char* map_dir) { // data[row][column] - it's array of rows
  Utils::swap_if_required((double*)&south_border, (double*)&north_border);
  Utils::swap_if_required((double*)&west_border, (double*)&east_border);

  // Rounding to avoid problems with numerical errors
  // Adding some margin to make sure that the mesh is always inside the map
  int west_border_int  = border_to_int(west_border) - MARGIN;
  int north_border_int = border_to_int(north_border) + MARGIN;
  int east_border_int  = border_to_int(east_border) + MARGIN;
  int south_border_int = border_to_int(south_border) - MARGIN;

  // Update the map vertices
  const auto map_N_border = (double)north_border_int / VALUES_IN_DEGREE;
  const auto map_S_border = (double)south_border_int / VALUES_IN_DEGREE;
  const auto map_E_border = (double)east_border_int / VALUES_IN_DEGREE;
  const auto map_W_border = (double)west_border_int / VALUES_IN_DEGREE;

  size_t cols = (size_t)(east_border_int - west_border_int);
  size_t rows = (size_t)(north_border_int - south_border_int);
  // mallocs an entire grid of doubles based on borders; to be read in later
  // from disk
  double** map_data = Map::init_map_data(rows, cols);
  Map* map          = new Map(map_data, cols, rows, 1. / VALUES_IN_DEGREE,
                     1. / VALUES_IN_DEGREE);

  map->setNorthBorder(map_N_border);
  map->setWestBorder(map_W_border);

  // read in points within specified borders
  read_from_multiple_files(map_W_border, map_N_border, map_E_border,
                           map_S_border, map_dir, map_data);

  skip_outliers(map_data, map->getLength(), map->getWidth());

  return map;
}

void SrtmReader::read_from_multiple_files(const double west_border,
                                          const double north_border,
                                          const double east_border,
                                          const double south_border,
                                          const char* map_dir,
                                          double** map_data) {
  int first_free_row = 0;
  double north_ptr   = north_border;
  double south_ptr = Utils::is_lesser(Utils::floor2(north_border), south_border)
                         ? south_border
                         : Utils::floor2(north_border);
  if (Utils::equals(north_ptr, south_ptr)) {
    south_ptr = Utils::is_lesser(north_border - 1, south_border)
                    ? south_border
                    : north_border - 1;
  }

  // loop over y-axis
  while (Utils::is_greater(north_ptr, south_border)) {
    int north_ptr_int = border_to_int(north_ptr);
    size_t rows_here =
        (size_t)std::abs(north_ptr_int - border_to_int(south_ptr));

    int first_free_col = 0;
    double west_ptr    = west_border;
    double east_ptr = Utils::is_greater(Utils::ceil2(west_border), east_border)
                          ? east_border
                          : Utils::ceil2(west_border);

    // loop over x-axis
    while (Utils::is_lesser(west_ptr, east_border)) {
      int west_ptr_int = border_to_int(west_ptr);
      size_t cols_here =
          (size_t)std::abs(border_to_int(east_ptr) - west_ptr_int);
      // determine file to read and actual do read
      read_from_file(north_ptr_int, west_ptr_int, rows_here, cols_here,
                     first_free_row, first_free_col, map_data, map_dir);

      first_free_col += cols_here;
      west_ptr = Utils::floor2(west_ptr + 1);
      east_ptr = Utils::is_greater(east_ptr + 1, east_border) ? east_border
                                                              : east_ptr + 1;
    }

    first_free_row += rows_here;
    north_ptr = Utils::equals(Utils::floor2(north_ptr), north_ptr)
                    ? north_ptr - 1
                    : Utils::floor2(north_ptr);
    south_ptr = Utils::is_lesser(south_ptr - 1, south_border) ? south_border
                                                              : south_ptr - 1;
  }
}

void SrtmReader::read_from_file(int north_border_int, int west_border_int,
                                size_t rows, size_t cols, int first_row,
                                int first_col, double** map_data,
                                const char* map_dir) {
  char file_to_open[256];
  get_filename(file_to_open, map_dir, west_border_int, north_border_int);

  FILE* map_file;
  if ((map_file = fopen(file_to_open, "rb")) == NULL) {
    fprintf(stderr, "%s\n", strerror(errno));
    exit(1);
  }
  int cells_in_degree = VALUES_IN_DEGREE + 1;
  if (fseek(map_file,
            (((VALUES_IN_DEGREE - (north_border_int % VALUES_IN_DEGREE)) %
              VALUES_IN_DEGREE) *
                 cells_in_degree +
             (west_border_int % VALUES_IN_DEGREE)) *
                PIXEL_SIZE,
            SEEK_SET) == -1) {
    fprintf(stderr, "%s\n", strerror(errno));
    exit(1);
  }
  uint16_t* buffer = (uint16_t*)malloc(PIXEL_SIZE * cols);
  for (size_t i = 0; i < rows; ++i) {
    if (cols != fread(buffer, PIXEL_SIZE, cols, map_file))
      std::abort();
    if (fseek(map_file, (cells_in_degree - cols) * PIXEL_SIZE, SEEK_CUR) ==
        -1) {
      fprintf(stderr, "%s\n", strerror(errno));
      exit(1);
    }
    for (size_t j = 0; j < cols; ++j) {
      Utils::change_bytes_order(&(buffer[j]));
      // here is where data is being written after being read
      map_data[first_row + i][first_col + j] = buffer[j];
    }
  }
  free(buffer);
  if (fclose(map_file) != 0) {
    fprintf(stderr, "%s\n", strerror(errno));
    exit(1);
  }
}

void SrtmReader::skip_outliers(double* const* map_data, size_t length,
                               size_t width) {
  bool outlierFound = false;
  for (size_t i = 0; i < length; ++i) {
    for (size_t j = 0; j < width; ++j) {
      // smooth out outlier point here
      if (map_data[i][j] > 3000 || map_data[i][j] < 10) {
        outlierFound = true;
        if (i > 0) {
          map_data[i][j] = map_data[i - 1][j];
        } else {
          map_data[i][j] = map_data[i + 1][j];
        }
      }
    }
  }

  if (outlierFound) {
    galois::gInfo("Outliers in input data detected.");
  }
}

void SrtmReader::get_filename(char* filename, const char* map_dir,
                              int west_border_int, int north_border_int) {
  int first_long_to_read;
  int first_lat_to_read;

  if (west_border_int < 0) {
    if (west_border_int % VALUES_IN_DEGREE != 0) {
      first_long_to_read = west_border_int / VALUES_IN_DEGREE + 1;
    } else {
      first_long_to_read = west_border_int / VALUES_IN_DEGREE;
    }
  } else {
    first_long_to_read = west_border_int / VALUES_IN_DEGREE;
  }

  if (north_border_int < 0) {
    if (north_border_int % VALUES_IN_DEGREE != 0) {
      first_lat_to_read = north_border_int / VALUES_IN_DEGREE + 1;
    } else {
      first_lat_to_read = north_border_int / VALUES_IN_DEGREE;
    }
  } else {
    if (north_border_int % VALUES_IN_DEGREE != 0) {
      first_lat_to_read = north_border_int / VALUES_IN_DEGREE;
    } else {
      first_lat_to_read = north_border_int / VALUES_IN_DEGREE - 1;
    }
  }

  sprintf(filename, "%s/%s%d%s%.3d.hgt", map_dir,
          first_lat_to_read < 0 ? "S" : "N", first_lat_to_read,
          first_long_to_read < 0 ? "W" : "E", first_long_to_read);
}

int SrtmReader::border_to_int(const double border) {
  return (int)round(border * SrtmReader::VALUES_IN_DEGREE);
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.h
================================================
#ifndef TERGEN_SRTMREADER_H
#define TERGEN_SRTMREADER_H

#include <cstdlib>
#include "../model/Map.h"

class SrtmReader {
private:
  static const int RESOLUTION = 3;

  static const unsigned short PIXEL_SIZE = 2;

  /**
   * Given the border of the coordinates to read, read files corresponding to
   * the points within the border
   *
   * @param map_dir directory storing the files to read
   * @param map_data 2D malloc'd array representing the map
   */
  void read_from_multiple_files(const double west_border,
                                const double north_border,
                                const double east_border,
                                const double south_border, const char* map_dir,
                                double** map_data);

  /**
   * Given north and west starting point as well as rows/columns to read,
   * find the file to read and read it into the map
   */
  void read_from_file(int north_border_int, int west_border_int, size_t rows,
                      size_t cols, int first_row, int first_col,
                      double** map_data, const char* map_dir);

  /**
   * Smooth out read outliers by making them take a nearby point.
   */
  void skip_outliers(double* const* map_data, size_t length, size_t width);

  //! Given north and west points, determine file name to read
  void get_filename(char* filename, const char* map_dir, int west_border_int,
                    int north_border_int);

  //! Convert a border point into an int
  int border_to_int(const double border);

public:
  static const int VALUES_IN_DEGREE = 60 * 60 / RESOLUTION;
  static const int MARGIN           = 3;

  Map* read(const double west_border, const double north_border,
            const double east_border, const double south_border,
            const char* map_dir);
};

#endif // TERGEN_SRTMREADER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/ConnectivityManager.h
================================================
#ifndef GALOIS_CONNECTIVITYMANAGER_H
#define GALOIS_CONNECTIVITYMANAGER_H

#include <galois/optional.h>
#include "../model/Graph.h"

class ConnectivityManager {
private:
  Graph& graph;

public:
  ConnectivityManager(Graph& graph) : graph(graph) {}

  //! Return a vector of neighbors given some vertex
  std::vector<GNode> getNeighbours(GNode node) const {
    std::vector<GNode> vertices;
    for (Graph::edge_iterator ii = graph.edge_begin(node),
                              ee = graph.edge_end(node);
         ii != ee; ++ii) {
      vertices.push_back(graph.getEdgeDst(ii));
    }
    return vertices;
  }

  //! Given 3 nodes that comprise a triangle, return the triangle's edges
  //! Key is that edges will be returned such that it is 0->1, 1->2,
  //! and 2->0
  //! Assumption for this to work is that the triangle order is 0->1,
  //! 1->2, and 2->0 else you will get an empty edge
  std::vector<optional<EdgeIterator>>
  getTriangleEdges(std::vector<GNode> vertices) {
    std::vector<optional<EdgeIterator>> edges;
    for (int i = 0; i < 3; i++) {
      edges.emplace_back(getEdge(vertices[i], vertices[(i + 1) % 3]));
    }
    return edges;
  }

  //! Return an edge (if it exists; may have been broken into 2)
  optional<EdgeIterator> getEdge(const GNode& v1, const GNode& v2) const {
    EdgeIterator edge = graph.findEdge(v1, v2);
    return convertToOptionalEdge(edge);
  }

  //! See if an edge exists and return optional if necessary
  optional<EdgeIterator> convertToOptionalEdge(const EdgeIterator& edge) const {
    if (edge.base() == edge.end()) {
      return galois::optional<EdgeIterator>();
    } else {
      return galois::optional<EdgeIterator>(edge);
    }
  }

  //! True if there's a broken edge in the vector of edges
  bool hasBrokenEdge(const std::vector<optional<EdgeIterator>>& edges) const {
    return countBrokenEdges(edges) > 0;
  }

  //! Count the number of edges that don't exist (i.e. broken) in a vector
  //! of edges
  int countBrokenEdges(const std::vector<optional<EdgeIterator>>& edges) const {
    int counter = 0;
    for (const optional<EdgeIterator>& edge : edges) {
      if (!edge) {
        counter++;
      }
    }
    return counter;
  }

  //! Attempts to find a node between two nodes (i.e. find midpoint, if it
  //! exists)
  optional<GNode> findNodeBetween(const GNode& node1,
                                  const GNode& node2) const {
    Coordinates expectedLocation =
        (node1->getData().getCoords() + node2->getData().getCoords()) / 2.;
    std::vector<GNode> neighbours1 = getNeighbours(node1);
    std::vector<GNode> neighbours2 = getNeighbours(node2);
    for (GNode& iNode : neighbours1) {
      auto iNodeData = graph.getData(iNode);
      for (GNode& jNode : neighbours2) {
        if (iNode == jNode &&
            iNodeData.getCoords().isXYequal(expectedLocation)) {
          return optional<GNode>(iNode);
        }
      }
    }
    return optional<GNode>();
  }

  //! Creates a node and adds to specified worklist; returns it as well
  GNode createNode(NodeData& nodeData, galois::UserContext<GNode>& ctx) const {
    GNode node = createNode(nodeData);
    ctx.push(node);
    return std::move(node);
  }

  //! Adds a new node to the graph; returns node id
  GNode createNode(NodeData nodeData) const {
    auto node = graph.createNode(nodeData);
    graph.addNode(node);
    return node;
  }

  /**
   * Create a new edge; need to specify if border, the middle point, and
   * its length
   *
   * NOTE: can theoretically calculate middle + length given just the
   * two nodes
   */
  void createEdge(GNode& node1, GNode& node2, bool border,
                  const Coordinates& middlePoint, double length) {
    // add the edge
    graph.addEdge(node1, node2);
    // get the edge
    const EdgeIterator& edge = graph.findEdge(node1, node2);
    // edit its edge data
    graph.getEdgeData(edge).setBorder(border);
    graph.getEdgeData(edge).setMiddlePoint(middlePoint);
    graph.getEdgeData(edge).setLength(length);
  }

  /**
   * Connects 3 nodes with a hyperedge; should be a triangle.
   *
   * Adds the new node to a worklist as well.
   */
  void createInterior(const GNode& node1, const GNode& node2,
                      const GNode& node3,
                      galois::UserContext<GNode>& ctx) const {
    // args: is a hyper edge + do not need to refine
    NodeData interiorData = NodeData{true, false};
    auto interior         = createNode(interiorData, ctx);

    // connect hyperedge to triangle
    graph.addEdge(interior, node1);
    graph.addEdge(interior, node2);
    graph.addEdge(interior, node3);
    // located in center of triangle
    interior->getData().setCoords((node1->getData().getCoords() +
                                   node2->getData().getCoords() +
                                   node3->getData().getCoords()) /
                                  3.);
  }

  /**
   * Connects 3 nodes with a hyperedge; should be a triangle. Returns
   * the new node ID.
   *
   * For consistency, node1->node2->node3 edge order is probably
   * preferred.
   */
  GNode createInterior(const GNode& node1, const GNode& node2,
                       const GNode& node3) const {
    // args: is a hyper edge + do not need to refine
    NodeData interiorData = NodeData{true, false};
    auto interior         = createNode(interiorData);

    // connect hyperedge to triangle
    graph.addEdge(interior, node1);
    graph.addEdge(interior, node2);
    graph.addEdge(interior, node3);
    // located in center of triangle
    interior->getData().setCoords((node1->getData().getCoords() +
                                   node2->getData().getCoords() +
                                   node3->getData().getCoords()) /
                                  3.);
    return std::move(interior);
  }

  //! Return reference underlying graph
  Graph& getGraph() const { return graph; }

  //! Get the coordinates of all neighbors of specified vertex and return them.
  const std::vector<Coordinates> getVerticesCoords(const GNode& node) const {
    std::vector<Coordinates> result;
    for (auto neighbour : getNeighbours(node)) {
      result.push_back(neighbour->getData().getCoords());
    }
    return result;
  }

  //! Remove edge node1->node2 or node2->node1 (whichever is found)
  void removeEdge(const GNode& node1, const GNode& node2) const {
    const EdgeIterator& edge1 = graph.findEdge(node1, node2);
    if (edge1.base() != edge1.end()) {
      graph.removeEdge(node1, edge1);
      return;
    }
    const EdgeIterator& edge2 = graph.findEdge(node2, node1);
    if (edge2.base() != edge2.end()) {
      graph.removeEdge(node2, edge2);
      return;
    }
    std::cerr << "Problem in removing an edge." << std::endl;
  }
};

#endif // GALOIS_CONNECTIVITYMANAGER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/GaloisUtils.h
================================================
#ifndef GALOIS_UTILS_H
#define GALOIS_UTILS_H

#include <cmath>
#include <vector>

static const double EPS = 1e-4;

inline bool equals(double a, double b) { return fabs(a - b) < EPS; }

inline bool greater(double a, double b) { return a - b >= EPS; }

inline bool less(double a, double b) { return a - b <= -EPS; }

inline std::vector<int> indexesOfMaxElems(std::vector<double> elems) {
  std::vector<int> result;
  if (elems.empty()) {
    return result;
  }
  double currentMax = elems[0];
  result.push_back(0);
  for (unsigned long i = 1; i < elems.size(); ++i) {
    if (greater(elems[i], currentMax)) {
      result.clear();
      result.push_back(i);
      currentMax = elems[i];
    } else if (equals(elems[i], currentMax)) {
      result.push_back(i);
    }
  }
  return result;
}

#endif // GALOIS_UTILS_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/GraphGenerator.h
================================================
#ifndef GALOIS_GRAPHGENERATOR_H
#define GALOIS_GRAPHGENERATOR_H

#include "Utils.h"

class GraphGenerator {
public:
  static void generateSampleGraphWithDataWithConversionToUtm(
      Graph& graph, Map& map, const double west_border,
      const double north_border, const double east_border,
      const double south_border, bool version2D, bool square) {
    // temp storage for nodes we care about for this function
    vector<GNode> nodes;
    // wrapper around graph to edit it
    ConnectivityManager connManager{graph};

    // note the following coordinates should be the same ones used to load
    // terrain data into the map; the height (z coordinate) is retrieved from
    // said map

    // SW
    const Coordinates& coordinates0 =
        Coordinates{Utils::convertToUtm(south_border, west_border, map), map};
    // NW
    const Coordinates& coordinates1 =
        Coordinates{Utils::convertToUtm(north_border, west_border, map), map};
    // SE
    const Coordinates& coordinates2 =
        Coordinates{Utils::convertToUtm(south_border, east_border, map), map};
    // NE
    const Coordinates& coordinates3 =
        Coordinates{Utils::convertToUtm(north_border, east_border, map), map};

    std::vector<Coordinates> coords;
    if (!square) {
      coords.push_back(coordinates0);
      coords.push_back(coordinates1);
      coords.push_back(coordinates2);
      coords.push_back(coordinates3);
    } else {
      double north = std::min(coordinates1.getY(), coordinates3.getY());
      double south = std::max(coordinates0.getY(), coordinates2.getY());
      double east  = std::min(coordinates2.getX(), coordinates3.getX());
      double west  = std::max(coordinates0.getX(), coordinates1.getX());
      double diff  = std::min(fabs(north - south), fabs(east - west));
      north        = south + diff;
      east         = west + diff;
      coords.emplace_back(west, south, map);
      coords.emplace_back(west, north, map);
      coords.emplace_back(east, south, map);
      coords.emplace_back(east, north, map);
    }

    // create the node points for the border intersections
    // NOT a hyperedge or a hanging node (because border points)
    nodes.push_back(connManager.createNode(NodeData{false, coords[0], false}));
    nodes.push_back(connManager.createNode(NodeData{false, coords[1], false}));
    nodes.push_back(connManager.createNode(NodeData{false, coords[2], false}));
    nodes.push_back(connManager.createNode(NodeData{false, coords[3], false}));
    galois::gInfo("Nodes created.");

    // nodes.push_back(connManager.createNode(NodeData{false,
    // Coordinates{west_border, south_border}, false}));
    // nodes.push_back(connManager.createNode(NodeData{false,
    // Coordinates{west_border, north_border}, false}));
    // nodes.push_back(connManager.createNode(NodeData{false,
    // Coordinates{east_border, south_border}, false}));
    // nodes.push_back(connManager.createNode(NodeData{false,
    // Coordinates{east_border, north_border}, false}));

    // 0 = SW, 1 = NW, 2 = SE, 3 = NE
    double leftBorderLength = nodes[0]->getData().getCoords().dist(
        nodes[1]->getData().getCoords(), version2D);
    double topBorderLength = nodes[1]->getData().getCoords().dist(
        nodes[3]->getData().getCoords(), version2D);
    double rightBorderLength = nodes[2]->getData().getCoords().dist(
        nodes[3]->getData().getCoords(), version2D);
    double bottomBorderLength = nodes[0]->getData().getCoords().dist(
        nodes[2]->getData().getCoords(), version2D);
    double SWtoNELength = nodes[3]->getData().getCoords().dist(
        nodes[0]->getData().getCoords(), version2D);

    // @todo can refactor some of the below to make less redundant

    // create 5 edges
    // left border creation
    connManager.createEdge(
        nodes[0], nodes[1], true,
        Coordinates{west_border, (north_border + south_border) / 2.,
                    map.get_height(west_border,
                                   (north_border + south_border) / 2., false)},
        leftBorderLength);
    // top border creation
    connManager.createEdge(
        nodes[1], nodes[3], true,
        Coordinates{(west_border + east_border) / 2., north_border,
                    map.get_height((west_border + east_border) / 2.,
                                   north_border, false)},
        topBorderLength);
    // right border creation
    connManager.createEdge(
        nodes[2], nodes[3], true,
        Coordinates{east_border, (north_border + south_border) / 2.,
                    map.get_height(east_border,
                                   (north_border + south_border) / 2., false)},
        rightBorderLength);
    // left border creation
    connManager.createEdge(
        nodes[0], nodes[2], true,
        Coordinates{(west_border + east_border) / 2., south_border,
                    map.get_height((west_border + east_border) / 2.,
                                   south_border, false)},
        bottomBorderLength);
    // this edge is diagonal from SW to NE with middle point being the center
    // of the map; this is the common edge of the first 2 triangles
    connManager.createEdge(
        nodes[3], nodes[0], false,
        Coordinates{(west_border + east_border) / 2.,
                    (north_border + south_border) / 2.,
                    map.get_height((west_border + east_border) / 2.,
                                   (north_border + south_border) / 2., false)},
        SWtoNELength);

    // creates the hyperedge for the 2 initial triangles of the graph
    connManager.createInterior(nodes[0], nodes[1], nodes[3]);
    connManager.createInterior(nodes[0], nodes[3], nodes[2]);
    galois::gInfo("Graph generated.");
  }

  static void generateSampleGraphWithData(Graph& graph, Map& map,
                                          const double west_border,
                                          const double north_border,
                                          const double east_border,
                                          const double south_border,
                                          bool version2D) {
    vector<GNode> nodes;
    ConnectivityManager connManager{graph};

    Utils::convertToUtm(south_border, west_border, map);

    nodes.push_back(connManager.createNode(
        NodeData{false, Coordinates{south_border, west_border, map}, false}));
    nodes.push_back(connManager.createNode(
        NodeData{false, Coordinates{north_border, west_border, map}, false}));
    nodes.push_back(connManager.createNode(
        NodeData{false, Coordinates{south_border, east_border, map}, false}));
    nodes.push_back(connManager.createNode(
        NodeData{false, Coordinates{north_border, east_border, map}, false}));

    //        nodes.push_back(connManager.createNode(NodeData{false,
    //        Coordinates{west_border, south_border}, false}));
    //        nodes.push_back(connManager.createNode(NodeData{false,
    //        Coordinates{west_border, north_border}, false}));
    //        nodes.push_back(connManager.createNode(NodeData{false,
    //        Coordinates{east_border, south_border}, false}));
    //        nodes.push_back(connManager.createNode(NodeData{false,
    //        Coordinates{east_border, north_border}, false}));

    double length1 = nodes[0]->getData().getCoords().dist(
        nodes[1]->getData().getCoords(), version2D);
    double length2 = nodes[1]->getData().getCoords().dist(
        nodes[3]->getData().getCoords(), version2D);
    double length3 = nodes[2]->getData().getCoords().dist(
        nodes[3]->getData().getCoords(), version2D);
    double length4 = nodes[0]->getData().getCoords().dist(
        nodes[2]->getData().getCoords(), version2D);
    double length5 = nodes[3]->getData().getCoords().dist(
        nodes[0]->getData().getCoords(), version2D);

    connManager.createEdge(
        nodes[0], nodes[1], true,
        Coordinates{
            west_border, (north_border + south_border) / 2.,
            map.get_height(west_border, (north_border + south_border) / 2.)},
        length1);
    connManager.createEdge(
        nodes[1], nodes[3], true,
        Coordinates{
            (west_border + east_border) / 2., north_border,
            map.get_height((west_border + east_border) / 2., north_border)},
        length2);
    connManager.createEdge(
        nodes[2], nodes[3], true,
        Coordinates{
            east_border, (north_border + south_border) / 2.,
            map.get_height(east_border, (north_border + south_border) / 2.)},
        length3);
    connManager.createEdge(
        nodes[0], nodes[2], true,
        Coordinates{
            (west_border + east_border) / 2., south_border,
            map.get_height((west_border + east_border) / 2., south_border)},
        length4);
    connManager.createEdge(
        nodes[3], nodes[0], false,
        Coordinates{(west_border + east_border) / 2.,
                    (north_border + south_border) / 2.,
                    map.get_height((west_border + east_border) / 2.,
                                   (north_border + south_border) / 2.)},
        length5);

    connManager.createInterior(nodes[0], nodes[1], nodes[3]);
    connManager.createInterior(nodes[0], nodes[3], nodes[2]);
  }
};

#endif // GALOIS_GRAPHGENERATOR_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/MyGraphFormatWriter.h
================================================
#ifndef GALOIS_MYGRAPHFORMATWRITER_H
#define GALOIS_MYGRAPHFORMATWRITER_H

#include <fstream>
#include <utility>

using std::set;
using std::string;
typedef std::tuple<string, string, bool> Edge;
typedef std::pair<int, NodeData> Node;

class MyGraphFormatWriter {
private:
  static void writeToMyGraphFormat(const set<Node>& vertices,
                                   const set<Node>& interiors,
                                   const set<Edge>& edges, const string& path) {
    std::ofstream file;
    file.open(path);
    printNodes(
        vertices, "N,n", [](NodeData n) { return n.isHanging(); }, file);
    printNodes(
        interiors, "H,h", [](NodeData n) { return n.isToRefine(); }, file);
    printEdges(edges, file);
    file.close();
  }

  static void printEdges(const set<Edge>& edges, std::ofstream& file) {
    int i = 0;
    for (auto edge : edges) {
      file << "E,e" << i++ << "," << std::get<0>(edge) << ","
           << std::get<1>(edge) << "," << (std::get<2>(edge) ? "true" : "false")
           << std::endl;
    }
  }

  static void printNodes(const set<Node>& nodes, const string preambule,
                         bool (*attributeChecker)(NodeData),
                         std::ofstream& file) {
    for (auto node : nodes) {
      file << preambule << node.first << "," << node.second.getCoords().getX()
           << "," << node.second.getCoords().getY() << ","
           << node.second.getCoords().getZ() << ","
           << (attributeChecker(node.second) ? "true" : "false") << std::endl;
    }
  }

  static void addEdge(set<Edge>& edges, const string& firstNodeId,
                      const string& secondNodeId, bool border) {
    if (!findEdge(firstNodeId, secondNodeId, edges).is_initialized()) {
      edges.emplace(std::make_tuple(firstNodeId, secondNodeId, border));
    }
  }

  static optional<Edge> findEdge(const string& first, const string& second,
                                 const set<Edge>& edges) {
    for (auto edge : edges) {
      if ((std::get<0>(edge) == first && std::get<1>(edge) == second) ||
          (std::get<0>(edge) == second && std::get<1>(edge) == first)) {
        return galois::optional<Edge>(edge);
      }
    }
    return galois::optional<Edge>();
  }

  static string getNodeId(set<Node>& nodes, int& nodesIter, NodeData& data) {
    return getNodeId(nodes, nodesIter, data, optional<set<Node>>());
  }

  static string getNodeId(set<Node>& nodes, int& nodesIter, NodeData& data,
                          optional<set<Node>> additionalNodesSet) {
    optional<string> maybeId = findNode(data, nodes);
    if (maybeId) {
      return maybeId.get();
    }
    if (additionalNodesSet) {
      optional<string> maybeId2 = findNode(data, additionalNodesSet.get());
      if (maybeId2) {
        return maybeId2.get();
      }
    }
    nodes.emplace(Node(nodesIter, data));
    return (data.isHyperEdge() ? "h" : "n") + std::to_string(nodesIter++);
  }

  static optional<string> findNode(const NodeData& node,
                                   const set<Node>& nodesSet) {
    for (auto pair : nodesSet) {
      if (pair.second == node) {
        return optional<string>((node.isHyperEdge() ? "h" : "n") +
                                std::__cxx11::to_string(pair.first));
      }
    }
    return optional<string>();
  }

public:
  static void writeToFile(Graph& graph, const string& path) {
    set<Node> vertices;
    set<Node> interiors;
    set<Edge> edges;
    int nodesIter     = 0;
    int interiorsIter = 0;
    for (auto node : graph) {
      NodeData& data = graph.getData(node);
      if (!data.isHyperEdge()) {
        string firstNodeId = getNodeId(vertices, nodesIter, data);
        for (const EdgeIterator& e : graph.edges(node)) {
          GNode dstNode        = graph.getEdgeDst(e);
          NodeData dstNodeData = graph.getData(dstNode);
          if (!dstNodeData.isHyperEdge()) {
            string secondNodeId = getNodeId(vertices, nodesIter, dstNodeData);
            addEdge(
                edges, firstNodeId, secondNodeId,
                graph.getEdgeData(graph.findEdge(node, dstNode)).isBorder());
          }
        }
      } else {
        string firstInteriorId = getNodeId(interiors, interiorsIter, data,
                                           optional<set<Node>>(vertices));
        for (const EdgeIterator& e : graph.edges(node)) {
          GNode dstNode        = graph.getEdgeDst(e);
          NodeData dstNodeData = graph.getData(dstNode);
          string secondNodeId = getNodeId(interiors, interiorsIter, dstNodeData,
                                          optional<set<Node>>(vertices));
          addEdge(edges, firstInteriorId, secondNodeId,
                  graph.getEdgeData(graph.findEdge(node, dstNode)).isBorder());
        }
      }
    }
    writeToMyGraphFormat(vertices, interiors, edges, path);
  }
};

#endif // GALOIS_MYGRAPHFORMATWRITER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/Utils.cpp
================================================
#include <cmath>
#include <cstdio>
#include "Utils.h"

#ifndef PI
#define PI 3.14159265358979323846
#endif

bool Utils::is_lesser(double a, double b) { return a - b < -2 * EPSILON; }

bool Utils::is_greater(double a, double b) { return a - b > 2 * EPSILON; }

bool Utils::equals(const double a, const double b) {
  return fabs(a - b) < EPSILON;
}

double Utils::floor2(double a) {
  double b = (int)a;
  if (!(!is_greater(b, a) && is_greater(b + 1, a))) {
    ++b;
  }
  return b;
}

double Utils::ceil2(double a) { return floor2(a) + 1; }

void Utils::change_bytes_order(uint16_t* var_ptr) {
  uint16_t tmp = *var_ptr;
  tmp <<= 8;
  (*var_ptr) >>= 8;
  (*var_ptr) |= tmp;
}

void Utils::swap_if_required(double* should_be_lower,
                             double* should_be_bigger) {
  if ((*should_be_lower) > (*should_be_bigger)) {
    double tmp          = (*should_be_lower);
    (*should_be_lower)  = (*should_be_bigger);
    (*should_be_bigger) = tmp;
  }
}

size_t Utils::gcd(size_t a, size_t b) {
  do {
    if (b > a) {
      size_t tmp = a;
      a          = b;
      b          = tmp;
    }
    a -= b;
  } while (a != 0);
  return b;
}

double Utils::d2r(double degrees) { return degrees * PI / 180; }

double Utils::r2d(double radians) { return radians * 180 / PI; }

void Utils::shift(int from, int to, size_t* array) {
  for (int i = to; i > from; --i) {
    array[i] = array[i - 1];
  }
}

std::pair<double, double> Utils::convertToUtm(double latitude, double longitude,
                                              Map& map) {
  long zone;
  char hemisphere;
  double easting;
  double northing;
  if (Convert_Geodetic_To_UTM(d2r(latitude), d2r(longitude), &zone, &hemisphere,
                              &easting, &northing)) {
    fprintf(stderr, "Error during conversion to UTM.\n");
    exit(13);
  }
  if (map.getZone() != -1) {
    if (map.getZone() != zone || map.getHemisphere() != hemisphere) {
      fprintf(stderr,
              "Error: All the points must be within the same UTM zone.\n");
      exit(14);
    }
  } else {
    map.setZone(zone);
    map.setHemisphere(hemisphere);
  }
  return std::pair<double, double>(easting, northing);
}

================================================
FILE: lonestar/scientific/cpu/longestedge/src/utils/Utils.h
================================================

#ifndef TERGEN_UTILS_H
#define TERGEN_UTILS_H

#include <cstdint>
#include <cstdlib>
#include <utility>
#include "../libmgrs/utm.h"
#include "../model/Map.h"

class Utils {
public:
  constexpr static const double EPSILON = 1e-6;

  static bool is_lesser(double a, double b);

  static bool is_greater(double a, double b);

  static bool equals(const double a, const double b);

  static double floor2(double a);

  static double ceil2(double a);

  static void change_bytes_order(uint16_t* var_ptr);

  static void swap_if_required(double* should_be_lower,
                               double* should_be_bigger);

  static size_t gcd(size_t a, size_t b);

  static double d2r(double degrees);

  static double r2d(double radians);

  static void shift(int from, int to, size_t* array);

  static std::pair<double, double> convertToUtm(double latitude,
                                                double longitude, Map& map);
};

#endif // TERGEN_UTILS_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/writers/InpWriter.cpp
================================================
#include "InpWriter.h"

#include "../model/Graph.h"
#include "../model/NodeData.h"
#include "../utils/ConnectivityManager.h"

#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

using inpEdge = std::pair<size_t, size_t>;

void inpWriter(const std::string filename, Graph& graph) {

  auto nodeVector  = std::vector<InpNodeInfo>{};
  auto conecVector = std::vector<InpConecInfo>{};

  // Process the graph and get the vectors to write the inp file
  processGraph(graph, nodeVector, conecVector);

  // Write the file
  auto file = std::ofstream(filename);

  if (!file.is_open()) {
    std::cerr << "Cannot open output file " << filename << std::endl;
    exit(EXIT_FAILURE);
  }

  // Write header
  file << nodeVector.size() << " " << nodeVector.size() + conecVector.size()
       << " 0 0 0" << std::endl;

  // Write nodes
  auto counter = 0;
  for (const auto node : nodeVector) {
    file << counter << " " << node.coods.getX() << " " << node.coods.getY()
         << " " << node.coods.getZ() << std::endl;

    ++counter;
  }

  // Write elements
  // First elements related to nodes (points)
  counter = 0;
  for (const auto node : nodeVector) {
    file << counter << " " << node.mat << " pt " << node.id << std::endl;

    ++counter;
  }

  // Then elements related to nodes (interior, edges, and triangles)
  for (const auto& conec : conecVector) {
    file << counter << " " << conec.mat << " " << conec.type;

    for (const auto id : conec.conec) {
      file << " " << id;
    }

    file << std::endl;
    ++counter;
  }

  file.close();
}

void processGraph(Graph& graph, std::vector<InpNodeInfo>& nodeVector,
                  std::vector<InpConecInfo>& conecVector) {
  size_t nodeCounter = 0;

  auto connManager = ConnectivityManager{graph};
  auto nodeMap     = std::map<GNode, size_t>{};

  // First, process mesh nodes
  for (const auto graphNode : graph) {
    if (!graphNode->getData().isHyperEdge()) { // Only mesh nodes
      const auto coords = graphNode->getData().getCoords();
      const auto mat    = (graphNode->getData().isHanging()) ? 1u : 0u;
      const auto id     = nodeCounter;
      nodeVector.push_back(InpNodeInfo{coords, mat, id});
      nodeMap.insert({graphNode, id});
      ++nodeCounter;
    }
  }

  auto edgeSet = std::set<inpEdge>{};

  // Then, we process interiors
  for (const auto graphNode : graph) {
    if (!graphNode->getData().isHyperEdge()) {
      continue;
    }

    const auto coords = graphNode->getData().getCoords();
    const auto mat    = (graphNode->getData().isToRefine()) ? 3u : 2u;
    const auto id     = nodeCounter;
    nodeVector.push_back(InpNodeInfo{coords, mat, id});
    nodeMap.insert({graphNode, id});
    ++nodeCounter;

    // Get the three mesh node Ids

    const auto intNodes = connManager.getNeighbours(graphNode);
    const auto intNodesID =
        std::vector<size_t>{nodeMap.at(intNodes[0]), nodeMap.at(intNodes[1]),
                            nodeMap.at(intNodes[2])};

    // Now we generate the mesh triangle
    conecVector.push_back(InpConecInfo{intNodesID, 7, "tri"});

    // Now we generate the three edges that join the interior and the mesh
    // nodes
    conecVector.push_back(
        InpConecInfo{std::vector<size_t>{id, intNodesID.at(0)}, 4, "line"});

    conecVector.push_back(
        InpConecInfo{std::vector<size_t>{id, intNodesID.at(1)}, 4, "line"});

    conecVector.push_back(
        InpConecInfo{std::vector<size_t>{id, intNodesID.at(2)}, 4, "line"});

    // Finally, we generate the triangle edges
    for (auto i = 0u; i < intNodesID.size(); ++i) {

      // Create the inp edge
      const auto edge =
          inpEdge{std::min(intNodesID[i], intNodesID[(i + 1) % 3]),
                  std::max(intNodesID[i], intNodesID[(i + 1) % 3])};

      // Check if it has been created before
      if (edgeSet.insert(edge).second) {

        // Get the graph edge to see if it's on the boundary
        const auto graphEdge =
            connManager.getEdge(intNodes[i], intNodes[(i + 1) % 3]);

        const auto mat =
            connManager.getGraph().getEdgeData(graphEdge.get()).isBorder() ? 6u
                                                                           : 5u;

        conecVector.push_back(InpConecInfo{
            std::vector<size_t>{edge.first, edge.second}, mat, "line"});
      }
    }
  }
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/writers/InpWriter.h
================================================
#ifndef INP_WRITER_H
#define INP_WRITER_H

#include "../model/Coordinates.h"
#include "../model/Graph.h"

#include <cstddef>
#include <string>
#include <vector>

struct InpNodeInfo {
  Coordinates coods;
  size_t mat;
  size_t id;
};

struct InpConecInfo {
  std::vector<size_t> conec;
  size_t mat;
  std::string type;
};

void inpWriter(const std::string filename, Graph& graph);

void processGraph(Graph& graph, std::vector<InpNodeInfo>& nodeVector,
                  std::vector<InpConecInfo>& conecVector);

#endif // INP_WRITER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/src/writers/TriangleFormatWriter.cpp
================================================
#include "TriangleFormatWriter.h"

#include "../model/Graph.h"
#include "../model/NodeData.h"
#include "../utils/ConnectivityManager.h"

#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

using triEdge = std::pair<size_t, size_t>;

void triangleFormatWriter(const std::string& filename, Graph& graph) {

  auto nodeVector  = std::vector<TriNodeInfo>{};
  auto segmVector  = std::vector<TriSegmInfo>{};
  auto conecVector = std::vector<TriConecInfo>{};

  // Process the graph and get the vectors to write the tri file
  trProcessGraph(graph, nodeVector, segmVector, conecVector);

  // Write the file
  auto nodeFile = std::ofstream(filename + ".node");

  if (!nodeFile.is_open()) {
    std::cerr << "Cannot open output file " << filename << std::endl;
    exit(EXIT_FAILURE);
  }

  // Write header
  nodeFile << nodeVector.size() << " 2 1 0" << std::endl;

  // Write nodes
  auto counter = 0;
  for (const auto node : nodeVector) {
    nodeFile << counter++ << " " << node.coods.getX() << " "
             << node.coods.getY() << " " << node.coods.getZ() << std::endl;
  }
  nodeFile.close();

  // Write elements
  auto eleFile = std::ofstream(filename + ".ele");

  if (!eleFile.is_open()) {
    std::cerr << "Cannot open output file " << filename << std::endl;
    exit(EXIT_FAILURE);
  }

  // Write header
  eleFile << conecVector.size() << " 3 0" << std::endl;

  // First elements related to nodes (points)
  counter = 0;

  // Then elements related to eles (edges and triangles)
  for (const auto& conec : conecVector) {
    eleFile << counter++;

    for (const auto id : conec.conec) {
      eleFile << " " << id;
    }

    eleFile << std::endl;
  }

  eleFile.close();

  // Write elements
  auto polyFile = std::ofstream(filename + ".poly");

  if (!polyFile.is_open()) {
    std::cerr << "Cannot open output file " << filename << std::endl;
    exit(EXIT_FAILURE);
  }

  polyFile << nodeVector.size() << " 2 1 0" << std::endl;

  //.poly file
  // Write nodes
  counter = 0;
  for (const auto node : nodeVector) {
    polyFile << counter++ << " " << node.coods.getX() << " "
             << node.coods.getY() << " " << node.coods.getZ() << std::endl;
  }

  counter = 0;
  polyFile << segmVector.size() << " 1" << std::endl;
  for (const auto& segm : segmVector) {
    polyFile << counter++ << " " << segm.points[0] << " " << segm.points[1]
             << " " << (segm.border ? 1 : 0) << std::endl;
  }

  polyFile << "0" << std::endl;

  polyFile.close();
}
void trProcessGraph(Graph& graph, std::vector<TriNodeInfo>& nodeVector,
                    std::vector<TriSegmInfo>& segmVector,
                    std::vector<TriConecInfo>& conecVector) {
  size_t nodeCounter = 0;

  auto connManager = ConnectivityManager{graph};
  auto nodeMap     = std::map<GNode, size_t>{};

  // First, process mesh nodes
  for (const auto graphNode : graph) {
    if (!graphNode->getData().isHyperEdge()) { // Only mesh nodes
      const auto coords = graphNode->getData().getCoords();
      const auto mat    = (graphNode->getData().isHanging()) ? 1u : 0u;
      const auto id     = nodeCounter;
      nodeVector.push_back(TriNodeInfo{coords, mat, id});
      nodeMap.insert({graphNode, id});
      ++nodeCounter;
    }
  }

  auto edgeSet = std::set<triEdge>{};

  // Then, we process interiors
  for (const auto graphNode : graph) {
    if (!graphNode->getData().isHyperEdge()) {
      continue;
    }

    const auto id = nodeCounter;
    nodeMap.insert({graphNode, id});
    ++nodeCounter;

    // Get the three mesh node Ids
    const auto intNodes = connManager.getNeighbours(graphNode);
    const auto intNodesID =
        std::vector<size_t>{nodeMap.at(intNodes[0]), nodeMap.at(intNodes[1]),
                            nodeMap.at(intNodes[2])};
    changeOrientationIfRequired(intNodesID, nodeVector);

    // Now we generate the mesh triangle
    conecVector.push_back(TriConecInfo{intNodesID, 7});

    // Finally, we generate the triangle edges
    for (auto i = 0u; i < intNodesID.size(); ++i) {

      // Create the tri edge
      const auto edge =
          triEdge{std::min(intNodesID[i], intNodesID[(i + 1) % 3]),
                  std::max(intNodesID[i], intNodesID[(i + 1) % 3])};

      // Check if it has been created before
      if (edgeSet.insert(edge).second) {

        // Get the graph edge to see if it's on the boundary
        const auto graphEdge =
            connManager.getEdge(intNodes[i], intNodes[(i + 1) % 3]);

        segmVector.push_back(TriSegmInfo{
            std::vector<size_t>{edge.first, edge.second},
            connManager.getGraph().getEdgeData(graphEdge.get()).isBorder()});
      }
    }
  }
}

void changeOrientationIfRequired(std::vector<unsigned long> element,
                                 std::vector<TriNodeInfo> nodeVector) {
  if (greater(((nodeVector[element[1]].coods.getX() -
                nodeVector[element[0]].coods.getX()) *
               (nodeVector[element[2]].coods.getY() -
                nodeVector[element[0]].coods.getY())) -
                  ((nodeVector[element[1]].coods.getY() -
                    nodeVector[element[0]].coods.getY()) *
                   (nodeVector[element[2]].coods.getX() -
                    nodeVector[element[0]].coods.getX())),
              0.)) {
    std::iter_swap(element.begin() + 1, element.begin() + 2);
  }
}


================================================
FILE: lonestar/scientific/cpu/longestedge/src/writers/TriangleFormatWriter.h
================================================
#ifndef TRI_WRITER_H
#define TRI_WRITER_H

#include "../model/Coordinates.h"
#include "../model/Graph.h"

#include <cstddef>
#include <string>
#include <vector>

struct TriNodeInfo {
  Coordinates coods;
  size_t mat;
  size_t id;
};

struct TriConecInfo {
  std::vector<size_t> conec;
  size_t mat;
};

struct TriSegmInfo {
  std::vector<size_t> points;
  bool border;
};

void triangleFormatWriter(const std::string& filename, Graph& graph);

void trProcessGraph(Graph& graph, std::vector<TriNodeInfo>& nodeVector,
                    std::vector<TriSegmInfo>& segmVector,
                    std::vector<TriConecInfo>& conecVector);

void changeOrientationIfRequired(std::vector<unsigned long> element,
                                 std::vector<TriNodeInfo> nodeVector);

#endif // TRI_WRITER_H


================================================
FILE: lonestar/scientific/cpu/longestedge/test/TestMain.cpp
================================================
#ifndef GALOIS_TESTMAIN_H
#define GALOIS_TESTMAIN_H

#define CATCH_CONFIG_MAIN
#include "catch.hpp"
#include "productions/Production1Test.cpp"
#include "utils/ConnectivityManagerTest.cpp"
#include "model/ProductionStateTest.cpp"
#include "utils/UtilsTest.cpp"
#include "model/MapTest.cpp"

#endif // GALOIS_TESTMAIN_H


================================================
FILE: lonestar/scientific/cpu/longestedge/test/catch.hpp
================================================
/*
 *  Catch v2.13.5
 *  Generated: 2021-04-10 23:43:17.560525
 *  ----------------------------------------------------------
 *  This file has been merged from multiple headers. Please don't edit it
 * directly Copyright (c) 2021 Two Blue Cubes Ltd. All rights reserved.
 *
 *  Distributed under the Boost Software License, Version 1.0. (See accompanying
 *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 */
#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
// start catch.hpp

#define CATCH_VERSION_MAJOR 2
#define CATCH_VERSION_MINOR 13
#define CATCH_VERSION_PATCH 5

#ifdef __clang__
#pragma clang system_header
#elif defined __GNUC__
#pragma GCC system_header
#endif

// start catch_suppress_warnings.h

#ifdef __clang__
#ifdef __ICC // icpc defines the __clang__ macro
#pragma warning(push)
#pragma warning(disable : 161 1682)
#else // __ICC
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpadded"
#pragma clang diagnostic ignored "-Wswitch-enum"
#pragma clang diagnostic ignored "-Wcovered-switch-default"
#endif
#elif defined __GNUC__
// Because REQUIREs trigger GCC's -Wparentheses, and because still
// supported version of g++ have only buggy support for _Pragmas,
// Wparentheses have to be suppressed globally.
#pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wpadded"
#endif
// end catch_suppress_warnings.h
#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER)
#define CATCH_IMPL
#define CATCH_CONFIG_ALL_PARTS
#endif

// In the impl file, we want to have access to all parts of the headers
// Can also be used to sanely support PCHs
#if defined(CATCH_CONFIG_ALL_PARTS)
#define CATCH_CONFIG_EXTERNAL_INTERFACES
#if defined(CATCH_CONFIG_DISABLE_MATCHERS)
#undef CATCH_CONFIG_DISABLE_MATCHERS
#endif
#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
#endif
#endif

#if !defined(CATCH_CONFIG_IMPL_ONLY)
// start catch_platform.h

// See e.g.:
// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html
#ifdef __APPLE__
#include <TargetConditionals.h>
#if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) ||                          \
    (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1)
#define CATCH_PLATFORM_MAC
#elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1)
#define CATCH_PLATFORM_IPHONE
#endif

#elif defined(linux) || defined(__linux) || defined(__linux__)
#define CATCH_PLATFORM_LINUX

#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) ||               \
    defined(_MSC_VER) || defined(__MINGW32__)
#define CATCH_PLATFORM_WINDOWS
#endif

// end catch_platform.h

#ifdef CATCH_IMPL
#ifndef CLARA_CONFIG_MAIN
#define CLARA_CONFIG_MAIN_NOT_DEFINED
#define CLARA_CONFIG_MAIN
#endif
#endif

// start catch_user_interfaces.h

namespace Catch {
unsigned int rngSeed();
}

// end catch_user_interfaces.h
// start catch_tag_alias_autoregistrar.h

// start catch_common.h

// start catch_compiler_capabilities.h

// Detect a number of compiler features - by compiler
// The following features are defined:
//
// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported?
// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?
// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?
// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?
// ****************
// Note to maintainers: if new toggles are added please document them
// in configuration.md, too
// ****************

// In general each macro has a _NO_<feature name> form
// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.
// Many features, at point of detection, define an _INTERNAL_ macro, so they
// can be combined, en-mass, with the _NO_ forms later.

#ifdef __cplusplus

#if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
#define CATCH_CPP14_OR_GREATER
#endif

#if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
#define CATCH_CPP17_OR_GREATER
#endif

#endif

// Only GCC compiler should be used in this block, so other compilers trying to
// mask themselves as GCC should be ignored.
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) &&             \
    !defined(__CUDACC__) && !defined(__LCC__)
#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("GCC diagnostic push")
#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma("GCC diagnostic pop")

#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)                                    \
  (void)__builtin_constant_p(__VA_ARGS__)

#endif

#if defined(__clang__)

#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \
  _Pragma("clang diagnostic push")
#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma("clang diagnostic pop")

// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug
// which results in calls to destructors being emitted for each temporary,
// without a matching initialization. In practice, this can result in something
// like `std::string::~string` being called on an uninitialized value.
//
// For example, this code will likely segfault under IBM XL:
// ```
// REQUIRE(std::string("12") + "34" == "1234")
// ```
//
// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.
#if !defined(__ibmxl__) && !defined(__CUDACC__)
#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)                                            \
  (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, \
                                             hicpp-vararg) */
#endif

#define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \
  _Pragma("clang diagnostic ignored \"-Wexit-time-destructors\"")              \
      _Pragma("clang diagnostic ignored \"-Wglobal-constructors\"")

#define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS                           \
  _Pragma("clang diagnostic ignored \"-Wparentheses\"")

#define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                \
  _Pragma("clang diagnostic ignored \"-Wunused-variable\"")

#define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                         \
  _Pragma("clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"")

#define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                       \
  _Pragma("clang diagnostic ignored \"-Wunused-template\"")

#endif // __clang__

////////////////////////////////////////////////////////////////////////////////
// Assume that non-Windows platforms support posix signals by default
#if !defined(CATCH_PLATFORM_WINDOWS)
#define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS
#endif

////////////////////////////////////////////////////////////////////////////////
// We know some environments not to support full POSIX signals
#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) ||      \
    defined(__DJGPP__)
#define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
#endif

#ifdef __OS400__
#define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
#define CATCH_CONFIG_COLOUR_NONE
#endif

////////////////////////////////////////////////////////////////////////////////
// Android somehow still does not support std::to_string
#if defined(__ANDROID__)
#define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
#define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE
#endif

////////////////////////////////////////////////////////////////////////////////
// Not all Windows environments support SEH properly
#if defined(__MINGW32__)
#define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
#endif

////////////////////////////////////////////////////////////////////////////////
// PS4
#if defined(__ORBIS__)
#define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE
#endif

////////////////////////////////////////////////////////////////////////////////
// Cygwin
#ifdef __CYGWIN__

// Required for some versions of Cygwin to declare gettimeofday
// see:
// http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin
#define _BSD_SOURCE
// some versions of cygwin (most) do not support std::to_string. Use the libstd
// check.
// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html
// line 2812-2813
#if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) &&                 \
      !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))

#define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING

#endif
#endif // __CYGWIN__

////////////////////////////////////////////////////////////////////////////////
// Visual C++
#if defined(_MSC_VER)

#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma(warning(push))
#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma(warning(pop))

// Universal Windows platform does not support SEH
// Or console colours (or console at all...)
#if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP)
#define CATCH_CONFIG_COLOUR_NONE
#else
#define CATCH_INTERNAL_CONFIG_WINDOWS_SEH
#endif

// MSVC traditional preprocessor needs some workaround for __VA_ARGS__
// _MSVC_TRADITIONAL == 0 means new conformant preprocessor
// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor
#if !defined(__clang__) // Handle Clang masquerading for msvc
#if !defined(_MSVC_TRADITIONAL) ||                                             \
    (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
#define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#endif // MSVC_TRADITIONAL
#endif // __clang__

#endif // _MSC_VER

#if defined(_REENTRANT) || defined(_MSC_VER)
// Enable async processing, as -pthread is specified or no additional linking is
// required
#define CATCH_INTERNAL_CONFIG_USE_ASYNC
#endif // _MSC_VER

////////////////////////////////////////////////////////////////////////////////
// Check if we are compiled with -fno-exceptions or equivalent
#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)
#define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED
#endif

////////////////////////////////////////////////////////////////////////////////
// DJGPP
#ifdef __DJGPP__
#define CATCH_INTERNAL_CONFIG_NO_WCHAR
#endif // __DJGPP__

////////////////////////////////////////////////////////////////////////////////
// Embarcadero C++Build
#if defined(__BORLANDC__)
#define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN
#endif

////////////////////////////////////////////////////////////////////////////////

// Use of __COUNTER__ is suppressed during code analysis in
// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly
// handled by it.
// Otherwise all supported compilers support COUNTER macro,
// but user still might want to turn it off
#if (!defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L)
#define CATCH_INTERNAL_CONFIG_COUNTER
#endif

////////////////////////////////////////////////////////////////////////////////

// RTX is a special version of Windows that is real time.
// This means that it is detected as Windows, but does not provide
// the same set of capabilities as real Windows does.
#if defined(UNDER_RTSS) || defined(RTX64_BUILD)
#define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
#define CATCH_INTERNAL_CONFIG_NO_ASYNC
#define CATCH_CONFIG_COLOUR_NONE
#endif

#if !defined(_GLIBCXX_USE_C99_MATH_TR1)
#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER
#endif

// Various stdlib support checks that require __has_include
#if defined(__has_include)
// Check if string_view is available and usable
#if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)
#define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW
#endif

// Check if optional is available and usable
#if __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
#define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
#endif // __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)

// Check if byte is available and usable
#if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
#include <cstddef>
#if __cpp_lib_byte > 0
#define CATCH_INTERNAL_CONFIG_CPP17_BYTE
#endif
#endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)

// Check if variant is available and usable
#if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
#if defined(__clang__) && (__clang_major__ < 8)
// work around clang bug with libstdc++
// https://bugs.llvm.org/show_bug.cgi?id=31852 fix should be in clang 8,
// workaround in libstdc++ 8.2
#include <ciso646>
#if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
#define CATCH_CONFIG_NO_CPP17_VARIANT
#else
#define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
#endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE
       // < 9)
#else
#define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
#endif // defined(__clang__) && (__clang_major__ < 8)
#endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
#endif // defined(__has_include)

#if defined(CATCH_INTERNAL_CONFIG_COUNTER) &&                                  \
    !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER)
#define CATCH_CONFIG_COUNTER
#endif
#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) &&                              \
    !defined(CATCH_CONFIG_NO_WINDOWS_SEH) &&                                   \
    !defined(CATCH_CONFIG_WINDOWS_SEH) &&                                      \
    !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)
#define CATCH_CONFIG_WINDOWS_SEH
#endif
// This is set by default, because we assume that unix compilers are
// posix-signal-compatible by default.
#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) &&                            \
    !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) &&                        \
    !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) &&                                 \
    !defined(CATCH_CONFIG_POSIX_SIGNALS)
#define CATCH_CONFIG_POSIX_SIGNALS
#endif
// This is set by default, because we assume that compilers with no wchar_t
// support are just rare exceptions.
#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) &&                                \
    !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR)
#define CATCH_CONFIG_WCHAR
#endif

#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) &&                      \
    !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) &&                               \
    !defined(CATCH_CONFIG_CPP11_TO_STRING)
#define CATCH_CONFIG_CPP11_TO_STRING
#endif

#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) &&                           \
    !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) &&                                \
    !defined(CATCH_CONFIG_CPP17_OPTIONAL)
#define CATCH_CONFIG_CPP17_OPTIONAL
#endif

#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) &&                        \
    !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) &&                             \
    !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
#define CATCH_CONFIG_CPP17_STRING_VIEW
#endif

#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) &&                            \
    !defined(CATCH_CONFIG_NO_CPP17_VARIANT) &&                                 \
    !defined(CATCH_CONFIG_CPP17_VARIANT)
#define CATCH_CONFIG_CPP17_VARIANT
#endif

#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) &&                               \
    !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE)
#define CATCH_CONFIG_CPP17_BYTE
#endif

#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
#define CATCH_INTERNAL_CONFIG_NEW_CAPTURE
#endif

#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) &&                              \
    !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) &&                          \
    !defined(CATCH_CONFIG_NO_NEW_CAPTURE) &&                                   \
    !defined(CATCH_CONFIG_NEW_CAPTURE)
#define CATCH_CONFIG_NEW_CAPTURE
#endif

#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) &&                      \
    !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
#define CATCH_CONFIG_DISABLE_EXCEPTIONS
#endif

#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) &&                           \
    !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) &&                                \
    !defined(CATCH_CONFIG_POLYFILL_ISNAN)
#define CATCH_CONFIG_POLYFILL_ISNAN
#endif

#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) &&                                \
    !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) &&                                \
    !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC)
#define CATCH_CONFIG_USE_ASYNC
#endif

#if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) &&                         \
    !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) &&                              \
    !defined(CATCH_CONFIG_ANDROID_LOGWRITE)
#define CATCH_CONFIG_ANDROID_LOGWRITE
#endif

#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) &&                         \
    !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) &&                              \
    !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
#define CATCH_CONFIG_GLOBAL_NEXTAFTER
#endif

// Even if we do not think the compiler has that warning, we still have
// to provide a macro that can be used by the code.
#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION)
#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
#endif
#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION)
#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
#endif
#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
#define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
#endif
#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
#define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
#endif
#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS)
#define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS
#endif
#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
#define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
#endif

// The goal of this macro is to avoid evaluation of the arguments, but
// still have the compiler warn on problems inside...
#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)
#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)
#endif

#if defined(__APPLE__) && defined(__apple_build_version__) &&                  \
    (__clang_major__ < 10)
#undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
#elif defined(__clang__) && (__clang_major__ < 5)
#undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
#endif

#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)
#define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
#endif

#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
#define CATCH_TRY if ((true))
#define CATCH_CATCH_ALL if ((false))
#define CATCH_CATCH_ANON(type) if ((false))
#else
#define CATCH_TRY try
#define CATCH_CATCH_ALL catch (...)
#define CATCH_CATCH_ANON(type) catch (type)
#endif

#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) &&            \
    !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) &&                 \
    !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR)
#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#endif

// end catch_compiler_capabilities.h
#define INTERNAL_CATCH_UNIQUE_NAME_LINE2(name, line) name##line
#define INTERNAL_CATCH_UNIQUE_NAME_LINE(name, line)                            \
  INTERNAL_CATCH_UNIQUE_NAME_LINE2(name, line)
#ifdef CATCH_CONFIG_COUNTER
#define INTERNAL_CATCH_UNIQUE_NAME(name)                                       \
  INTERNAL_CATCH_UNIQUE_NAME_LINE(name, __COUNTER__)
#else
#define INTERNAL_CATCH_UNIQUE_NAME(name)                                       \
  INTERNAL_CATCH_UNIQUE_NAME_LINE(name, __LINE__)
#endif

#include <iosfwd>
#include <string>
#include <cstdint>

// We need a dummy global operator<< so we can bring it into Catch namespace
// later
struct Catch_global_namespace_dummy {};
std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);

namespace Catch {

struct CaseSensitive {
  enum Choice { Yes, No };
};

class NonCopyable {
  NonCopyable(NonCopyable const&)            = delete;
  NonCopyable(NonCopyable&&)                 = delete;
  NonCopyable& operator=(NonCopyable const&) = delete;
  NonCopyable& operator=(NonCopyable&&)      = delete;

protected:
  NonCopyable();
  virtual ~NonCopyable();
};

struct SourceLineInfo {

  SourceLineInfo() = delete;
  SourceLineInfo(char const* _file, std::size_t _line) noexcept
      : file(_file), line(_line) {}

  SourceLineInfo(SourceLineInfo const& other)          = default;
  SourceLineInfo& operator=(SourceLineInfo const&)     = default;
  SourceLineInfo(SourceLineInfo&&) noexcept            = default;
  SourceLineInfo& operator=(SourceLineInfo&&) noexcept = default;

  bool empty() const noexcept { return file[0] == '\0'; }
  bool operator==(SourceLineInfo const& other) const noexcept;
  bool operator<(SourceLineInfo const& other) const noexcept;

  char const* file;
  std::size_t line;
};

std::ostream& operator<<(std::ostream& os, SourceLineInfo const& info);

// Bring in operator<< from global namespace into Catch namespace
// This is necessary because the overload of operator<< above makes
// lookup stop at namespace Catch
using ::operator<<;

// Use this in variadic streaming macros to allow
//    >> +StreamEndStop
// as well as
//    >> stuff +StreamEndStop
struct StreamEndStop {
  std::string operator+() const;
};
template <typename T>
T const& operator+(T const& value, StreamEndStop) {
  return value;
}
} // namespace Catch

#define CATCH_INTERNAL_LINEINFO                                                \
  ::Catch::SourceLineInfo(__FILE__, static_cast<std::size_t>(__LINE__))

// end catch_common.h
namespace Catch {

struct RegistrarForTagAliases {
  RegistrarForTagAliases(char const* alias, char const* tag,
                         SourceLineInfo const& lineInfo);
};

} // end namespace Catch

#define CATCH_REGISTER_TAG_ALIAS(alias, spec)                                  \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME(                    \
      AutoRegisterTagAlias)(alias, spec, CATCH_INTERNAL_LINEINFO);             \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

// end catch_tag_alias_autoregistrar.h
// start catch_test_registry.h

// start catch_interfaces_testcase.h

#include <vector>

namespace Catch {

class TestSpec;

struct ITestInvoker {
  virtual void invoke() const = 0;
  virtual ~ITestInvoker();
};

class TestCase;
struct IConfig;

struct ITestCaseRegistry {
  virtual ~ITestCaseRegistry();
  virtual std::vector<TestCase> const& getAllTests() const = 0;
  virtual std::vector<TestCase> const&
  getAllTestsSorted(IConfig const& config) const = 0;
};

bool isThrowSafe(TestCase const& testCase, IConfig const& config);
bool matchTest(TestCase const& testCase, TestSpec const& testSpec,
               IConfig const& config);
std::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,
                                  TestSpec const& testSpec,
                                  IConfig const& config);
std::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config);

} // namespace Catch

// end catch_interfaces_testcase.h
// start catch_stringref.h

#include <cstddef>
#include <string>
#include <iosfwd>
#include <cassert>

namespace Catch {

/// A non-owning string class (similar to the forthcoming std::string_view)
/// Note that, because a StringRef may be a substring of another string,
/// it may not be null terminated.
class StringRef {
public:
  using size_type      = std::size_t;
  using const_iterator = const char*;

private:
  static constexpr char const* const s_empty = "";

  char const* m_start = s_empty;
  size_type m_size    = 0;

public: // construction
  constexpr StringRef() noexcept = default;

  StringRef(char const* rawChars) noexcept;

  constexpr StringRef(char const* rawChars, size_type size) noexcept
      : m_start(rawChars), m_size(size) {}

  StringRef(std::string const& stdString) noexcept
      : m_start(stdString.c_str()), m_size(stdString.size()) {}

  explicit operator std::string() const { return std::string(m_start, m_size); }

public: // operators
  auto operator==(StringRef const& other) const noexcept -> bool;
  auto operator!=(StringRef const& other) const noexcept -> bool {
    return !(*this == other);
  }

  auto operator[](size_type index) const noexcept -> char {
    assert(index < m_size);
    return m_start[index];
  }

public: // named queries
  constexpr auto empty() const noexcept -> bool { return m_size == 0; }
  constexpr auto size() const noexcept -> size_type { return m_size; }

  // Returns the current start pointer. If the StringRef is not
  // null-terminated, throws std::domain_exception
  auto c_str() const -> char const*;

public: // substrings and searches
  // Returns a substring of [start, start + length).
  // If start + length > size(), then the substring is [start, size()).
  // If start > size(), then the substring is empty.
  auto substr(size_type start, size_type length) const noexcept -> StringRef;

  // Returns the current start pointer. May not be null-terminated.
  auto data() const noexcept -> char const*;

  constexpr auto isNullTerminated() const noexcept -> bool {
    return m_start[m_size] == '\0';
  }

public: // iterators
  constexpr const_iterator begin() const { return m_start; }
  constexpr const_iterator end() const { return m_start + m_size; }
};

auto operator+=(std::string& lhs, StringRef const& sr) -> std::string&;
auto operator<<(std::ostream& os, StringRef const& sr) -> std::ostream&;

constexpr auto operator"" _sr(char const* rawChars, std::size_t size) noexcept
    -> StringRef {
  return StringRef(rawChars, size);
}
} // namespace Catch

constexpr auto operator"" _catch_sr(char const* rawChars,
                                    std::size_t size) noexcept
    -> Catch::StringRef {
  return Catch::StringRef(rawChars, size);
}

// end catch_stringref.h
// start catch_preprocessor.hpp

#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__
#define CATCH_RECURSION_LEVEL1(...)                                            \
  CATCH_RECURSION_LEVEL0(                                                      \
      CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__)))
#define CATCH_RECURSION_LEVEL2(...)                                            \
  CATCH_RECURSION_LEVEL1(                                                      \
      CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__)))
#define CATCH_RECURSION_LEVEL3(...)                                            \
  CATCH_RECURSION_LEVEL2(                                                      \
      CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__)))
#define CATCH_RECURSION_LEVEL4(...)                                            \
  CATCH_RECURSION_LEVEL3(                                                      \
      CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__)))
#define CATCH_RECURSION_LEVEL5(...)                                            \
  CATCH_RECURSION_LEVEL4(                                                      \
      CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__)))

#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__
// MSVC needs more evaluations
#define CATCH_RECURSION_LEVEL6(...)                                            \
  CATCH_RECURSION_LEVEL5(                                                      \
      CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__)))
#define CATCH_RECURSE(...)                                                     \
  CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__))
#else
#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__)
#endif

#define CATCH_REC_END(...)
#define CATCH_REC_OUT

#define CATCH_EMPTY()
#define CATCH_DEFER(id) id CATCH_EMPTY()

#define CATCH_REC_GET_END2() 0, CATCH_REC_END
#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2
#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1
#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT
#define CATCH_REC_NEXT1(test, next) CATCH_DEFER(CATCH_REC_NEXT0)(test, next, 0)
#define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next)

#define CATCH_REC_LIST0(f, x, peek, ...)                                       \
  , f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1))(f, peek,           \
                                                            __VA_ARGS__)
#define CATCH_REC_LIST1(f, x, peek, ...)                                       \
  , f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0))(f, peek,           \
                                                            __VA_ARGS__)
#define CATCH_REC_LIST2(f, x, peek, ...)                                       \
  f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1))(f, peek, __VA_ARGS__)

#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...)                          \
  , f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(      \
        f, userdata, peek, __VA_ARGS__)
#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...)                          \
  , f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD))(      \
        f, userdata, peek, __VA_ARGS__)
#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...)                          \
  f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(        \
      f, userdata, peek, __VA_ARGS__)

// Applies the function macro `f` to each of the remaining parameters, inserts
// commas between the results, and passes userdata as the first parameter to
// each invocation, e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a),
// f(x, b), f(x, c)
#define CATCH_REC_LIST_UD(f, userdata, ...)                                    \
  CATCH_RECURSE(                                                               \
      CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0))

#define CATCH_REC_LIST(f, ...)                                                 \
  CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))

#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param)
#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO##__VA_ARGS__
#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__
#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__)
#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__
#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param)                         \
  INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param))
#else
// MSVC is adding extra space and needs another indirection to expand
// INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__)
#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__
#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param)                         \
  (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1)
#endif

#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__
#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name)

#define INTERNAL_CATCH_REMOVE_PARENS(...)                                      \
  INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__)

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...)                                    \
  decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>())
#define INTERNAL_CATCH_MAKE_TYPE_LIST(...)                                     \
  INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))
#else
#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...)                                    \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>()))
#define INTERNAL_CATCH_MAKE_TYPE_LIST(...)                                     \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(                  \
      INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)))
#endif

#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)                         \
  CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST, __VA_ARGS__)

#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0)
#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1)                             \
  INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1)
#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2)                         \
  INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2)
#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3)                     \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)
#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4)                 \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)
#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5)             \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)
#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6)         \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)
#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7)     \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)
#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)
#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7,    \
                                            _8, _9)                            \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)
#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7,    \
                                            _8, _9, _10)                       \
  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \
      INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9,  \
                                          _10)

#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9,   \
                                     _10, N, ...)                              \
  N

#define INTERNAL_CATCH_TYPE_GEN                                                \
  template <typename...>                                                       \
  struct TypeList {};                                                          \
  template <typename... Ts>                                                    \
  constexpr auto get_wrapper() noexcept->TypeList<Ts...> {                     \
    return {};                                                                 \
  }                                                                            \
  template <template <typename...> class...>                                   \
  struct TemplateTypeList {};                                                  \
  template <template <typename...> class... Cs>                                \
  constexpr auto get_wrapper() noexcept->TemplateTypeList<Cs...> {             \
    return {};                                                                 \
  }                                                                            \
  template <typename...>                                                       \
  struct append;                                                               \
  template <typename...>                                                       \
  struct rewrap;                                                               \
  template <template <typename...> class, typename...>                         \
  struct create;                                                               \
  template <template <typename...> class, typename>                            \
  struct convert;                                                              \
                                                                               \
  template <typename T>                                                        \
  struct append<T> {                                                           \
    using type = T;                                                            \
  };                                                                           \
  template <template <typename...> class L1, typename... E1,                   \
            template <typename...> class L2, typename... E2, typename... Rest> \
  struct append<L1<E1...>, L2<E2...>, Rest...> {                               \
    using type = typename append<L1<E1..., E2...>, Rest...>::type;             \
  };                                                                           \
  template <template <typename...> class L1, typename... E1, typename... Rest> \
  struct append<L1<E1...>, TypeList<mpl_::na>, Rest...> {                      \
    using type = L1<E1...>;                                                    \
  };                                                                           \
                                                                               \
  template <template <typename...> class Container,                            \
            template <typename...> class List, typename... elems>              \
  struct rewrap<TemplateTypeList<Container>, List<elems...>> {                 \
    using type = TypeList<Container<elems...>>;                                \
  };                                                                           \
  template <template <typename...> class Container,                            \
            template <typename...> class List, class... Elems,                 \
            typename... Elements>                                              \
  struct rewrap<TemplateTypeList<Container>, List<Elems...>, Elements...> {    \
    using type = typename append<TypeList<Container<Elems...>>,                \
                                 typename rewrap<TemplateTypeList<Container>,  \
                                                 Elements...>::type>::type;    \
  };                                                                           \
                                                                               \
  template <template <typename...> class Final,                                \
            template <typename...> class... Containers, typename... Types>     \
  struct create<Final, TemplateTypeList<Containers...>, TypeList<Types...>> {  \
    using type =                                                               \
        typename append<Final<>, typename rewrap<TemplateTypeList<Containers>, \
                                                 Types...>::type...>::type;    \
  };                                                                           \
  template <template <typename...> class Final,                                \
            template <typename...> class List, typename... Ts>                 \
  struct convert<Final, List<Ts...>> {                                         \
    using type = typename append<Final<>, TypeList<Ts>...>::type;              \
  };

#define INTERNAL_CATCH_NTTP_1(signature, ...)                                  \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  struct Nttp {};                                                              \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  constexpr auto get_wrapper() noexcept->Nttp<__VA_ARGS__> {                   \
    return {};                                                                 \
  }                                                                            \
  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class...>       \
  struct NttpTemplateTypeList {};                                              \
  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class... Cs>    \
  constexpr auto get_wrapper() noexcept->NttpTemplateTypeList<Cs...> {         \
    return {};                                                                 \
  }                                                                            \
                                                                               \
  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \
            class Container,                                                   \
            template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class List,     \
            INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>> {          \
    using type = TypeList<Container<__VA_ARGS__>>;                             \
  };                                                                           \
  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \
            class Container,                                                   \
            template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class List,     \
            INTERNAL_CATCH_REMOVE_PARENS(signature), typename... Elements>     \
  struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>,            \
                Elements...> {                                                 \
    using type =                                                               \
        typename append<TypeList<Container<__VA_ARGS__>>,                      \
                        typename rewrap<NttpTemplateTypeList<Container>,       \
                                        Elements...>::type>::type;             \
  };                                                                           \
  template <template <typename...> class Final,                                \
            template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \
            class... Containers,                                               \
            typename... Types>                                                 \
  struct create<Final, NttpTemplateTypeList<Containers...>,                    \
                TypeList<Types...>> {                                          \
    using type =                                                               \
        typename append<Final<>,                                               \
                        typename rewrap<NttpTemplateTypeList<Containers>,      \
                                        Types...>::type...>::type;             \
  };

#define INTERNAL_CATCH_DECLARE_SIG_TEST0(TestName)
#define INTERNAL_CATCH_DECLARE_SIG_TEST1(TestName, signature)                  \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  static void TestName()
#define INTERNAL_CATCH_DECLARE_SIG_TEST_X(TestName, signature, ...)            \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  static void TestName()

#define INTERNAL_CATCH_DEFINE_SIG_TEST0(TestName)
#define INTERNAL_CATCH_DEFINE_SIG_TEST1(TestName, signature)                   \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  static void TestName()
#define INTERNAL_CATCH_DEFINE_SIG_TEST_X(TestName, signature, ...)             \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  static void TestName()

#define INTERNAL_CATCH_NTTP_REGISTER0(TestFunc, signature)                     \
  template <typename Type>                                                     \
  void reg_test(TypeList<Type>, Catch::NameAndTags nameAndTags) {              \
    Catch::AutoReg(Catch::makeTestInvoker(&TestFunc<Type>),                    \
                   CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);  \
  }

#define INTERNAL_CATCH_NTTP_REGISTER(TestFunc, signature, ...)                 \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  void reg_test(Nttp<__VA_ARGS__>, Catch::NameAndTags nameAndTags) {           \
    Catch::AutoReg(Catch::makeTestInvoker(&TestFunc<__VA_ARGS__>),             \
                   CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);  \
  }

#define INTERNAL_CATCH_NTTP_REGISTER_METHOD0(TestName, signature, ...)         \
  template <typename Type>                                                     \
  void reg_test(TypeList<Type>, Catch::StringRef className,                    \
                Catch::NameAndTags nameAndTags) {                              \
    Catch::AutoReg(Catch::makeTestInvoker(&TestName<Type>::test),              \
                   CATCH_INTERNAL_LINEINFO, className, nameAndTags);           \
  }

#define INTERNAL_CATCH_NTTP_REGISTER_METHOD(TestName, signature, ...)          \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  void reg_test(Nttp<__VA_ARGS__>, Catch::StringRef className,                 \
                Catch::NameAndTags nameAndTags) {                              \
    Catch::AutoReg(Catch::makeTestInvoker(&TestName<__VA_ARGS__>::test),       \
                   CATCH_INTERNAL_LINEINFO, className, nameAndTags);           \
  }

#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0(TestName, ClassName)
#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1(TestName, ClassName,           \
                                                signature)                     \
  template <typename TestType>                                                 \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<TestType> {        \
    void test();                                                               \
  }

#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X(TestName, ClassName,          \
                                                 signature, ...)               \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<__VA_ARGS__> {     \
    void test();                                                               \
  }

#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0(TestName)
#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1(TestName, signature)            \
  template <typename TestType>                                                 \
  void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<TestType>::test()
#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X(TestName, signature, ...)      \
  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \
  void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<__VA_ARGS__>::test()

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_NTTP_0
#define INTERNAL_CATCH_NTTP_GEN(...)                                           \
  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \
      __VA_ARGS__, INTERNAL_CATCH_NTTP_1(__VA_ARGS__),                         \
      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \
      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \
      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \
      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \
      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_0)
#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...)                   \
  INTERNAL_CATCH_VA_NARGS_IMPL("dummy", __VA_ARGS__,                           \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1,         \
                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)         \
  (TestName, __VA_ARGS__)
#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...)       \
  INTERNAL_CATCH_VA_NARGS_IMPL("dummy", __VA_ARGS__,                           \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1,        \
                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)        \
  (TestName, ClassName, __VA_ARGS__)
#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...)                      \
  INTERNAL_CATCH_VA_NARGS_IMPL("dummy", __VA_ARGS__,                           \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD0,           \
                               INTERNAL_CATCH_NTTP_REGISTER_METHOD0)           \
  (TestName, __VA_ARGS__)
#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...)                             \
  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER,                      \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)            \
  (TestFunc, __VA_ARGS__)
#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...)                          \
  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X,                  \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)        \
  (TestName, __VA_ARGS__)
#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...)                         \
  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,                 \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,     \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)      \
  (TestName, __VA_ARGS__)
#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...)                                  \
  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \
      __VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,                        \
      INTERNAL_CATCH_REMOVE_PARENS_10_ARG, INTERNAL_CATCH_REMOVE_PARENS_9_ARG, \
      INTERNAL_CATCH_REMOVE_PARENS_8_ARG, INTERNAL_CATCH_REMOVE_PARENS_7_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_6_ARG, INTERNAL_CATCH_REMOVE_PARENS_5_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_4_ARG, INTERNAL_CATCH_REMOVE_PARENS_3_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_2_ARG, INTERNAL_CATCH_REMOVE_PARENS_1_ARG)  \
  (__VA_ARGS__)
#else
#define INTERNAL_CATCH_NTTP_0(signature)
#define INTERNAL_CATCH_NTTP_GEN(...)                                           \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      __VA_ARGS__, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,               \
      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,     \
      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,     \
      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,                            \
      INTERNAL_CATCH_NTTP_0)(__VA_ARGS__))
#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...)                   \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,           \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1,                                  \
      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__))
#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...)       \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,          \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1,                                 \
      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName,            \
                                               __VA_ARGS__))
#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...)                      \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD,               \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD0,                                    \
      INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__))
#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...)                             \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER,                      \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \
      INTERNAL_CATCH_NTTP_REGISTER0,                                           \
      INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__))
#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...)                          \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X,                  \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \
      INTERNAL_CATCH_DEFINE_SIG_TEST1,                                         \
      INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__))
#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...)                         \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,                 \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,     \
      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \
      INTERNAL_CATCH_DECLARE_SIG_TEST1,                                        \
      INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__))
#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...)                                  \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \
      __VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,                        \
      INTERNAL_CATCH_REMOVE_PARENS_10_ARG, INTERNAL_CATCH_REMOVE_PARENS_9_ARG, \
      INTERNAL_CATCH_REMOVE_PARENS_8_ARG, INTERNAL_CATCH_REMOVE_PARENS_7_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_6_ARG, INTERNAL_CATCH_REMOVE_PARENS_5_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_4_ARG, INTERNAL_CATCH_REMOVE_PARENS_3_ARG,  \
      INTERNAL_CATCH_REMOVE_PARENS_2_ARG,                                      \
      INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__))
#endif

// end catch_preprocessor.hpp
// start catch_meta.hpp

#include <type_traits>

namespace Catch {
template <typename T>
struct always_false : std::false_type {};

template <typename>
struct true_given : std::true_type {};
struct is_callable_tester {
  template <typename Fun, typename... Args>
  true_given<
      decltype(std::declval<Fun>()(std::declval<Args>()...))> static test(int);
  template <typename...>
  std::false_type static test(...);
};

template <typename T>
struct is_callable;

template <typename Fun, typename... Args>
struct is_callable<Fun(Args...)>
    : decltype(is_callable_tester::test<Fun, Args...>(0)) {};

#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
// std::result_of is deprecated in C++17 and removed in C++20. Hence, it is
// replaced with std::invoke_result here.
template <typename Func, typename... U>
using FunctionReturnType =
    std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;
#else
// Keep ::type here because we still support C++11
template <typename Func, typename... U>
using FunctionReturnType =
    typename std::remove_reference<typename std::remove_cv<
        typename std::result_of<Func(U...)>::type>::type>::type;
#endif

} // namespace Catch

namespace mpl_ {
struct na;
}

// end catch_meta.hpp
namespace Catch {

template <typename C>
class TestInvokerAsMethod : public ITestInvoker {
  void (C::*m_testAsMethod)();

public:
  TestInvokerAsMethod(void (C::*testAsMethod)()) noexcept
      : m_testAsMethod(testAsMethod) {}

  void invoke() const override {
    C obj;
    (obj.*m_testAsMethod)();
  }
};

auto makeTestInvoker(void (*testAsFunction)()) noexcept -> ITestInvoker*;

template <typename C>
auto makeTestInvoker(void (C::*testAsMethod)()) noexcept -> ITestInvoker* {
  return new (std::nothrow) TestInvokerAsMethod<C>(testAsMethod);
}

struct NameAndTags {
  NameAndTags(StringRef const& name_ = StringRef(),
              StringRef const& tags_ = StringRef()) noexcept;
  StringRef name;
  StringRef tags;
};

struct AutoReg : NonCopyable {
  AutoReg(ITestInvoker* invoker, SourceLineInfo const& lineInfo,
          StringRef const& classOrMethod,
          NameAndTags const& nameAndTags) noexcept;
  ~AutoReg();
};

} // end namespace Catch

#if defined(CATCH_CONFIG_DISABLE)
#define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(TestName, ...)                 \
  static void TestName()
#define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(TestName, ClassName,    \
                                                       ...)                    \
  namespace {                                                                  \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) {                  \
    void test();                                                               \
  };                                                                           \
  }                                                                            \
  void TestName::test()
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                   \
    TestName, TestFunc, Name, Tags, Signature, ...)                            \
  INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,                                     \
                                 INTERNAL_CATCH_REMOVE_PARENS(Signature))
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(            \
    TestNameClass, TestName, ClassName, Name, Tags, Signature, ...)            \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(                                    \
        TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));         \
  }                                                                            \
  }                                                                            \
  INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(                                       \
      TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...)     \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                         \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, typename TestType, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...)     \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                     \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \
          Name, Tags, typename TestType, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags,      \
                                                              Signature, ...)  \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                         \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags,      \
                                                              Signature, ...)  \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                     \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \
          Name, Tags, Signature, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(              \
    ClassName, Name, Tags, ...)                                                \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, typename T, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(              \
    ClassName, Name, Tags, ...)                                                \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(              \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),   \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          ClassName, Name, Tags, typename T, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(          \
    ClassName, Name, Tags, Signature, ...)                                     \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(          \
    ClassName, Name, Tags, Signature, ...)                                     \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(              \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),   \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          ClassName, Name, Tags, Signature, __VA_ARGS__))
#endif
#endif

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_TESTCASE2(TestName, ...)                                \
  static void TestName();                                                      \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \
      Catch::makeTestInvoker(&TestName), CATCH_INTERNAL_LINEINFO,              \
      Catch::StringRef(), Catch::NameAndTags{__VA_ARGS__});                    \
  } /* NOLINT */                                                               \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  static void TestName()
#define INTERNAL_CATCH_TESTCASE(...)                                           \
  INTERNAL_CATCH_TESTCASE2(                                                    \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), __VA_ARGS__)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_METHOD_AS_TEST_CASE(QualifiedMethod, ...)               \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \
      Catch::makeTestInvoker(&QualifiedMethod), CATCH_INTERNAL_LINEINFO,       \
      "&" #QualifiedMethod, Catch::NameAndTags{__VA_ARGS__});                  \
  } /* NOLINT */                                                               \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_TEST_CASE_METHOD2(TestName, ClassName, ...)             \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) {                  \
    void test();                                                               \
  };                                                                           \
  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \
      Catch::makeTestInvoker(&TestName::test), CATCH_INTERNAL_LINEINFO,        \
      #ClassName, Catch::NameAndTags{__VA_ARGS__}); /* NOLINT */               \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  void TestName::test()
#define INTERNAL_CATCH_TEST_CASE_METHOD(ClassName, ...)                        \
  INTERNAL_CATCH_TEST_CASE_METHOD2(                                            \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), ClassName,     \
      __VA_ARGS__)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_REGISTER_TESTCASE(Function, ...)                        \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \
      Catch::makeTestInvoker(Function), CATCH_INTERNAL_LINEINFO,               \
      Catch::StringRef(), Catch::NameAndTags{__VA_ARGS__}); /* NOLINT */       \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags,    \
                                            Signature, ...)                    \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  INTERNAL_CATCH_DECLARE_SIG_TEST(TestFunc,                                    \
                                  INTERNAL_CATCH_REMOVE_PARENS(Signature));    \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \
    INTERNAL_CATCH_NTTP_REG_GEN(TestFunc,                                      \
                                INTERNAL_CATCH_REMOVE_PARENS(Signature))       \
    template <typename... Types>                                               \
    struct TestName {                                                          \
      TestName() {                                                             \
        int index                          = 0;                                \
        constexpr char const* tmpl_types[] = {CATCH_REC_LIST(                  \
            INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};            \
        using expander                     = int[];                            \
        (void)expander{                                                        \
            (reg_test(Types{},                                                 \
                      Catch::NameAndTags{                                      \
                          Name " - " + std::string(tmpl_types[index]), Tags}), \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      TestName<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();      \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,                                     \
                                 INTERNAL_CATCH_REMOVE_PARENS(Signature))

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...)                     \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(                                         \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, typename TestType, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...)                     \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(             \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, typename TestType, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...)      \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(                                         \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...)      \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(             \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, Signature, __VA_ARGS__))
#endif

#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                            \
    TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList)       \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  template <typename TestType>                                                 \
  static void TestFuncName();                                                  \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \
    template <typename... Types>                                               \
    struct TestName {                                                          \
      void reg_tests() {                                                       \
        int index                          = 0;                                \
        using expander                     = int[];                            \
        constexpr char const* tmpl_types[] = {                                 \
            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \
                           INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};          \
        constexpr char const* types_list[] = {                                 \
            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \
                           INTERNAL_CATCH_REMOVE_PARENS(TypesList))};          \
        constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]); \
        (void)expander{                                                        \
            (Catch::AutoReg(                                                   \
                 Catch::makeTestInvoker(&TestFuncName<Types>),                 \
                 CATCH_INTERNAL_LINEINFO, Catch::StringRef(),                  \
                 Catch::NameAndTags{                                           \
                     Name " - " + std::string(tmpl_types[index / num_types]) + \
                         "<" + std::string(types_list[index % num_types]) +    \
                         ">",                                                  \
                     Tags}),                                                   \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      using TestInit = typename create<                                        \
          TestName,                                                            \
          decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()),    \
          TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(                  \
              INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;                \
      TestInit t;                                                              \
      t.reg_tests();                                                           \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  template <typename TestType>                                                 \
  static void TestFuncName()

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)             \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, typename T, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)             \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(      \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, typename T, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature,   \
                                                      ...)                     \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature,   \
                                                      ...)                     \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(      \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, Signature, __VA_ARGS__))
#endif

#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name,     \
                                                 Tags, TmplList)               \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  template <typename TestType>                                                 \
  static void TestFunc();                                                      \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    template <typename... Types>                                               \
    struct TestName {                                                          \
      void reg_tests() {                                                       \
        int index      = 0;                                                    \
        using expander = int[];                                                \
        (void)expander{                                                        \
            (Catch::AutoReg(                                                   \
                 Catch::makeTestInvoker(&TestFunc<Types>),                     \
                 CATCH_INTERNAL_LINEINFO, Catch::StringRef(),                  \
                 Catch::NameAndTags{                                           \
                     Name " - " +                                              \
                         std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) +     \
                         " - " + std::to_string(index),                        \
                     Tags}),                                                   \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      using TestInit = typename convert<TestName, TmplList>::type;             \
      TestInit t;                                                              \
      t.reg_tests();                                                           \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  template <typename TestType>                                                 \
  static void TestFunc()

#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList)           \
  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(                                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      Name, Tags, TmplList)

#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                            \
    TestNameClass, TestName, ClassName, Name, Tags, Signature, ...)            \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \
    INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(                                    \
        TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));         \
    INTERNAL_CATCH_NTTP_REG_METHOD_GEN(                                        \
        TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))                     \
    template <typename... Types>                                               \
    struct TestNameClass {                                                     \
      TestNameClass() {                                                        \
        int index                          = 0;                                \
        constexpr char const* tmpl_types[] = {CATCH_REC_LIST(                  \
            INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};            \
        using expander                     = int[];                            \
        (void)expander{                                                        \
            (reg_test(Types{}, #ClassName,                                     \
                      Catch::NameAndTags{                                      \
                          Name " - " + std::string(tmpl_types[index]), Tags}), \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      TestNameClass<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>(); \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(                                       \
      TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(ClassName, Name, Tags, ...)   \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, typename T, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(ClassName, Name, Tags, ...)   \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(      \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, typename T, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(ClassName, Name, Tags,    \
                                                     Signature, ...)           \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                                  \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(ClassName, Name, Tags,    \
                                                     Signature, ...)           \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(      \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      ClassName, Name, Tags, Signature, __VA_ARGS__))
#endif

#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                    \
    TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes,      \
    TypesList)                                                                 \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  template <typename TestType>                                                 \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName<TestType>) {        \
    void test();                                                               \
  };                                                                           \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestNameClass) {                     \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \
    template <typename... Types>                                               \
    struct TestNameClass {                                                     \
      void reg_tests() {                                                       \
        int index                          = 0;                                \
        using expander                     = int[];                            \
        constexpr char const* tmpl_types[] = {                                 \
            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \
                           INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};          \
        constexpr char const* types_list[] = {                                 \
            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \
                           INTERNAL_CATCH_REMOVE_PARENS(TypesList))};          \
        constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]); \
        (void)expander{                                                        \
            (Catch::AutoReg(                                                   \
                 Catch::makeTestInvoker(&TestName<Types>::test),               \
                 CATCH_INTERNAL_LINEINFO, #ClassName,                          \
                 Catch::NameAndTags{                                           \
                     Name " - " + std::string(tmpl_types[index / num_types]) + \
                         "<" + std::string(types_list[index % num_types]) +    \
                         ">",                                                  \
                     Tags}),                                                   \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      using TestInit = typename create<                                        \
          TestNameClass,                                                       \
          decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()),    \
          TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(                  \
              INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;                \
      TestInit t;                                                              \
      t.reg_tests();                                                           \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  template <typename TestType>                                                 \
  void TestName<TestType>::test()

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(ClassName, Name,      \
                                                         Tags, ...)            \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                          \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      ClassName, Name, Tags, typename T, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(ClassName, Name,      \
                                                         Tags, ...)            \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                      \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \
          ClassName, Name, Tags, typename T, __VA_ARGS__))
#endif

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(                  \
    ClassName, Name, Tags, Signature, ...)                                     \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                          \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      ClassName, Name, Tags, Signature, __VA_ARGS__)
#else
#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(                  \
    ClassName, Name, Tags, Signature, ...)                                     \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                      \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \
          INTERNAL_CATCH_UNIQUE_NAME(                                          \
              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \
          ClassName, Name, Tags, Signature, __VA_ARGS__))
#endif

#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2(                       \
    TestNameClass, TestName, ClassName, Name, Tags, TmplList)                  \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \
  template <typename TestType>                                                 \
  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName<TestType>) {        \
    void test();                                                               \
  };                                                                           \
  namespace {                                                                  \
  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \
    INTERNAL_CATCH_TYPE_GEN                                                    \
    template <typename... Types>                                               \
    struct TestNameClass {                                                     \
      void reg_tests() {                                                       \
        int index      = 0;                                                    \
        using expander = int[];                                                \
        (void)expander{                                                        \
            (Catch::AutoReg(                                                   \
                 Catch::makeTestInvoker(&TestName<Types>::test),               \
                 CATCH_INTERNAL_LINEINFO, #ClassName,                          \
                 Catch::NameAndTags{                                           \
                     Name " - " +                                              \
                         std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) +     \
                         " - " + std::to_string(index),                        \
                     Tags}),                                                   \
             index++)...}; /* NOLINT */                                        \
      }                                                                        \
    };                                                                         \
    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \
      using TestInit = typename convert<TestNameClass, TmplList>::type;        \
      TestInit t;                                                              \
      t.reg_tests();                                                           \
      return 0;                                                                \
    }();                                                                       \
  }                                                                            \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  template <typename TestType>                                                 \
  void TestName<TestType>::test()

#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags,   \
                                                      TmplList)                \
  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2(                             \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \
      INTERNAL_CATCH_UNIQUE_NAME(                                              \
          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \
      ClassName, Name, Tags, TmplList)

// end catch_test_registry.h
// start catch_capture.hpp

// start catch_assertionhandler.h

// start catch_assertioninfo.h

// start catch_result_type.h

namespace Catch {

// ResultWas::OfType enum
struct ResultWas {
  enum OfType {
    Unknown = -1,
    Ok      = 0,
    Info    = 1,
    Warning = 2,

    FailureBit = 0x10,

    ExpressionFailed = FailureBit | 1,
    ExplicitFailure  = FailureBit | 2,

    Exception = 0x100 | FailureBit,

    ThrewException      = Exception | 1,
    DidntThrowException = Exception | 2,

    FatalErrorCondition = 0x200 | FailureBit

  };
};

bool isOk(ResultWas::OfType resultType);
bool isJustInfo(int flags);

// ResultDisposition::Flags enum
struct ResultDisposition {
  enum Flags {
    Normal = 0x01,

    ContinueOnFailure = 0x02, // Failures fail test, but execution continues
    FalseTest         = 0x04, // Prefix expression with !
    SuppressFail      = 0x08  // Failures are reported but do not fail the test
  };
};

ResultDisposition::Flags operator|(ResultDisposition::Flags lhs,
                                   ResultDisposition::Flags rhs);

bool shouldContinueOnFailure(int flags);
inline bool isFalseTest(int flags) {
  return (flags & ResultDisposition::FalseTest) != 0;
}
bool shouldSuppressFailure(int flags);

} // end namespace Catch

// end catch_result_type.h
namespace Catch {

struct AssertionInfo {
  StringRef macroName;
  SourceLineInfo lineInfo;
  StringRef capturedExpression;
  ResultDisposition::Flags resultDisposition;

  // We want to delete this constructor but a compiler bug in 4.8 means
  // the struct is then treated as non-aggregate
  // AssertionInfo() = delete;
};

} // end namespace Catch

// end catch_assertioninfo.h
// start catch_decomposer.h

// start catch_tostring.h

#include <vector>
#include <cstddef>
#include <type_traits>
#include <string>
// start catch_stream.h

#include <iosfwd>
#include <cstddef>
#include <ostream>

namespace Catch {

std::ostream& cout();
std::ostream& cerr();
std::ostream& clog();

class StringRef;

struct IStream {
  virtual ~IStream();
  virtual std::ostream& stream() const = 0;
};

auto makeStream(StringRef const& filename) -> IStream const*;

class ReusableStringStream : NonCopyable {
  std::size_t m_index;
  std::ostream* m_oss;

public:
  ReusableStringStream();
  ~ReusableStringStream();

  auto str() const -> std::string;

  template <typename T>
  auto operator<<(T const& value) -> ReusableStringStream& {
    *m_oss << value;
    return *this;
  }
  auto get() -> std::ostream& { return *m_oss; }
};
} // namespace Catch

// end catch_stream.h
// start catch_interfaces_enum_values_registry.h

#include <vector>

namespace Catch {

namespace Detail {
struct EnumInfo {
  StringRef m_name;
  std::vector<std::pair<int, StringRef>> m_values;

  ~EnumInfo();

  StringRef lookup(int value) const;
};
} // namespace Detail

struct IMutableEnumValuesRegistry {
  virtual ~IMutableEnumValuesRegistry();

  virtual Detail::EnumInfo const&
  registerEnum(StringRef enumName, StringRef allEnums,
               std::vector<int> const& values) = 0;

  template <typename E>
  Detail::EnumInfo const& registerEnum(StringRef enumName, StringRef allEnums,
                                       std::initializer_list<E> values) {
    static_assert(sizeof(int) >= sizeof(E), "Cannot serialize enum to int");
    std::vector<int> intValues;
    intValues.reserve(values.size());
    for (auto enumValue : values)
      intValues.push_back(static_cast<int>(enumValue));
    return registerEnum(enumName, allEnums, intValues);
  }
};

} // namespace Catch

// end catch_interfaces_enum_values_registry.h

#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
#include <string_view>
#endif

#ifdef __OBJC__
// start catch_objc_arc.hpp

#import <Foundation/Foundation.h>

#ifdef __has_feature
#define CATCH_ARC_ENABLED __has_feature(objc_arc)
#else
#define CATCH_ARC_ENABLED 0
#endif

void arcSafeRelease(NSObject* obj);
id performOptionalSelector(id obj, SEL sel);

#if !CATCH_ARC_ENABLED
inline void arcSafeRelease(NSObject* obj) { [obj release]; }
inline id performOptionalSelector(id obj, SEL sel) {
  if ([obj respondsToSelector:sel])
    return [obj performSelector:sel];
  return nil;
}
#define CATCH_UNSAFE_UNRETAINED
#define CATCH_ARC_STRONG
#else
inline void arcSafeRelease(NSObject*) {}
inline id performOptionalSelector(id obj, SEL sel) {
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Warc-performSelector-leaks"
#endif
  if ([obj respondsToSelector:sel])
    return [obj performSelector:sel];
#ifdef __clang__
#pragma clang diagnostic pop
#endif
  return nil;
}
#define CATCH_UNSAFE_UNRETAINED __unsafe_unretained
#define CATCH_ARC_STRONG __strong
#endif

// end catch_objc_arc.hpp
#endif

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(                                                               \
    disable : 4180) // We attempt to stream a function (address) by const&,
                    // which MSVC complains about but is harmless
#endif

namespace Catch {
namespace Detail {

extern const std::string unprintableString;

std::string rawMemoryToString(const void* object, std::size_t size);

template <typename T>
std::string rawMemoryToString(const T& object) {
  return rawMemoryToString(&object, sizeof(object));
}

template <typename T>
class IsStreamInsertable {
  template <typename Stream, typename U>
  static auto test(int)
      -> decltype(std::declval<Stream&>() << std::declval<U>(),
                  std::true_type());

  template <typename, typename>
  static auto test(...) -> std::false_type;

public:
  static const bool value = decltype(test<std::ostream, const T&>(0))::value;
};

template <typename E>
std::string convertUnknownEnumToString(E e);

template <typename T>
typename std::enable_if<!std::is_enum<T>::value &&
                            !std::is_base_of<std::exception, T>::value,
                        std::string>::type
convertUnstreamable(T const&) {
  return Detail::unprintableString;
}
template <typename T>
typename std::enable_if<!std::is_enum<T>::value &&
                            std::is_base_of<std::exception, T>::value,
                        std::string>::type
convertUnstreamable(T const& ex) {
  return ex.what();
}

template <typename T>
typename std::enable_if<std::is_enum<T>::value, std::string>::type
convertUnstreamable(T const& value) {
  return convertUnknownEnumToString(value);
}

#if defined(_MANAGED)
//! Convert a CLR string to a utf8 std::string
template <typename T>
std::string clrReferenceToString(T ^ ref) {
  if (ref == nullptr)
    return std::string("null");
  auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());
  cli::pin_ptr<System::Byte> p = &bytes[0];
  return std::string(reinterpret_cast<char const*>(p), bytes->Length);
}
#endif

} // namespace Detail

// If we decide for C++14, change these to enable_if_ts
template <typename T, typename = void>
struct StringMaker {
  template <typename Fake = T>
  static
      typename std::enable_if<::Catch::Detail::IsStreamInsertable<Fake>::value,
                              std::string>::type
      convert(const Fake& value) {
    ReusableStringStream rss;
    // NB: call using the function-like syntax to avoid ambiguity with
    // user-defined templated operator<< under clang.
    rss.operator<<(value);
    return rss.str();
  }

  template <typename Fake = T>
  static
      typename std::enable_if<!::Catch::Detail::IsStreamInsertable<Fake>::value,
                              std::string>::type
      convert(const Fake& value) {
#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)
    return Detail::convertUnstreamable(value);
#else
    return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);
#endif
  }
};

namespace Detail {

// This function dispatches all stringification requests inside of Catch.
// Should be preferably called fully qualified, like ::Catch::Detail::stringify
template <typename T>
std::string stringify(const T& e) {
  return ::Catch::StringMaker<typename std::remove_cv<
      typename std::remove_reference<T>::type>::type>::convert(e);
}

template <typename E>
std::string convertUnknownEnumToString(E e) {
  return ::Catch::Detail::stringify(
      static_cast<typename std::underlying_type<E>::type>(e));
}

#if defined(_MANAGED)
template <typename T>
std::string stringify(T ^ e) {
  return ::Catch::StringMaker<T ^>::convert(e);
}
#endif

} // namespace Detail

// Some predefined specializations

template <>
struct StringMaker<std::string> {
  static std::string convert(const std::string& str);
};

#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
template <>
struct StringMaker<std::string_view> {
  static std::string convert(std::string_view str);
};
#endif

template <>
struct StringMaker<char const*> {
  static std::string convert(char const* str);
};
template <>
struct StringMaker<char*> {
  static std::string convert(char* str);
};

#ifdef CATCH_CONFIG_WCHAR
template <>
struct StringMaker<std::wstring> {
  static std::string convert(const std::wstring& wstr);
};

#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
template <>
struct StringMaker<std::wstring_view> {
  static std::string convert(std::wstring_view str);
};
#endif

template <>
struct StringMaker<wchar_t const*> {
  static std::string convert(wchar_t const* str);
};
template <>
struct StringMaker<wchar_t*> {
  static std::string convert(wchar_t* str);
};
#endif

// TBD: Should we use `strnlen` to ensure that we don't go out of the buffer,
//      while keeping string semantics?
template <int SZ>
struct StringMaker<char[SZ]> {
  static std::string convert(char const* str) {
    return ::Catch::Detail::stringify(std::string{str});
  }
};
template <int SZ>
struct StringMaker<signed char[SZ]> {
  static std::string convert(signed char const* str) {
    return ::Catch::Detail::stringify(
        std::string{reinterpret_cast<char const*>(str)});
  }
};
template <int SZ>
struct StringMaker<unsigned char[SZ]> {
  static std::string convert(unsigned char const* str) {
    return ::Catch::Detail::stringify(
        std::string{reinterpret_cast<char const*>(str)});
  }
};

#if defined(CATCH_CONFIG_CPP17_BYTE)
template <>
struct StringMaker<std::byte> {
  static std::string convert(std::byte value);
};
#endif // defined(CATCH_CONFIG_CPP17_BYTE)
template <>
struct StringMaker<int> {
  static std::string convert(int value);
};
template <>
struct StringMaker<long> {
  static std::string convert(long value);
};
template <>
struct StringMaker<long long> {
  static std::string convert(long long value);
};
template <>
struct StringMaker<unsigned int> {
  static std::string convert(unsigned int value);
};
template <>
struct StringMaker<unsigned long> {
  static std::string convert(unsigned long value);
};
template <>
struct StringMaker<unsigned long long> {
  static std::string convert(unsigned long long value);
};

template <>
struct StringMaker<bool> {
  static std::string convert(bool b);
};

template <>
struct StringMaker<char> {
  static std::string convert(char c);
};
template <>
struct StringMaker<signed char> {
  static std::string convert(signed char c);
};
template <>
struct StringMaker<unsigned char> {
  static std::string convert(unsigned char c);
};

template <>
struct StringMaker<std::nullptr_t> {
  static std::string convert(std::nullptr_t);
};

template <>
struct StringMaker<float> {
  static std::string convert(float value);
  static int precision;
};

template <>
struct StringMaker<double> {
  static std::string convert(double value);
  static int precision;
};

template <typename T>
struct StringMaker<T*> {
  template <typename U>
  static std::string convert(U* p) {
    if (p) {
      return ::Catch::Detail::rawMemoryToString(p);
    } else {
      return "nullptr";
    }
  }
};

template <typename R, typename C>
struct StringMaker<R C::*> {
  static std::string convert(R C::*p) {
    if (p) {
      return ::Catch::Detail::rawMemoryToString(p);
    } else {
      return "nullptr";
    }
  }
};

#if defined(_MANAGED)
template <typename T>
struct StringMaker<T ^> {
  static std::string convert(T ^ ref) {
    return ::Catch::Detail::clrReferenceToString(ref);
  }
};
#endif

namespace Detail {
template <typename InputIterator, typename Sentinel = InputIterator>
std::string rangeToString(InputIterator first, Sentinel last) {
  ReusableStringStream rss;
  rss << "{ ";
  if (first != last) {
    rss << ::Catch::Detail::stringify(*first);
    for (++first; first != last; ++first)
      rss << ", " << ::Catch::Detail::stringify(*first);
  }
  rss << " }";
  return rss.str();
}
} // namespace Detail

#ifdef __OBJC__
template <>
struct StringMaker<NSString*> {
  static std::string convert(NSString* nsstring) {
    if (!nsstring)
      return "nil";
    return std::string("@") + [nsstring UTF8String];
  }
};
template <>
struct StringMaker<NSObject*> {
  static std::string convert(NSObject* nsObject) {
    return ::Catch::Detail::stringify([nsObject description]);
  }
};
namespace Detail {
inline std::string stringify(NSString* nsstring) {
  return StringMaker<NSString*>::convert(nsstring);
}

} // namespace Detail
#endif // __OBJC__

} // namespace Catch

//////////////////////////////////////////////////////
// Separate std-lib types stringification, so it can be selectively enabled
// This means that we do not bring in

#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)
#define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
#define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
#define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
#define CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
#endif

// Separate std::pair specialization
#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)
#include <utility>
namespace Catch {
template <typename T1, typename T2>
struct StringMaker<std::pair<T1, T2>> {
  static std::string convert(const std::pair<T1, T2>& pair) {
    ReusableStringStream rss;
    rss << "{ " << ::Catch::Detail::stringify(pair.first) << ", "
        << ::Catch::Detail::stringify(pair.second) << " }";
    return rss.str();
  }
};
} // namespace Catch
#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER

#if defined(CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER) &&                       \
    defined(CATCH_CONFIG_CPP17_OPTIONAL)
#include <optional>
namespace Catch {
template <typename T>
struct StringMaker<std::optional<T>> {
  static std::string convert(const std::optional<T>& optional) {
    ReusableStringStream rss;
    if (optional.has_value()) {
      rss << ::Catch::Detail::stringify(*optional);
    } else {
      rss << "{ }";
    }
    return rss.str();
  }
};
} // namespace Catch
#endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER

// Separate std::tuple specialization
#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)
#include <tuple>
namespace Catch {
namespace Detail {
template <typename Tuple, std::size_t N = 0,
          bool = (N < std::tuple_size<Tuple>::value)>
struct TupleElementPrinter {
  static void print(const Tuple& tuple, std::ostream& os) {
    os << (N ? ", " : " ") << ::Catch::Detail::stringify(std::get<N>(tuple));
    TupleElementPrinter<Tuple, N + 1>::print(tuple, os);
  }
};

template <typename Tuple, std::size_t N>
struct TupleElementPrinter<Tuple, N, false> {
  static void print(const Tuple&, std::ostream&) {}
};

} // namespace Detail

template <typename... Types>
struct StringMaker<std::tuple<Types...>> {
  static std::string convert(const std::tuple<Types...>& tuple) {
    ReusableStringStream rss;
    rss << '{';
    Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());
    rss << " }";
    return rss.str();
  }
};
} // namespace Catch
#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER

#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) &&                        \
    defined(CATCH_CONFIG_CPP17_VARIANT)
#include <variant>
namespace Catch {
template <>
struct StringMaker<std::monostate> {
  static std::string convert(const std::monostate&) { return "{ }"; }
};

template <typename... Elements>
struct StringMaker<std::variant<Elements...>> {
  static std::string convert(const std::variant<Elements...>& variant) {
    if (variant.valueless_by_exception()) {
      return "{valueless variant}";
    } else {
      return std::visit(
          [](const auto& value) { return ::Catch::Detail::stringify(value); },
          variant);
    }
  }
};
} // namespace Catch
#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER

namespace Catch {
// Import begin/ end from std here
using std::begin;
using std::end;

namespace detail {
template <typename...>
struct void_type {
  using type = void;
};

template <typename T, typename = void>
struct is_range_impl : std::false_type {};

template <typename T>
struct is_range_impl<
    T, typename void_type<decltype(begin(std::declval<T>()))>::type>
    : std::true_type {};
} // namespace detail

template <typename T>
struct is_range : detail::is_range_impl<T> {};

#if defined(_MANAGED) // Managed types are never ranges
template <typename T>
struct is_range<T ^> {
  static const bool value = false;
};
#endif

template <typename Range>
std::string rangeToString(Range const& range) {
  return ::Catch::Detail::rangeToString(begin(range), end(range));
}

// Handle vector<bool> specially
template <typename Allocator>
std::string rangeToString(std::vector<bool, Allocator> const& v) {
  ReusableStringStream rss;
  rss << "{ ";
  bool first = true;
  for (bool b : v) {
    if (first)
      first = false;
    else
      rss << ", ";
    rss << ::Catch::Detail::stringify(b);
  }
  rss << " }";
  return rss.str();
}

template <typename R>
struct StringMaker<R,
                   typename std::enable_if<
                       is_range<R>::value &&
                       !::Catch::Detail::IsStreamInsertable<R>::value>::type> {
  static std::string convert(R const& range) { return rangeToString(range); }
};

template <typename T, int SZ>
struct StringMaker<T[SZ]> {
  static std::string convert(T const (&arr)[SZ]) { return rangeToString(arr); }
};

} // namespace Catch

// Separate std::chrono::duration specialization
#if defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
#include <ctime>
#include <ratio>
#include <chrono>

namespace Catch {

template <class Ratio>
struct ratio_string {
  static std::string symbol();
};

template <class Ratio>
std::string ratio_string<Ratio>::symbol() {
  Catch::ReusableStringStream rss;
  rss << '[' << Ratio::num << '/' << Ratio::den << ']';
  return rss.str();
}
template <>
struct ratio_string<std::atto> {
  static std::string symbol();
};
template <>
struct ratio_string<std::femto> {
  static std::string symbol();
};
template <>
struct ratio_string<std::pico> {
  static std::string symbol();
};
template <>
struct ratio_string<std::nano> {
  static std::string symbol();
};
template <>
struct ratio_string<std::micro> {
  static std::string symbol();
};
template <>
struct ratio_string<std::milli> {
  static std::string symbol();
};

////////////
// std::chrono::duration specializations
template <typename Value, typename Ratio>
struct StringMaker<std::chrono::duration<Value, Ratio>> {
  static std::string
  convert(std::chrono::duration<Value, Ratio> const& duration) {
    ReusableStringStream rss;
    rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';
    return rss.str();
  }
};
template <typename Value>
struct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {
  static std::string
  convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {
    ReusableStringStream rss;
    rss << duration.count() << " s";
    return rss.str();
  }
};
template <typename Value>
struct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {
  static std::string
  convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {
    ReusableStringStream rss;
    rss << duration.count() << " m";
    return rss.str();
  }
};
template <typename Value>
struct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {
  static std::string
  convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {
    ReusableStringStream rss;
    rss << duration.count() << " h";
    return rss.str();
  }
};

////////////
// std::chrono::time_point specialization
// Generic time_point cannot be specialized, only
// std::chrono::time_point<system_clock>
template <typename Clock, typename Duration>
struct StringMaker<std::chrono::time_point<Clock, Duration>> {
  static std::string
  convert(std::chrono::time_point<Clock, Duration> const& time_point) {
    return ::Catch::Detail::stringify(time_point.time_since_epoch()) +
           " since epoch";
  }
};
// std::chrono::time_point<system_clock> specialization
template <typename Duration>
struct StringMaker<
    std::chrono::time_point<std::chrono::system_clock, Duration>> {
  static std::string
  convert(std::chrono::time_point<std::chrono::system_clock, Duration> const&
              time_point) {
    auto converted = std::chrono::system_clock::to_time_t(time_point);

#ifdef _MSC_VER
    std::tm timeInfo = {};
    gmtime_s(&timeInfo, &converted);
#else
    std::tm* timeInfo = std::gmtime(&converted);
#endif

    auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
    char timeStamp[timeStampSize];
    const char* const fmt = "%Y-%m-%dT%H:%M:%SZ";

#ifdef _MSC_VER
    std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
#else
    std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
#endif
    return std::string(timeStamp);
  }
};
} // namespace Catch
#endif // CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER

#define INTERNAL_CATCH_REGISTER_ENUM(enumName, ...)                            \
  namespace Catch {                                                            \
  template <>                                                                  \
  struct StringMaker<enumName> {                                               \
    static std::string convert(enumName value) {                               \
      static const auto& enumInfo =                                            \
          ::Catch::getMutableRegistryHub()                                     \
              .getMutableEnumValuesRegistry()                                  \
              .registerEnum(#enumName, #__VA_ARGS__, {__VA_ARGS__});           \
      return static_cast<std::string>(                                         \
          enumInfo.lookup(static_cast<int>(value)));                           \
    }                                                                          \
  };                                                                           \
  }

#define CATCH_REGISTER_ENUM(enumName, ...)                                     \
  INTERNAL_CATCH_REGISTER_ENUM(enumName, __VA_ARGS__)

#ifdef _MSC_VER
#pragma warning(pop)
#endif

// end catch_tostring.h
#include <iosfwd>

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4389) // '==' : signed/unsigned mismatch
#pragma warning(disable : 4018) // more "signed/unsigned mismatch"
#pragma warning(disable : 4312) // Converting int to T* using reinterpret_cast
                                // (issue on x64 platform)
#pragma warning(                                                               \
    disable : 4180) // qualifier applied to function type has no meaning
#pragma warning(disable : 4800) // Forcing result to true or false
#endif

namespace Catch {

struct ITransientExpression {
  auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }
  auto getResult() const -> bool { return m_result; }
  virtual void streamReconstructedExpression(std::ostream& os) const = 0;

  ITransientExpression(bool isBinaryExpression, bool result)
      : m_isBinaryExpression(isBinaryExpression), m_result(result) {}

  // We don't actually need a virtual destructor, but many static analysers
  // complain if it's not here :-(
  virtual ~ITransientExpression();

  bool m_isBinaryExpression;
  bool m_result;
};

void formatReconstructedExpression(std::ostream& os, std::string const& lhs,
                                   StringRef op, std::string const& rhs);

template <typename LhsT, typename RhsT>
class BinaryExpr : public ITransientExpression {
  LhsT m_lhs;
  StringRef m_op;
  RhsT m_rhs;

  void streamReconstructedExpression(std::ostream& os) const override {
    formatReconstructedExpression(os, Catch::Detail::stringify(m_lhs), m_op,
                                  Catch::Detail::stringify(m_rhs));
  }

public:
  BinaryExpr(bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs)
      : ITransientExpression{true, comparisonResult}, m_lhs(lhs), m_op(op),
        m_rhs(rhs) {}

  template <typename T>
  auto operator&&(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator||(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator==(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator!=(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator>(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator<(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator>=(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename T>
  auto operator<=(T) const -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<T>::value,
                  "chained comparisons are not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }
};

template <typename LhsT>
class UnaryExpr : public ITransientExpression {
  LhsT m_lhs;

  void streamReconstructedExpression(std::ostream& os) const override {
    os << Catch::Detail::stringify(m_lhs);
  }

public:
  explicit UnaryExpr(LhsT lhs)
      : ITransientExpression{false, static_cast<bool>(lhs)}, m_lhs(lhs) {}
};

// Specialised comparison functions to handle equality comparisons between ints
// and pointers (NULL deduces as an int)
template <typename LhsT, typename RhsT>
auto compareEqual(LhsT const& lhs, RhsT const& rhs) -> bool {
  return static_cast<bool>(lhs == rhs);
}
template <typename T>
auto compareEqual(T* const& lhs, int rhs) -> bool {
  return lhs == reinterpret_cast<void const*>(rhs);
}
template <typename T>
auto compareEqual(T* const& lhs, long rhs) -> bool {
  return lhs == reinterpret_cast<void const*>(rhs);
}
template <typename T>
auto compareEqual(int lhs, T* const& rhs) -> bool {
  return reinterpret_cast<void const*>(lhs) == rhs;
}
template <typename T>
auto compareEqual(long lhs, T* const& rhs) -> bool {
  return reinterpret_cast<void const*>(lhs) == rhs;
}

template <typename LhsT, typename RhsT>
auto compareNotEqual(LhsT const& lhs, RhsT&& rhs) -> bool {
  return static_cast<bool>(lhs != rhs);
}
template <typename T>
auto compareNotEqual(T* const& lhs, int rhs) -> bool {
  return lhs != reinterpret_cast<void const*>(rhs);
}
template <typename T>
auto compareNotEqual(T* const& lhs, long rhs) -> bool {
  return lhs != reinterpret_cast<void const*>(rhs);
}
template <typename T>
auto compareNotEqual(int lhs, T* const& rhs) -> bool {
  return reinterpret_cast<void const*>(lhs) != rhs;
}
template <typename T>
auto compareNotEqual(long lhs, T* const& rhs) -> bool {
  return reinterpret_cast<void const*>(lhs) != rhs;
}

template <typename LhsT>
class ExprLhs {
  LhsT m_lhs;

public:
  explicit ExprLhs(LhsT lhs) : m_lhs(lhs) {}

  template <typename RhsT>
  auto operator==(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {compareEqual(m_lhs, rhs), m_lhs, "==", rhs};
  }
  auto operator==(bool rhs) -> BinaryExpr<LhsT, bool> const {
    return {m_lhs == rhs, m_lhs, "==", rhs};
  }

  template <typename RhsT>
  auto operator!=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {compareNotEqual(m_lhs, rhs), m_lhs, "!=", rhs};
  }
  auto operator!=(bool rhs) -> BinaryExpr<LhsT, bool> const {
    return {m_lhs != rhs, m_lhs, "!=", rhs};
  }

  template <typename RhsT>
  auto operator>(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs > rhs), m_lhs, ">", rhs};
  }
  template <typename RhsT>
  auto operator<(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs < rhs), m_lhs, "<", rhs};
  }
  template <typename RhsT>
  auto operator>=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs >= rhs), m_lhs, ">=", rhs};
  }
  template <typename RhsT>
  auto operator<=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs <= rhs), m_lhs, "<=", rhs};
  }
  template <typename RhsT>
  auto operator|(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs | rhs), m_lhs, "|", rhs};
  }
  template <typename RhsT>
  auto operator&(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs & rhs), m_lhs, "&", rhs};
  }
  template <typename RhsT>
  auto operator^(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
    return {static_cast<bool>(m_lhs ^ rhs), m_lhs, "^", rhs};
  }

  template <typename RhsT>
  auto operator&&(RhsT const&) -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<RhsT>::value,
                  "operator&& is not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  template <typename RhsT>
  auto operator||(RhsT const&) -> BinaryExpr<LhsT, RhsT const&> const {
    static_assert(always_false<RhsT>::value,
                  "operator|| is not supported inside assertions, "
                  "wrap the expression inside parentheses, or decompose it");
  }

  auto makeUnaryExpr() const -> UnaryExpr<LhsT> {
    return UnaryExpr<LhsT>{m_lhs};
  }
};

void handleExpression(ITransientExpression const& expr);

template <typename T>
void handleExpression(ExprLhs<T> const& expr) {
  handleExpression(expr.makeUnaryExpr());
}

struct Decomposer {
  template <typename T>
  auto operator<=(T const& lhs) -> ExprLhs<T const&> {
    return ExprLhs<T const&>{lhs};
  }

  auto operator<=(bool value) -> ExprLhs<bool> { return ExprLhs<bool>{value}; }
};

} // end namespace Catch

#ifdef _MSC_VER
#pragma warning(pop)
#endif

// end catch_decomposer.h
// start catch_interfaces_capture.h

#include <string>
#include <chrono>

namespace Catch {

class AssertionResult;
struct AssertionInfo;
struct SectionInfo;
struct SectionEndInfo;
struct MessageInfo;
struct MessageBuilder;
struct Counts;
struct AssertionReaction;
struct SourceLineInfo;

struct ITransientExpression;
struct IGeneratorTracker;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
struct BenchmarkInfo;
template <typename Duration = std::chrono::duration<double, std::nano>>
struct BenchmarkStats;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

struct IResultCapture {

  virtual ~IResultCapture();

  virtual bool sectionStarted(SectionInfo const& sectionInfo,
                              Counts& assertions)               = 0;
  virtual void sectionEnded(SectionEndInfo const& endInfo)      = 0;
  virtual void sectionEndedEarly(SectionEndInfo const& endInfo) = 0;

  virtual auto acquireGeneratorTracker(StringRef generatorName,
                                       SourceLineInfo const& lineInfo)
      -> IGeneratorTracker& = 0;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  virtual void benchmarkPreparing(std::string const& name)   = 0;
  virtual void benchmarkStarting(BenchmarkInfo const& info)  = 0;
  virtual void benchmarkEnded(BenchmarkStats<> const& stats) = 0;
  virtual void benchmarkFailed(std::string const& error)     = 0;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

  virtual void pushScopedMessage(MessageInfo const& message) = 0;
  virtual void popScopedMessage(MessageInfo const& message)  = 0;

  virtual void emplaceUnscopedMessage(MessageBuilder const& builder) = 0;

  virtual void handleFatalErrorCondition(StringRef message) = 0;

  virtual void handleExpr(AssertionInfo const& info,
                          ITransientExpression const& expr,
                          AssertionReaction& reaction)    = 0;
  virtual void handleMessage(AssertionInfo const& info,
                             ResultWas::OfType resultType,
                             StringRef const& message,
                             AssertionReaction& reaction) = 0;
  virtual void
  handleUnexpectedExceptionNotThrown(AssertionInfo const& info,
                                     AssertionReaction& reaction) = 0;
  virtual void
  handleUnexpectedInflightException(AssertionInfo const& info,
                                    std::string const& message,
                                    AssertionReaction& reaction) = 0;
  virtual void handleIncomplete(AssertionInfo const& info)       = 0;
  virtual void handleNonExpr(AssertionInfo const& info,
                             ResultWas::OfType resultType,
                             AssertionReaction& reaction)        = 0;

  virtual bool lastAssertionPassed() = 0;
  virtual void assertionPassed()     = 0;

  // Deprecated, do not use:
  virtual std::string getCurrentTestName() const       = 0;
  virtual const AssertionResult* getLastResult() const = 0;
  virtual void exceptionEarlyReported()                = 0;
};

IResultCapture& getResultCapture();
} // namespace Catch

// end catch_interfaces_capture.h
namespace Catch {

struct TestFailureException {};
struct AssertionResultData;
struct IResultCapture;
class RunContext;

class LazyExpression {
  friend class AssertionHandler;
  friend struct AssertionStats;
  friend class RunContext;

  ITransientExpression const* m_transientExpression = nullptr;
  bool m_isNegated;

public:
  LazyExpression(bool isNegated);
  LazyExpression(LazyExpression const& other);
  LazyExpression& operator=(LazyExpression const&) = delete;

  explicit operator bool() const;

  friend auto operator<<(std::ostream& os, LazyExpression const& lazyExpr)
      -> std::ostream&;
};

struct AssertionReaction {
  bool shouldDebugBreak = false;
  bool shouldThrow      = false;
};

class AssertionHandler {
  AssertionInfo m_assertionInfo;
  AssertionReaction m_reaction;
  bool m_completed = false;
  IResultCapture& m_resultCapture;

public:
  AssertionHandler(StringRef const& macroName, SourceLineInfo const& lineInfo,
                   StringRef capturedExpression,
                   ResultDisposition::Flags resultDisposition);
  ~AssertionHandler() {
    if (!m_completed) {
      m_resultCapture.handleIncomplete(m_assertionInfo);
    }
  }

  template <typename T>
  void handleExpr(ExprLhs<T> const& expr) {
    handleExpr(expr.makeUnaryExpr());
  }
  void handleExpr(ITransientExpression const& expr);

  void handleMessage(ResultWas::OfType resultType, StringRef const& message);

  void handleExceptionThrownAsExpected();
  void handleUnexpectedExceptionNotThrown();
  void handleExceptionNotThrownAsExpected();
  void handleThrowingCallSkipped();
  void handleUnexpectedInflightException();

  void complete();
  void setCompleted();

  // query
  auto allowThrows() const -> bool;
};

void handleExceptionMatchExpr(AssertionHandler& handler, std::string const& str,
                              StringRef const& matcherString);

} // namespace Catch

// end catch_assertionhandler.h
// start catch_message.h

#include <string>
#include <vector>

namespace Catch {

struct MessageInfo {
  MessageInfo(StringRef const& _macroName, SourceLineInfo const& _lineInfo,
              ResultWas::OfType _type);

  StringRef macroName;
  std::string message;
  SourceLineInfo lineInfo;
  ResultWas::OfType type;
  unsigned int sequence;

  bool operator==(MessageInfo const& other) const;
  bool operator<(MessageInfo const& other) const;

private:
  static unsigned int globalCount;
};

struct MessageStream {

  template <typename T>
  MessageStream& operator<<(T const& value) {
    m_stream << value;
    return *this;
  }

  ReusableStringStream m_stream;
};

struct MessageBuilder : MessageStream {
  MessageBuilder(StringRef const& macroName, SourceLineInfo const& lineInfo,
                 ResultWas::OfType type);

  template <typename T>
  MessageBuilder& operator<<(T const& value) {
    m_stream << value;
    return *this;
  }

  MessageInfo m_info;
};

class ScopedMessage {
public:
  explicit ScopedMessage(MessageBuilder const& builder);
  ScopedMessage(ScopedMessage& duplicate) = delete;
  ScopedMessage(ScopedMessage&& old);
  ~ScopedMessage();

  MessageInfo m_info;
  bool m_moved;
};

class Capturer {
  std::vector<MessageInfo> m_messages;
  IResultCapture& m_resultCapture = getResultCapture();
  size_t m_captured               = 0;

public:
  Capturer(StringRef macroName, SourceLineInfo const& lineInfo,
           ResultWas::OfType resultType, StringRef names);
  ~Capturer();

  void captureValue(size_t index, std::string const& value);

  template <typename T>
  void captureValues(size_t index, T const& value) {
    captureValue(index, Catch::Detail::stringify(value));
  }

  template <typename T, typename... Ts>
  void captureValues(size_t index, T const& value, Ts const&... values) {
    captureValue(index, Catch::Detail::stringify(value));
    captureValues(index + 1, values...);
  }
};

} // end namespace Catch

// end catch_message.h
#if !defined(CATCH_CONFIG_DISABLE)

#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
#define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__
#else
#define CATCH_INTERNAL_STRINGIFY(...)                                          \
  "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"
#endif

#if defined(CATCH_CONFIG_FAST_COMPILE) ||                                      \
    defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)

///////////////////////////////////////////////////////////////////////////////
// Another way to speed-up compilation is to omit local try-catch for REQUIRE*
// macros.
#define INTERNAL_CATCH_TRY
#define INTERNAL_CATCH_CATCH(capturer)

#else // CATCH_CONFIG_FAST_COMPILE

#define INTERNAL_CATCH_TRY try
#define INTERNAL_CATCH_CATCH(handler)                                          \
  catch (...) {                                                                \
    handler.handleUnexpectedInflightException();                               \
  }

#endif

#define INTERNAL_CATCH_REACT(handler) handler.complete();

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_TEST(macroName, resultDisposition, ...)                 \
  do {                                                                         \
    CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__);                               \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \
    INTERNAL_CATCH_TRY {                                                       \
      CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                \
      CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS                             \
      catchAssertionHandler.handleExpr(Catch::Decomposer() <= __VA_ARGS__);    \
      CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                 \
    }                                                                          \
    INTERNAL_CATCH_CATCH(catchAssertionHandler)                                \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while ((void)0, (false) && static_cast<bool>(!!(__VA_ARGS__)))

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_IF(macroName, resultDisposition, ...)                   \
  INTERNAL_CATCH_TEST(macroName, resultDisposition, __VA_ARGS__);              \
  if (Catch::getResultCapture().lastAssertionPassed())

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_ELSE(macroName, resultDisposition, ...)                 \
  INTERNAL_CATCH_TEST(macroName, resultDisposition, __VA_ARGS__);              \
  if (!Catch::getResultCapture().lastAssertionPassed())

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_NO_THROW(macroName, resultDisposition, ...)             \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \
    try {                                                                      \
      static_cast<void>(__VA_ARGS__);                                          \
      catchAssertionHandler.handleExceptionNotThrownAsExpected();              \
    } catch (...) {                                                            \
      catchAssertionHandler.handleUnexpectedInflightException();               \
    }                                                                          \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_THROWS(macroName, resultDisposition, ...)               \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \
    if (catchAssertionHandler.allowThrows())                                   \
      try {                                                                    \
        static_cast<void>(__VA_ARGS__);                                        \
        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \
      } catch (...) {                                                          \
        catchAssertionHandler.handleExceptionThrownAsExpected();               \
      }                                                                        \
    else                                                                       \
      catchAssertionHandler.handleThrowingCallSkipped();                       \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_THROWS_AS(macroName, exceptionType, resultDisposition,  \
                                 expr)                                         \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(expr) ", " CATCH_INTERNAL_STRINGIFY(          \
            exceptionType),                                                    \
        resultDisposition);                                                    \
    if (catchAssertionHandler.allowThrows())                                   \
      try {                                                                    \
        static_cast<void>(expr);                                               \
        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \
      } catch (exceptionType const&) {                                         \
        catchAssertionHandler.handleExceptionThrownAsExpected();               \
      } catch (...) {                                                          \
        catchAssertionHandler.handleUnexpectedInflightException();             \
      }                                                                        \
    else                                                                       \
      catchAssertionHandler.handleThrowingCallSkipped();                       \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_MSG(macroName, messageType, resultDisposition, ...)     \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(),     \
        resultDisposition);                                                    \
    catchAssertionHandler.handleMessage(                                       \
        messageType,                                                           \
        (Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop())     \
            .m_stream.str());                                                  \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_CAPTURE(varName, macroName, ...)                        \
  auto varName = Catch::Capturer(macroName, CATCH_INTERNAL_LINEINFO,           \
                                 Catch::ResultWas::Info, #__VA_ARGS__);        \
  varName.captureValues(0, __VA_ARGS__)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_INFO(macroName, log)                                    \
  Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME(scopedMessage)(              \
      Catch::MessageBuilder(macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,     \
                            Catch::ResultWas::Info)                            \
      << log);

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_UNSCOPED_INFO(macroName, log)                           \
  Catch::getResultCapture().emplaceUnscopedMessage(                            \
      Catch::MessageBuilder(macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,     \
                            Catch::ResultWas::Info)                            \
      << log)

///////////////////////////////////////////////////////////////////////////////
// Although this is matcher-based, it can be used with just a string
#define INTERNAL_CATCH_THROWS_STR_MATCHES(macroName, resultDisposition,        \
                                          matcher, ...)                        \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(   \
            matcher),                                                          \
        resultDisposition);                                                    \
    if (catchAssertionHandler.allowThrows())                                   \
      try {                                                                    \
        static_cast<void>(__VA_ARGS__);                                        \
        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \
      } catch (...) {                                                          \
        Catch::handleExceptionMatchExpr(catchAssertionHandler, matcher,        \
                                        #matcher##_catch_sr);                  \
      }                                                                        \
    else                                                                       \
      catchAssertionHandler.handleThrowingCallSkipped();                       \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

#endif // CATCH_CONFIG_DISABLE

// end catch_capture.hpp
// start catch_section.h

// start catch_section_info.h

// start catch_totals.h

#include <cstddef>

namespace Catch {

struct Counts {
  Counts operator-(Counts const& other) const;
  Counts& operator+=(Counts const& other);

  std::size_t total() const;
  bool allPassed() const;
  bool allOk() const;

  std::size_t passed      = 0;
  std::size_t failed      = 0;
  std::size_t failedButOk = 0;
};

struct Totals {

  Totals operator-(Totals const& other) const;
  Totals& operator+=(Totals const& other);

  Totals delta(Totals const& prevTotals) const;

  int error = 0;
  Counts assertions;
  Counts testCases;
};
} // namespace Catch

// end catch_totals.h
#include <string>

namespace Catch {

struct SectionInfo {
  SectionInfo(SourceLineInfo const& _lineInfo, std::string const& _name);

  // Deprecated
  SectionInfo(SourceLineInfo const& _lineInfo, std::string const& _name,
              std::string const&)
      : SectionInfo(_lineInfo, _name) {}

  std::string name;
  std::string description; // !Deprecated: this will always be empty
  SourceLineInfo lineInfo;
};

struct SectionEndInfo {
  SectionInfo sectionInfo;
  Counts prevAssertions;
  double durationInSeconds;
};

} // end namespace Catch

// end catch_section_info.h
// start catch_timer.h

#include <cstdint>

namespace Catch {

auto getCurrentNanosecondsSinceEpoch() -> uint64_t;
auto getEstimatedClockResolution() -> uint64_t;

class Timer {
  uint64_t m_nanoseconds = 0;

public:
  void start();
  auto getElapsedNanoseconds() const -> uint64_t;
  auto getElapsedMicroseconds() const -> uint64_t;
  auto getElapsedMilliseconds() const -> unsigned int;
  auto getElapsedSeconds() const -> double;
};

} // namespace Catch

// end catch_timer.h
#include <string>

namespace Catch {

class Section : NonCopyable {
public:
  Section(SectionInfo const& info);
  ~Section();

  // This indicates whether the section should be executed or not
  explicit operator bool() const;

private:
  SectionInfo m_info;

  std::string m_name;
  Counts m_assertions;
  bool m_sectionIncluded;
  Timer m_timer;
};

} // end namespace Catch

#define INTERNAL_CATCH_SECTION(...)                                            \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                      \
  if (Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(                        \
          catch_internal_Section) =                                            \
          Catch::SectionInfo(CATCH_INTERNAL_LINEINFO, __VA_ARGS__))            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

#define INTERNAL_CATCH_DYNAMIC_SECTION(...)                                    \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                      \
  if (Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(                        \
          catch_internal_Section) =                                            \
          Catch::SectionInfo(                                                  \
              CATCH_INTERNAL_LINEINFO,                                         \
              (Catch::ReusableStringStream() << __VA_ARGS__).str()))           \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

// end catch_section.h
// start catch_interfaces_exception.h

// start catch_interfaces_registry_hub.h

#include <string>
#include <memory>

namespace Catch {

class TestCase;
struct ITestCaseRegistry;
struct IExceptionTranslatorRegistry;
struct IExceptionTranslator;
struct IReporterRegistry;
struct IReporterFactory;
struct ITagAliasRegistry;
struct IMutableEnumValuesRegistry;

class StartupExceptionRegistry;

using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;

struct IRegistryHub {
  virtual ~IRegistryHub();

  virtual IReporterRegistry const& getReporterRegistry() const = 0;
  virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
  virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
  virtual IExceptionTranslatorRegistry const&
  getExceptionTranslatorRegistry() const = 0;

  virtual StartupExceptionRegistry const&
  getStartupExceptionRegistry() const = 0;
};

struct IMutableRegistryHub {
  virtual ~IMutableRegistryHub();
  virtual void registerReporter(std::string const& name,
                                IReporterFactoryPtr const& factory)       = 0;
  virtual void registerListener(IReporterFactoryPtr const& factory)       = 0;
  virtual void registerTest(TestCase const& testInfo)                     = 0;
  virtual void registerTranslator(const IExceptionTranslator* translator) = 0;
  virtual void registerTagAlias(std::string const& alias,
                                std::string const& tag,
                                SourceLineInfo const& lineInfo)           = 0;
  virtual void registerStartupException() noexcept                        = 0;
  virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry()      = 0;
};

IRegistryHub const& getRegistryHub();
IMutableRegistryHub& getMutableRegistryHub();
void cleanUp();
std::string translateActiveException();

} // namespace Catch

// end catch_interfaces_registry_hub.h
#if defined(CATCH_CONFIG_DISABLE)
#define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG(translatorName, signature)   \
  static std::string translatorName(signature)
#endif

#include <exception>
#include <string>
#include <vector>

namespace Catch {
using exceptionTranslateFunction = std::string (*)();

struct IExceptionTranslator;
using ExceptionTranslators =
    std::vector<std::unique_ptr<IExceptionTranslator const>>;

struct IExceptionTranslator {
  virtual ~IExceptionTranslator();
  virtual std::string
  translate(ExceptionTranslators::const_iterator it,
            ExceptionTranslators::const_iterator itEnd) const = 0;
};

struct IExceptionTranslatorRegistry {
  virtual ~IExceptionTranslatorRegistry();

  virtual std::string translateActiveException() const = 0;
};

class ExceptionTranslatorRegistrar {
  template <typename T>
  class ExceptionTranslator : public IExceptionTranslator {
  public:
    ExceptionTranslator(std::string (*translateFunction)(T&))
        : m_translateFunction(translateFunction) {}

    std::string
    translate(ExceptionTranslators::const_iterator it,
              ExceptionTranslators::const_iterator itEnd) const override {
#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
      return "";
#else
      try {
        if (it == itEnd)
          std::rethrow_exception(std::current_exception());
        else
          return (*it)->translate(it + 1, itEnd);
      } catch (T& ex) {
        return m_translateFunction(ex);
      }
#endif
    }

  protected:
    std::string (*m_translateFunction)(T&);
  };

public:
  template <typename T>
  ExceptionTranslatorRegistrar(std::string (*translateFunction)(T&)) {
    getMutableRegistryHub().registerTranslator(
        new ExceptionTranslator<T>(translateFunction));
  }
};
} // namespace Catch

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2(translatorName, signature)         \
  static std::string translatorName(signature);                                \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME(              \
      catch_internal_ExceptionRegistrar)(&translatorName);                     \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \
  static std::string translatorName(signature)

#define INTERNAL_CATCH_TRANSLATE_EXCEPTION(signature)                          \
  INTERNAL_CATCH_TRANSLATE_EXCEPTION2(                                         \
      INTERNAL_CATCH_UNIQUE_NAME(catch_internal_ExceptionTranslator),          \
      signature)

// end catch_interfaces_exception.h
// start catch_approx.h

#include <type_traits>

namespace Catch {
namespace Detail {

class Approx {
private:
  bool equalityComparisonImpl(double other) const;
  // Validates the new margin (margin >= 0)
  // out-of-line to avoid including stdexcept in the header
  void setMargin(double margin);
  // Validates the new epsilon (0 < epsilon < 1)
  // out-of-line to avoid including stdexcept in the header
  void setEpsilon(double epsilon);

public:
  explicit Approx(double value);

  static Approx custom();

  Approx operator-() const;

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  Approx operator()(T const& value) {
    Approx approx(static_cast<double>(value));
    approx.m_epsilon = m_epsilon;
    approx.m_margin  = m_margin;
    approx.m_scale   = m_scale;
    return approx;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  explicit Approx(T const& value) : Approx(static_cast<double>(value)) {}

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator==(const T& lhs, Approx const& rhs) {
    auto lhs_v = static_cast<double>(lhs);
    return rhs.equalityComparisonImpl(lhs_v);
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator==(Approx const& lhs, const T& rhs) {
    return operator==(rhs, lhs);
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator!=(T const& lhs, Approx const& rhs) {
    return !operator==(lhs, rhs);
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator!=(Approx const& lhs, T const& rhs) {
    return !operator==(rhs, lhs);
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator<=(T const& lhs, Approx const& rhs) {
    return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator<=(Approx const& lhs, T const& rhs) {
    return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator>=(T const& lhs, Approx const& rhs) {
    return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  friend bool operator>=(Approx const& lhs, T const& rhs) {
    return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  Approx& epsilon(T const& newEpsilon) {
    double epsilonAsDouble = static_cast<double>(newEpsilon);
    setEpsilon(epsilonAsDouble);
    return *this;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  Approx& margin(T const& newMargin) {
    double marginAsDouble = static_cast<double>(newMargin);
    setMargin(marginAsDouble);
    return *this;
  }

  template <typename T, typename = typename std::enable_if<
                            std::is_constructible<double, T>::value>::type>
  Approx& scale(T const& newScale) {
    m_scale = static_cast<double>(newScale);
    return *this;
  }

  std::string toString() const;

private:
  double m_epsilon;
  double m_margin;
  double m_scale;
  double m_value;
};
} // end namespace Detail

namespace literals {
Detail::Approx operator"" _a(long double val);
Detail::Approx operator"" _a(unsigned long long val);
} // end namespace literals

template <>
struct StringMaker<Catch::Detail::Approx> {
  static std::string convert(Catch::Detail::Approx const& value);
};

} // end namespace Catch

// end catch_approx.h
// start catch_string_manip.h

#include <string>
#include <iosfwd>
#include <vector>

namespace Catch {

bool startsWith(std::string const& s, std::string const& prefix);
bool startsWith(std::string const& s, char prefix);
bool endsWith(std::string const& s, std::string const& suffix);
bool endsWith(std::string const& s, char suffix);
bool contains(std::string const& s, std::string const& infix);
void toLowerInPlace(std::string& s);
std::string toLower(std::string const& s);
//! Returns a new string without whitespace at the start/end
std::string trim(std::string const& str);
//! Returns a substring of the original ref without whitespace. Beware
//! lifetimes!
StringRef trim(StringRef ref);

// !!! Be aware, returns refs into original string - make sure original string
// outlives them
std::vector<StringRef> splitStringRef(StringRef str, char delimiter);
bool replaceInPlace(std::string& str, std::string const& replaceThis,
                    std::string const& withThis);

struct pluralise {
  pluralise(std::size_t count, std::string const& label);

  friend std::ostream& operator<<(std::ostream& os,
                                  pluralise const& pluraliser);

  std::size_t m_count;
  std::string m_label;
};
} // namespace Catch

// end catch_string_manip.h
#ifndef CATCH_CONFIG_DISABLE_MATCHERS
// start catch_capture_matchers.h

// start catch_matchers.h

#include <string>
#include <vector>

namespace Catch {
namespace Matchers {
namespace Impl {

template <typename ArgT>
struct MatchAllOf;
template <typename ArgT>
struct MatchAnyOf;
template <typename ArgT>
struct MatchNotOf;

class MatcherUntypedBase {
public:
  MatcherUntypedBase()                                     = default;
  MatcherUntypedBase(MatcherUntypedBase const&)            = default;
  MatcherUntypedBase& operator=(MatcherUntypedBase const&) = delete;
  std::string toString() const;

protected:
  virtual ~MatcherUntypedBase();
  virtual std::string describe() const = 0;
  mutable std::string m_cachedToString;
};

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
#endif

template <typename ObjectT>
struct MatcherMethod {
  virtual bool match(ObjectT const& arg) const = 0;
};

#if defined(__OBJC__)
// Hack to fix Catch GH issue #1661. Could use id for generic Object support.
// use of const for Object pointers is very uncommon and under ARC it causes
// some kind of signature mismatch that breaks compilation
template <>
struct MatcherMethod<NSString*> {
  virtual bool match(NSString* arg) const = 0;
};
#endif

#ifdef __clang__
#pragma clang diagnostic pop
#endif

template <typename T>
struct MatcherBase : MatcherUntypedBase, MatcherMethod<T> {

  MatchAllOf<T> operator&&(MatcherBase const& other) const;
  MatchAnyOf<T> operator||(MatcherBase const& other) const;
  MatchNotOf<T> operator!() const;
};

template <typename ArgT>
struct MatchAllOf : MatcherBase<ArgT> {
  bool match(ArgT const& arg) const override {
    for (auto matcher : m_matchers) {
      if (!matcher->match(arg))
        return false;
    }
    return true;
  }
  std::string describe() const override {
    std::string description;
    description.reserve(4 + m_matchers.size() * 32);
    description += "( ";
    bool first = true;
    for (auto matcher : m_matchers) {
      if (first)
        first = false;
      else
        description += " and ";
      description += matcher->toString();
    }
    description += " )";
    return description;
  }

  MatchAllOf<ArgT> operator&&(MatcherBase<ArgT> const& other) {
    auto copy(*this);
    copy.m_matchers.push_back(&other);
    return copy;
  }

  std::vector<MatcherBase<ArgT> const*> m_matchers;
};
template <typename ArgT>
struct MatchAnyOf : MatcherBase<ArgT> {

  bool match(ArgT const& arg) const override {
    for (auto matcher : m_matchers) {
      if (matcher->match(arg))
        return true;
    }
    return false;
  }
  std::string describe() const override {
    std::string description;
    description.reserve(4 + m_matchers.size() * 32);
    description += "( ";
    bool first = true;
    for (auto matcher : m_matchers) {
      if (first)
        first = false;
      else
        description += " or ";
      description += matcher->toString();
    }
    description += " )";
    return description;
  }

  MatchAnyOf<ArgT> operator||(MatcherBase<ArgT> const& other) {
    auto copy(*this);
    copy.m_matchers.push_back(&other);
    return copy;
  }

  std::vector<MatcherBase<ArgT> const*> m_matchers;
};

template <typename ArgT>
struct MatchNotOf : MatcherBase<ArgT> {

  MatchNotOf(MatcherBase<ArgT> const& underlyingMatcher)
      : m_underlyingMatcher(underlyingMatcher) {}

  bool match(ArgT const& arg) const override {
    return !m_underlyingMatcher.match(arg);
  }

  std::string describe() const override {
    return "not " + m_underlyingMatcher.toString();
  }
  MatcherBase<ArgT> const& m_underlyingMatcher;
};

template <typename T>
MatchAllOf<T> MatcherBase<T>::operator&&(MatcherBase const& other) const {
  return MatchAllOf<T>() && *this && other;
}
template <typename T>
MatchAnyOf<T> MatcherBase<T>::operator||(MatcherBase const& other) const {
  return MatchAnyOf<T>() || *this || other;
}
template <typename T>
MatchNotOf<T> MatcherBase<T>::operator!() const {
  return MatchNotOf<T>(*this);
}

} // namespace Impl

} // namespace Matchers

using namespace Matchers;
using Matchers::Impl::MatcherBase;

} // namespace Catch

// end catch_matchers.h
// start catch_matchers_exception.hpp

namespace Catch {
namespace Matchers {
namespace Exception {

class ExceptionMessageMatcher : public MatcherBase<std::exception> {
  std::string m_message;

public:
  ExceptionMessageMatcher(std::string const& message) : m_message(message) {}

  bool match(std::exception const& ex) const override;

  std::string describe() const override;
};

} // namespace Exception

Exception::ExceptionMessageMatcher Message(std::string const& message);

} // namespace Matchers
} // namespace Catch

// end catch_matchers_exception.hpp
// start catch_matchers_floating.h

namespace Catch {
namespace Matchers {

namespace Floating {

enum class FloatingPointKind : uint8_t;

struct WithinAbsMatcher : MatcherBase<double> {
  WithinAbsMatcher(double target, double margin);
  bool match(double const& matchee) const override;
  std::string describe() const override;

private:
  double m_target;
  double m_margin;
};

struct WithinUlpsMatcher : MatcherBase<double> {
  WithinUlpsMatcher(double target, uint64_t ulps, FloatingPointKind baseType);
  bool match(double const& matchee) const override;
  std::string describe() const override;

private:
  double m_target;
  uint64_t m_ulps;
  FloatingPointKind m_type;
};

// Given IEEE-754 format for floats and doubles, we can assume
// that float -> double promotion is lossless. Given this, we can
// assume that if we do the standard relative comparison of
// |lhs - rhs| <= epsilon * max(fabs(lhs), fabs(rhs)), then we get
// the same result if we do this for floats, as if we do this for
// doubles that were promoted from floats.
struct WithinRelMatcher : MatcherBase<double> {
  WithinRelMatcher(double target, double epsilon);
  bool match(double const& matchee) const override;
  std::string describe() const override;

private:
  double m_target;
  double m_epsilon;
};

} // namespace Floating

// The following functions create the actual matcher objects.
// This allows the types to be inferred
Floating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff);
Floating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff);
Floating::WithinAbsMatcher WithinAbs(double target, double margin);
Floating::WithinRelMatcher WithinRel(double target, double eps);
// defaults epsilon to 100*numeric_limits<double>::epsilon()
Floating::WithinRelMatcher WithinRel(double target);
Floating::WithinRelMatcher WithinRel(float target, float eps);
// defaults epsilon to 100*numeric_limits<float>::epsilon()
Floating::WithinRelMatcher WithinRel(float target);

} // namespace Matchers
} // namespace Catch

// end catch_matchers_floating.h
// start catch_matchers_generic.hpp

#include <functional>
#include <string>

namespace Catch {
namespace Matchers {
namespace Generic {

namespace Detail {
std::string finalizeDescription(const std::string& desc);
}

template <typename T>
class PredicateMatcher : public MatcherBase<T> {
  std::function<bool(T const&)> m_predicate;
  std::string m_description;

public:
  PredicateMatcher(std::function<bool(T const&)> const& elem,
                   std::string const& descr)
      : m_predicate(std::move(elem)),
        m_description(Detail::finalizeDescription(descr)) {}

  bool match(T const& item) const override { return m_predicate(item); }

  std::string describe() const override { return m_description; }
};

} // namespace Generic

// The following functions create the actual matcher objects.
// The user has to explicitly specify type to the function, because
// inferring std::function<bool(T const&)> is hard (but possible) and
// requires a lot of TMP.
template <typename T>
Generic::PredicateMatcher<T>
Predicate(std::function<bool(T const&)> const& predicate,
          std::string const& description = "") {
  return Generic::PredicateMatcher<T>(predicate, description);
}

} // namespace Matchers
} // namespace Catch

// end catch_matchers_generic.hpp
// start catch_matchers_string.h

#include <string>

namespace Catch {
namespace Matchers {

namespace StdString {

struct CasedString {
  CasedString(std::string const& str, CaseSensitive::Choice caseSensitivity);
  std::string adjustString(std::string const& str) const;
  std::string caseSensitivitySuffix() const;

  CaseSensitive::Choice m_caseSensitivity;
  std::string m_str;
};

struct StringMatcherBase : MatcherBase<std::string> {
  StringMatcherBase(std::string const& operation,
                    CasedString const& comparator);
  std::string describe() const override;

  CasedString m_comparator;
  std::string m_operation;
};

struct EqualsMatcher : StringMatcherBase {
  EqualsMatcher(CasedString const& comparator);
  bool match(std::string const& source) const override;
};
struct ContainsMatcher : StringMatcherBase {
  ContainsMatcher(CasedString const& comparator);
  bool match(std::string const& source) const override;
};
struct StartsWithMatcher : StringMatcherBase {
  StartsWithMatcher(CasedString const& comparator);
  bool match(std::string const& source) const override;
};
struct EndsWithMatcher : StringMatcherBase {
  EndsWithMatcher(CasedString const& comparator);
  bool match(std::string const& source) const override;
};

struct RegexMatcher : MatcherBase<std::string> {
  RegexMatcher(std::string regex, CaseSensitive::Choice caseSensitivity);
  bool match(std::string const& matchee) const override;
  std::string describe() const override;

private:
  std::string m_regex;
  CaseSensitive::Choice m_caseSensitivity;
};

} // namespace StdString

// The following functions create the actual matcher objects.
// This allows the types to be inferred

StdString::EqualsMatcher
Equals(std::string const& str,
       CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);
StdString::ContainsMatcher
Contains(std::string const& str,
         CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);
StdString::EndsWithMatcher
EndsWith(std::string const& str,
         CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);
StdString::StartsWithMatcher
StartsWith(std::string const& str,
           CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);
StdString::RegexMatcher
Matches(std::string const& regex,
        CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);

} // namespace Matchers
} // namespace Catch

// end catch_matchers_string.h
// start catch_matchers_vector.h

#include <algorithm>

namespace Catch {
namespace Matchers {

namespace Vector {
template <typename T, typename Alloc>
struct ContainsElementMatcher : MatcherBase<std::vector<T, Alloc>> {

  ContainsElementMatcher(T const& comparator) : m_comparator(comparator) {}

  bool match(std::vector<T, Alloc> const& v) const override {
    for (auto const& el : v) {
      if (el == m_comparator) {
        return true;
      }
    }
    return false;
  }

  std::string describe() const override {
    return "Contains: " + ::Catch::Detail::stringify(m_comparator);
  }

  T const& m_comparator;
};

template <typename T, typename AllocComp, typename AllocMatch>
struct ContainsMatcher : MatcherBase<std::vector<T, AllocMatch>> {

  ContainsMatcher(std::vector<T, AllocComp> const& comparator)
      : m_comparator(comparator) {}

  bool match(std::vector<T, AllocMatch> const& v) const override {
    // !TBD: see note in EqualsMatcher
    if (m_comparator.size() > v.size())
      return false;
    for (auto const& comparator : m_comparator) {
      auto present = false;
      for (const auto& el : v) {
        if (el == comparator) {
          present = true;
          break;
        }
      }
      if (!present) {
        return false;
      }
    }
    return true;
  }
  std::string describe() const override {
    return "Contains: " + ::Catch::Detail::stringify(m_comparator);
  }

  std::vector<T, AllocComp> const& m_comparator;
};

template <typename T, typename AllocComp, typename AllocMatch>
struct EqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {

  EqualsMatcher(std::vector<T, AllocComp> const& comparator)
      : m_comparator(comparator) {}

  bool match(std::vector<T, AllocMatch> const& v) const override {
    // !TBD: This currently works if all elements can be compared using !=
    // - a more general approach would be via a compare template that defaults
    // to using !=. but could be specialised for, e.g. std::vector<T, Alloc> etc
    // - then just call that directly
    if (m_comparator.size() != v.size())
      return false;
    for (std::size_t i = 0; i < v.size(); ++i)
      if (m_comparator[i] != v[i])
        return false;
    return true;
  }
  std::string describe() const override {
    return "Equals: " + ::Catch::Detail::stringify(m_comparator);
  }
  std::vector<T, AllocComp> const& m_comparator;
};

template <typename T, typename AllocComp, typename AllocMatch>
struct ApproxMatcher : MatcherBase<std::vector<T, AllocMatch>> {

  ApproxMatcher(std::vector<T, AllocComp> const& comparator)
      : m_comparator(comparator) {}

  bool match(std::vector<T, AllocMatch> const& v) const override {
    if (m_comparator.size() != v.size())
      return false;
    for (std::size_t i = 0; i < v.size(); ++i)
      if (m_comparator[i] != approx(v[i]))
        return false;
    return true;
  }
  std::string describe() const override {
    return "is approx: " + ::Catch::Detail::stringify(m_comparator);
  }
  template <typename = typename std::enable_if<
                std::is_constructible<double, T>::value>::type>
  ApproxMatcher& epsilon(T const& newEpsilon) {
    approx.epsilon(newEpsilon);
    return *this;
  }
  template <typename = typename std::enable_if<
                std::is_constructible<double, T>::value>::type>
  ApproxMatcher& margin(T const& newMargin) {
    approx.margin(newMargin);
    return *this;
  }
  template <typename = typename std::enable_if<
                std::is_constructible<double, T>::value>::type>
  ApproxMatcher& scale(T const& newScale) {
    approx.scale(newScale);
    return *this;
  }

  std::vector<T, AllocComp> const& m_comparator;
  mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom();
};

template <typename T, typename AllocComp, typename AllocMatch>
struct UnorderedEqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
  UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target)
      : m_target(target) {}
  bool match(std::vector<T, AllocMatch> const& vec) const override {
    if (m_target.size() != vec.size()) {
      return false;
    }
    return std::is_permutation(m_target.begin(), m_target.end(), vec.begin());
  }

  std::string describe() const override {
    return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
  }

private:
  std::vector<T, AllocComp> const& m_target;
};

} // namespace Vector

// The following functions create the actual matcher objects.
// This allows the types to be inferred

template <typename T, typename AllocComp = std::allocator<T>,
          typename AllocMatch = AllocComp>
Vector::ContainsMatcher<T, AllocComp, AllocMatch>
Contains(std::vector<T, AllocComp> const& comparator) {
  return Vector::ContainsMatcher<T, AllocComp, AllocMatch>(comparator);
}

template <typename T, typename Alloc = std::allocator<T>>
Vector::ContainsElementMatcher<T, Alloc> VectorContains(T const& comparator) {
  return Vector::ContainsElementMatcher<T, Alloc>(comparator);
}

template <typename T, typename AllocComp = std::allocator<T>,
          typename AllocMatch = AllocComp>
Vector::EqualsMatcher<T, AllocComp, AllocMatch>
Equals(std::vector<T, AllocComp> const& comparator) {
  return Vector::EqualsMatcher<T, AllocComp, AllocMatch>(comparator);
}

template <typename T, typename AllocComp = std::allocator<T>,
          typename AllocMatch = AllocComp>
Vector::ApproxMatcher<T, AllocComp, AllocMatch>
Approx(std::vector<T, AllocComp> const& comparator) {
  return Vector::ApproxMatcher<T, AllocComp, AllocMatch>(comparator);
}

template <typename T, typename AllocComp = std::allocator<T>,
          typename AllocMatch = AllocComp>
Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>
UnorderedEquals(std::vector<T, AllocComp> const& target) {
  return Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>(target);
}

} // namespace Matchers
} // namespace Catch

// end catch_matchers_vector.h
namespace Catch {

template <typename ArgT, typename MatcherT>
class MatchExpr : public ITransientExpression {
  ArgT const& m_arg;
  MatcherT m_matcher;
  StringRef m_matcherString;

public:
  MatchExpr(ArgT const& arg, MatcherT const& matcher,
            StringRef const& matcherString)
      : ITransientExpression{true, matcher.match(arg)}, m_arg(arg),
        m_matcher(matcher), m_matcherString(matcherString) {}

  void streamReconstructedExpression(std::ostream& os) const override {
    auto matcherAsString = m_matcher.toString();
    os << Catch::Detail::stringify(m_arg) << ' ';
    if (matcherAsString == Detail::unprintableString)
      os << m_matcherString;
    else
      os << matcherAsString;
  }
};

using StringMatcher = Matchers::Impl::MatcherBase<std::string>;

void handleExceptionMatchExpr(AssertionHandler& handler,
                              StringMatcher const& matcher,
                              StringRef const& matcherString);

template <typename ArgT, typename MatcherT>
auto makeMatchExpr(ArgT const& arg, MatcherT const& matcher,
                   StringRef const& matcherString)
    -> MatchExpr<ArgT, MatcherT> {
  return MatchExpr<ArgT, MatcherT>(arg, matcher, matcherString);
}

} // namespace Catch

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CHECK_THAT(macroName, matcher, resultDisposition, arg)        \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(arg) ", " CATCH_INTERNAL_STRINGIFY(matcher),  \
        resultDisposition);                                                    \
    INTERNAL_CATCH_TRY {                                                       \
      catchAssertionHandler.handleExpr(                                        \
          Catch::makeMatchExpr(arg, matcher, #matcher##_catch_sr));            \
    }                                                                          \
    INTERNAL_CATCH_CATCH(catchAssertionHandler)                                \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

///////////////////////////////////////////////////////////////////////////////
#define INTERNAL_CATCH_THROWS_MATCHES(macroName, exceptionType,                \
                                      resultDisposition, matcher, ...)         \
  do {                                                                         \
    Catch::AssertionHandler catchAssertionHandler(                             \
        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \
        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(   \
            exceptionType) ", " CATCH_INTERNAL_STRINGIFY(matcher),             \
        resultDisposition);                                                    \
    if (catchAssertionHandler.allowThrows())                                   \
      try {                                                                    \
        static_cast<void>(__VA_ARGS__);                                        \
        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \
      } catch (exceptionType const& ex) {                                      \
        catchAssertionHandler.handleExpr(                                      \
            Catch::makeMatchExpr(ex, matcher, #matcher##_catch_sr));           \
      } catch (...) {                                                          \
        catchAssertionHandler.handleUnexpectedInflightException();             \
      }                                                                        \
    else                                                                       \
      catchAssertionHandler.handleThrowingCallSkipped();                       \
    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \
  } while (false)

// end catch_capture_matchers.h
#endif
// start catch_generators.hpp

// start catch_interfaces_generatortracker.h

#include <memory>

namespace Catch {

namespace Generators {
class GeneratorUntypedBase {
public:
  GeneratorUntypedBase() = default;
  virtual ~GeneratorUntypedBase();
  // Attempts to move the generator to the next element
  //
  // Returns true iff the move succeeded (and a valid element
  // can be retrieved).
  virtual bool next() = 0;
};
using GeneratorBasePtr = std::unique_ptr<GeneratorUntypedBase>;

} // namespace Generators

struct IGeneratorTracker {
  virtual ~IGeneratorTracker();
  virtual auto hasGenerator() const -> bool                                = 0;
  virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;
  virtual void setGenerator(Generators::GeneratorBasePtr&& generator)      = 0;
};

} // namespace Catch

// end catch_interfaces_generatortracker.h
// start catch_enforce.h

#include <exception>

namespace Catch {
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
template <typename Ex>
[[noreturn]] void throw_exception(Ex const& e) {
  throw e;
}
#else // ^^ Exceptions are enabled //  Exceptions are disabled vv
[[noreturn]] void throw_exception(std::exception const& e);
#endif

[[noreturn]] void throw_logic_error(std::string const& msg);
[[noreturn]] void throw_domain_error(std::string const& msg);
[[noreturn]] void throw_runtime_error(std::string const& msg);

} // namespace Catch

#define CATCH_MAKE_MSG(...) (Catch::ReusableStringStream() << __VA_ARGS__).str()

#define CATCH_INTERNAL_ERROR(...)                                              \
  Catch::throw_logic_error(CATCH_MAKE_MSG(                                     \
      CATCH_INTERNAL_LINEINFO << ": Internal Catch2 error: " << __VA_ARGS__))

#define CATCH_ERROR(...) Catch::throw_domain_error(CATCH_MAKE_MSG(__VA_ARGS__))

#define CATCH_RUNTIME_ERROR(...)                                               \
  Catch::throw_runtime_error(CATCH_MAKE_MSG(__VA_ARGS__))

#define CATCH_ENFORCE(condition, ...)                                          \
  do {                                                                         \
    if (!(condition))                                                          \
      CATCH_ERROR(__VA_ARGS__);                                                \
  } while (false)

// end catch_enforce.h
#include <memory>
#include <vector>
#include <cassert>

#include <utility>
#include <exception>

namespace Catch {

class GeneratorException : public std::exception {
  const char* const m_msg = "";

public:
  GeneratorException(const char* msg) : m_msg(msg) {}

  const char* what() const noexcept override final;
};

namespace Generators {

// !TBD move this into its own location?
namespace pf {
template <typename T, typename... Args>
std::unique_ptr<T> make_unique(Args&&... args) {
  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
} // namespace pf

template <typename T>
struct IGenerator : GeneratorUntypedBase {
  virtual ~IGenerator() = default;

  // Returns the current element of the generator
  //
  // \Precondition The generator is either freshly constructed,
  // or the last call to `next()` returned true
  virtual T const& get() const = 0;
  using type                   = T;
};

template <typename T>
class SingleValueGenerator final : public IGenerator<T> {
  T m_value;

public:
  SingleValueGenerator(T&& value) : m_value(std::move(value)) {}

  T const& get() const override { return m_value; }
  bool next() override { return false; }
};

template <typename T>
class FixedValuesGenerator final : public IGenerator<T> {
  static_assert(
      !std::is_same<T, bool>::value,
      "FixedValuesGenerator does not support bools because of std::vector<bool>"
      "specialization, use SingleValue Generator instead.");
  std::vector<T> m_values;
  size_t m_idx = 0;

public:
  FixedValuesGenerator(std::initializer_list<T> values) : m_values(values) {}

  T const& get() const override { return m_values[m_idx]; }
  bool next() override {
    ++m_idx;
    return m_idx < m_values.size();
  }
};

template <typename T>
class GeneratorWrapper final {
  std::unique_ptr<IGenerator<T>> m_generator;

public:
  GeneratorWrapper(std::unique_ptr<IGenerator<T>> generator)
      : m_generator(std::move(generator)) {}
  T const& get() const { return m_generator->get(); }
  bool next() { return m_generator->next(); }
};

template <typename T>
GeneratorWrapper<T> value(T&& value) {
  return GeneratorWrapper<T>(
      pf::make_unique<SingleValueGenerator<T>>(std::forward<T>(value)));
}
template <typename T>
GeneratorWrapper<T> values(std::initializer_list<T> values) {
  return GeneratorWrapper<T>(pf::make_unique<FixedValuesGenerator<T>>(values));
}

template <typename T>
class Generators : public IGenerator<T> {
  std::vector<GeneratorWrapper<T>> m_generators;
  size_t m_current = 0;

  void populate(GeneratorWrapper<T>&& generator) {
    m_generators.emplace_back(std::move(generator));
  }
  void populate(T&& val) {
    m_generators.emplace_back(value(std::forward<T>(val)));
  }
  template <typename U>
  void populate(U&& val) {
    populate(T(std::forward<U>(val)));
  }
  template <typename U, typename... Gs>
  void populate(U&& valueOrGenerator, Gs&&... moreGenerators) {
    populate(std::forward<U>(valueOrGenerator));
    populate(std::forward<Gs>(moreGenerators)...);
  }

public:
  template <typename... Gs>
  Generators(Gs&&... moreGenerators) {
    m_generators.reserve(sizeof...(Gs));
    populate(std::forward<Gs>(moreGenerators)...);
  }

  T const& get() const override { return m_generators[m_current].get(); }

  bool next() override {
    if (m_current >= m_generators.size()) {
      return false;
    }
    const bool current_status = m_generators[m_current].next();
    if (!current_status) {
      ++m_current;
    }
    return m_current < m_generators.size();
  }
};

template <typename... Ts>
GeneratorWrapper<std::tuple<Ts...>>
table(std::initializer_list<std::tuple<typename std::decay<Ts>::type...>>
          tuples) {
  return values<std::tuple<Ts...>>(tuples);
}

// Tag type to signal that a generator sequence should convert arguments to a
// specific type
template <typename T>
struct as {};

template <typename T, typename... Gs>
auto makeGenerators(GeneratorWrapper<T>&& generator, Gs&&... moreGenerators)
    -> Generators<T> {
  return Generators<T>(std::move(generator),
                       std::forward<Gs>(moreGenerators)...);
}
template <typename T>
auto makeGenerators(GeneratorWrapper<T>&& generator) -> Generators<T> {
  return Generators<T>(std::move(generator));
}
template <typename T, typename... Gs>
auto makeGenerators(T&& val, Gs&&... moreGenerators) -> Generators<T> {
  return makeGenerators(value(std::forward<T>(val)),
                        std::forward<Gs>(moreGenerators)...);
}
template <typename T, typename U, typename... Gs>
auto makeGenerators(as<T>, U&& val, Gs&&... moreGenerators) -> Generators<T> {
  return makeGenerators(value(T(std::forward<U>(val))),
                        std::forward<Gs>(moreGenerators)...);
}

auto acquireGeneratorTracker(StringRef generatorName,
                             SourceLineInfo const& lineInfo)
    -> IGeneratorTracker&;

template <typename L>
// Note: The type after -> is weird, because VS2015 cannot parse
//       the expression used in the typedef inside, when it is in
//       return type. Yeah.
auto generate(StringRef generatorName, SourceLineInfo const& lineInfo,
              L const& generatorExpression)
    -> decltype(std::declval<decltype(generatorExpression())>().get()) {
  using UnderlyingType = typename decltype(generatorExpression())::type;

  IGeneratorTracker& tracker = acquireGeneratorTracker(generatorName, lineInfo);
  if (!tracker.hasGenerator()) {
    tracker.setGenerator(
        pf::make_unique<Generators<UnderlyingType>>(generatorExpression()));
  }

  auto const& generator =
      static_cast<IGenerator<UnderlyingType> const&>(*tracker.getGenerator());
  return generator.get();
}

} // namespace Generators
} // namespace Catch

#define GENERATE(...)                                                          \
  Catch::Generators::generate(                                                 \
      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \
      CATCH_INTERNAL_LINEINFO, [] {                                            \
        using namespace Catch::Generators;                                     \
        return makeGenerators(__VA_ARGS__);                                    \
      }) // NOLINT(google-build-using-namespace)
#define GENERATE_COPY(...)                                                     \
  Catch::Generators::generate(                                                 \
      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \
      CATCH_INTERNAL_LINEINFO, [=] {                                           \
        using namespace Catch::Generators;                                     \
        return makeGenerators(__VA_ARGS__);                                    \
      }) // NOLINT(google-build-using-namespace)
#define GENERATE_REF(...)                                                      \
  Catch::Generators::generate(                                                 \
      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \
      CATCH_INTERNAL_LINEINFO, [&] {                                           \
        using namespace Catch::Generators;                                     \
        return makeGenerators(__VA_ARGS__);                                    \
      }) // NOLINT(google-build-using-namespace)

// end catch_generators.hpp
// start catch_generators_generic.hpp

namespace Catch {
namespace Generators {

template <typename T>
class TakeGenerator : public IGenerator<T> {
  GeneratorWrapper<T> m_generator;
  size_t m_returned = 0;
  size_t m_target;

public:
  TakeGenerator(size_t target, GeneratorWrapper<T>&& generator)
      : m_generator(std::move(generator)), m_target(target) {
    assert(target != 0 && "Empty generators are not allowed");
  }
  T const& get() const override { return m_generator.get(); }
  bool next() override {
    ++m_returned;
    if (m_returned >= m_target) {
      return false;
    }

    const auto success = m_generator.next();
    // If the underlying generator does not contain enough values
    // then we cut short as well
    if (!success) {
      m_returned = m_target;
    }
    return success;
  }
};

template <typename T>
GeneratorWrapper<T> take(size_t target, GeneratorWrapper<T>&& generator) {
  return GeneratorWrapper<T>(
      pf::make_unique<TakeGenerator<T>>(target, std::move(generator)));
}

template <typename T, typename Predicate>
class FilterGenerator : public IGenerator<T> {
  GeneratorWrapper<T> m_generator;
  Predicate m_predicate;

public:
  template <typename P = Predicate>
  FilterGenerator(P&& pred, GeneratorWrapper<T>&& generator)
      : m_generator(std::move(generator)), m_predicate(std::forward<P>(pred)) {
    if (!m_predicate(m_generator.get())) {
      // It might happen that there are no values that pass the
      // filter. In that case we throw an exception.
      auto has_initial_value = next();
      if (!has_initial_value) {
        Catch::throw_exception(
            GeneratorException("No valid value found in filtered generator"));
      }
    }
  }

  T const& get() const override { return m_generator.get(); }

  bool next() override {
    bool success = m_generator.next();
    if (!success) {
      return false;
    }
    while (!m_predicate(m_generator.get()) &&
           (success = m_generator.next()) == true)
      ;
    return success;
  }
};

template <typename T, typename Predicate>
GeneratorWrapper<T> filter(Predicate&& pred, GeneratorWrapper<T>&& generator) {
  return GeneratorWrapper<T>(std::unique_ptr<IGenerator<T>>(
      pf::make_unique<FilterGenerator<T, Predicate>>(
          std::forward<Predicate>(pred), std::move(generator))));
}

template <typename T>
class RepeatGenerator : public IGenerator<T> {
  static_assert(!std::is_same<T, bool>::value,
                "RepeatGenerator currently does not support bools"
                "because of std::vector<bool> specialization");
  GeneratorWrapper<T> m_generator;
  mutable std::vector<T> m_returned;
  size_t m_target_repeats;
  size_t m_current_repeat = 0;
  size_t m_repeat_index   = 0;

public:
  RepeatGenerator(size_t repeats, GeneratorWrapper<T>&& generator)
      : m_generator(std::move(generator)), m_target_repeats(repeats) {
    assert(m_target_repeats > 0 &&
           "Repeat generator must repeat at least once");
  }

  T const& get() const override {
    if (m_current_repeat == 0) {
      m_returned.push_back(m_generator.get());
      return m_returned.back();
    }
    return m_returned[m_repeat_index];
  }

  bool next() override {
    // There are 2 basic cases:
    // 1) We are still reading the generator
    // 2) We are reading our own cache

    // In the first case, we need to poke the underlying generator.
    // If it happily moves, we are left in that state, otherwise it is time to
    // start reading from our cache
    if (m_current_repeat == 0) {
      const auto success = m_generator.next();
      if (!success) {
        ++m_current_repeat;
      }
      return m_current_repeat < m_target_repeats;
    }

    // In the second case, we need to move indices forward and check that we
    // haven't run up against the end
    ++m_repeat_index;
    if (m_repeat_index == m_returned.size()) {
      m_repeat_index = 0;
      ++m_current_repeat;
    }
    return m_current_repeat < m_target_repeats;
  }
};

template <typename T>
GeneratorWrapper<T> repeat(size_t repeats, GeneratorWrapper<T>&& generator) {
  return GeneratorWrapper<T>(
      pf::make_unique<RepeatGenerator<T>>(repeats, std::move(generator)));
}

template <typename T, typename U, typename Func>
class MapGenerator : public IGenerator<T> {
  // TBD: provide static assert for mapping function, for friendly error message
  GeneratorWrapper<U> m_generator;
  Func m_function;
  // To avoid returning dangling reference, we have to save the values
  T m_cache;

public:
  template <typename F2 = Func>
  MapGenerator(F2&& function, GeneratorWrapper<U>&& generator)
      : m_generator(std::move(generator)),
        m_function(std::forward<F2>(function)),
        m_cache(m_function(m_generator.get())) {}

  T const& get() const override { return m_cache; }
  bool next() override {
    const auto success = m_generator.next();
    if (success) {
      m_cache = m_function(m_generator.get());
    }
    return success;
  }
};

template <typename Func, typename U, typename T = FunctionReturnType<Func, U>>
GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
  return GeneratorWrapper<T>(pf::make_unique<MapGenerator<T, U, Func>>(
      std::forward<Func>(function), std::move(generator)));
}

template <typename T, typename U, typename Func>
GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
  return GeneratorWrapper<T>(pf::make_unique<MapGenerator<T, U, Func>>(
      std::forward<Func>(function), std::move(generator)));
}

template <typename T>
class ChunkGenerator final : public IGenerator<std::vector<T>> {
  std::vector<T> m_chunk;
  size_t m_chunk_size;
  GeneratorWrapper<T> m_generator;
  bool m_used_up = false;

public:
  ChunkGenerator(size_t size, GeneratorWrapper<T> generator)
      : m_chunk_size(size), m_generator(std::move(generator)) {
    m_chunk.reserve(m_chunk_size);
    if (m_chunk_size != 0) {
      m_chunk.push_back(m_generator.get());
      for (size_t i = 1; i < m_chunk_size; ++i) {
        if (!m_generator.next()) {
          Catch::throw_exception(GeneratorException(
              "Not enough values to initialize the first chunk"));
        }
        m_chunk.push_back(m_generator.get());
      }
    }
  }
  std::vector<T> const& get() const override { return m_chunk; }
  bool next() override {
    m_chunk.clear();
    for (size_t idx = 0; idx < m_chunk_size; ++idx) {
      if (!m_generator.next()) {
        return false;
      }
      m_chunk.push_back(m_generator.get());
    }
    return true;
  }
};

template <typename T>
GeneratorWrapper<std::vector<T>> chunk(size_t size,
                                       GeneratorWrapper<T>&& generator) {
  return GeneratorWrapper<std::vector<T>>(
      pf::make_unique<ChunkGenerator<T>>(size, std::move(generator)));
}

} // namespace Generators
} // namespace Catch

// end catch_generators_generic.hpp
// start catch_generators_specific.hpp

// start catch_context.h

#include <memory>

namespace Catch {

struct IResultCapture;
struct IRunner;
struct IConfig;
struct IMutableContext;

using IConfigPtr = std::shared_ptr<IConfig const>;

struct IContext {
  virtual ~IContext();

  virtual IResultCapture* getResultCapture()  = 0;
  virtual IRunner* getRunner()                = 0;
  virtual IConfigPtr const& getConfig() const = 0;
};

struct IMutableContext : IContext {
  virtual ~IMutableContext();
  virtual void setResultCapture(IResultCapture* resultCapture) = 0;
  virtual void setRunner(IRunner* runner)                      = 0;
  virtual void setConfig(IConfigPtr const& config)             = 0;

private:
  static IMutableContext* currentContext;
  friend IMutableContext& getCurrentMutableContext();
  friend void cleanUpContext();
  static void createContext();
};

inline IMutableContext& getCurrentMutableContext() {
  if (!IMutableContext::currentContext)
    IMutableContext::createContext();
  // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
  return *IMutableContext::currentContext;
}

inline IContext& getCurrentContext() { return getCurrentMutableContext(); }

void cleanUpContext();

class SimplePcg32;
SimplePcg32& rng();
} // namespace Catch

// end catch_context.h
// start catch_interfaces_config.h

// start catch_option.hpp

namespace Catch {

// An optional type
template <typename T>
class Option {
public:
  Option() : nullableValue(nullptr) {}
  Option(T const& _value) : nullableValue(new (storage) T(_value)) {}
  Option(Option const& _other)
      : nullableValue(_other ? new (storage) T(*_other) : nullptr) {}

  ~Option() { reset(); }

  Option& operator=(Option const& _other) {
    if (&_other != this) {
      reset();
      if (_other)
        nullableValue = new (storage) T(*_other);
    }
    return *this;
  }
  Option& operator=(T const& _value) {
    reset();
    nullableValue = new (storage) T(_value);
    return *this;
  }

  void reset() {
    if (nullableValue)
      nullableValue->~T();
    nullableValue = nullptr;
  }

  T& operator*() { return *nullableValue; }
  T const& operator*() const { return *nullableValue; }
  T* operator->() { return nullableValue; }
  const T* operator->() const { return nullableValue; }

  T valueOr(T const& defaultValue) const {
    return nullableValue ? *nullableValue : defaultValue;
  }

  bool some() const { return nullableValue != nullptr; }
  bool none() const { return nullableValue == nullptr; }

  bool operator!() const { return nullableValue == nullptr; }
  explicit operator bool() const { return some(); }

private:
  T* nullableValue;
  alignas(alignof(T)) char storage[sizeof(T)];
};

} // end namespace Catch

// end catch_option.hpp
#include <chrono>
#include <iosfwd>
#include <string>
#include <vector>
#include <memory>

namespace Catch {

enum class Verbosity { Quiet = 0, Normal, High };

struct WarnAbout {
  enum What { Nothing = 0x00, NoAssertions = 0x01, NoTests = 0x02 };
};

struct ShowDurations {
  enum OrNot { DefaultForReporter, Always, Never };
};
struct RunTests {
  enum InWhatOrder {
    InDeclarationOrder,
    InLexicographicalOrder,
    InRandomOrder
  };
};
struct UseColour {
  enum YesOrNo { Auto, Yes, No };
};
struct WaitForKeypress {
  enum When {
    Never,
    BeforeStart        = 1,
    BeforeExit         = 2,
    BeforeStartAndExit = BeforeStart | BeforeExit
  };
};

class TestSpec;

struct IConfig : NonCopyable {

  virtual ~IConfig();

  virtual bool allowThrows() const                                 = 0;
  virtual std::ostream& stream() const                             = 0;
  virtual std::string name() const                                 = 0;
  virtual bool includeSuccessfulResults() const                    = 0;
  virtual bool shouldDebugBreak() const                            = 0;
  virtual bool warnAboutMissingAssertions() const                  = 0;
  virtual bool warnAboutNoTests() const                            = 0;
  virtual int abortAfter() const                                   = 0;
  virtual bool showInvisibles() const                              = 0;
  virtual ShowDurations::OrNot showDurations() const               = 0;
  virtual double minDuration() const                               = 0;
  virtual TestSpec const& testSpec() const                         = 0;
  virtual bool hasTestFilters() const                              = 0;
  virtual std::vector<std::string> const& getTestsOrTags() const   = 0;
  virtual RunTests::InWhatOrder runOrder() const                   = 0;
  virtual unsigned int rngSeed() const                             = 0;
  virtual UseColour::YesOrNo useColour() const                     = 0;
  virtual std::vector<std::string> const& getSectionsToRun() const = 0;
  virtual Verbosity verbosity() const                              = 0;

  virtual bool benchmarkNoAnalysis() const                      = 0;
  virtual int benchmarkSamples() const                          = 0;
  virtual double benchmarkConfidenceInterval() const            = 0;
  virtual unsigned int benchmarkResamples() const               = 0;
  virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
};

using IConfigPtr = std::shared_ptr<IConfig const>;
} // namespace Catch

// end catch_interfaces_config.h
// start catch_random_number_generator.h

#include <cstdint>

namespace Catch {

// This is a simple implementation of C++11 Uniform Random Number
// Generator. It does not provide all operators, because Catch2
// does not use it, but it should behave as expected inside stdlib's
// distributions.
// The implementation is based on the PCG family (http://pcg-random.org)
class SimplePcg32 {
  using state_type = std::uint64_t;

public:
  using result_type = std::uint32_t;
  static constexpr result_type(min)() { return 0; }
  static constexpr result_type(max)() { return static_cast<result_type>(-1); }

  // Provide some default initial state for the default constructor
  SimplePcg32() : SimplePcg32(0xed743cc4U) {}

  explicit SimplePcg32(result_type seed_);

  void seed(result_type seed_);
  void discard(uint64_t skip);

  result_type operator()();

private:
  friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
  friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);

  // In theory we also need operator<< and operator>>
  // In practice we do not use them, so we will skip them for now

  std::uint64_t m_state;
  // This part of the state determines which "stream" of the numbers
  // is chosen -- we take it as a constant for Catch2, so we only
  // need to deal with seeding the main state.
  // Picked by reading 8 bytes from `/dev/random` :-)
  static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
};

} // end namespace Catch

// end catch_random_number_generator.h
#include <random>

namespace Catch {
namespace Generators {

template <typename Float>
class RandomFloatingGenerator final : public IGenerator<Float> {
  Catch::SimplePcg32& m_rng;
  std::uniform_real_distribution<Float> m_dist;
  Float m_current_number;

public:
  RandomFloatingGenerator(Float a, Float b) : m_rng(rng()), m_dist(a, b) {
    static_cast<void>(next());
  }

  Float const& get() const override { return m_current_number; }
  bool next() override {
    m_current_number = m_dist(m_rng);
    return true;
  }
};

template <typename Integer>
class RandomIntegerGenerator final : public IGenerator<Integer> {
  Catch::SimplePcg32& m_rng;
  std::uniform_int_distribution<Integer> m_dist;
  Integer m_current_number;

public:
  RandomIntegerGenerator(Integer a, Integer b) : m_rng(rng()), m_dist(a, b) {
    static_cast<void>(next());
  }

  Integer const& get() const override { return m_current_number; }
  bool next() override {
    m_current_number = m_dist(m_rng);
    return true;
  }
};

// TODO: Ideally this would be also constrained against the various char types,
//       but I don't expect users to run into that in practice.
template <typename T>
typename std::enable_if<std::is_integral<T>::value &&
                            !std::is_same<T, bool>::value,
                        GeneratorWrapper<T>>::type
random(T a, T b) {
  return GeneratorWrapper<T>(pf::make_unique<RandomIntegerGenerator<T>>(a, b));
}

template <typename T>
typename std::enable_if<std::is_floating_point<T>::value,
                        GeneratorWrapper<T>>::type
random(T a, T b) {
  return GeneratorWrapper<T>(pf::make_unique<RandomFloatingGenerator<T>>(a, b));
}

template <typename T>
class RangeGenerator final : public IGenerator<T> {
  T m_current;
  T m_end;
  T m_step;
  bool m_positive;

public:
  RangeGenerator(T const& start, T const& end, T const& step)
      : m_current(start), m_end(end), m_step(step), m_positive(m_step > T(0)) {
    assert(m_current != m_end && "Range start and end cannot be equal");
    assert(m_step != T(0) && "Step size cannot be zero");
    assert(((m_positive && m_current <= m_end) ||
            (!m_positive && m_current >= m_end)) &&
           "Step moves away from end");
  }

  RangeGenerator(T const& start, T const& end)
      : RangeGenerator(start, end, (start < end) ? T(1) : T(-1)) {}

  T const& get() const override { return m_current; }

  bool next() override {
    m_current += m_step;
    return (m_positive) ? (m_current < m_end) : (m_current > m_end);
  }
};

template <typename T>
GeneratorWrapper<T> range(T const& start, T const& end, T const& step) {
  static_assert(std::is_arithmetic<T>::value && !std::is_same<T, bool>::value,
                "Type must be numeric");
  return GeneratorWrapper<T>(
      pf::make_unique<RangeGenerator<T>>(start, end, step));
}

template <typename T>
GeneratorWrapper<T> range(T const& start, T const& end) {
  static_assert(std::is_integral<T>::value && !std::is_same<T, bool>::value,
                "Type must be an integer");
  return GeneratorWrapper<T>(pf::make_unique<RangeGenerator<T>>(start, end));
}

template <typename T>
class IteratorGenerator final : public IGenerator<T> {
  static_assert(!std::is_same<T, bool>::value,
                "IteratorGenerator currently does not support bools"
                "because of std::vector<bool> specialization");

  std::vector<T> m_elems;
  size_t m_current = 0;

public:
  template <typename InputIterator, typename InputSentinel>
  IteratorGenerator(InputIterator first, InputSentinel last)
      : m_elems(first, last) {
    if (m_elems.empty()) {
      Catch::throw_exception(
          GeneratorException("IteratorGenerator received no valid values"));
    }
  }

  T const& get() const override { return m_elems[m_current]; }

  bool next() override {
    ++m_current;
    return m_current != m_elems.size();
  }
};

template <typename InputIterator, typename InputSentinel,
          typename ResultType =
              typename std::iterator_traits<InputIterator>::value_type>
GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
  return GeneratorWrapper<ResultType>(
      pf::make_unique<IteratorGenerator<ResultType>>(from, to));
}

template <typename Container,
          typename ResultType = typename Container::value_type>
GeneratorWrapper<ResultType> from_range(Container const& cnt) {
  return GeneratorWrapper<ResultType>(
      pf::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
}

} // namespace Generators
} // namespace Catch

// end catch_generators_specific.hpp

// These files are included here so the single_include script doesn't put them
// in the conditionally compiled sections
// start catch_test_case_info.h

#include <string>
#include <vector>
#include <memory>

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpadded"
#endif

namespace Catch {

struct ITestInvoker;

struct TestCaseInfo {
  enum SpecialProperties {
    None        = 0,
    IsHidden    = 1 << 1,
    ShouldFail  = 1 << 2,
    MayFail     = 1 << 3,
    Throws      = 1 << 4,
    NonPortable = 1 << 5,
    Benchmark   = 1 << 6
  };

  TestCaseInfo(std::string const& _name, std::string const& _className,
               std::string const& _description,
               std::vector<std::string> const& _tags,
               SourceLineInfo const& _lineInfo);

  friend void setTags(TestCaseInfo& testCaseInfo,
                      std::vector<std::string> tags);

  bool isHidden() const;
  bool throws() const;
  bool okToFail() const;
  bool expectedToFail() const;

  std::string tagsAsString() const;

  std::string name;
  std::string className;
  std::string description;
  std::vector<std::string> tags;
  std::vector<std::string> lcaseTags;
  SourceLineInfo lineInfo;
  SpecialProperties properties;
};

class TestCase : public TestCaseInfo {
public:
  TestCase(ITestInvoker* testCase, TestCaseInfo&& info);

  TestCase withName(std::string const& _newName) const;

  void invoke() const;

  TestCaseInfo const& getTestCaseInfo() const;

  bool operator==(TestCase const& other) const;
  bool operator<(TestCase const& other) const;

private:
  std::shared_ptr<ITestInvoker> test;
};

TestCase makeTestCase(ITestInvoker* testCase, std::string const& className,
                      NameAndTags const& nameAndTags,
                      SourceLineInfo const& lineInfo);
} // namespace Catch

#ifdef __clang__
#pragma clang diagnostic pop
#endif

// end catch_test_case_info.h
// start catch_interfaces_runner.h

namespace Catch {

struct IRunner {
  virtual ~IRunner();
  virtual bool aborting() const = 0;
};
} // namespace Catch

// end catch_interfaces_runner.h

#ifdef __OBJC__
// start catch_objc.hpp

#import <objc/runtime.h>

#include <string>

// NB. Any general catch headers included here must be included
// in catch.hpp first to make sure they are included by the single
// header for non obj-usage

///////////////////////////////////////////////////////////////////////////////
// This protocol is really only here for (self) documenting purposes, since
// all its methods are optional.
@protocol OcFixture

@optional

- (void)setUp;
- (void)tearDown;

@end

namespace Catch {

class OcMethod : public ITestInvoker {

public:
  OcMethod(Class cls, SEL sel) : m_cls(cls), m_sel(sel) {}

  virtual void invoke() const {
    id obj = [[m_cls alloc] init];

    performOptionalSelector(obj, @selector(setUp));
    performOptionalSelector(obj, m_sel);
    performOptionalSelector(obj, @selector(tearDown));

    arcSafeRelease(obj);
  }

private:
  virtual ~OcMethod() {}

  Class m_cls;
  SEL m_sel;
};

namespace Detail {

inline std::string getAnnotation(Class cls, std::string const& annotationName,
                                 std::string const& testCaseName) {
  NSString* selStr =
      [[NSString alloc] initWithFormat:@"Catch_%s_%s", annotationName.c_str(),
                                       testCaseName.c_str()];
  SEL sel = NSSelectorFromString(selStr);
  arcSafeRelease(selStr);
  id value = performOptionalSelector(cls, sel);
  if (value)
    return [(NSString*)value UTF8String];
  return "";
}
} // namespace Detail

inline std::size_t registerTestMethods() {
  std::size_t noTestMethods = 0;
  int noClasses             = objc_getClassList(nullptr, 0);

  Class* classes =
      (CATCH_UNSAFE_UNRETAINED Class*)malloc(sizeof(Class) * noClasses);
  objc_getClassList(classes, noClasses);

  for (int c = 0; c < noClasses; c++) {
    Class cls = classes[c];
    {
      u_int count;
      Method* methods = class_copyMethodList(cls, &count);
      for (u_int m = 0; m < count; m++) {
        SEL selector           = method_getName(methods[m]);
        std::string methodName = sel_getName(selector);
        if (startsWith(methodName, "Catch_TestCase_")) {
          std::string testCaseName = methodName.substr(15);
          std::string name = Detail::getAnnotation(cls, "Name", testCaseName);
          std::string desc =
              Detail::getAnnotation(cls, "Description", testCaseName);
          const char* className = class_getName(cls);

          getMutableRegistryHub().registerTest(makeTestCase(
              new OcMethod(cls, selector), className,
              NameAndTags(name.c_str(), desc.c_str()), SourceLineInfo("", 0)));
          noTestMethods++;
        }
      }
      free(methods);
    }
  }
  return noTestMethods;
}

#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)

namespace Matchers {
namespace Impl {
namespace NSStringMatchers {

struct StringHolder : MatcherBase<NSString*> {
  StringHolder(NSString* substr) : m_substr([substr copy]) {}
  StringHolder(StringHolder const& other) : m_substr([other.m_substr copy]) {}
  StringHolder() { arcSafeRelease(m_substr); }

  bool match(NSString* str) const override { return false; }

  NSString* CATCH_ARC_STRONG m_substr;
};

struct Equals : StringHolder {
  Equals(NSString* substr) : StringHolder(substr) {}

  bool match(NSString* str) const override {
    return (str != nil || m_substr == nil) && [str isEqualToString:m_substr];
  }

  std::string describe() const override {
    return "equals string: " + Catch::Detail::stringify(m_substr);
  }
};

struct Contains : StringHolder {
  Contains(NSString* substr) : StringHolder(substr) {}

  bool match(NSString* str) const override {
    return (str != nil || m_substr == nil) &&
           [str rangeOfString:m_substr].location != NSNotFound;
  }

  std::string describe() const override {
    return "contains string: " + Catch::Detail::stringify(m_substr);
  }
};

struct StartsWith : StringHolder {
  StartsWith(NSString* substr) : StringHolder(substr) {}

  bool match(NSString* str) const override {
    return (str != nil || m_substr == nil) &&
           [str rangeOfString:m_substr].location == 0;
  }

  std::string describe() const override {
    return "starts with: " + Catch::Detail::stringify(m_substr);
  }
};
struct EndsWith : StringHolder {
  EndsWith(NSString* substr) : StringHolder(substr) {}

  bool match(NSString* str) const override {
    return (str != nil || m_substr == nil) &&
           [str rangeOfString:m_substr].location ==
               [str length] - [m_substr length];
  }

  std::string describe() const override {
    return "ends with: " + Catch::Detail::stringify(m_substr);
  }
};

} // namespace NSStringMatchers
} // namespace Impl

inline Impl::NSStringMatchers::Equals Equals(NSString* substr) {
  return Impl::NSStringMatchers::Equals(substr);
}

inline Impl::NSStringMatchers::Contains Contains(NSString* substr) {
  return Impl::NSStringMatchers::Contains(substr);
}

inline Impl::NSStringMatchers::StartsWith StartsWith(NSString* substr) {
  return Impl::NSStringMatchers::StartsWith(substr);
}

inline Impl::NSStringMatchers::EndsWith EndsWith(NSString* substr) {
  return Impl::NSStringMatchers::EndsWith(substr);
}

} // namespace Matchers

using namespace Matchers;

#endif // CATCH_CONFIG_DISABLE_MATCHERS

} // namespace Catch

///////////////////////////////////////////////////////////////////////////////
#define OC_MAKE_UNIQUE_NAME(root, uniqueSuffix) root##uniqueSuffix
#define OC_TEST_CASE2(name, desc, uniqueSuffix)                                \
  +(NSString*)OC_MAKE_UNIQUE_NAME(Catch_Name_test_, uniqueSuffix) {            \
    return @name;                                                              \
  }                                                                            \
  +(NSString*)OC_MAKE_UNIQUE_NAME(Catch_Description_test_, uniqueSuffix) {     \
    return @desc;                                                              \
  }                                                                            \
  -(void)OC_MAKE_UNIQUE_NAME(Catch_TestCase_test_, uniqueSuffix)

#define OC_TEST_CASE(name, desc) OC_TEST_CASE2(name, desc, __LINE__)

// end catch_objc.hpp
#endif

// Benchmarking needs the externally-facing parts of reporters to work
#if defined(CATCH_CONFIG_EXTERNAL_INTERFACES) ||                               \
    defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
// start catch_external_interfaces.h

// start catch_reporter_bases.hpp

// start catch_interfaces_reporter.h

// start catch_config.hpp

// start catch_test_spec_parser.h

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpadded"
#endif

// start catch_test_spec.h

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpadded"
#endif

// start catch_wildcard_pattern.h

namespace Catch {
class WildcardPattern {
  enum WildcardPosition {
    NoWildcard         = 0,
    WildcardAtStart    = 1,
    WildcardAtEnd      = 2,
    WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
  };

public:
  WildcardPattern(std::string const& pattern,
                  CaseSensitive::Choice caseSensitivity);
  virtual ~WildcardPattern() = default;
  virtual bool matches(std::string const& str) const;

private:
  std::string normaliseString(std::string const& str) const;
  CaseSensitive::Choice m_caseSensitivity;
  WildcardPosition m_wildcard = NoWildcard;
  std::string m_pattern;
};
} // namespace Catch

// end catch_wildcard_pattern.h
#include <string>
#include <vector>
#include <memory>

namespace Catch {

struct IConfig;

class TestSpec {
  class Pattern {
  public:
    explicit Pattern(std::string const& name);
    virtual ~Pattern();
    virtual bool matches(TestCaseInfo const& testCase) const = 0;
    std::string const& name() const;

  private:
    std::string const m_name;
  };
  using PatternPtr = std::shared_ptr<Pattern>;

  class NamePattern : public Pattern {
  public:
    explicit NamePattern(std::string const& name,
                         std::string const& filterString);
    bool matches(TestCaseInfo const& testCase) const override;

  private:
    WildcardPattern m_wildcardPattern;
  };

  class TagPattern : public Pattern {
  public:
    explicit TagPattern(std::string const& tag,
                        std::string const& filterString);
    bool matches(TestCaseInfo const& testCase) const override;

  private:
    std::string m_tag;
  };

  class ExcludedPattern : public Pattern {
  public:
    explicit ExcludedPattern(PatternPtr const& underlyingPattern);
    bool matches(TestCaseInfo const& testCase) const override;

  private:
    PatternPtr m_underlyingPattern;
  };

  struct Filter {
    std::vector<PatternPtr> m_patterns;

    bool matches(TestCaseInfo const& testCase) const;
    std::string name() const;
  };

public:
  struct FilterMatch {
    std::string name;
    std::vector<TestCase const*> tests;
  };
  using Matches       = std::vector<FilterMatch>;
  using vectorStrings = std::vector<std::string>;

  bool hasFilters() const;
  bool matches(TestCaseInfo const& testCase) const;
  Matches matchesByFilter(std::vector<TestCase> const& testCases,
                          IConfig const& config) const;
  const vectorStrings& getInvalidArgs() const;

private:
  std::vector<Filter> m_filters;
  std::vector<std::string> m_invalidArgs;
  friend class TestSpecParser;
};
} // namespace Catch

#ifdef __clang__
#pragma clang diagnostic pop
#endif

// end catch_test_spec.h
// start catch_interfaces_tag_alias_registry.h

#include <string>

namespace Catch {

struct TagAlias;

struct ITagAliasRegistry {
  virtual ~ITagAliasRegistry();
  // Nullptr if not present
  virtual TagAlias const* find(std::string const& alias) const = 0;
  virtual std::string
  expandAliases(std::string const& unexpandedTestSpec) const = 0;

  static ITagAliasRegistry const& get();
};

} // end namespace Catch

// end catch_interfaces_tag_alias_registry.h
namespace Catch {

class TestSpecParser {
  enum Mode { None, Name, QuotedName, Tag, EscapedName };
  Mode m_mode                  = None;
  Mode lastMode                = None;
  bool m_exclusion             = false;
  std::size_t m_pos            = 0;
  std::size_t m_realPatternPos = 0;
  std::string m_arg;
  std::string m_substring;
  std::string m_patternName;
  std::vector<std::size_t> m_escapeChars;
  TestSpec::Filter m_currentFilter;
  TestSpec m_testSpec;
  ITagAliasRegistry const* m_tagAliases = nullptr;

public:
  TestSpecParser(ITagAliasRegistry const& tagAliases);

  TestSpecParser& parse(std::string const& arg);
  TestSpec testSpec();

private:
  bool visitChar(char c);
  void startNewMode(Mode mode);
  bool processNoneChar(char c);
  void processNameChar(char c);
  bool processOtherChar(char c);
  void endMode();
  void escape();
  bool isControlChar(char c) const;
  void saveLastMode();
  void revertBackToLastMode();
  void addFilter();
  bool separate();

  // Handles common preprocessing of the pattern for name/tag patterns
  std::string preprocessPattern();
  // Adds the current pattern as a test name
  void addNamePattern();
  // Adds the current pattern as a tag
  void addTagPattern();

  inline void addCharToPattern(char c) {
    m_substring += c;
    m_patternName += c;
    m_realPatternPos++;
  }
};
TestSpec parseTestSpec(std::string const& arg);

} // namespace Catch

#ifdef __clang__
#pragma clang diagnostic pop
#endif

// end catch_test_spec_parser.h
// Libstdc++ doesn't like incomplete classes for unique_ptr

#include <memory>
#include <vector>
#include <string>

#ifndef CATCH_CONFIG_CONSOLE_WIDTH
#define CATCH_CONFIG_CONSOLE_WIDTH 80
#endif

namespace Catch {

struct IStream;

struct ConfigData {
  bool listTests         = false;
  bool listTags          = false;
  bool listReporters     = false;
  bool listTestNamesOnly = false;

  bool showSuccessfulTests = false;
  bool shouldDebugBreak    = false;
  bool noThrow             = false;
  bool showHelp            = false;
  bool showInvisibles      = false;
  bool filenamesAsTags     = false;
  bool libIdentify         = false;

  int abortAfter       = -1;
  unsigned int rngSeed = 0;

  bool benchmarkNoAnalysis                           = false;
  unsigned int benchmarkSamples                      = 100;
  double benchmarkConfidenceInterval                 = 0.95;
  unsigned int benchmarkResamples                    = 100000;
  std::chrono::milliseconds::rep benchmarkWarmupTime = 100;

  Verbosity verbosity                   = Verbosity::Normal;
  WarnAbout::What warnings              = WarnAbout::Nothing;
  ShowDurations::OrNot showDurations    = ShowDurations::DefaultForReporter;
  double minDuration                    = -1;
  RunTests::InWhatOrder runOrder        = RunTests::InDeclarationOrder;
  UseColour::YesOrNo useColour          = UseColour::Auto;
  WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;

  std::string outputFilename;
  std::string name;
  std::string processName;
#ifndef CATCH_CONFIG_DEFAULT_REPORTER
#define CATCH_CONFIG_DEFAULT_REPORTER "console"
#endif
  std::string reporterName = CATCH_CONFIG_DEFAULT_REPORTER;
#undef CATCH_CONFIG_DEFAULT_REPORTER

  std::vector<std::string> testsOrTags;
  std::vector<std::string> sectionsToRun;
};

class Config : public IConfig {
public:
  Config() = default;
  Config(ConfigData const& data);
  virtual ~Config() = default;

  std::string const& getFilename() const;

  bool listTests() const;
  bool listTestNamesOnly() const;
  bool listTags() const;
  bool listReporters() const;

  std::string getProcessName() const;
  std::string const& getReporterName() const;

  std::vector<std::string> const& getTestsOrTags() const override;
  std::vector<std::string> const& getSectionsToRun() const override;

  TestSpec const& testSpec() const override;
  bool hasTestFilters() const override;

  bool showHelp() const;

  // IConfig interface
  bool allowThrows() const override;
  std::ostream& stream() const override;
  std::string name() const override;
  bool includeSuccessfulResults() const override;
  bool warnAboutMissingAssertions() const override;
  bool warnAboutNoTests() const override;
  ShowDurations::OrNot showDurations() const override;
  double minDuration() const override;
  RunTests::InWhatOrder runOrder() const override;
  unsigned int rngSeed() const override;
  UseColour::YesOrNo useColour() const override;
  bool shouldDebugBreak() const override;
  int abortAfter() const override;
  bool showInvisibles() const override;
  Verbosity verbosity() const override;
  bool benchmarkNoAnalysis() const override;
  int benchmarkSamples() const override;
  double benchmarkConfidenceInterval() const override;
  unsigned int benchmarkResamples() const override;
  std::chrono::milliseconds benchmarkWarmupTime() const override;

private:
  IStream const* openStream();
  ConfigData m_data;

  std::unique_ptr<IStream const> m_stream;
  TestSpec m_testSpec;
  bool m_hasTestFilters = false;
};

} // end namespace Catch

// end catch_config.hpp
// start catch_assertionresult.h

#include <string>

namespace Catch {

struct AssertionResultData {
  AssertionResultData() = delete;

  AssertionResultData(ResultWas::OfType _resultType,
                      LazyExpression const& _lazyExpression);

  std::string message;
  mutable std::string reconstructedExpression;
  LazyExpression lazyExpression;
  ResultWas::OfType resultType;

  std::string reconstructExpression() const;
};

class AssertionResult {
public:
  AssertionResult() = delete;
  AssertionResult(AssertionInfo const& info, AssertionResultData const& data);

  bool isOk() const;
  bool succeeded() const;
  ResultWas::OfType getResultType() const;
  bool hasExpression() const;
  bool hasMessage() const;
  std::string getExpression() const;
  std::string getExpressionInMacro() const;
  bool hasExpandedExpression() const;
  std::string getExpandedExpression() const;
  std::string getMessage() const;
  SourceLineInfo getSourceInfo() const;
  StringRef getTestMacroName() const;

  // protected:
  AssertionInfo m_info;
  AssertionResultData m_resultData;
};

} // end namespace Catch

// end catch_assertionresult.h
#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
// start catch_estimate.hpp

// Statistics estimates

namespace Catch {
namespace Benchmark {
template <typename Duration>
struct Estimate {
  Duration point;
  Duration lower_bound;
  Duration upper_bound;
  double confidence_interval;

  template <typename Duration2>
  operator Estimate<Duration2>() const {
    return {point, lower_bound, upper_bound, confidence_interval};
  }
};
} // namespace Benchmark
} // namespace Catch

// end catch_estimate.hpp
// start catch_outlier_classification.hpp

// Outlier information

namespace Catch {
namespace Benchmark {
struct OutlierClassification {
  int samples_seen = 0;
  int low_severe   = 0; // more than 3 times IQR below Q1
  int low_mild     = 0; // 1.5 to 3 times IQR below Q1
  int high_mild    = 0; // 1.5 to 3 times IQR above Q3
  int high_severe  = 0; // more than 3 times IQR above Q3

  int total() const { return low_severe + low_mild + high_mild + high_severe; }
};
} // namespace Benchmark
} // namespace Catch

// end catch_outlier_classification.hpp
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

#include <string>
#include <iosfwd>
#include <map>
#include <set>
#include <memory>
#include <algorithm>

namespace Catch {

struct ReporterConfig {
  explicit ReporterConfig(IConfigPtr const& _fullConfig);

  ReporterConfig(IConfigPtr const& _fullConfig, std::ostream& _stream);

  std::ostream& stream() const;
  IConfigPtr fullConfig() const;

private:
  std::ostream* m_stream;
  IConfigPtr m_fullConfig;
};

struct ReporterPreferences {
  bool shouldRedirectStdOut      = false;
  bool shouldReportAllAssertions = false;
};

template <typename T>
struct LazyStat : Option<T> {
  LazyStat& operator=(T const& _value) {
    Option<T>::operator=(_value);
    used = false;
    return *this;
  }
  void reset() {
    Option<T>::reset();
    used = false;
  }
  bool used = false;
};

struct TestRunInfo {
  TestRunInfo(std::string const& _name);
  std::string name;
};
struct GroupInfo {
  GroupInfo(std::string const& _name, std::size_t _groupIndex,
            std::size_t _groupsCount);

  std::string name;
  std::size_t groupIndex;
  std::size_t groupsCounts;
};

struct AssertionStats {
  AssertionStats(AssertionResult const& _assertionResult,
                 std::vector<MessageInfo> const& _infoMessages,
                 Totals const& _totals);

  AssertionStats(AssertionStats const&)            = default;
  AssertionStats(AssertionStats&&)                 = default;
  AssertionStats& operator=(AssertionStats const&) = delete;
  AssertionStats& operator=(AssertionStats&&)      = delete;
  virtual ~AssertionStats();

  AssertionResult assertionResult;
  std::vector<MessageInfo> infoMessages;
  Totals totals;
};

struct SectionStats {
  SectionStats(SectionInfo const& _sectionInfo, Counts const& _assertions,
               double _durationInSeconds, bool _missingAssertions);
  SectionStats(SectionStats const&)            = default;
  SectionStats(SectionStats&&)                 = default;
  SectionStats& operator=(SectionStats const&) = default;
  SectionStats& operator=(SectionStats&&)      = default;
  virtual ~SectionStats();

  SectionInfo sectionInfo;
  Counts assertions;
  double durationInSeconds;
  bool missingAssertions;
};

struct TestCaseStats {
  TestCaseStats(TestCaseInfo const& _testInfo, Totals const& _totals,
                std::string const& _stdOut, std::string const& _stdErr,
                bool _aborting);

  TestCaseStats(TestCaseStats const&)            = default;
  TestCaseStats(TestCaseStats&&)                 = default;
  TestCaseStats& operator=(TestCaseStats const&) = default;
  TestCaseStats& operator=(TestCaseStats&&)      = default;
  virtual ~TestCaseStats();

  TestCaseInfo testInfo;
  Totals totals;
  std::string stdOut;
  std::string stdErr;
  bool aborting;
};

struct TestGroupStats {
  TestGroupStats(GroupInfo const& _groupInfo, Totals const& _totals,
                 bool _aborting);
  TestGroupStats(GroupInfo const& _groupInfo);

  TestGroupStats(TestGroupStats const&)            = default;
  TestGroupStats(TestGroupStats&&)                 = default;
  TestGroupStats& operator=(TestGroupStats const&) = default;
  TestGroupStats& operator=(TestGroupStats&&)      = default;
  virtual ~TestGroupStats();

  GroupInfo groupInfo;
  Totals totals;
  bool aborting;
};

struct TestRunStats {
  TestRunStats(TestRunInfo const& _runInfo, Totals const& _totals,
               bool _aborting);

  TestRunStats(TestRunStats const&)            = default;
  TestRunStats(TestRunStats&&)                 = default;
  TestRunStats& operator=(TestRunStats const&) = default;
  TestRunStats& operator=(TestRunStats&&)      = default;
  virtual ~TestRunStats();

  TestRunInfo runInfo;
  Totals totals;
  bool aborting;
};

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
struct BenchmarkInfo {
  std::string name;
  double estimatedDuration;
  int iterations;
  int samples;
  unsigned int resamples;
  double clockResolution;
  double clockCost;
};

template <class Duration>
struct BenchmarkStats {
  BenchmarkInfo info;

  std::vector<Duration> samples;
  Benchmark::Estimate<Duration> mean;
  Benchmark::Estimate<Duration> standardDeviation;
  Benchmark::OutlierClassification outliers;
  double outlierVariance;

  template <typename Duration2>
  operator BenchmarkStats<Duration2>() const {
    std::vector<Duration2> samples2;
    samples2.reserve(samples.size());
    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),
                   [](Duration d) { return Duration2(d); });
    return {
        info,     std::move(samples2), mean, standardDeviation,
        outliers, outlierVariance,
    };
  }
};
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

struct IStreamingReporter {
  virtual ~IStreamingReporter() = default;

  // Implementing class must also provide the following static methods:
  // static std::string getDescription();
  // static std::set<Verbosity> getSupportedVerbosities()

  virtual ReporterPreferences getPreferences() const = 0;

  virtual void noMatchingTestCases(std::string const& spec) = 0;

  virtual void reportInvalidArguments(std::string const&) {}

  virtual void testRunStarting(TestRunInfo const& testRunInfo) = 0;
  virtual void testGroupStarting(GroupInfo const& groupInfo)   = 0;

  virtual void testCaseStarting(TestCaseInfo const& testInfo)  = 0;
  virtual void sectionStarting(SectionInfo const& sectionInfo) = 0;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  virtual void benchmarkPreparing(std::string const&) {}
  virtual void benchmarkStarting(BenchmarkInfo const&) {}
  virtual void benchmarkEnded(BenchmarkStats<> const&) {}
  virtual void benchmarkFailed(std::string const&) {}
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

  virtual void assertionStarting(AssertionInfo const& assertionInfo) = 0;

  // The return value indicates if the messages buffer should be cleared:
  virtual bool assertionEnded(AssertionStats const& assertionStats) = 0;

  virtual void sectionEnded(SectionStats const& sectionStats)       = 0;
  virtual void testCaseEnded(TestCaseStats const& testCaseStats)    = 0;
  virtual void testGroupEnded(TestGroupStats const& testGroupStats) = 0;
  virtual void testRunEnded(TestRunStats const& testRunStats)       = 0;

  virtual void skipTest(TestCaseInfo const& testInfo) = 0;

  // Default empty implementation provided
  virtual void fatalErrorEncountered(StringRef name);

  virtual bool isMulti() const;
};
using IStreamingReporterPtr = std::unique_ptr<IStreamingReporter>;

struct IReporterFactory {
  virtual ~IReporterFactory();
  virtual IStreamingReporterPtr create(ReporterConfig const& config) const = 0;
  virtual std::string getDescription() const                               = 0;
};
using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;

struct IReporterRegistry {
  using FactoryMap = std::map<std::string, IReporterFactoryPtr>;
  using Listeners  = std::vector<IReporterFactoryPtr>;

  virtual ~IReporterRegistry();
  virtual IStreamingReporterPtr create(std::string const& name,
                                       IConfigPtr const& config) const = 0;
  virtual FactoryMap const& getFactories() const                       = 0;
  virtual Listeners const& getListeners() const                        = 0;
};

} // end namespace Catch

// end catch_interfaces_reporter.h
#include <algorithm>
#include <cstring>
#include <cfloat>
#include <cstdio>
#include <cassert>
#include <memory>
#include <ostream>

namespace Catch {
void prepareExpandedExpression(AssertionResult& result);

// Returns double formatted as %.3f (format expected on output)
std::string getFormattedDuration(double duration);

//! Should the reporter show
bool shouldShowDuration(IConfig const& config, double duration);

std::string serializeFilters(std::vector<std::string> const& container);

template <typename DerivedT>
struct StreamingReporterBase : IStreamingReporter {

  StreamingReporterBase(ReporterConfig const& _config)
      : m_config(_config.fullConfig()), stream(_config.stream()) {
    m_reporterPrefs.shouldRedirectStdOut = false;
    if (!DerivedT::getSupportedVerbosities().count(m_config->verbosity()))
      CATCH_ERROR("Verbosity level not supported by this reporter");
  }

  ReporterPreferences getPreferences() const override {
    return m_reporterPrefs;
  }

  static std::set<Verbosity> getSupportedVerbosities() {
    return {Verbosity::Normal};
  }

  ~StreamingReporterBase() override = default;

  void noMatchingTestCases(std::string const&) override {}

  void reportInvalidArguments(std::string const&) override {}

  void testRunStarting(TestRunInfo const& _testRunInfo) override {
    currentTestRunInfo = _testRunInfo;
  }

  void testGroupStarting(GroupInfo const& _groupInfo) override {
    currentGroupInfo = _groupInfo;
  }

  void testCaseStarting(TestCaseInfo const& _testInfo) override {
    currentTestCaseInfo = _testInfo;
  }
  void sectionStarting(SectionInfo const& _sectionInfo) override {
    m_sectionStack.push_back(_sectionInfo);
  }

  void sectionEnded(SectionStats const& /* _sectionStats */) override {
    m_sectionStack.pop_back();
  }
  void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {
    currentTestCaseInfo.reset();
  }
  void testGroupEnded(TestGroupStats const& /* _testGroupStats */) override {
    currentGroupInfo.reset();
  }
  void testRunEnded(TestRunStats const& /* _testRunStats */) override {
    currentTestCaseInfo.reset();
    currentGroupInfo.reset();
    currentTestRunInfo.reset();
  }

  void skipTest(TestCaseInfo const&) override {
    // Don't do anything with this by default.
    // It can optionally be overridden in the derived class.
  }

  IConfigPtr m_config;
  std::ostream& stream;

  LazyStat<TestRunInfo> currentTestRunInfo;
  LazyStat<GroupInfo> currentGroupInfo;
  LazyStat<TestCaseInfo> currentTestCaseInfo;

  std::vector<SectionInfo> m_sectionStack;
  ReporterPreferences m_reporterPrefs;
};

template <typename DerivedT>
struct CumulativeReporterBase : IStreamingReporter {
  template <typename T, typename ChildNodeT>
  struct Node {
    explicit Node(T const& _value) : value(_value) {}
    virtual ~Node() {}

    using ChildNodes = std::vector<std::shared_ptr<ChildNodeT>>;
    T value;
    ChildNodes children;
  };
  struct SectionNode {
    explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}
    virtual ~SectionNode() = default;

    bool operator==(SectionNode const& other) const {
      return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
    }
    bool operator==(std::shared_ptr<SectionNode> const& other) const {
      return operator==(*other);
    }

    SectionStats stats;
    using ChildSections = std::vector<std::shared_ptr<SectionNode>>;
    using Assertions    = std::vector<AssertionStats>;
    ChildSections childSections;
    Assertions assertions;
    std::string stdOut;
    std::string stdErr;
  };

  struct BySectionInfo {
    BySectionInfo(SectionInfo const& other) : m_other(other) {}
    BySectionInfo(BySectionInfo const& other) : m_other(other.m_other) {}
    bool operator()(std::shared_ptr<SectionNode> const& node) const {
      return ((node->stats.sectionInfo.name == m_other.name) &&
              (node->stats.sectionInfo.lineInfo == m_other.lineInfo));
    }
    void operator=(BySectionInfo const&) = delete;

  private:
    SectionInfo const& m_other;
  };

  using TestCaseNode  = Node<TestCaseStats, SectionNode>;
  using TestGroupNode = Node<TestGroupStats, TestCaseNode>;
  using TestRunNode   = Node<TestRunStats, TestGroupNode>;

  CumulativeReporterBase(ReporterConfig const& _config)
      : m_config(_config.fullConfig()), stream(_config.stream()) {
    m_reporterPrefs.shouldRedirectStdOut = false;
    if (!DerivedT::getSupportedVerbosities().count(m_config->verbosity()))
      CATCH_ERROR("Verbosity level not supported by this reporter");
  }
  ~CumulativeReporterBase() override = default;

  ReporterPreferences getPreferences() const override {
    return m_reporterPrefs;
  }

  static std::set<Verbosity> getSupportedVerbosities() {
    return {Verbosity::Normal};
  }

  void testRunStarting(TestRunInfo const&) override {}
  void testGroupStarting(GroupInfo const&) override {}

  void testCaseStarting(TestCaseInfo const&) override {}

  void sectionStarting(SectionInfo const& sectionInfo) override {
    SectionStats incompleteStats(sectionInfo, Counts(), 0, false);
    std::shared_ptr<SectionNode> node;
    if (m_sectionStack.empty()) {
      if (!m_rootSection)
        m_rootSection = std::make_shared<SectionNode>(incompleteStats);
      node = m_rootSection;
    } else {
      SectionNode& parentNode = *m_sectionStack.back();
      auto it                 = std::find_if(parentNode.childSections.begin(),
                                             parentNode.childSections.end(),
                                             BySectionInfo(sectionInfo));
      if (it == parentNode.childSections.end()) {
        node = std::make_shared<SectionNode>(incompleteStats);
        parentNode.childSections.push_back(node);
      } else
        node = *it;
    }
    m_sectionStack.push_back(node);
    m_deepestSection = std::move(node);
  }

  void assertionStarting(AssertionInfo const&) override {}

  bool assertionEnded(AssertionStats const& assertionStats) override {
    assert(!m_sectionStack.empty());
    // AssertionResult holds a pointer to a temporary DecomposedExpression,
    // which getExpandedExpression() calls to build the expression string.
    // Our section stack copy of the assertionResult will likely outlive the
    // temporary, so it must be expanded or discarded now to avoid calling
    // a destroyed object later.
    prepareExpandedExpression(
        const_cast<AssertionResult&>(assertionStats.assertionResult));
    SectionNode& sectionNode = *m_sectionStack.back();
    sectionNode.assertions.push_back(assertionStats);
    return true;
  }
  void sectionEnded(SectionStats const& sectionStats) override {
    assert(!m_sectionStack.empty());
    SectionNode& node = *m_sectionStack.back();
    node.stats        = sectionStats;
    m_sectionStack.pop_back();
  }
  void testCaseEnded(TestCaseStats const& testCaseStats) override {
    auto node = std::make_shared<TestCaseNode>(testCaseStats);
    assert(m_sectionStack.size() == 0);
    node->children.push_back(m_rootSection);
    m_testCases.push_back(node);
    m_rootSection.reset();

    assert(m_deepestSection);
    m_deepestSection->stdOut = testCaseStats.stdOut;
    m_deepestSection->stdErr = testCaseStats.stdErr;
  }
  void testGroupEnded(TestGroupStats const& testGroupStats) override {
    auto node = std::make_shared<TestGroupNode>(testGroupStats);
    node->children.swap(m_testCases);
    m_testGroups.push_back(node);
  }
  void testRunEnded(TestRunStats const& testRunStats) override {
    auto node = std::make_shared<TestRunNode>(testRunStats);
    node->children.swap(m_testGroups);
    m_testRuns.push_back(node);
    testRunEndedCumulative();
  }
  virtual void testRunEndedCumulative() = 0;

  void skipTest(TestCaseInfo const&) override {}

  IConfigPtr m_config;
  std::ostream& stream;
  std::vector<AssertionStats> m_assertions;
  std::vector<std::vector<std::shared_ptr<SectionNode>>> m_sections;
  std::vector<std::shared_ptr<TestCaseNode>> m_testCases;
  std::vector<std::shared_ptr<TestGroupNode>> m_testGroups;

  std::vector<std::shared_ptr<TestRunNode>> m_testRuns;

  std::shared_ptr<SectionNode> m_rootSection;
  std::shared_ptr<SectionNode> m_deepestSection;
  std::vector<std::shared_ptr<SectionNode>> m_sectionStack;
  ReporterPreferences m_reporterPrefs;
};

template <char C>
char const* getLineOfChars() {
  static char line[CATCH_CONFIG_CONSOLE_WIDTH] = {0};
  if (!*line) {
    std::memset(line, C, CATCH_CONFIG_CONSOLE_WIDTH - 1);
    line[CATCH_CONFIG_CONSOLE_WIDTH - 1] = 0;
  }
  return line;
}

struct TestEventListenerBase : StreamingReporterBase<TestEventListenerBase> {
  TestEventListenerBase(ReporterConfig const& _config);

  static std::set<Verbosity> getSupportedVerbosities();

  void assertionStarting(AssertionInfo const&) override;
  bool assertionEnded(AssertionStats const&) override;
};

} // end namespace Catch

// end catch_reporter_bases.hpp
// start catch_console_colour.h

namespace Catch {

struct Colour {
  enum Code {
    None = 0,

    White,
    Red,
    Green,
    Blue,
    Cyan,
    Yellow,
    Grey,

    Bright = 0x10,

    BrightRed    = Bright | Red,
    BrightGreen  = Bright | Green,
    LightGrey    = Bright | Grey,
    BrightWhite  = Bright | White,
    BrightYellow = Bright | Yellow,

    // By intention
    FileName              = LightGrey,
    Warning               = BrightYellow,
    ResultError           = BrightRed,
    ResultSuccess         = BrightGreen,
    ResultExpectedFailure = Warning,

    Error   = BrightRed,
    Success = Green,

    OriginalExpression      = Cyan,
    ReconstructedExpression = BrightYellow,

    SecondaryText = LightGrey,
    Headers       = White
  };

  // Use constructed object for RAII guard
  Colour(Code _colourCode);
  Colour(Colour&& other) noexcept;
  Colour& operator=(Colour&& other) noexcept;
  ~Colour();

  // Use static method for one-shot changes
  static void use(Code _colourCode);

private:
  bool m_moved = false;
};

std::ostream& operator<<(std::ostream& os, Colour const&);

} // end namespace Catch

// end catch_console_colour.h
// start catch_reporter_registrars.hpp

namespace Catch {

template <typename T>
class ReporterRegistrar {

  class ReporterFactory : public IReporterFactory {

    IStreamingReporterPtr create(ReporterConfig const& config) const override {
      return std::unique_ptr<T>(new T(config));
    }

    std::string getDescription() const override { return T::getDescription(); }
  };

public:
  explicit ReporterRegistrar(std::string const& name) {
    getMutableRegistryHub().registerReporter(
        name, std::make_shared<ReporterFactory>());
  }
};

template <typename T>
class ListenerRegistrar {

  class ListenerFactory : public IReporterFactory {

    IStreamingReporterPtr create(ReporterConfig const& config) const override {
      return std::unique_ptr<T>(new T(config));
    }
    std::string getDescription() const override { return std::string(); }
  };

public:
  ListenerRegistrar() {
    getMutableRegistryHub().registerListener(
        std::make_shared<ListenerFactory>());
  }
};
} // namespace Catch

#if !defined(CATCH_CONFIG_DISABLE)

#define CATCH_REGISTER_REPORTER(name, reporterType)                            \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::ReporterRegistrar<reporterType>                                       \
      catch_internal_RegistrarFor##reporterType(name);                         \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

#define CATCH_REGISTER_LISTENER(listenerType)                                  \
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \
  namespace {                                                                  \
  Catch::ListenerRegistrar<listenerType>                                       \
      catch_internal_RegistrarFor##listenerType;                               \
  }                                                                            \
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
#else // CATCH_CONFIG_DISABLE

#define CATCH_REGISTER_REPORTER(name, reporterType)
#define CATCH_REGISTER_LISTENER(listenerType)

#endif // CATCH_CONFIG_DISABLE

// end catch_reporter_registrars.hpp
// Allow users to base their work off existing reporters
// start catch_reporter_compact.h

namespace Catch {

struct CompactReporter : StreamingReporterBase<CompactReporter> {

  using StreamingReporterBase::StreamingReporterBase;

  ~CompactReporter() override;

  static std::string getDescription();

  void noMatchingTestCases(std::string const& spec) override;

  void assertionStarting(AssertionInfo const&) override;

  bool assertionEnded(AssertionStats const& _assertionStats) override;

  void sectionEnded(SectionStats const& _sectionStats) override;

  void testRunEnded(TestRunStats const& _testRunStats) override;
};

} // end namespace Catch

// end catch_reporter_compact.h
// start catch_reporter_console.h

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4061) // Not all labels are EXPLICITLY handled in
                                // switch Note that 4062 (not all labels are
                                // handled and default is missing) is enabled
#endif

namespace Catch {
// Fwd decls
struct SummaryColumn;
class TablePrinter;

struct ConsoleReporter : StreamingReporterBase<ConsoleReporter> {
  std::unique_ptr<TablePrinter> m_tablePrinter;

  ConsoleReporter(ReporterConfig const& config);
  ~ConsoleReporter() override;
  static std::string getDescription();

  void noMatchingTestCases(std::string const& spec) override;

  void reportInvalidArguments(std::string const& arg) override;

  void assertionStarting(AssertionInfo const&) override;

  bool assertionEnded(AssertionStats const& _assertionStats) override;

  void sectionStarting(SectionInfo const& _sectionInfo) override;
  void sectionEnded(SectionStats const& _sectionStats) override;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  void benchmarkPreparing(std::string const& name) override;
  void benchmarkStarting(BenchmarkInfo const& info) override;
  void benchmarkEnded(BenchmarkStats<> const& stats) override;
  void benchmarkFailed(std::string const& error) override;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

  void testCaseEnded(TestCaseStats const& _testCaseStats) override;
  void testGroupEnded(TestGroupStats const& _testGroupStats) override;
  void testRunEnded(TestRunStats const& _testRunStats) override;
  void testRunStarting(TestRunInfo const& _testRunInfo) override;

private:
  void lazyPrint();

  void lazyPrintWithoutClosingBenchmarkTable();
  void lazyPrintRunInfo();
  void lazyPrintGroupInfo();
  void printTestCaseAndSectionHeader();

  void printClosedHeader(std::string const& _name);
  void printOpenHeader(std::string const& _name);

  // if string has a : in first line will set indent to follow it on
  // subsequent lines
  void printHeaderString(std::string const& _string, std::size_t indent = 0);

  void printTotals(Totals const& totals);
  void printSummaryRow(std::string const& label,
                       std::vector<SummaryColumn> const& cols, std::size_t row);

  void printTotalsDivider(Totals const& totals);
  void printSummaryDivider();
  void printTestFilters();

private:
  bool m_headerPrinted = false;
};

} // end namespace Catch

#if defined(_MSC_VER)
#pragma warning(pop)
#endif

// end catch_reporter_console.h
// start catch_reporter_junit.h

// start catch_xmlwriter.h

#include <vector>

namespace Catch {
enum class XmlFormatting {
  None    = 0x00,
  Indent  = 0x01,
  Newline = 0x02,
};

XmlFormatting operator|(XmlFormatting lhs, XmlFormatting rhs);
XmlFormatting operator&(XmlFormatting lhs, XmlFormatting rhs);

class XmlEncode {
public:
  enum ForWhat { ForTextNodes, ForAttributes };

  XmlEncode(std::string const& str, ForWhat forWhat = ForTextNodes);

  void encodeTo(std::ostream& os) const;

  friend std::ostream& operator<<(std::ostream& os, XmlEncode const& xmlEncode);

private:
  std::string m_str;
  ForWhat m_forWhat;
};

class XmlWriter {
public:
  class ScopedElement {
  public:
    ScopedElement(XmlWriter* writer, XmlFormatting fmt);

    ScopedElement(ScopedElement&& other) noexcept;
    ScopedElement& operator=(ScopedElement&& other) noexcept;

    ~ScopedElement();

    ScopedElement& writeText(std::string const& text,
                             XmlFormatting fmt = XmlFormatting::Newline |
                                                 XmlFormatting::Indent);

    template <typename T>
    ScopedElement& writeAttribute(std::string const& name, T const& attribute) {
      m_writer->writeAttribute(name, attribute);
      return *this;
    }

  private:
    mutable XmlWriter* m_writer = nullptr;
    XmlFormatting m_fmt;
  };

  XmlWriter(std::ostream& os = Catch::cout());
  ~XmlWriter();

  XmlWriter(XmlWriter const&)            = delete;
  XmlWriter& operator=(XmlWriter const&) = delete;

  XmlWriter& startElement(std::string const& name,
                          XmlFormatting fmt = XmlFormatting::Newline |
                                              XmlFormatting::Indent);

  ScopedElement scopedElement(std::string const& name,
                              XmlFormatting fmt = XmlFormatting::Newline |
                                                  XmlFormatting::Indent);

  XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline |
                                            XmlFormatting::Indent);

  XmlWriter& writeAttribute(std::string const& name,
                            std::string const& attribute);

  XmlWriter& writeAttribute(std::string const& name, bool attribute);

  template <typename T>
  XmlWriter& writeAttribute(std::string const& name, T const& attribute) {
    ReusableStringStream rss;
    rss << attribute;
    return writeAttribute(name, rss.str());
  }

  XmlWriter& writeText(std::string const& text,
                       XmlFormatting fmt = XmlFormatting::Newline |
                                           XmlFormatting::Indent);

  XmlWriter& writeComment(std::string const& text,
                          XmlFormatting fmt = XmlFormatting::Newline |
                                              XmlFormatting::Indent);

  void writeStylesheetRef(std::string const& url);

  XmlWriter& writeBlankLine();

  void ensureTagClosed();

private:
  void applyFormatting(XmlFormatting fmt);

  void writeDeclaration();

  void newlineIfNecessary();

  bool m_tagIsOpen    = false;
  bool m_needsNewline = false;
  std::vector<std::string> m_tags;
  std::string m_indent;
  std::ostream& m_os;
};

} // namespace Catch

// end catch_xmlwriter.h
namespace Catch {

class JunitReporter : public CumulativeReporterBase<JunitReporter> {
public:
  JunitReporter(ReporterConfig const& _config);

  ~JunitReporter() override;

  static std::string getDescription();

  void noMatchingTestCases(std::string const& /*spec*/) override;

  void testRunStarting(TestRunInfo const& runInfo) override;

  void testGroupStarting(GroupInfo const& groupInfo) override;

  void testCaseStarting(TestCaseInfo const& testCaseInfo) override;
  bool assertionEnded(AssertionStats const& assertionStats) override;

  void testCaseEnded(TestCaseStats const& testCaseStats) override;

  void testGroupEnded(TestGroupStats const& testGroupStats) override;

  void testRunEndedCumulative() override;

  void writeGroup(TestGroupNode const& groupNode, double suiteTime);

  void writeTestCase(TestCaseNode const& testCaseNode);

  void writeSection(std::string const& className, std::string const& rootName,
                    SectionNode const& sectionNode);

  void writeAssertions(SectionNode const& sectionNode);
  void writeAssertion(AssertionStats const& stats);

  XmlWriter xml;
  Timer suiteTimer;
  std::string stdOutForSuite;
  std::string stdErrForSuite;
  unsigned int unexpectedExceptions = 0;
  bool m_okToFail                   = false;
};

} // end namespace Catch

// end catch_reporter_junit.h
// start catch_reporter_xml.h

namespace Catch {
class XmlReporter : public StreamingReporterBase<XmlReporter> {
public:
  XmlReporter(ReporterConfig const& _config);

  ~XmlReporter() override;

  static std::string getDescription();

  virtual std::string getStylesheetRef() const;

  void writeSourceInfo(SourceLineInfo const& sourceInfo);

public: // StreamingReporterBase
  void noMatchingTestCases(std::string const& s) override;

  void testRunStarting(TestRunInfo const& testInfo) override;

  void testGroupStarting(GroupInfo const& groupInfo) override;

  void testCaseStarting(TestCaseInfo const& testInfo) override;

  void sectionStarting(SectionInfo const& sectionInfo) override;

  void assertionStarting(AssertionInfo const&) override;

  bool assertionEnded(AssertionStats const& assertionStats) override;

  void sectionEnded(SectionStats const& sectionStats) override;

  void testCaseEnded(TestCaseStats const& testCaseStats) override;

  void testGroupEnded(TestGroupStats const& testGroupStats) override;

  void testRunEnded(TestRunStats const& testRunStats) override;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  void benchmarkPreparing(std::string const& name) override;
  void benchmarkStarting(BenchmarkInfo const&) override;
  void benchmarkEnded(BenchmarkStats<> const&) override;
  void benchmarkFailed(std::string const&) override;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

private:
  Timer m_testCaseTimer;
  XmlWriter m_xml;
  int m_sectionDepth = 0;
};

} // end namespace Catch

// end catch_reporter_xml.h

// end catch_external_interfaces.h
#endif

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
// start catch_benchmarking_all.hpp

// A proxy header that includes all of the benchmarking headers to allow
// concise include of the benchmarking features. You should prefer the
// individual includes in standard use.

// start catch_benchmark.hpp

// Benchmark

// start catch_chronometer.hpp

// User-facing chronometer

// start catch_clock.hpp

// Clocks

#include <chrono>
#include <ratio>

namespace Catch {
namespace Benchmark {
template <typename Clock>
using ClockDuration = typename Clock::duration;
template <typename Clock>
using FloatDuration = std::chrono::duration<double, typename Clock::period>;

template <typename Clock>
using TimePoint = typename Clock::time_point;

using default_clock = std::chrono::steady_clock;

template <typename Clock>
struct now {
  TimePoint<Clock> operator()() const { return Clock::now(); }
};

using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
} // namespace Benchmark
} // namespace Catch

// end catch_clock.hpp
// start catch_optimizer.hpp

// Hinting the optimizer

#if defined(_MSC_VER)
#include <atomic> // atomic_thread_fence
#endif

namespace Catch {
namespace Benchmark {
#if defined(__GNUC__) || defined(__clang__)
template <typename T>
inline void keep_memory(T* p) {
  asm volatile("" : : "g"(p) : "memory");
}
inline void keep_memory() { asm volatile("" : : : "memory"); }

namespace Detail {
inline void optimizer_barrier() { keep_memory(); }
} // namespace Detail
#elif defined(_MSC_VER)

#pragma optimize("", off)
template <typename T>
inline void keep_memory(T* p) {
  // thanks @milleniumbug
  *reinterpret_cast<char volatile*>(p) =
      *reinterpret_cast<char const volatile*>(p);
}
// TODO equivalent keep_memory()
#pragma optimize("", on)

namespace Detail {
inline void optimizer_barrier() {
  std::atomic_thread_fence(std::memory_order_seq_cst);
}
} // namespace Detail

#endif

template <typename T>
inline void deoptimize_value(T&& x) {
  keep_memory(&x);
}

template <typename Fn, typename... Args>
inline auto invoke_deoptimized(Fn&& fn, Args&&... args) ->
    typename std::enable_if<
        !std::is_same<void, decltype(fn(args...))>::value>::type {
  deoptimize_value(std::forward<Fn>(fn)(std::forward<Args...>(args...)));
}

template <typename Fn, typename... Args>
inline auto invoke_deoptimized(Fn&& fn, Args&&... args) ->
    typename std::enable_if<
        std::is_same<void, decltype(fn(args...))>::value>::type {
  std::forward<Fn>(fn)(std::forward<Args...>(args...));
}
} // namespace Benchmark
} // namespace Catch

// end catch_optimizer.hpp
// start catch_complete_invoke.hpp

// Invoke with a special case for void

#include <type_traits>
#include <utility>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename T>
struct CompleteType {
  using type = T;
};
template <>
struct CompleteType<void> {
  struct type {};
};

template <typename T>
using CompleteType_t = typename CompleteType<T>::type;

template <typename Result>
struct CompleteInvoker {
  template <typename Fun, typename... Args>
  static Result invoke(Fun&& fun, Args&&... args) {
    return std::forward<Fun>(fun)(std::forward<Args>(args)...);
  }
};
template <>
struct CompleteInvoker<void> {
  template <typename Fun, typename... Args>
  static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
    std::forward<Fun>(fun)(std::forward<Args>(args)...);
    return {};
  }
};

// invoke and not return void :(
template <typename Fun, typename... Args>
CompleteType_t<FunctionReturnType<Fun, Args...>>
complete_invoke(Fun&& fun, Args&&... args) {
  return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(
      std::forward<Fun>(fun), std::forward<Args>(args)...);
}

const std::string benchmarkErrorMsg = "a benchmark failed to run successfully";
} // namespace Detail

template <typename Fun>
Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
  CATCH_TRY { return Detail::complete_invoke(std::forward<Fun>(fun)); }
  CATCH_CATCH_ALL {
    getResultCapture().benchmarkFailed(translateActiveException());
    CATCH_RUNTIME_ERROR(Detail::benchmarkErrorMsg);
  }
}
} // namespace Benchmark
} // namespace Catch

// end catch_complete_invoke.hpp
namespace Catch {
namespace Benchmark {
namespace Detail {
struct ChronometerConcept {
  virtual void start()          = 0;
  virtual void finish()         = 0;
  virtual ~ChronometerConcept() = default;
};
template <typename Clock>
struct ChronometerModel final : public ChronometerConcept {
  void start() override { started = Clock::now(); }
  void finish() override { finished = Clock::now(); }

  ClockDuration<Clock> elapsed() const { return finished - started; }

  TimePoint<Clock> started;
  TimePoint<Clock> finished;
};
} // namespace Detail

struct Chronometer {
public:
  template <typename Fun>
  void measure(Fun&& fun) {
    measure(std::forward<Fun>(fun), is_callable<Fun(int)>());
  }

  int runs() const { return k; }

  Chronometer(Detail::ChronometerConcept& meter, int k) : impl(&meter), k(k) {}

private:
  template <typename Fun>
  void measure(Fun&& fun, std::false_type) {
    measure([&fun](int) { return fun(); }, std::true_type());
  }

  template <typename Fun>
  void measure(Fun&& fun, std::true_type) {
    Detail::optimizer_barrier();
    impl->start();
    for (int i = 0; i < k; ++i)
      invoke_deoptimized(fun, i);
    impl->finish();
    Detail::optimizer_barrier();
  }

  Detail::ChronometerConcept* impl;
  int k;
};
} // namespace Benchmark
} // namespace Catch

// end catch_chronometer.hpp
// start catch_environment.hpp

// Environment information

namespace Catch {
namespace Benchmark {
template <typename Duration>
struct EnvironmentEstimate {
  Duration mean;
  OutlierClassification outliers;

  template <typename Duration2>
  operator EnvironmentEstimate<Duration2>() const {
    return {mean, outliers};
  }
};
template <typename Clock>
struct Environment {
  using clock_type = Clock;
  EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
  EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
};
} // namespace Benchmark
} // namespace Catch

// end catch_environment.hpp
// start catch_execution_plan.hpp

// Execution plan

// start catch_benchmark_function.hpp

// Dumb std::function implementation for consistent call overhead

#include <cassert>
#include <type_traits>
#include <utility>
#include <memory>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename T>
using Decay = typename std::decay<T>::type;
template <typename T, typename U>
struct is_related : std::is_same<Decay<T>, Decay<U>> {};

/// We need to reinvent std::function because every piece of code that might add
/// overhead in a measurement context needs to have consistent performance
/// characteristics so that we can account for it in the measurement.
/// Implementations of std::function with optimizations that aren't always
/// applicable, like small buffer optimizations, are not uncommon. This is
/// effectively an implementation of std::function without any such
/// optimizations; it may be slow, but it is consistently slow.
struct BenchmarkFunction {
private:
  struct callable {
    virtual void call(Chronometer meter) const = 0;
    virtual callable* clone() const            = 0;
    virtual ~callable()                        = default;
  };
  template <typename Fun>
  struct model : public callable {
    model(Fun&& fun) : fun(std::move(fun)) {}
    model(Fun const& fun) : fun(fun) {}

    model<Fun>* clone() const override { return new model<Fun>(*this); }

    void call(Chronometer meter) const override {
      call(meter, is_callable<Fun(Chronometer)>());
    }
    void call(Chronometer meter, std::true_type) const { fun(meter); }
    void call(Chronometer meter, std::false_type) const { meter.measure(fun); }

    Fun fun;
  };

  struct do_nothing {
    void operator()() const {}
  };

  template <typename T>
  BenchmarkFunction(model<T>* c) : f(c) {}

public:
  BenchmarkFunction() : f(new model<do_nothing>{{}}) {}

  template <typename Fun,
            typename std::enable_if<!is_related<Fun, BenchmarkFunction>::value,
                                    int>::type = 0>
  BenchmarkFunction(Fun&& fun)
      : f(new model<typename std::decay<Fun>::type>(std::forward<Fun>(fun))) {}

  BenchmarkFunction(BenchmarkFunction&& that) : f(std::move(that.f)) {}

  BenchmarkFunction(BenchmarkFunction const& that) : f(that.f->clone()) {}

  BenchmarkFunction& operator=(BenchmarkFunction&& that) {
    f = std::move(that.f);
    return *this;
  }

  BenchmarkFunction& operator=(BenchmarkFunction const& that) {
    f.reset(that.f->clone());
    return *this;
  }

  void operator()(Chronometer meter) const { f->call(meter); }

private:
  std::unique_ptr<callable> f;
};
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_benchmark_function.hpp
// start catch_repeat.hpp

// repeat algorithm

#include <type_traits>
#include <utility>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename Fun>
struct repeater {
  void operator()(int k) const {
    for (int i = 0; i < k; ++i) {
      fun();
    }
  }
  Fun fun;
};
template <typename Fun>
repeater<typename std::decay<Fun>::type> repeat(Fun&& fun) {
  return {std::forward<Fun>(fun)};
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_repeat.hpp
// start catch_run_for_at_least.hpp

// Run a function for a minimum amount of time

// start catch_measure.hpp

// Measure

// start catch_timing.hpp

// Timing

#include <tuple>
#include <type_traits>

namespace Catch {
namespace Benchmark {
template <typename Duration, typename Result>
struct Timing {
  Duration elapsed;
  Result result;
  int iterations;
};
template <typename Clock, typename Func, typename... Args>
using TimingOf =
    Timing<ClockDuration<Clock>,
           Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
} // namespace Benchmark
} // namespace Catch

// end catch_timing.hpp
#include <utility>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename Clock, typename Fun, typename... Args>
TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
  auto start = Clock::now();
  auto&& r   = Detail::complete_invoke(fun, std::forward<Args>(args)...);
  auto end   = Clock::now();
  auto delta = end - start;
  return {delta, std::forward<decltype(r)>(r), 1};
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_measure.hpp
#include <utility>
#include <type_traits>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename Clock, typename Fun>
TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
  return Detail::measure<Clock>(fun, iters);
}
template <typename Clock, typename Fun>
TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters,
                                              std::true_type) {
  Detail::ChronometerModel<Clock> meter;
  auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));

  return {meter.elapsed(), std::move(result), iters};
}

template <typename Clock, typename Fun>
using run_for_at_least_argument_t =
    typename std::conditional<is_callable<Fun(Chronometer)>::value, Chronometer,
                              int>::type;

struct optimized_away_error : std::exception {
  const char* what() const noexcept override {
    return "could not measure benchmark, maybe it was optimized away";
  }
};

template <typename Clock, typename Fun>
TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>
run_for_at_least(ClockDuration<Clock> how_long, int seed, Fun&& fun) {
  auto iters = seed;
  while (iters < (1 << 30)) {
    auto&& Timing =
        measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());

    if (Timing.elapsed >= how_long) {
      return {Timing.elapsed, std::move(Timing.result), iters};
    }
    iters *= 2;
  }
  throw optimized_away_error{};
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_run_for_at_least.hpp
#include <algorithm>

namespace Catch {
namespace Benchmark {
template <typename Duration>
struct ExecutionPlan {
  int iterations_per_sample;
  Duration estimated_duration;
  Detail::BenchmarkFunction benchmark;
  Duration warmup_time;
  int warmup_iterations;

  template <typename Duration2>
  operator ExecutionPlan<Duration2>() const {
    return {iterations_per_sample, estimated_duration, benchmark, warmup_time,
            warmup_iterations};
  }

  template <typename Clock>
  std::vector<FloatDuration<Clock>>
  run(const IConfig& cfg, Environment<FloatDuration<Clock>> env) const {
    // warmup a bit
    Detail::run_for_at_least<Clock>(
        std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time),
        warmup_iterations, Detail::repeat(now<Clock>{}));

    std::vector<FloatDuration<Clock>> times;
    times.reserve(cfg.benchmarkSamples());
    std::generate_n(
        std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {
          Detail::ChronometerModel<Clock> model;
          this->benchmark(Chronometer(model, iterations_per_sample));
          auto sample_time = model.elapsed() - env.clock_cost.mean;
          if (sample_time < FloatDuration<Clock>::zero())
            sample_time = FloatDuration<Clock>::zero();
          return sample_time / iterations_per_sample;
        });
    return times;
  }
};
} // namespace Benchmark
} // namespace Catch

// end catch_execution_plan.hpp
// start catch_estimate_clock.hpp

// Environment measurement

// start catch_stats.hpp

// Statistical analysis tools

#include <algorithm>
#include <functional>
#include <vector>
#include <iterator>
#include <numeric>
#include <tuple>
#include <cmath>
#include <utility>
#include <cstddef>
#include <random>

namespace Catch {
namespace Benchmark {
namespace Detail {
using sample = std::vector<double>;

double weighted_average_quantile(int k, int q,
                                 std::vector<double>::iterator first,
                                 std::vector<double>::iterator last);

template <typename Iterator>
OutlierClassification classify_outliers(Iterator first, Iterator last) {
  std::vector<double> copy(first, last);

  auto q1  = weighted_average_quantile(1, 4, copy.begin(), copy.end());
  auto q3  = weighted_average_quantile(3, 4, copy.begin(), copy.end());
  auto iqr = q3 - q1;
  auto los = q1 - (iqr * 3.);
  auto lom = q1 - (iqr * 1.5);
  auto him = q3 + (iqr * 1.5);
  auto his = q3 + (iqr * 3.);

  OutlierClassification o;
  for (; first != last; ++first) {
    auto&& t = *first;
    if (t < los)
      ++o.low_severe;
    else if (t < lom)
      ++o.low_mild;
    else if (t > his)
      ++o.high_severe;
    else if (t > him)
      ++o.high_mild;
    ++o.samples_seen;
  }
  return o;
}

template <typename Iterator>
double mean(Iterator first, Iterator last) {
  auto count = last - first;
  double sum = std::accumulate(first, last, 0.);
  return sum / count;
}

template <typename URng, typename Iterator, typename Estimator>
sample resample(URng& rng, int resamples, Iterator first, Iterator last,
                Estimator& estimator) {
  auto n = last - first;
  std::uniform_int_distribution<decltype(n)> dist(0, n - 1);

  sample out;
  out.reserve(resamples);
  std::generate_n(
      std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {
        std::vector<double> resampled;
        resampled.reserve(n);
        std::generate_n(std::back_inserter(resampled), n,
                        [first, &dist, &rng] { return first[dist(rng)]; });
        return estimator(resampled.begin(), resampled.end());
      });
  std::sort(out.begin(), out.end());
  return out;
}

template <typename Estimator, typename Iterator>
sample jackknife(Estimator&& estimator, Iterator first, Iterator last) {
  auto n      = last - first;
  auto second = std::next(first);
  sample results;
  results.reserve(n);

  for (auto it = first; it != last; ++it) {
    std::iter_swap(it, first);
    results.push_back(estimator(second, last));
  }

  return results;
}

inline double normal_cdf(double x) {
  return std::erfc(-x / std::sqrt(2.0)) / 2.0;
}

double erfc_inv(double x);

double normal_quantile(double p);

template <typename Iterator, typename Estimator>
Estimate<double> bootstrap(double confidence_level, Iterator first,
                           Iterator last, sample const& resample,
                           Estimator&& estimator) {
  auto n_samples = last - first;

  double point = estimator(first, last);
  // Degenerate case with a single sample
  if (n_samples == 1)
    return {point, point, point, confidence_level};

  sample jack      = jackknife(estimator, first, last);
  double jack_mean = mean(jack.begin(), jack.end());
  double sum_squares, sum_cubes;
  std::tie(sum_squares, sum_cubes) =
      std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.),
                      [jack_mean](std::pair<double, double> sqcb,
                                  double x) -> std::pair<double, double> {
                        auto d  = jack_mean - x;
                        auto d2 = d * d;
                        auto d3 = d2 * d;
                        return {sqcb.first + d2, sqcb.second + d3};
                      });

  double accel  = sum_cubes / (6 * std::pow(sum_squares, 1.5));
  int n         = static_cast<int>(resample.size());
  double prob_n = std::count_if(resample.begin(), resample.end(),
                                [point](double x) { return x < point; }) /
                  (double)n;
  // degenerate case with uniform samples
  if (prob_n == 0)
    return {point, point, point, confidence_level};

  double bias = normal_quantile(prob_n);
  double z1   = normal_quantile((1. - confidence_level) / 2.);

  auto cumn = [n](double x) -> int { return std::lround(normal_cdf(x) * n); };
  auto a    = [bias, accel](double b) { return bias + b / (1. - accel * b); };
  double b1 = bias + z1;
  double b2 = bias - z1;
  double a1 = a(b1);
  double a2 = a(b2);
  auto lo   = (std::max)(cumn(a1), 0);
  auto hi   = (std::min)(cumn(a2), n - 1);

  return {point, resample[lo], resample[hi], confidence_level};
}

double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);

struct bootstrap_analysis {
  Estimate<double> mean;
  Estimate<double> standard_deviation;
  double outlier_variance;
};

bootstrap_analysis analyse_samples(double confidence_level, int n_resamples,
                                   std::vector<double>::iterator first,
                                   std::vector<double>::iterator last);
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_stats.hpp
#include <algorithm>
#include <iterator>
#include <tuple>
#include <vector>
#include <cmath>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename Clock>
std::vector<double> resolution(int k) {
  std::vector<TimePoint<Clock>> times;
  times.reserve(k + 1);
  std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});

  std::vector<double> deltas;
  deltas.reserve(k);
  std::transform(std::next(times.begin()), times.end(), times.begin(),
                 std::back_inserter(deltas),
                 [](TimePoint<Clock> a, TimePoint<Clock> b) {
                   return static_cast<double>((a - b).count());
                 });

  return deltas;
}

const auto warmup_iterations                = 10000;
const auto warmup_time                      = std::chrono::milliseconds(100);
const auto minimum_ticks                    = 1000;
const auto warmup_seed                      = 10000;
const auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
const auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
const auto clock_cost_estimation_tick_limit = 100000;
const auto clock_cost_estimation_time       = std::chrono::milliseconds(10);
const auto clock_cost_estimation_iterations = 10000;

template <typename Clock>
int warmup() {
  return run_for_at_least<Clock>(
             std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time),
             warmup_seed, &resolution<Clock>)
      .iterations;
}
template <typename Clock>
EnvironmentEstimate<FloatDuration<Clock>>
estimate_clock_resolution(int iterations) {
  auto r =
      run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(
                                  clock_resolution_estimation_time),
                              iterations, &resolution<Clock>)
          .result;
  return {
      FloatDuration<Clock>(mean(r.begin(), r.end())),
      classify_outliers(r.begin(), r.end()),
  };
}
template <typename Clock>
EnvironmentEstimate<FloatDuration<Clock>>
estimate_clock_cost(FloatDuration<Clock> resolution) {
  auto time_limit =
      (std::min)(resolution * clock_cost_estimation_tick_limit,
                 FloatDuration<Clock>(clock_cost_estimation_time_limit));
  auto time_clock = [](int k) {
    return Detail::measure<Clock>([k] {
             for (int i = 0; i < k; ++i) {
               volatile auto ignored = Clock::now();
               (void)ignored;
             }
           })
        .elapsed;
  };
  time_clock(1);
  int iters = clock_cost_estimation_iterations;
  auto&& r =
      run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(
                                  clock_cost_estimation_time),
                              iters, time_clock);
  std::vector<double> times;
  int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
  times.reserve(nsamples);
  std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {
    return static_cast<double>(
        (time_clock(r.iterations) / r.iterations).count());
  });
  return {
      FloatDuration<Clock>(mean(times.begin(), times.end())),
      classify_outliers(times.begin(), times.end()),
  };
}

template <typename Clock>
Environment<FloatDuration<Clock>> measure_environment() {
  static Environment<FloatDuration<Clock>>* env = nullptr;
  if (env) {
    return *env;
  }

  auto iters      = Detail::warmup<Clock>();
  auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
  auto cost       = Detail::estimate_clock_cost<Clock>(resolution.mean);

  env = new Environment<FloatDuration<Clock>>{resolution, cost};
  return *env;
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_estimate_clock.hpp
// start catch_analyse.hpp

// Run and analyse one benchmark

// start catch_sample_analysis.hpp

// Benchmark results

#include <algorithm>
#include <vector>
#include <string>
#include <iterator>

namespace Catch {
namespace Benchmark {
template <typename Duration>
struct SampleAnalysis {
  std::vector<Duration> samples;
  Estimate<Duration> mean;
  Estimate<Duration> standard_deviation;
  OutlierClassification outliers;
  double outlier_variance;

  template <typename Duration2>
  operator SampleAnalysis<Duration2>() const {
    std::vector<Duration2> samples2;
    samples2.reserve(samples.size());
    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),
                   [](Duration d) { return Duration2(d); });
    return {
        std::move(samples2), mean, standard_deviation, outliers,
        outlier_variance,
    };
  }
};
} // namespace Benchmark
} // namespace Catch

// end catch_sample_analysis.hpp
#include <algorithm>
#include <iterator>
#include <vector>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename Duration, typename Iterator>
SampleAnalysis<Duration> analyse(const IConfig& cfg, Environment<Duration>,
                                 Iterator first, Iterator last) {
  if (!cfg.benchmarkNoAnalysis()) {
    std::vector<double> samples;
    samples.reserve(last - first);
    std::transform(first, last, std::back_inserter(samples),
                   [](Duration d) { return d.count(); });

    auto analysis = Catch::Benchmark::Detail::analyse_samples(
        cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(),
        samples.begin(), samples.end());
    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(),
                                                                samples.end());

    auto wrap_estimate = [](Estimate<double> e) {
      return Estimate<Duration>{
          Duration(e.point),
          Duration(e.lower_bound),
          Duration(e.upper_bound),
          e.confidence_interval,
      };
    };
    std::vector<Duration> samples2;
    samples2.reserve(samples.size());
    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),
                   [](double d) { return Duration(d); });
    return {
        std::move(samples2),
        wrap_estimate(analysis.mean),
        wrap_estimate(analysis.standard_deviation),
        outliers,
        analysis.outlier_variance,
    };
  } else {
    std::vector<Duration> samples;
    samples.reserve(last - first);

    Duration mean = Duration(0);
    int i         = 0;
    for (auto it = first; it < last; ++it, ++i) {
      samples.push_back(Duration(*it));
      mean += Duration(*it);
    }
    mean /= i;

    return {std::move(samples), Estimate<Duration>{mean, mean, mean, 0.0},
            Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
            OutlierClassification{}, 0.0};
  }
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

// end catch_analyse.hpp
#include <algorithm>
#include <functional>
#include <string>
#include <vector>
#include <cmath>

namespace Catch {
namespace Benchmark {
struct Benchmark {
  Benchmark(std::string&& name) : name(std::move(name)) {}

  template <class FUN>
  Benchmark(std::string&& name, FUN&& func)
      : fun(std::move(func)), name(std::move(name)) {}

  template <typename Clock>
  ExecutionPlan<FloatDuration<Clock>>
  prepare(const IConfig& cfg, Environment<FloatDuration<Clock>> env) const {
    auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
    auto run_time =
        std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(
                               cfg.benchmarkWarmupTime()));
    auto&& test = Detail::run_for_at_least<Clock>(
        std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
    int new_iters =
        static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
    return {new_iters,
            test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(),
            fun,
            std::chrono::duration_cast<FloatDuration<Clock>>(
                cfg.benchmarkWarmupTime()),
            Detail::warmup_iterations};
  }

  template <typename Clock = default_clock>
  void run() {
    IConfigPtr cfg = getCurrentContext().getConfig();

    auto env = Detail::measure_environment<Clock>();

    getResultCapture().benchmarkPreparing(name);
    CATCH_TRY {
      auto plan = user_code([&] { return prepare<Clock>(*cfg, env); });

      BenchmarkInfo info{name,
                         plan.estimated_duration.count(),
                         plan.iterations_per_sample,
                         cfg->benchmarkSamples(),
                         cfg->benchmarkResamples(),
                         env.clock_resolution.mean.count(),
                         env.clock_cost.mean.count()};

      getResultCapture().benchmarkStarting(info);

      auto samples =
          user_code([&] { return plan.template run<Clock>(*cfg, env); });

      auto analysis =
          Detail::analyse(*cfg, env, samples.begin(), samples.end());
      BenchmarkStats<FloatDuration<Clock>> stats{info,
                                                 analysis.samples,
                                                 analysis.mean,
                                                 analysis.standard_deviation,
                                                 analysis.outliers,
                                                 analysis.outlier_variance};
      getResultCapture().benchmarkEnded(stats);
    }
    CATCH_CATCH_ALL {
      if (translateActiveException() !=
          Detail::benchmarkErrorMsg) // benchmark errors have been reported,
                                     // otherwise rethrow.
        std::rethrow_exception(std::current_exception());
    }
  }

  // sets lambda to be used in fun *and* executes benchmark!
  template <typename Fun,
            typename std::enable_if<!Detail::is_related<Fun, Benchmark>::value,
                                    int>::type = 0>
  Benchmark& operator=(Fun func) {
    fun = Detail::BenchmarkFunction(func);
    run();
    return *this;
  }

  explicit operator bool() { return true; }

private:
  Detail::BenchmarkFunction fun;
  std::string name;
};
} // namespace Benchmark
} // namespace Catch

#define INTERNAL_CATCH_GET_1_ARG(arg1, arg2, ...) arg1
#define INTERNAL_CATCH_GET_2_ARG(arg1, arg2, ...) arg2

#define INTERNAL_CATCH_BENCHMARK(BenchmarkName, name, benchmarkIndex)          \
  if (Catch::Benchmark::Benchmark BenchmarkName{name})                         \
  BenchmarkName = [&](int benchmarkIndex)

#define INTERNAL_CATCH_BENCHMARK_ADVANCED(BenchmarkName, name)                 \
  if (Catch::Benchmark::Benchmark BenchmarkName{name})                         \
  BenchmarkName = [&]

// end catch_benchmark.hpp
// start catch_constructor.hpp

// Constructor and destructor helpers

#include <type_traits>

namespace Catch {
namespace Benchmark {
namespace Detail {
template <typename T, bool Destruct>
struct ObjectStorage {
  using TStorage =
      typename std::aligned_storage<sizeof(T),
                                    std::alignment_of<T>::value>::type;

  ObjectStorage() : data() {}

  ObjectStorage(const ObjectStorage& other) {
    new (&data) T(other.stored_object());
  }

  ObjectStorage(ObjectStorage&& other) {
    new (&data) T(std::move(other.stored_object()));
  }

  ~ObjectStorage() { destruct_on_exit<T>(); }

  template <typename... Args>
  void construct(Args&&... args) {
    new (&data) T(std::forward<Args>(args)...);
  }

  template <bool AllowManualDestruction = !Destruct>
  typename std::enable_if<AllowManualDestruction>::type destruct() {
    stored_object().~T();
  }

private:
  // If this is a constructor benchmark, destruct the underlying object
  template <typename U>
  void destruct_on_exit(typename std::enable_if<Destruct, U>::type* = 0) {
    destruct<true>();
  }
  // Otherwise, don't
  template <typename U>
  void destruct_on_exit(typename std::enable_if<!Destruct, U>::type* = 0) {}

  T& stored_object() { return *static_cast<T*>(static_cast<void*>(&data)); }

  T const& stored_object() const {
    return *static_cast<T*>(static_cast<void*>(&data));
  }

  TStorage data;
};
} // namespace Detail

template <typename T>
using storage_for = Detail::ObjectStorage<T, true>;

template <typename T>
using destructable_object = Detail::ObjectStorage<T, false>;
} // namespace Benchmark
} // namespace Catch

// end catch_constructor.hpp
// end catch_benchmarking_all.hpp
#endif

#endif // ! CATCH_CONFIG_IMPL_ONLY

#ifdef CATCH_IMPL
// start catch_impl.hpp

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wweak-vtables"
#endif

// Keep these here for external reporters
// start catch_test_case_tracker.h

#include <string>
#include <vector>
#include <memory>

namespace Catch {
namespace TestCaseTracking {

struct NameAndLocation {
  std::string name;
  SourceLineInfo location;

  NameAndLocation(std::string const& _name, SourceLineInfo const& _location);
  friend bool operator==(NameAndLocation const& lhs,
                         NameAndLocation const& rhs) {
    return lhs.name == rhs.name && lhs.location == rhs.location;
  }
};

class ITracker;

using ITrackerPtr = std::shared_ptr<ITracker>;

class ITracker {
  NameAndLocation m_nameAndLocation;

public:
  ITracker(NameAndLocation const& nameAndLoc) : m_nameAndLocation(nameAndLoc) {}

  // static queries
  NameAndLocation const& nameAndLocation() const { return m_nameAndLocation; }

  virtual ~ITracker();

  // dynamic queries
  virtual bool isComplete() const = 0; // Successfully completed or failed
  virtual bool isSuccessfullyCompleted() const = 0;
  virtual bool isOpen() const                  = 0; // Started but not complete
  virtual bool hasChildren() const             = 0;
  virtual bool hasStarted() const              = 0;

  virtual ITracker& parent() = 0;

  // actions
  virtual void close()                   = 0; // Successfully complete
  virtual void fail()                    = 0;
  virtual void markAsNeedingAnotherRun() = 0;

  virtual void addChild(ITrackerPtr const& child)                       = 0;
  virtual ITrackerPtr findChild(NameAndLocation const& nameAndLocation) = 0;
  virtual void openChild()                                              = 0;

  // Debug/ checking
  virtual bool isSectionTracker() const   = 0;
  virtual bool isGeneratorTracker() const = 0;
};

class TrackerContext {

  enum RunState { NotStarted, Executing, CompletedCycle };

  ITrackerPtr m_rootTracker;
  ITracker* m_currentTracker = nullptr;
  RunState m_runState        = NotStarted;

public:
  ITracker& startRun();
  void endRun();

  void startCycle();
  void completeCycle();

  bool completedCycle() const;
  ITracker& currentTracker();
  void setCurrentTracker(ITracker* tracker);
};

class TrackerBase : public ITracker {
protected:
  enum CycleState {
    NotStarted,
    Executing,
    ExecutingChildren,
    NeedsAnotherRun,
    CompletedSuccessfully,
    Failed
  };

  using Children = std::vector<ITrackerPtr>;
  TrackerContext& m_ctx;
  ITracker* m_parent;
  Children m_children;
  CycleState m_runState = NotStarted;

public:
  TrackerBase(NameAndLocation const& nameAndLocation, TrackerContext& ctx,
              ITracker* parent);

  bool isComplete() const override;
  bool isSuccessfullyCompleted() const override;
  bool isOpen() const override;
  bool hasChildren() const override;
  bool hasStarted() const override { return m_runState != NotStarted; }

  void addChild(ITrackerPtr const& child) override;

  ITrackerPtr findChild(NameAndLocation const& nameAndLocation) override;
  ITracker& parent() override;

  void openChild() override;

  bool isSectionTracker() const override;
  bool isGeneratorTracker() const override;

  void open();

  void close() override;
  void fail() override;
  void markAsNeedingAnotherRun() override;

private:
  void moveToParent();
  void moveToThis();
};

class SectionTracker : public TrackerBase {
  std::vector<std::string> m_filters;
  std::string m_trimmed_name;

public:
  SectionTracker(NameAndLocation const& nameAndLocation, TrackerContext& ctx,
                 ITracker* parent);

  bool isSectionTracker() const override;

  bool isComplete() const override;

  static SectionTracker& acquire(TrackerContext& ctx,
                                 NameAndLocation const& nameAndLocation);

  void tryOpen();

  void addInitialFilters(std::vector<std::string> const& filters);
  void addNextFilters(std::vector<std::string> const& filters);
  //! Returns filters active in this tracker
  std::vector<std::string> const& getFilters() const;
  //! Returns whitespace-trimmed name of the tracked section
  std::string const& trimmedName() const;
};

} // namespace TestCaseTracking

using TestCaseTracking::ITracker;
using TestCaseTracking::SectionTracker;
using TestCaseTracking::TrackerContext;

} // namespace Catch

// end catch_test_case_tracker.h

// start catch_leak_detector.h

namespace Catch {

struct LeakDetector {
  LeakDetector();
  ~LeakDetector();
};

} // namespace Catch
// end catch_leak_detector.h
// Cpp files will be included in the single-header file here
// start catch_stats.cpp

// Statistical analysis tools

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)

#include <cassert>
#include <random>

#if defined(CATCH_CONFIG_USE_ASYNC)
#include <future>
#endif

namespace {
double erf_inv(double x) {
  // Code accompanying the article "Approximating the erfinv function" in GPU
  // Computing Gems, Volume 2
  double w, p;

  w = -log((1.0 - x) * (1.0 + x));

  if (w < 6.250000) {
    w = w - 3.125000;
    p = -3.6444120640178196996e-21;
    p = -1.685059138182016589e-19 + p * w;
    p = 1.2858480715256400167e-18 + p * w;
    p = 1.115787767802518096e-17 + p * w;
    p = -1.333171662854620906e-16 + p * w;
    p = 2.0972767875968561637e-17 + p * w;
    p = 6.6376381343583238325e-15 + p * w;
    p = -4.0545662729752068639e-14 + p * w;
    p = -8.1519341976054721522e-14 + p * w;
    p = 2.6335093153082322977e-12 + p * w;
    p = -1.2975133253453532498e-11 + p * w;
    p = -5.4154120542946279317e-11 + p * w;
    p = 1.051212273321532285e-09 + p * w;
    p = -4.1126339803469836976e-09 + p * w;
    p = -2.9070369957882005086e-08 + p * w;
    p = 4.2347877827932403518e-07 + p * w;
    p = -1.3654692000834678645e-06 + p * w;
    p = -1.3882523362786468719e-05 + p * w;
    p = 0.0001867342080340571352 + p * w;
    p = -0.00074070253416626697512 + p * w;
    p = -0.0060336708714301490533 + p * w;
    p = 0.24015818242558961693 + p * w;
    p = 1.6536545626831027356 + p * w;
  } else if (w < 16.000000) {
    w = sqrt(w) - 3.250000;
    p = 2.2137376921775787049e-09;
    p = 9.0756561938885390979e-08 + p * w;
    p = -2.7517406297064545428e-07 + p * w;
    p = 1.8239629214389227755e-08 + p * w;
    p = 1.5027403968909827627e-06 + p * w;
    p = -4.013867526981545969e-06 + p * w;
    p = 2.9234449089955446044e-06 + p * w;
    p = 1.2475304481671778723e-05 + p * w;
    p = -4.7318229009055733981e-05 + p * w;
    p = 6.8284851459573175448e-05 + p * w;
    p = 2.4031110387097893999e-05 + p * w;
    p = -0.0003550375203628474796 + p * w;
    p = 0.00095328937973738049703 + p * w;
    p = -0.0016882755560235047313 + p * w;
    p = 0.0024914420961078508066 + p * w;
    p = -0.0037512085075692412107 + p * w;
    p = 0.005370914553590063617 + p * w;
    p = 1.0052589676941592334 + p * w;
    p = 3.0838856104922207635 + p * w;
  } else {
    w = sqrt(w) - 5.000000;
    p = -2.7109920616438573243e-11;
    p = -2.5556418169965252055e-10 + p * w;
    p = 1.5076572693500548083e-09 + p * w;
    p = -3.7894654401267369937e-09 + p * w;
    p = 7.6157012080783393804e-09 + p * w;
    p = -1.4960026627149240478e-08 + p * w;
    p = 2.9147953450901080826e-08 + p * w;
    p = -6.7711997758452339498e-08 + p * w;
    p = 2.2900482228026654717e-07 + p * w;
    p = -9.9298272942317002539e-07 + p * w;
    p = 4.5260625972231537039e-06 + p * w;
    p = -1.9681778105531670567e-05 + p * w;
    p = 7.5995277030017761139e-05 + p * w;
    p = -0.00021503011930044477347 + p * w;
    p = -0.00013871931833623122026 + p * w;
    p = 1.0103004648645343977 + p * w;
    p = 4.8499064014085844221 + p * w;
  }
  return p * x;
}

double standard_deviation(std::vector<double>::iterator first,
                          std::vector<double>::iterator last) {
  auto m          = Catch::Benchmark::Detail::mean(first, last);
  double variance = std::accumulate(first, last, 0.,
                                    [m](double a, double b) {
                                      double diff = b - m;
                                      return a + diff * diff;
                                    }) /
                    (last - first);
  return std::sqrt(variance);
}

} // namespace

namespace Catch {
namespace Benchmark {
namespace Detail {

double weighted_average_quantile(int k, int q,
                                 std::vector<double>::iterator first,
                                 std::vector<double>::iterator last) {
  auto count = last - first;
  double idx = (count - 1) * k / static_cast<double>(q);
  int j      = static_cast<int>(idx);
  double g   = idx - j;
  std::nth_element(first, first + j, last);
  auto xj = first[j];
  if (g == 0)
    return xj;

  auto xj1 = *std::min_element(first + (j + 1), last);
  return xj + g * (xj1 - xj);
}

double erfc_inv(double x) { return erf_inv(1.0 - x); }

double normal_quantile(double p) {
  static const double ROOT_TWO = std::sqrt(2.0);

  double result = 0.0;
  assert(p >= 0 && p <= 1);
  if (p < 0 || p > 1) {
    return result;
  }

  result = -erfc_inv(2.0 * p);
  // result *= normal distribution standard deviation (1.0) * sqrt(2)
  result *= /*sd * */ ROOT_TWO;
  // result += normal disttribution mean (0)
  return result;
}

double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {
  double sb     = stddev.point;
  double mn     = mean.point / n;
  double mg_min = mn / 2.;
  double sg     = (std::min)(mg_min / 4., sb / std::sqrt(n));
  double sg2    = sg * sg;
  double sb2    = sb * sb;

  auto c_max = [n, mn, sb2, sg2](double x) -> double {
    double k   = mn - x;
    double d   = k * k;
    double nd  = n * d;
    double k0  = -n * nd;
    double k1  = sb2 - n * sg2 + nd;
    double det = k1 * k1 - 4 * sg2 * k0;
    return (int)(-2. * k0 / (k1 + std::sqrt(det)));
  };

  auto var_out = [n, sb2, sg2](double c) {
    double nc = n - c;
    return (nc / n) * (sb2 - nc * sg2);
  };

  return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) /
         sb2;
}

bootstrap_analysis analyse_samples(double confidence_level, int n_resamples,
                                   std::vector<double>::iterator first,
                                   std::vector<double>::iterator last) {
  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
  static std::random_device entropy;
  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION

  auto n = static_cast<int>(
      last -
      first); // seriously, one can't use integral types without hell in C++

  auto mean   = &Detail::mean<std::vector<double>::iterator>;
  auto stddev = &standard_deviation;

#if defined(CATCH_CONFIG_USE_ASYNC)
  auto Estimate = [=](double (*f)(std::vector<double>::iterator,
                                  std::vector<double>::iterator)) {
    auto seed = entropy();
    return std::async(std::launch::async, [=] {
      std::mt19937 rng(seed);
      auto resampled = resample(rng, n_resamples, first, last, f);
      return bootstrap(confidence_level, first, last, resampled, f);
    });
  };

  auto mean_future   = Estimate(mean);
  auto stddev_future = Estimate(stddev);

  auto mean_estimate   = mean_future.get();
  auto stddev_estimate = stddev_future.get();
#else
  auto Estimate = [=](double (*f)(std::vector<double>::iterator,
                                  std::vector<double>::iterator)) {
    auto seed = entropy();
    std::mt19937 rng(seed);
    auto resampled = resample(rng, n_resamples, first, last, f);
    return bootstrap(confidence_level, first, last, resampled, f);
  };

  auto mean_estimate   = Estimate(mean);
  auto stddev_estimate = Estimate(stddev);
#endif // CATCH_USE_ASYNC

  double outlier_variance =
      Detail::outlier_variance(mean_estimate, stddev_estimate, n);

  return {mean_estimate, stddev_estimate, outlier_variance};
}
} // namespace Detail
} // namespace Benchmark
} // namespace Catch

#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
// end catch_stats.cpp
// start catch_approx.cpp

#include <cmath>
#include <limits>

namespace {

// Performs equivalent check of std::fabs(lhs - rhs) <= margin
// But without the subtraction to allow for INFINITY in comparison
bool marginComparison(double lhs, double rhs, double margin) {
  return (lhs + margin >= rhs) && (rhs + margin >= lhs);
}

} // namespace

namespace Catch {
namespace Detail {

Approx::Approx(double value)
    : m_epsilon(std::numeric_limits<float>::epsilon() * 100), m_margin(0.0),
      m_scale(0.0), m_value(value) {}

Approx Approx::custom() { return Approx(0); }

Approx Approx::operator-() const {
  auto temp(*this);
  temp.m_value = -temp.m_value;
  return temp;
}

std::string Approx::toString() const {
  ReusableStringStream rss;
  rss << "Approx( " << ::Catch::Detail::stringify(m_value) << " )";
  return rss.str();
}

bool Approx::equalityComparisonImpl(const double other) const {
  // First try with fixed margin, then compute margin based on epsilon, scale
  // and Approx's value Thanks to Richard Harris for his help refining the
  // scaled margin value
  return marginComparison(m_value, other, m_margin) ||
         marginComparison(
             m_value, other,
             m_epsilon *
                 (m_scale + std::fabs(std::isinf(m_value) ? 0 : m_value)));
}

void Approx::setMargin(double newMargin) {
  CATCH_ENFORCE(newMargin >= 0,
                "Invalid Approx::margin: "
                    << newMargin << '.'
                    << " Approx::Margin has to be non-negative.");
  m_margin = newMargin;
}

void Approx::setEpsilon(double newEpsilon) {
  CATCH_ENFORCE(newEpsilon >= 0 && newEpsilon <= 1.0,
                "Invalid Approx::epsilon: "
                    << newEpsilon << '.'
                    << " Approx::epsilon has to be in [0, 1]");
  m_epsilon = newEpsilon;
}

} // end namespace Detail

namespace literals {
Detail::Approx operator"" _a(long double val) { return Detail::Approx(val); }
Detail::Approx operator"" _a(unsigned long long val) {
  return Detail::Approx(val);
}
} // end namespace literals

std::string StringMaker<Catch::Detail::Approx>::convert(
    Catch::Detail::Approx const& value) {
  return value.toString();
}

} // end namespace Catch
// end catch_approx.cpp
// start catch_assertionhandler.cpp

// start catch_debugger.h

namespace Catch {
bool isDebuggerActive();
}

#ifdef CATCH_PLATFORM_MAC

#if defined(__i386__) || defined(__x86_64__)
#define CATCH_TRAP() __asm__("int $3\n" : :) /* NOLINT */
#elif defined(__aarch64__)
#define CATCH_TRAP() __asm__(".inst 0xd4200000")
#endif

#elif defined(CATCH_PLATFORM_IPHONE)

// use inline assembler
#if defined(__i386__) || defined(__x86_64__)
#define CATCH_TRAP() __asm__("int $3")
#elif defined(__aarch64__)
#define CATCH_TRAP() __asm__(".inst 0xd4200000")
#elif defined(__arm__) && !defined(__thumb__)
#define CATCH_TRAP() __asm__(".inst 0xe7f001f0")
#elif defined(__arm__) && defined(__thumb__)
#define CATCH_TRAP() __asm__(".inst 0xde01")
#endif

#elif defined(CATCH_PLATFORM_LINUX)
// If we can use inline assembler, do it because this allows us to break
// directly at the location of the failing check instead of breaking inside
// raise() called from it, i.e. one stack frame below.
#if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))
#define CATCH_TRAP() asm volatile("int $3") /* NOLINT */
#else                                       // Fall back to the generic way.
#include <signal.h>

#define CATCH_TRAP() raise(SIGTRAP)
#endif
#elif defined(_MSC_VER)
#define CATCH_TRAP() __debugbreak()
#elif defined(__MINGW32__)
extern "C" __declspec(dllimport) void __stdcall DebugBreak();
#define CATCH_TRAP() DebugBreak()
#endif

#ifndef CATCH_BREAK_INTO_DEBUGGER
#ifdef CATCH_TRAP
#define CATCH_BREAK_INTO_DEBUGGER()                                            \
  [] {                                                                         \
    if (Catch::isDebuggerActive()) {                                           \
      CATCH_TRAP();                                                            \
    }                                                                          \
  }()
#else
#define CATCH_BREAK_INTO_DEBUGGER() [] {}()
#endif
#endif

// end catch_debugger.h
// start catch_run_context.h

// start catch_fatal_condition.h

#include <cassert>

namespace Catch {

// Wrapper for platform-specific fatal error (signals/SEH) handlers
//
// Tries to be cooperative with other handlers, and not step over
// other handlers. This means that unknown structured exceptions
// are passed on, previous signal handlers are called, and so on.
//
// Can only be instantiated once, and assumes that once a signal
// is caught, the binary will end up terminating. Thus, there
class FatalConditionHandler {
  bool m_started = false;

  // Install/disengage implementation for specific platform.
  // Should be if-defed to work on current platform, can assume
  // engage-disengage 1:1 pairing.
  void engage_platform();
  void disengage_platform();

public:
  // Should also have platform-specific implementations as needed
  FatalConditionHandler();
  ~FatalConditionHandler();

  void engage() {
    assert(!m_started && "Handler cannot be installed twice.");
    m_started = true;
    engage_platform();
  }

  void disengage() {
    assert(m_started &&
           "Handler cannot be uninstalled without being installed first");
    m_started = false;
    disengage_platform();
  }
};

//! Simple RAII guard for (dis)engaging the FatalConditionHandler
class FatalConditionHandlerGuard {
  FatalConditionHandler* m_handler;

public:
  FatalConditionHandlerGuard(FatalConditionHandler* handler)
      : m_handler(handler) {
    m_handler->engage();
  }
  ~FatalConditionHandlerGuard() { m_handler->disengage(); }
};

} // end namespace Catch

// end catch_fatal_condition.h
#include <string>

namespace Catch {

struct IMutableContext;

///////////////////////////////////////////////////////////////////////////

class RunContext : public IResultCapture, public IRunner {

public:
  RunContext(RunContext const&)            = delete;
  RunContext& operator=(RunContext const&) = delete;

  explicit RunContext(IConfigPtr const& _config,
                      IStreamingReporterPtr&& reporter);

  ~RunContext() override;

  void testGroupStarting(std::string const& testSpec, std::size_t groupIndex,
                         std::size_t groupsCount);
  void testGroupEnded(std::string const& testSpec, Totals const& totals,
                      std::size_t groupIndex, std::size_t groupsCount);

  Totals runTest(TestCase const& testCase);

  IConfigPtr config() const;
  IStreamingReporter& reporter() const;

public: // IResultCapture
  // Assertion handlers
  void handleExpr(AssertionInfo const& info, ITransientExpression const& expr,
                  AssertionReaction& reaction) override;
  void handleMessage(AssertionInfo const& info, ResultWas::OfType resultType,
                     StringRef const& message,
                     AssertionReaction& reaction) override;
  void handleUnexpectedExceptionNotThrown(AssertionInfo const& info,
                                          AssertionReaction& reaction) override;
  void handleUnexpectedInflightException(AssertionInfo const& info,
                                         std::string const& message,
                                         AssertionReaction& reaction) override;
  void handleIncomplete(AssertionInfo const& info) override;
  void handleNonExpr(AssertionInfo const& info, ResultWas::OfType resultType,
                     AssertionReaction& reaction) override;

  bool sectionStarted(SectionInfo const& sectionInfo,
                      Counts& assertions) override;

  void sectionEnded(SectionEndInfo const& endInfo) override;
  void sectionEndedEarly(SectionEndInfo const& endInfo) override;

  auto acquireGeneratorTracker(StringRef generatorName,
                               SourceLineInfo const& lineInfo)
      -> IGeneratorTracker& override;

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  void benchmarkPreparing(std::string const& name) override;
  void benchmarkStarting(BenchmarkInfo const& info) override;
  void benchmarkEnded(BenchmarkStats<> const& stats) override;
  void benchmarkFailed(std::string const& error) override;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

  void pushScopedMessage(MessageInfo const& message) override;
  void popScopedMessage(MessageInfo const& message) override;

  void emplaceUnscopedMessage(MessageBuilder const& builder) override;

  std::string getCurrentTestName() const override;

  const AssertionResult* getLastResult() const override;

  void exceptionEarlyReported() override;

  void handleFatalErrorCondition(StringRef message) override;

  bool lastAssertionPassed() override;

  void assertionPassed() override;

public:
  // !TBD We need to do this another way!
  bool aborting() const final;

private:
  void runCurrentTest(std::string& redirectedCout, std::string& redirectedCerr);
  void invokeActiveTestCase();

  void resetAssertionInfo();
  bool testForMissingAssertions(Counts& assertions);

  void assertionEnded(AssertionResult const& result);
  void reportExpr(AssertionInfo const& info, ResultWas::OfType resultType,
                  ITransientExpression const* expr, bool negated);

  void populateReaction(AssertionReaction& reaction);

private:
  void handleUnfinishedSections();

  TestRunInfo m_runInfo;
  IMutableContext& m_context;
  TestCase const* m_activeTestCase = nullptr;
  ITracker* m_testCaseTracker      = nullptr;
  Option<AssertionResult> m_lastResult;

  IConfigPtr m_config;
  Totals m_totals;
  IStreamingReporterPtr m_reporter;
  std::vector<MessageInfo> m_messages;
  std::vector<ScopedMessage>
      m_messageScopes; /* Keeps owners of so-called unscoped messages. */
  AssertionInfo m_lastAssertionInfo;
  std::vector<SectionEndInfo> m_unfinishedSections;
  std::vector<ITracker*> m_activeSections;
  TrackerContext m_trackerContext;
  FatalConditionHandler m_fatalConditionhandler;
  bool m_lastAssertionPassed    = false;
  bool m_shouldReportUnexpected = true;
  bool m_includeSuccessfulResults;
};

void seedRng(IConfig const& config);
unsigned int rngSeed();
} // end namespace Catch

// end catch_run_context.h
namespace Catch {

namespace {
auto operator<<(std::ostream& os, ITransientExpression const& expr)
    -> std::ostream& {
  expr.streamReconstructedExpression(os);
  return os;
}
} // namespace

LazyExpression::LazyExpression(bool isNegated) : m_isNegated(isNegated) {}

LazyExpression::LazyExpression(LazyExpression const& other)
    : m_isNegated(other.m_isNegated) {}

LazyExpression::operator bool() const {
  return m_transientExpression != nullptr;
}

auto operator<<(std::ostream& os, LazyExpression const& lazyExpr)
    -> std::ostream& {
  if (lazyExpr.m_isNegated)
    os << "!";

  if (lazyExpr) {
    if (lazyExpr.m_isNegated &&
        lazyExpr.m_transientExpression->isBinaryExpression())
      os << "(" << *lazyExpr.m_transientExpression << ")";
    else
      os << *lazyExpr.m_transientExpression;
  } else {
    os << "{** error - unchecked empty expression requested **}";
  }
  return os;
}

AssertionHandler::AssertionHandler(StringRef const& macroName,
                                   SourceLineInfo const& lineInfo,
                                   StringRef capturedExpression,
                                   ResultDisposition::Flags resultDisposition)
    : m_assertionInfo{macroName, lineInfo, capturedExpression,
                      resultDisposition},
      m_resultCapture(getResultCapture()) {}

void AssertionHandler::handleExpr(ITransientExpression const& expr) {
  m_resultCapture.handleExpr(m_assertionInfo, expr, m_reaction);
}
void AssertionHandler::handleMessage(ResultWas::OfType resultType,
                                     StringRef const& message) {
  m_resultCapture.handleMessage(m_assertionInfo, resultType, message,
                                m_reaction);
}

auto AssertionHandler::allowThrows() const -> bool {
  return getCurrentContext().getConfig()->allowThrows();
}

void AssertionHandler::complete() {
  setCompleted();
  if (m_reaction.shouldDebugBreak) {

    // If you find your debugger stopping you here then go one level up on the
    // call-stack for the code that caused it (typically a failed assertion)

    // (To go back to the test and change execution, jump over the throw, next)
    CATCH_BREAK_INTO_DEBUGGER();
  }
  if (m_reaction.shouldThrow) {
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
    throw Catch::TestFailureException();
#else
    CATCH_ERROR("Test failure requires aborting test!");
#endif
  }
}
void AssertionHandler::setCompleted() { m_completed = true; }

void AssertionHandler::handleUnexpectedInflightException() {
  m_resultCapture.handleUnexpectedInflightException(
      m_assertionInfo, Catch::translateActiveException(), m_reaction);
}

void AssertionHandler::handleExceptionThrownAsExpected() {
  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
}
void AssertionHandler::handleExceptionNotThrownAsExpected() {
  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
}

void AssertionHandler::handleUnexpectedExceptionNotThrown() {
  m_resultCapture.handleUnexpectedExceptionNotThrown(m_assertionInfo,
                                                     m_reaction);
}

void AssertionHandler::handleThrowingCallSkipped() {
  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
}

// This is the overload that takes a string and infers the Equals matcher from
// it The more general overload, that takes any string matcher, is in
// catch_capture_matchers.cpp
void handleExceptionMatchExpr(AssertionHandler& handler, std::string const& str,
                              StringRef const& matcherString) {
  handleExceptionMatchExpr(handler, Matchers::Equals(str), matcherString);
}

} // namespace Catch
// end catch_assertionhandler.cpp
// start catch_assertionresult.cpp

namespace Catch {
AssertionResultData::AssertionResultData(ResultWas::OfType _resultType,
                                         LazyExpression const& _lazyExpression)
    : lazyExpression(_lazyExpression), resultType(_resultType) {}

std::string AssertionResultData::reconstructExpression() const {

  if (reconstructedExpression.empty()) {
    if (lazyExpression) {
      ReusableStringStream rss;
      rss << lazyExpression;
      reconstructedExpression = rss.str();
    }
  }
  return reconstructedExpression;
}

AssertionResult::AssertionResult(AssertionInfo const& info,
                                 AssertionResultData const& data)
    : m_info(info), m_resultData(data) {}

// Result was a success
bool AssertionResult::succeeded() const {
  return Catch::isOk(m_resultData.resultType);
}

// Result was a success, or failure is suppressed
bool AssertionResult::isOk() const {
  return Catch::isOk(m_resultData.resultType) ||
         shouldSuppressFailure(m_info.resultDisposition);
}

ResultWas::OfType AssertionResult::getResultType() const {
  return m_resultData.resultType;
}

bool AssertionResult::hasExpression() const {
  return !m_info.capturedExpression.empty();
}

bool AssertionResult::hasMessage() const {
  return !m_resultData.message.empty();
}

std::string AssertionResult::getExpression() const {
  // Possibly overallocating by 3 characters should be basically free
  std::string expr;
  expr.reserve(m_info.capturedExpression.size() + 3);
  if (isFalseTest(m_info.resultDisposition)) {
    expr += "!(";
  }
  expr += m_info.capturedExpression;
  if (isFalseTest(m_info.resultDisposition)) {
    expr += ')';
  }
  return expr;
}

std::string AssertionResult::getExpressionInMacro() const {
  std::string expr;
  if (m_info.macroName.empty())
    expr = static_cast<std::string>(m_info.capturedExpression);
  else {
    expr.reserve(m_info.macroName.size() + m_info.capturedExpression.size() +
                 4);
    expr += m_info.macroName;
    expr += "( ";
    expr += m_info.capturedExpression;
    expr += " )";
  }
  return expr;
}

bool AssertionResult::hasExpandedExpression() const {
  return hasExpression() && getExpandedExpression() != getExpression();
}

std::string AssertionResult::getExpandedExpression() const {
  std::string expr = m_resultData.reconstructExpression();
  return expr.empty() ? getExpression() : expr;
}

std::string AssertionResult::getMessage() const { return m_resultData.message; }
SourceLineInfo AssertionResult::getSourceInfo() const {
  return m_info.lineInfo;
}

StringRef AssertionResult::getTestMacroName() const { return m_info.macroName; }

} // end namespace Catch
// end catch_assertionresult.cpp
// start catch_capture_matchers.cpp

namespace Catch {

using StringMatcher = Matchers::Impl::MatcherBase<std::string>;

// This is the general overload that takes a any string matcher
// There is another overload, in catch_assertionhandler.h/.cpp, that only takes
// a string and infers the Equals matcher (so the header does not mention
// matchers)
void handleExceptionMatchExpr(AssertionHandler& handler,
                              StringMatcher const& matcher,
                              StringRef const& matcherString) {
  std::string exceptionMessage = Catch::translateActiveException();
  MatchExpr<std::string, StringMatcher const&> expr(exceptionMessage, matcher,
                                                    matcherString);
  handler.handleExpr(expr);
}

} // namespace Catch
// end catch_capture_matchers.cpp
// start catch_commandline.cpp

// start catch_commandline.h

// start catch_clara.h

// Use Catch's value for console width (store Clara's off to the side, if
// present)
#ifdef CLARA_CONFIG_CONSOLE_WIDTH
#define CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH                                  \
  CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
#undef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
#endif
#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH - 1

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wweak-vtables"
#pragma clang diagnostic ignored "-Wexit-time-destructors"
#pragma clang diagnostic ignored "-Wshadow"
#endif

// start clara.hpp
// Copyright 2017 Two Blue Cubes Ltd. All rights reserved.
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// See https://github.com/philsquared/Clara for more details

// Clara v1.1.5

#ifndef CATCH_CLARA_CONFIG_CONSOLE_WIDTH
#define CATCH_CLARA_CONFIG_CONSOLE_WIDTH 80
#endif

#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH                              \
  CATCH_CLARA_CONFIG_CONSOLE_WIDTH
#endif

#ifndef CLARA_CONFIG_OPTIONAL_TYPE
#ifdef __has_include
#if __has_include(<optional>) && __cplusplus >= 201703L
#include <optional>
#define CLARA_CONFIG_OPTIONAL_TYPE std::optional
#endif
#endif
#endif

// ----------- #included from clara_textflow.hpp -----------

// TextFlowCpp
//
// A single-header library for wrapping and laying out basic text, by Phil Nash
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// This project is hosted at https://github.com/philsquared/textflowcpp

#include <cassert>
#include <ostream>
#include <sstream>
#include <vector>

#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH 80
#endif

namespace Catch {
namespace clara {
namespace TextFlow {

inline auto isWhitespace(char c) -> bool {
  static std::string chars = " \t\n\r";
  return chars.find(c) != std::string::npos;
}
inline auto isBreakableBefore(char c) -> bool {
  static std::string chars = "[({<|";
  return chars.find(c) != std::string::npos;
}
inline auto isBreakableAfter(char c) -> bool {
  static std::string chars = "])}>.,:;*+-=&/\\";
  return chars.find(c) != std::string::npos;
}

class Columns;

class Column {
  std::vector<std::string> m_strings;
  size_t m_width         = CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH;
  size_t m_indent        = 0;
  size_t m_initialIndent = std::string::npos;

public:
  class iterator {
    friend Column;

    Column const& m_column;
    size_t m_stringIndex = 0;
    size_t m_pos         = 0;

    size_t m_len  = 0;
    size_t m_end  = 0;
    bool m_suffix = false;

    iterator(Column const& column, size_t stringIndex)
        : m_column(column), m_stringIndex(stringIndex) {}

    auto line() const -> std::string const& {
      return m_column.m_strings[m_stringIndex];
    }

    auto isBoundary(size_t at) const -> bool {
      assert(at > 0);
      assert(at <= line().size());

      return at == line().size() ||
             (isWhitespace(line()[at]) && !isWhitespace(line()[at - 1])) ||
             isBreakableBefore(line()[at]) || isBreakableAfter(line()[at - 1]);
    }

    void calcLength() {
      assert(m_stringIndex < m_column.m_strings.size());

      m_suffix   = false;
      auto width = m_column.m_width - indent();
      m_end      = m_pos;
      if (line()[m_pos] == '\n') {
        ++m_end;
      }
      while (m_end < line().size() && line()[m_end] != '\n')
        ++m_end;

      if (m_end < m_pos + width) {
        m_len = m_end - m_pos;
      } else {
        size_t len = width;
        while (len > 0 && !isBoundary(m_pos + len))
          --len;
        while (len > 0 && isWhitespace(line()[m_pos + len - 1]))
          --len;

        if (len > 0) {
          m_len = len;
        } else {
          m_suffix = true;
          m_len    = width - 1;
        }
      }
    }

    auto indent() const -> size_t {
      auto initial = m_pos == 0 && m_stringIndex == 0 ? m_column.m_initialIndent
                                                      : std::string::npos;
      return initial == std::string::npos ? m_column.m_indent : initial;
    }

    auto addIndentAndSuffix(std::string const& plain) const -> std::string {
      return std::string(indent(), ' ') + (m_suffix ? plain + "-" : plain);
    }

  public:
    using difference_type   = std::ptrdiff_t;
    using value_type        = std::string;
    using pointer           = value_type*;
    using reference         = value_type&;
    using iterator_category = std::forward_iterator_tag;

    explicit iterator(Column const& column) : m_column(column) {
      assert(m_column.m_width > m_column.m_indent);
      assert(m_column.m_initialIndent == std::string::npos ||
             m_column.m_width > m_column.m_initialIndent);
      calcLength();
      if (m_len == 0)
        m_stringIndex++; // Empty string
    }

    auto operator*() const -> std::string {
      assert(m_stringIndex < m_column.m_strings.size());
      assert(m_pos <= m_end);
      return addIndentAndSuffix(line().substr(m_pos, m_len));
    }

    auto operator++() -> iterator& {
      m_pos += m_len;
      if (m_pos < line().size() && line()[m_pos] == '\n')
        m_pos += 1;
      else
        while (m_pos < line().size() && isWhitespace(line()[m_pos]))
          ++m_pos;

      if (m_pos == line().size()) {
        m_pos = 0;
        ++m_stringIndex;
      }
      if (m_stringIndex < m_column.m_strings.size())
        calcLength();
      return *this;
    }
    auto operator++(int) -> iterator {
      iterator prev(*this);
      operator++();
      return prev;
    }

    auto operator==(iterator const& other) const -> bool {
      return m_pos == other.m_pos && m_stringIndex == other.m_stringIndex &&
             &m_column == &other.m_column;
    }
    auto operator!=(iterator const& other) const -> bool {
      return !operator==(other);
    }
  };
  using const_iterator = iterator;

  explicit Column(std::string const& text) { m_strings.push_back(text); }

  auto width(size_t newWidth) -> Column& {
    assert(newWidth > 0);
    m_width = newWidth;
    return *this;
  }
  auto indent(size_t newIndent) -> Column& {
    m_indent = newIndent;
    return *this;
  }
  auto initialIndent(size_t newIndent) -> Column& {
    m_initialIndent = newIndent;
    return *this;
  }

  auto width() const -> size_t { return m_width; }
  auto begin() const -> iterator { return iterator(*this); }
  auto end() const -> iterator { return {*this, m_strings.size()}; }

  inline friend std::ostream& operator<<(std::ostream& os, Column const& col) {
    bool first = true;
    for (auto line : col) {
      if (first)
        first = false;
      else
        os << "\n";
      os << line;
    }
    return os;
  }

  auto operator+(Column const& other) -> Columns;

  auto toString() const -> std::string {
    std::ostringstream oss;
    oss << *this;
    return oss.str();
  }
};

class Spacer : public Column {

public:
  explicit Spacer(size_t spaceWidth) : Column("") { width(spaceWidth); }
};

class Columns {
  std::vector<Column> m_columns;

public:
  class iterator {
    friend Columns;
    struct EndTag {};

    std::vector<Column> const& m_columns;
    std::vector<Column::iterator> m_iterators;
    size_t m_activeIterators;

    iterator(Columns const& columns, EndTag)
        : m_columns(columns.m_columns), m_activeIterators(0) {
      m_iterators.reserve(m_columns.size());

      for (auto const& col : m_columns)
        m_iterators.push_back(col.end());
    }

  public:
    using difference_type   = std::ptrdiff_t;
    using value_type        = std::string;
    using pointer           = value_type*;
    using reference         = value_type&;
    using iterator_category = std::forward_iterator_tag;

    explicit iterator(Columns const& columns)
        : m_columns(columns.m_columns), m_activeIterators(m_columns.size()) {
      m_iterators.reserve(m_columns.size());

      for (auto const& col : m_columns)
        m_iterators.push_back(col.begin());
    }

    auto operator==(iterator const& other) const -> bool {
      return m_iterators == other.m_iterators;
    }
    auto operator!=(iterator const& other) const -> bool {
      return m_iterators != other.m_iterators;
    }
    auto operator*() const -> std::string {
      std::string row, padding;

      for (size_t i = 0; i < m_columns.size(); ++i) {
        auto width = m_columns[i].width();
        if (m_iterators[i] != m_columns[i].end()) {
          std::string col = *m_iterators[i];
          row += padding + col;
          if (col.size() < width)
            padding = std::string(width - col.size(), ' ');
          else
            padding = "";
        } else {
          padding += std::string(width, ' ');
        }
      }
      return row;
    }
    auto operator++() -> iterator& {
      for (size_t i = 0; i < m_columns.size(); ++i) {
        if (m_iterators[i] != m_columns[i].end())
          ++m_iterators[i];
      }
      return *this;
    }
    auto operator++(int) -> iterator {
      iterator prev(*this);
      operator++();
      return prev;
    }
  };
  using const_iterator = iterator;

  auto begin() const -> iterator { return iterator(*this); }
  auto end() const -> iterator { return {*this, iterator::EndTag()}; }

  auto operator+=(Column const& col) -> Columns& {
    m_columns.push_back(col);
    return *this;
  }
  auto operator+(Column const& col) -> Columns {
    Columns combined = *this;
    combined += col;
    return combined;
  }

  inline friend std::ostream& operator<<(std::ostream& os,
                                         Columns const& cols) {

    bool first = true;
    for (auto line : cols) {
      if (first)
        first = false;
      else
        os << "\n";
      os << line;
    }
    return os;
  }

  auto toString() const -> std::string {
    std::ostringstream oss;
    oss << *this;
    return oss.str();
  }
};

inline auto Column::operator+(Column const& other) -> Columns {
  Columns cols;
  cols += *this;
  cols += other;
  return cols;
}
} // namespace TextFlow

} // namespace clara
} // namespace Catch

// ----------- end of #include from clara_textflow.hpp -----------
// ........... back in clara.hpp

#include <cctype>
#include <string>
#include <memory>
#include <set>
#include <algorithm>

#if !defined(CATCH_PLATFORM_WINDOWS) &&                                        \
    (defined(WIN32) || defined(__WIN32__) || defined(_WIN32) ||                \
     defined(_MSC_VER))
#define CATCH_PLATFORM_WINDOWS
#endif

namespace Catch {
namespace clara {
namespace detail {

// Traits for extracting arg and return type of lambdas (for single argument
// lambdas)
template <typename L>
struct UnaryLambdaTraits : UnaryLambdaTraits<decltype(&L::operator())> {};

template <typename ClassT, typename ReturnT, typename... Args>
struct UnaryLambdaTraits<ReturnT (ClassT::*)(Args...) const> {
  static const bool isValid = false;
};

template <typename ClassT, typename ReturnT, typename ArgT>
struct UnaryLambdaTraits<ReturnT (ClassT::*)(ArgT) const> {
  static const bool isValid = true;
  using ArgType             = typename std::remove_const<
      typename std::remove_reference<ArgT>::type>::type;
  using ReturnType = ReturnT;
};

class TokenStream;

// Transport for raw args (copied from main args, or supplied via init list for
// testing)
class Args {
  friend TokenStream;
  std::string m_exeName;
  std::vector<std::string> m_args;

public:
  Args(int argc, char const* const* argv)
      : m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}

  Args(std::initializer_list<std::string> args)
      : m_exeName(*args.begin()), m_args(args.begin() + 1, args.end()) {}

  auto exeName() const -> std::string { return m_exeName; }
};

// Wraps a token coming from a token stream. These may not directly correspond
// to strings as a single string may encode an option + its argument if the : or
// = form is used
enum class TokenType { Option, Argument };
struct Token {
  TokenType type;
  std::string token;
};

inline auto isOptPrefix(char c) -> bool {
  return c == '-'
#ifdef CATCH_PLATFORM_WINDOWS
         || c == '/'
#endif
      ;
}

// Abstracts iterators into args as a stream of tokens, with option arguments
// uniformly handled
class TokenStream {
  using Iterator = std::vector<std::string>::const_iterator;
  Iterator it;
  Iterator itEnd;
  std::vector<Token> m_tokenBuffer;

  void loadBuffer() {
    m_tokenBuffer.resize(0);

    // Skip any empty strings
    while (it != itEnd && it->empty())
      ++it;

    if (it != itEnd) {
      auto const& next = *it;
      if (isOptPrefix(next[0])) {
        auto delimiterPos = next.find_first_of(" :=");
        if (delimiterPos != std::string::npos) {
          m_tokenBuffer.push_back(
              {TokenType::Option, next.substr(0, delimiterPos)});
          m_tokenBuffer.push_back(
              {TokenType::Argument, next.substr(delimiterPos + 1)});
        } else {
          if (next[1] != '-' && next.size() > 2) {
            std::string opt = "- ";
            for (size_t i = 1; i < next.size(); ++i) {
              opt[1] = next[i];
              m_tokenBuffer.push_back({TokenType::Option, opt});
            }
          } else {
            m_tokenBuffer.push_back({TokenType::Option, next});
          }
        }
      } else {
        m_tokenBuffer.push_back({TokenType::Argument, next});
      }
    }
  }

public:
  explicit TokenStream(Args const& args)
      : TokenStream(args.m_args.begin(), args.m_args.end()) {}

  TokenStream(Iterator it, Iterator itEnd) : it(it), itEnd(itEnd) {
    loadBuffer();
  }

  explicit operator bool() const {
    return !m_tokenBuffer.empty() || it != itEnd;
  }

  auto count() const -> size_t { return m_tokenBuffer.size() + (itEnd - it); }

  auto operator*() const -> Token {
    assert(!m_tokenBuffer.empty());
    return m_tokenBuffer.front();
  }

  auto operator->() const -> Token const* {
    assert(!m_tokenBuffer.empty());
    return &m_tokenBuffer.front();
  }

  auto operator++() -> TokenStream& {
    if (m_tokenBuffer.size() >= 2) {
      m_tokenBuffer.erase(m_tokenBuffer.begin());
    } else {
      if (it != itEnd)
        ++it;
      loadBuffer();
    }
    return *this;
  }
};

class ResultBase {
public:
  enum Type { Ok, LogicError, RuntimeError };

protected:
  ResultBase(Type type) : m_type(type) {}
  virtual ~ResultBase() = default;

  virtual void enforceOk() const = 0;

  Type m_type;
};

template <typename T>
class ResultValueBase : public ResultBase {
public:
  auto value() const -> T const& {
    enforceOk();
    return m_value;
  }

protected:
  ResultValueBase(Type type) : ResultBase(type) {}

  ResultValueBase(ResultValueBase const& other) : ResultBase(other) {
    if (m_type == ResultBase::Ok)
      new (&m_value) T(other.m_value);
  }

  ResultValueBase(Type, T const& value) : ResultBase(Ok) {
    new (&m_value) T(value);
  }

  auto operator=(ResultValueBase const& other) -> ResultValueBase& {
    if (m_type == ResultBase::Ok)
      m_value.~T();
    ResultBase::operator=(other);
    if (m_type == ResultBase::Ok)
      new (&m_value) T(other.m_value);
    return *this;
  }

  ~ResultValueBase() override {
    if (m_type == Ok)
      m_value.~T();
  }

  union {
    T m_value;
  };
};

template <>
class ResultValueBase<void> : public ResultBase {
protected:
  using ResultBase::ResultBase;
};

template <typename T = void>
class BasicResult : public ResultValueBase<T> {
public:
  template <typename U>
  explicit BasicResult(BasicResult<U> const& other)
      : ResultValueBase<T>(other.type()), m_errorMessage(other.errorMessage()) {
    assert(type() != ResultBase::Ok);
  }

  template <typename U>
  static auto ok(U const& value) -> BasicResult {
    return {ResultBase::Ok, value};
  }
  static auto ok() -> BasicResult { return {ResultBase::Ok}; }
  static auto logicError(std::string const& message) -> BasicResult {
    return {ResultBase::LogicError, message};
  }
  static auto runtimeError(std::string const& message) -> BasicResult {
    return {ResultBase::RuntimeError, message};
  }

  explicit operator bool() const { return m_type == ResultBase::Ok; }
  auto type() const -> ResultBase::Type { return m_type; }
  auto errorMessage() const -> std::string { return m_errorMessage; }

protected:
  void enforceOk() const override {

    // Errors shouldn't reach this point, but if they do
    // the actual error message will be in m_errorMessage
    assert(m_type != ResultBase::LogicError);
    assert(m_type != ResultBase::RuntimeError);
    if (m_type != ResultBase::Ok)
      std::abort();
  }

  std::string m_errorMessage; // Only populated if resultType is an error

  BasicResult(ResultBase::Type type, std::string const& message)
      : ResultValueBase<T>(type), m_errorMessage(message) {
    assert(m_type != ResultBase::Ok);
  }

  using ResultValueBase<T>::ResultValueBase;
  using ResultBase::m_type;
};

enum class ParseResultType {
  Matched,
  NoMatch,
  ShortCircuitAll,
  ShortCircuitSame
};

class ParseState {
public:
  ParseState(ParseResultType type, TokenStream const& remainingTokens)
      : m_type(type), m_remainingTokens(remainingTokens) {}

  auto type() const -> ParseResultType { return m_type; }
  auto remainingTokens() const -> TokenStream { return m_remainingTokens; }

private:
  ParseResultType m_type;
  TokenStream m_remainingTokens;
};

using Result              = BasicResult<void>;
using ParserResult        = BasicResult<ParseResultType>;
using InternalParseResult = BasicResult<ParseState>;

struct HelpColumns {
  std::string left;
  std::string right;
};

template <typename T>
inline auto convertInto(std::string const& source, T& target) -> ParserResult {
  std::stringstream ss;
  ss << source;
  ss >> target;
  if (ss.fail())
    return ParserResult::runtimeError("Unable to convert '" + source +
                                      "' to destination type");
  else
    return ParserResult::ok(ParseResultType::Matched);
}
inline auto convertInto(std::string const& source, std::string& target)
    -> ParserResult {
  target = source;
  return ParserResult::ok(ParseResultType::Matched);
}
inline auto convertInto(std::string const& source, bool& target)
    -> ParserResult {
  std::string srcLC = source;
  std::transform(
      srcLC.begin(), srcLC.end(), srcLC.begin(),
      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
  if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" ||
      srcLC == "on")
    target = true;
  else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" ||
           srcLC == "off")
    target = false;
  else
    return ParserResult::runtimeError(
        "Expected a boolean value but did not recognise: '" + source + "'");
  return ParserResult::ok(ParseResultType::Matched);
}
#ifdef CLARA_CONFIG_OPTIONAL_TYPE
template <typename T>
inline auto convertInto(std::string const& source,
                        CLARA_CONFIG_OPTIONAL_TYPE<T>& target) -> ParserResult {
  T temp;
  auto result = convertInto(source, temp);
  if (result)
    target = std::move(temp);
  return result;
}
#endif // CLARA_CONFIG_OPTIONAL_TYPE

struct NonCopyable {
  NonCopyable()                              = default;
  NonCopyable(NonCopyable const&)            = delete;
  NonCopyable(NonCopyable&&)                 = delete;
  NonCopyable& operator=(NonCopyable const&) = delete;
  NonCopyable& operator=(NonCopyable&&)      = delete;
};

struct BoundRef : NonCopyable {
  virtual ~BoundRef() = default;
  virtual auto isContainer() const -> bool { return false; }
  virtual auto isFlag() const -> bool { return false; }
};
struct BoundValueRefBase : BoundRef {
  virtual auto setValue(std::string const& arg) -> ParserResult = 0;
};
struct BoundFlagRefBase : BoundRef {
  virtual auto setFlag(bool flag) -> ParserResult = 0;
  virtual auto isFlag() const -> bool { return true; }
};

template <typename T>
struct BoundValueRef : BoundValueRefBase {
  T& m_ref;

  explicit BoundValueRef(T& ref) : m_ref(ref) {}

  auto setValue(std::string const& arg) -> ParserResult override {
    return convertInto(arg, m_ref);
  }
};

template <typename T>
struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
  std::vector<T>& m_ref;

  explicit BoundValueRef(std::vector<T>& ref) : m_ref(ref) {}

  auto isContainer() const -> bool override { return true; }

  auto setValue(std::string const& arg) -> ParserResult override {
    T temp;
    auto result = convertInto(arg, temp);
    if (result)
      m_ref.push_back(temp);
    return result;
  }
};

struct BoundFlagRef : BoundFlagRefBase {
  bool& m_ref;

  explicit BoundFlagRef(bool& ref) : m_ref(ref) {}

  auto setFlag(bool flag) -> ParserResult override {
    m_ref = flag;
    return ParserResult::ok(ParseResultType::Matched);
  }
};

template <typename ReturnType>
struct LambdaInvoker {
  static_assert(std::is_same<ReturnType, ParserResult>::value,
                "Lambda must return void or clara::ParserResult");

  template <typename L, typename ArgType>
  static auto invoke(L const& lambda, ArgType const& arg) -> ParserResult {
    return lambda(arg);
  }
};

template <>
struct LambdaInvoker<void> {
  template <typename L, typename ArgType>
  static auto invoke(L const& lambda, ArgType const& arg) -> ParserResult {
    lambda(arg);
    return ParserResult::ok(ParseResultType::Matched);
  }
};

template <typename ArgType, typename L>
inline auto invokeLambda(L const& lambda, std::string const& arg)
    -> ParserResult {
  ArgType temp{};
  auto result = convertInto(arg, temp);
  return !result
             ? result
             : LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke(
                   lambda, temp);
}

template <typename L>
struct BoundLambda : BoundValueRefBase {
  L m_lambda;

  static_assert(UnaryLambdaTraits<L>::isValid,
                "Supplied lambda must take exactly one argument");
  explicit BoundLambda(L const& lambda) : m_lambda(lambda) {}

  auto setValue(std::string const& arg) -> ParserResult override {
    return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>(m_lambda, arg);
  }
};

template <typename L>
struct BoundFlagLambda : BoundFlagRefBase {
  L m_lambda;

  static_assert(UnaryLambdaTraits<L>::isValid,
                "Supplied lambda must take exactly one argument");
  static_assert(
      std::is_same<typename UnaryLambdaTraits<L>::ArgType, bool>::value,
      "flags must be boolean");

  explicit BoundFlagLambda(L const& lambda) : m_lambda(lambda) {}

  auto setFlag(bool flag) -> ParserResult override {
    return LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke(
        m_lambda, flag);
  }
};

enum class Optionality { Optional, Required };

struct Parser;

class ParserBase {
public:
  virtual ~ParserBase() = default;
  virtual auto validate() const -> Result { return Result::ok(); }
  virtual auto parse(std::string const& exeName,
                     TokenStream const& tokens) const
      -> InternalParseResult = 0;
  virtual auto cardinality() const -> size_t { return 1; }

  auto parse(Args const& args) const -> InternalParseResult {
    return parse(args.exeName(), TokenStream(args));
  }
};

template <typename DerivedT>
class ComposableParserImpl : public ParserBase {
public:
  template <typename T>
  auto operator|(T const& other) const -> Parser;

  template <typename T>
  auto operator+(T const& other) const -> Parser;
};

// Common code and state for Args and Opts
template <typename DerivedT>
class ParserRefImpl : public ComposableParserImpl<DerivedT> {
protected:
  Optionality m_optionality = Optionality::Optional;
  std::shared_ptr<BoundRef> m_ref;
  std::string m_hint;
  std::string m_description;

  explicit ParserRefImpl(std::shared_ptr<BoundRef> const& ref) : m_ref(ref) {}

public:
  template <typename T>
  ParserRefImpl(T& ref, std::string const& hint)
      : m_ref(std::make_shared<BoundValueRef<T>>(ref)), m_hint(hint) {}

  template <typename LambdaT>
  ParserRefImpl(LambdaT const& ref, std::string const& hint)
      : m_ref(std::make_shared<BoundLambda<LambdaT>>(ref)), m_hint(hint) {}

  auto operator()(std::string const& description) -> DerivedT& {
    m_description = description;
    return static_cast<DerivedT&>(*this);
  }

  auto optional() -> DerivedT& {
    m_optionality = Optionality::Optional;
    return static_cast<DerivedT&>(*this);
  };

  auto required() -> DerivedT& {
    m_optionality = Optionality::Required;
    return static_cast<DerivedT&>(*this);
  };

  auto isOptional() const -> bool {
    return m_optionality == Optionality::Optional;
  }

  auto cardinality() const -> size_t override {
    if (m_ref->isContainer())
      return 0;
    else
      return 1;
  }

  auto hint() const -> std::string { return m_hint; }
};

class ExeName : public ComposableParserImpl<ExeName> {
  std::shared_ptr<std::string> m_name;
  std::shared_ptr<BoundValueRefBase> m_ref;

  template <typename LambdaT>
  static auto makeRef(LambdaT const& lambda)
      -> std::shared_ptr<BoundValueRefBase> {
    return std::make_shared<BoundLambda<LambdaT>>(lambda);
  }

public:
  ExeName() : m_name(std::make_shared<std::string>("<executable>")) {}

  explicit ExeName(std::string& ref) : ExeName() {
    m_ref = std::make_shared<BoundValueRef<std::string>>(ref);
  }

  template <typename LambdaT>
  explicit ExeName(LambdaT const& lambda) : ExeName() {
    m_ref = std::make_shared<BoundLambda<LambdaT>>(lambda);
  }

  // The exe name is not parsed out of the normal tokens, but is handled
  // specially
  auto parse(std::string const&, TokenStream const& tokens) const
      -> InternalParseResult override {
    return InternalParseResult::ok(
        ParseState(ParseResultType::NoMatch, tokens));
  }

  auto name() const -> std::string { return *m_name; }
  auto set(std::string const& newName) -> ParserResult {

    auto lastSlash = newName.find_last_of("\\/");
    auto filename  = (lastSlash == std::string::npos)
                         ? newName
                         : newName.substr(lastSlash + 1);

    *m_name = filename;
    if (m_ref)
      return m_ref->setValue(filename);
    else
      return ParserResult::ok(ParseResultType::Matched);
  }
};

class Arg : public ParserRefImpl<Arg> {
public:
  using ParserRefImpl::ParserRefImpl;

  auto parse(std::string const&, TokenStream const& tokens) const
      -> InternalParseResult override {
    auto validationResult = validate();
    if (!validationResult)
      return InternalParseResult(validationResult);

    auto remainingTokens = tokens;
    auto const& token    = *remainingTokens;
    if (token.type != TokenType::Argument)
      return InternalParseResult::ok(
          ParseState(ParseResultType::NoMatch, remainingTokens));

    assert(!m_ref->isFlag());
    auto valueRef = static_cast<detail::BoundValueRefBase*>(m_ref.get());

    auto result = valueRef->setValue(remainingTokens->token);
    if (!result)
      return InternalParseResult(result);
    else
      return InternalParseResult::ok(
          ParseState(ParseResultType::Matched, ++remainingTokens));
  }
};

inline auto normaliseOpt(std::string const& optName) -> std::string {
#ifdef CATCH_PLATFORM_WINDOWS
  if (optName[0] == '/')
    return "-" + optName.substr(1);
  else
#endif
    return optName;
}

class Opt : public ParserRefImpl<Opt> {
protected:
  std::vector<std::string> m_optNames;

public:
  template <typename LambdaT>
  explicit Opt(LambdaT const& ref)
      : ParserRefImpl(std::make_shared<BoundFlagLambda<LambdaT>>(ref)) {}

  explicit Opt(bool& ref)
      : ParserRefImpl(std::make_shared<BoundFlagRef>(ref)) {}

  template <typename LambdaT>
  Opt(LambdaT const& ref, std::string const& hint) : ParserRefImpl(ref, hint) {}

  template <typename T>
  Opt(T& ref, std::string const& hint) : ParserRefImpl(ref, hint) {}

  auto operator[](std::string const& optName) -> Opt& {
    m_optNames.push_back(optName);
    return *this;
  }

  auto getHelpColumns() const -> std::vector<HelpColumns> {
    std::ostringstream oss;
    bool first = true;
    for (auto const& opt : m_optNames) {
      if (first)
        first = false;
      else
        oss << ", ";
      oss << opt;
    }
    if (!m_hint.empty())
      oss << " <" << m_hint << ">";
    return {{oss.str(), m_description}};
  }

  auto isMatch(std::string const& optToken) const -> bool {
    auto normalisedToken = normaliseOpt(optToken);
    for (auto const& name : m_optNames) {
      if (normaliseOpt(name) == normalisedToken)
        return true;
    }
    return false;
  }

  using ParserBase::parse;

  auto parse(std::string const&, TokenStream const& tokens) const
      -> InternalParseResult override {
    auto validationResult = validate();
    if (!validationResult)
      return InternalParseResult(validationResult);

    auto remainingTokens = tokens;
    if (remainingTokens && remainingTokens->type == TokenType::Option) {
      auto const& token = *remainingTokens;
      if (isMatch(token.token)) {
        if (m_ref->isFlag()) {
          auto flagRef = static_cast<detail::BoundFlagRefBase*>(m_ref.get());
          auto result  = flagRef->setFlag(true);
          if (!result)
            return InternalParseResult(result);
          if (result.value() == ParseResultType::ShortCircuitAll)
            return InternalParseResult::ok(
                ParseState(result.value(), remainingTokens));
        } else {
          auto valueRef = static_cast<detail::BoundValueRefBase*>(m_ref.get());
          ++remainingTokens;
          if (!remainingTokens)
            return InternalParseResult::runtimeError(
                "Expected argument following " + token.token);
          auto const& argToken = *remainingTokens;
          if (argToken.type != TokenType::Argument)
            return InternalParseResult::runtimeError(
                "Expected argument following " + token.token);
          auto result = valueRef->setValue(argToken.token);
          if (!result)
            return InternalParseResult(result);
          if (result.value() == ParseResultType::ShortCircuitAll)
            return InternalParseResult::ok(
                ParseState(result.value(), remainingTokens));
        }
        return InternalParseResult::ok(
            ParseState(ParseResultType::Matched, ++remainingTokens));
      }
    }
    return InternalParseResult::ok(
        ParseState(ParseResultType::NoMatch, remainingTokens));
  }

  auto validate() const -> Result override {
    if (m_optNames.empty())
      return Result::logicError("No options supplied to Opt");
    for (auto const& name : m_optNames) {
      if (name.empty())
        return Result::logicError("Option name cannot be empty");
#ifdef CATCH_PLATFORM_WINDOWS
      if (name[0] != '-' && name[0] != '/')
        return Result::logicError("Option name must begin with '-' or '/'");
#else
      if (name[0] != '-')
        return Result::logicError("Option name must begin with '-'");
#endif
    }
    return ParserRefImpl::validate();
  }
};

struct Help : Opt {
  Help(bool& showHelpFlag)
      : Opt([&](bool flag) {
          showHelpFlag = flag;
          return ParserResult::ok(ParseResultType::ShortCircuitAll);
        }) {
    static_cast<Opt&> (*this)("display usage information")["-?"]["-h"]["--help"]
        .optional();
  }
};

struct Parser : ParserBase {

  mutable ExeName m_exeName;
  std::vector<Opt> m_options;
  std::vector<Arg> m_args;

  auto operator|=(ExeName const& exeName) -> Parser& {
    m_exeName = exeName;
    return *this;
  }

  auto operator|=(Arg const& arg) -> Parser& {
    m_args.push_back(arg);
    return *this;
  }

  auto operator|=(Opt const& opt) -> Parser& {
    m_options.push_back(opt);
    return *this;
  }

  auto operator|=(Parser const& other) -> Parser& {
    m_options.insert(m_options.end(), other.m_options.begin(),
                     other.m_options.end());
    m_args.insert(m_args.end(), other.m_args.begin(), other.m_args.end());
    return *this;
  }

  template <typename T>
  auto operator|(T const& other) const -> Parser {
    return Parser(*this) |= other;
  }

  // Forward deprecated interface with '+' instead of '|'
  template <typename T>
  auto operator+=(T const& other) -> Parser& {
    return operator|=(other);
  }
  template <typename T>
  auto operator+(T const& other) const -> Parser {
    return operator|(other);
  }

  auto getHelpColumns() const -> std::vector<HelpColumns> {
    std::vector<HelpColumns> cols;
    for (auto const& o : m_options) {
      auto childCols = o.getHelpColumns();
      cols.insert(cols.end(), childCols.begin(), childCols.end());
    }
    return cols;
  }

  void writeToStream(std::ostream& os) const {
    if (!m_exeName.name().empty()) {
      os << "usage:\n"
         << "  " << m_exeName.name() << " ";
      bool required = true, first = true;
      for (auto const& arg : m_args) {
        if (first)
          first = false;
        else
          os << " ";
        if (arg.isOptional() && required) {
          os << "[";
          required = false;
        }
        os << "<" << arg.hint() << ">";
        if (arg.cardinality() == 0)
          os << " ... ";
      }
      if (!required)
        os << "]";
      if (!m_options.empty())
        os << " options";
      os << "\n\nwhere options are:" << std::endl;
    }

    auto rows           = getHelpColumns();
    size_t consoleWidth = CATCH_CLARA_CONFIG_CONSOLE_WIDTH;
    size_t optWidth     = 0;
    for (auto const& cols : rows)
      optWidth = (std::max)(optWidth, cols.left.size() + 2);

    optWidth = (std::min)(optWidth, consoleWidth / 2);

    for (auto const& cols : rows) {
      auto row =
          TextFlow::Column(cols.left).width(optWidth).indent(2) +
          TextFlow::Spacer(4) +
          TextFlow::Column(cols.right).width(consoleWidth - 7 - optWidth);
      os << row << std::endl;
    }
  }

  friend auto operator<<(std::ostream& os, Parser const& parser)
      -> std::ostream& {
    parser.writeToStream(os);
    return os;
  }

  auto validate() const -> Result override {
    for (auto const& opt : m_options) {
      auto result = opt.validate();
      if (!result)
        return result;
    }
    for (auto const& arg : m_args) {
      auto result = arg.validate();
      if (!result)
        return result;
    }
    return Result::ok();
  }

  using ParserBase::parse;

  auto parse(std::string const& exeName, TokenStream const& tokens) const
      -> InternalParseResult override {

    struct ParserInfo {
      ParserBase const* parser = nullptr;
      size_t count             = 0;
    };
    const size_t totalParsers = m_options.size() + m_args.size();
    assert(totalParsers < 512);
    // ParserInfo parseInfos[totalParsers]; // <-- this is what we really want
    // to do
    ParserInfo parseInfos[512];

    {
      size_t i = 0;
      for (auto const& opt : m_options)
        parseInfos[i++].parser = &opt;
      for (auto const& arg : m_args)
        parseInfos[i++].parser = &arg;
    }

    m_exeName.set(exeName);

    auto result =
        InternalParseResult::ok(ParseState(ParseResultType::NoMatch, tokens));
    while (result.value().remainingTokens()) {
      bool tokenParsed = false;

      for (size_t i = 0; i < totalParsers; ++i) {
        auto& parseInfo = parseInfos[i];
        if (parseInfo.parser->cardinality() == 0 ||
            parseInfo.count < parseInfo.parser->cardinality()) {
          result = parseInfo.parser->parse(exeName,
                                           result.value().remainingTokens());
          if (!result)
            return result;
          if (result.value().type() != ParseResultType::NoMatch) {
            tokenParsed = true;
            ++parseInfo.count;
            break;
          }
        }
      }

      if (result.value().type() == ParseResultType::ShortCircuitAll)
        return result;
      if (!tokenParsed)
        return InternalParseResult::runtimeError(
            "Unrecognised token: " + result.value().remainingTokens()->token);
    }
    // !TBD Check missing required options
    return result;
  }
};

template <typename DerivedT>
template <typename T>
auto ComposableParserImpl<DerivedT>::operator|(T const& other) const -> Parser {
  return Parser() | static_cast<DerivedT const&>(*this) | other;
}
} // namespace detail

// A Combined parser
using detail::Parser;

// A parser for options
using detail::Opt;

// A parser for arguments
using detail::Arg;

// Wrapper for argc, argv from main()
using detail::Args;

// Specifies the name of the executable
using detail::ExeName;

// Convenience wrapper for option parser that specifies the help option
using detail::Help;

// enum of result types from a parse
using detail::ParseResultType;

// Result type for parser operation
using detail::ParserResult;

} // namespace clara
} // namespace Catch

// end clara.hpp
#ifdef __clang__
#pragma clang diagnostic pop
#endif

// Restore Clara's value for console width, if present
#ifdef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH                              \
  CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
#undef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
#endif

// end catch_clara.h
namespace Catch {

clara::Parser makeCommandLineParser(ConfigData& config);

} // end namespace Catch

// end catch_commandline.h
#include <fstream>
#include <ctime>

namespace Catch {

clara::Parser makeCommandLineParser(ConfigData& config) {

  using namespace clara;

  auto const setWarning = [&](std::string const& warning) {
    auto warningSet = [&]() {
      if (warning == "NoAssertions")
        return WarnAbout::NoAssertions;

      if (warning == "NoTests")
        return WarnAbout::NoTests;

      return WarnAbout::Nothing;
    }();

    if (warningSet == WarnAbout::Nothing)
      return ParserResult::runtimeError("Unrecognised warning: '" + warning +
                                        "'");
    config.warnings =
        static_cast<WarnAbout::What>(config.warnings | warningSet);
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const loadTestNamesFromFile = [&](std::string const& filename) {
    std::ifstream f(filename.c_str());
    if (!f.is_open())
      return ParserResult::runtimeError("Unable to load input file: '" +
                                        filename + "'");

    std::string line;
    while (std::getline(f, line)) {
      line = trim(line);
      if (!line.empty() && !startsWith(line, '#')) {
        if (!startsWith(line, '"'))
          line = '"' + line + '"';
        config.testsOrTags.push_back(line);
        config.testsOrTags.emplace_back(",");
      }
    }
    // Remove comma in the end
    if (!config.testsOrTags.empty())
      config.testsOrTags.erase(config.testsOrTags.end() - 1);

    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setTestOrder = [&](std::string const& order) {
    if (startsWith("declared", order))
      config.runOrder = RunTests::InDeclarationOrder;
    else if (startsWith("lexical", order))
      config.runOrder = RunTests::InLexicographicalOrder;
    else if (startsWith("random", order))
      config.runOrder = RunTests::InRandomOrder;
    else
      return clara::ParserResult::runtimeError("Unrecognised ordering: '" +
                                               order + "'");
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setRngSeed = [&](std::string const& seed) {
    if (seed != "time")
      return clara::detail::convertInto(seed, config.rngSeed);
    config.rngSeed = static_cast<unsigned int>(std::time(nullptr));
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setColourUsage = [&](std::string const& useColour) {
    auto mode = toLower(useColour);

    if (mode == "yes")
      config.useColour = UseColour::Yes;
    else if (mode == "no")
      config.useColour = UseColour::No;
    else if (mode == "auto")
      config.useColour = UseColour::Auto;
    else
      return ParserResult::runtimeError(
          "colour mode must be one of: auto, yes or no. '" + useColour +
          "' not recognised");
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setWaitForKeypress = [&](std::string const& keypress) {
    auto keypressLc = toLower(keypress);
    if (keypressLc == "never")
      config.waitForKeypress = WaitForKeypress::Never;
    else if (keypressLc == "start")
      config.waitForKeypress = WaitForKeypress::BeforeStart;
    else if (keypressLc == "exit")
      config.waitForKeypress = WaitForKeypress::BeforeExit;
    else if (keypressLc == "both")
      config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
    else
      return ParserResult::runtimeError(
          "keypress argument must be one of: never, start, exit or both. '" +
          keypress + "' not recognised");
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setVerbosity = [&](std::string const& verbosity) {
    auto lcVerbosity = toLower(verbosity);
    if (lcVerbosity == "quiet")
      config.verbosity = Verbosity::Quiet;
    else if (lcVerbosity == "normal")
      config.verbosity = Verbosity::Normal;
    else if (lcVerbosity == "high")
      config.verbosity = Verbosity::High;
    else
      return ParserResult::runtimeError("Unrecognised verbosity, '" +
                                        verbosity + "'");
    return ParserResult::ok(ParseResultType::Matched);
  };
  auto const setReporter = [&](std::string const& reporter) {
    IReporterRegistry::FactoryMap const& factories =
        getRegistryHub().getReporterRegistry().getFactories();

    auto lcReporter = toLower(reporter);
    auto result     = factories.find(lcReporter);

    if (factories.end() != result)
      config.reporterName = lcReporter;
    else
      return ParserResult::runtimeError(
          "Unrecognized reporter, '" + reporter +
          "'. Check available with --list-reporters");
    return ParserResult::ok(ParseResultType::Matched);
  };

  auto cli =
      ExeName(config.processName) | Help(config.showHelp) |
      Opt(config.listTests)["-l"]["--list-tests"](
          "list all/matching test cases") |
      Opt(config.listTags)["-t"]["--list-tags"]("list all/matching tags") |
      Opt(config.showSuccessfulTests)["-s"]["--success"](
          "include successful tests in output") |
      Opt(config.shouldDebugBreak)["-b"]["--break"](
          "break into debugger on failure") |
      Opt(config.noThrow)["-e"]["--nothrow"]("skip exception tests") |
      Opt(config.showInvisibles)["-i"]["--invisibles"](
          "show invisibles (tabs, newlines)") |
      Opt(config.outputFilename, "filename")["-o"]["--out"]("output filename") |
      Opt(setReporter,
          "name")["-r"]["--reporter"]("reporter to use (defaults to console)") |
      Opt(config.name, "name")["-n"]["--name"]("suite name") | Opt([&](bool) {
        config.abortAfter = 1;
      })["-a"]["--abort"]("abort at first failure") |
      Opt([&](int x) { config.abortAfter = x; },
          "no. failures")["-x"]["--abortx"]("abort after x failures") |
      Opt(setWarning, "warning name")["-w"]["--warn"]("enable warnings") |
      Opt(
          [&](bool flag) {
            config.showDurations =
                flag ? ShowDurations::Always : ShowDurations::Never;
          },
          "yes|no")["-d"]["--durations"]("show test durations") |
      Opt(config.minDuration, "seconds")["-D"]["--min-duration"](
          "show test durations for tests taking at least the given number of "
          "seconds") |
      Opt(loadTestNamesFromFile, "filename")["-f"]["--input-file"](
          "load test names to run from a file") |
      Opt(config.filenamesAsTags)["-#"]["--filenames-as-tags"](
          "adds a tag for the filename") |
      Opt(config.sectionsToRun,
          "section name")["-c"]["--section"]("specify section to run") |
      Opt(setVerbosity,
          "quiet|normal|high")["-v"]["--verbosity"]("set output verbosity") |
      Opt(config.listTestNamesOnly)["--list-test-names-only"](
          "list all/matching test cases names only") |
      Opt(config.listReporters)["--list-reporters"]("list all reporters") |
      Opt(setTestOrder,
          "decl|lex|rand")["--order"]("test case order (defaults to decl)") |
      Opt(setRngSeed, "'time'|number")["--rng-seed"](
          "set a specific seed for random numbers") |
      Opt(setColourUsage,
          "yes|no")["--use-colour"]("should output be colourised") |
      Opt(config.libIdentify)["--libidentify"](
          "report name and version according to libidentify standard") |
      Opt(setWaitForKeypress, "never|start|exit|both")["--wait-for-keypress"](
          "waits for a keypress before exiting") |
      Opt(config.benchmarkSamples, "samples")["--benchmark-samples"](
          "number of samples to collect (default: 100)") |
      Opt(config.benchmarkResamples, "resamples")["--benchmark-resamples"](
          "number of resamples for the bootstrap (default: 100000)") |
      Opt(config.benchmarkConfidenceInterval,
          "confidence interval")["--benchmark-confidence-interval"](
          "confidence interval for the bootstrap (between 0 and 1, default: "
          "0.95)") |
      Opt(config.benchmarkNoAnalysis)["--benchmark-no-analysis"](
          "perform only measurements; do not perform any analysis") |
      Opt(config.benchmarkWarmupTime,
          "benchmarkWarmupTime")["--benchmark-warmup-time"](
          "amount of time in milliseconds spent on warming up each test "
          "(default: 100)") |
      Arg(config.testsOrTags,
          "test name|pattern|tags")("which test or tests to use");

  return cli;
}

} // end namespace Catch
// end catch_commandline.cpp
// start catch_common.cpp

#include <cstring>
#include <ostream>

namespace Catch {

bool SourceLineInfo::operator==(SourceLineInfo const& other) const noexcept {
  return line == other.line &&
         (file == other.file || std::strcmp(file, other.file) == 0);
}
bool SourceLineInfo::operator<(SourceLineInfo const& other) const noexcept {
  // We can assume that the same file will usually have the same pointer.
  // Thus, if the pointers are the same, there is no point in calling the strcmp
  return line < other.line || (line == other.line && file != other.file &&
                               (std::strcmp(file, other.file) < 0));
}

std::ostream& operator<<(std::ostream& os, SourceLineInfo const& info) {
#ifndef __GNUG__
  os << info.file << '(' << info.line << ')';
#else
  os << info.file << ':' << info.line;
#endif
  return os;
}

std::string StreamEndStop::operator+() const { return std::string(); }

NonCopyable::NonCopyable()  = default;
NonCopyable::~NonCopyable() = default;

} // namespace Catch
// end catch_common.cpp
// start catch_config.cpp

namespace Catch {

Config::Config(ConfigData const& data) : m_data(data), m_stream(openStream()) {
  // We need to trim filter specs to avoid trouble with superfluous
  // whitespace (esp. important for bdd macros, as those are manually
  // aligned with whitespace).

  for (auto& elem : m_data.testsOrTags) {
    elem = trim(elem);
  }
  for (auto& elem : m_data.sectionsToRun) {
    elem = trim(elem);
  }

  TestSpecParser parser(ITagAliasRegistry::get());
  if (!m_data.testsOrTags.empty()) {
    m_hasTestFilters = true;
    for (auto const& testOrTags : m_data.testsOrTags) {
      parser.parse(testOrTags);
    }
  }
  m_testSpec = parser.testSpec();
}

std::string const& Config::getFilename() const { return m_data.outputFilename; }

bool Config::listTests() const { return m_data.listTests; }
bool Config::listTestNamesOnly() const { return m_data.listTestNamesOnly; }
bool Config::listTags() const { return m_data.listTags; }
bool Config::listReporters() const { return m_data.listReporters; }

std::string Config::getProcessName() const { return m_data.processName; }
std::string const& Config::getReporterName() const {
  return m_data.reporterName;
}

std::vector<std::string> const& Config::getTestsOrTags() const {
  return m_data.testsOrTags;
}
std::vector<std::string> const& Config::getSectionsToRun() const {
  return m_data.sectionsToRun;
}

TestSpec const& Config::testSpec() const { return m_testSpec; }
bool Config::hasTestFilters() const { return m_hasTestFilters; }

bool Config::showHelp() const { return m_data.showHelp; }

// IConfig interface
bool Config::allowThrows() const { return !m_data.noThrow; }
std::ostream& Config::stream() const { return m_stream->stream(); }
std::string Config::name() const {
  return m_data.name.empty() ? m_data.processName : m_data.name;
}
bool Config::includeSuccessfulResults() const {
  return m_data.showSuccessfulTests;
}
bool Config::warnAboutMissingAssertions() const {
  return !!(m_data.warnings & WarnAbout::NoAssertions);
}
bool Config::warnAboutNoTests() const {
  return !!(m_data.warnings & WarnAbout::NoTests);
}
ShowDurations::OrNot Config::showDurations() const {
  return m_data.showDurations;
}
double Config::minDuration() const { return m_data.minDuration; }
RunTests::InWhatOrder Config::runOrder() const { return m_data.runOrder; }
unsigned int Config::rngSeed() const { return m_data.rngSeed; }
UseColour::YesOrNo Config::useColour() const { return m_data.useColour; }
bool Config::shouldDebugBreak() const { return m_data.shouldDebugBreak; }
int Config::abortAfter() const { return m_data.abortAfter; }
bool Config::showInvisibles() const { return m_data.showInvisibles; }
Verbosity Config::verbosity() const { return m_data.verbosity; }

bool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; }
int Config::benchmarkSamples() const { return m_data.benchmarkSamples; }
double Config::benchmarkConfidenceInterval() const {
  return m_data.benchmarkConfidenceInterval;
}
unsigned int Config::benchmarkResamples() const {
  return m_data.benchmarkResamples;
}
std::chrono::milliseconds Config::benchmarkWarmupTime() const {
  return std::chrono::milliseconds(m_data.benchmarkWarmupTime);
}

IStream const* Config::openStream() {
  return Catch::makeStream(m_data.outputFilename);
}

} // end namespace Catch
// end catch_config.cpp
// start catch_console_colour.cpp

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wexit-time-destructors"
#endif

// start catch_errno_guard.h

namespace Catch {

class ErrnoGuard {
public:
  ErrnoGuard();
  ~ErrnoGuard();

private:
  int m_oldErrno;
};

} // namespace Catch

// end catch_errno_guard.h
// start catch_windows_h_proxy.h

#if defined(CATCH_PLATFORM_WINDOWS)

#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX)
#define CATCH_DEFINED_NOMINMAX
#define NOMINMAX
#endif
#if !defined(WIN32_LEAN_AND_MEAN) &&                                           \
    !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN)
#define CATCH_DEFINED_WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif

#ifdef __AFXDLL
#include <AfxWin.h>
#else
#include <windows.h>
#endif

#ifdef CATCH_DEFINED_NOMINMAX
#undef NOMINMAX
#endif
#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN
#undef WIN32_LEAN_AND_MEAN
#endif

#endif // defined(CATCH_PLATFORM_WINDOWS)

// end catch_windows_h_proxy.h
#include <sstream>

namespace Catch {
namespace {

struct IColourImpl {
  virtual ~IColourImpl()                     = default;
  virtual void use(Colour::Code _colourCode) = 0;
};

struct NoColourImpl : IColourImpl {
  void use(Colour::Code) override {}

  static IColourImpl* instance() {
    static NoColourImpl s_instance;
    return &s_instance;
  }
};

} // namespace
} // namespace Catch

#if !defined(CATCH_CONFIG_COLOUR_NONE) &&                                      \
    !defined(CATCH_CONFIG_COLOUR_WINDOWS) &&                                   \
    !defined(CATCH_CONFIG_COLOUR_ANSI)
#ifdef CATCH_PLATFORM_WINDOWS
#define CATCH_CONFIG_COLOUR_WINDOWS
#else
#define CATCH_CONFIG_COLOUR_ANSI
#endif
#endif

#if defined(                                                                   \
    CATCH_CONFIG_COLOUR_WINDOWS) /////////////////////////////////////////

namespace Catch {
namespace {

class Win32ColourImpl : public IColourImpl {
public:
  Win32ColourImpl() : stdoutHandle(GetStdHandle(STD_OUTPUT_HANDLE)) {
    CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
    GetConsoleScreenBufferInfo(stdoutHandle, &csbiInfo);
    originalForegroundAttributes =
        csbiInfo.wAttributes & ~(BACKGROUND_GREEN | BACKGROUND_RED |
                                 BACKGROUND_BLUE | BACKGROUND_INTENSITY);
    originalBackgroundAttributes =
        csbiInfo.wAttributes & ~(FOREGROUND_GREEN | FOREGROUND_RED |
                                 FOREGROUND_BLUE | FOREGROUND_INTENSITY);
  }

  void use(Colour::Code _colourCode) override {
    switch (_colourCode) {
    case Colour::None:
      return setTextAttribute(originalForegroundAttributes);
    case Colour::White:
      return setTextAttribute(FOREGROUND_GREEN | FOREGROUND_RED |
                              FOREGROUND_BLUE);
    case Colour::Red:
      return setTextAttribute(FOREGROUND_RED);
    case Colour::Green:
      return setTextAttribute(FOREGROUND_GREEN);
    case Colour::Blue:
      return setTextAttribute(FOREGROUND_BLUE);
    case Colour::Cyan:
      return setTextAttribute(FOREGROUND_BLUE | FOREGROUND_GREEN);
    case Colour::Yellow:
      return setTextAttribute(FOREGROUND_RED | FOREGROUND_GREEN);
    case Colour::Grey:
      return setTextAttribute(0);

    case Colour::LightGrey:
      return setTextAttribute(FOREGROUND_INTENSITY);
    case Colour::BrightRed:
      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_RED);
    case Colour::BrightGreen:
      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_GREEN);
    case Colour::BrightWhite:
      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_GREEN |
                              FOREGROUND_RED | FOREGROUND_BLUE);
    case Colour::BrightYellow:
      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_RED |
                              FOREGROUND_GREEN);

    case Colour::Bright:
      CATCH_INTERNAL_ERROR("not a colour");

    default:
      CATCH_ERROR("Unknown colour requested");
    }
  }

private:
  void setTextAttribute(WORD _textAttribute) {
    SetConsoleTextAttribute(stdoutHandle,
                            _textAttribute | originalBackgroundAttributes);
  }
  HANDLE stdoutHandle;
  WORD originalForegroundAttributes;
  WORD originalBackgroundAttributes;
};

IColourImpl* platformColourInstance() {
  static Win32ColourImpl s_instance;

  IConfigPtr config = getCurrentContext().getConfig();
  UseColour::YesOrNo colourMode =
      config ? config->useColour() : UseColour::Auto;
  if (colourMode == UseColour::Auto)
    colourMode = UseColour::Yes;
  return colourMode == UseColour::Yes ? &s_instance : NoColourImpl::instance();
}

} // namespace
} // end namespace Catch

#elif defined(CATCH_CONFIG_COLOUR_ANSI) //////////////////////////////////////

#include <unistd.h>

namespace Catch {
namespace {

// use POSIX/ ANSI console terminal codes
// Thanks to Adam Strzelecki for original contribution
// (http://github.com/nanoant)
// https://github.com/philsquared/Catch/pull/131
class PosixColourImpl : public IColourImpl {
public:
  void use(Colour::Code _colourCode) override {
    switch (_colourCode) {
    case Colour::None:
    case Colour::White:
      return setColour("[0m");
    case Colour::Red:
      return setColour("[0;31m");
    case Colour::Green:
      return setColour("[0;32m");
    case Colour::Blue:
      return setColour("[0;34m");
    case Colour::Cyan:
      return setColour("[0;36m");
    case Colour::Yellow:
      return setColour("[0;33m");
    case Colour::Grey:
      return setColour("[1;30m");

    case Colour::LightGrey:
      return setColour("[0;37m");
    case Colour::BrightRed:
      return setColour("[1;31m");
    case Colour::BrightGreen:
      return setColour("[1;32m");
    case Colour::BrightWhite:
      return setColour("[1;37m");
    case Colour::BrightYellow:
      return setColour("[1;33m");

    case Colour::Bright:
      CATCH_INTERNAL_ERROR("not a colour");
    default:
      CATCH_INTERNAL_ERROR("Unknown colour requested");
    }
  }
  static IColourImpl* instance() {
    static PosixColourImpl s_instance;
    return &s_instance;
  }

private:
  void setColour(const char* _escapeCode) {
    getCurrentContext().getConfig()->stream() << '\033' << _escapeCode;
  }
};

bool useColourOnPlatform() {
  return
#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
      !isDebuggerActive() &&
#endif
#if !(defined(__DJGPP__) && defined(__STRICT_ANSI__))
      isatty(STDOUT_FILENO)
#else
      false
#endif
          ;
}
IColourImpl* platformColourInstance() {
  ErrnoGuard guard;
  IConfigPtr config = getCurrentContext().getConfig();
  UseColour::YesOrNo colourMode =
      config ? config->useColour() : UseColour::Auto;
  if (colourMode == UseColour::Auto)
    colourMode = useColourOnPlatform() ? UseColour::Yes : UseColour::No;
  return colourMode == UseColour::Yes ? PosixColourImpl::instance()
                                      : NoColourImpl::instance();
}

} // namespace
} // end namespace Catch

#else // not Windows or ANSI ///////////////////////////////////////////////

namespace Catch {

static IColourImpl* platformColourInstance() {
  return NoColourImpl::instance();
}

} // end namespace Catch

#endif // Windows/ ANSI/ None

namespace Catch {

Colour::Colour(Code _colourCode) { use(_colourCode); }
Colour::Colour(Colour&& other) noexcept {
  m_moved       = other.m_moved;
  other.m_moved = true;
}
Colour& Colour::operator=(Colour&& other) noexcept {
  m_moved       = other.m_moved;
  other.m_moved = true;
  return *this;
}

Colour::~Colour() {
  if (!m_moved)
    use(None);
}

void Colour::use(Code _colourCode) {
  static IColourImpl* impl = platformColourInstance();
  // Strictly speaking, this cannot possibly happen.
  // However, under some conditions it does happen (see #1626),
  // and this change is small enough that we can let practicality
  // triumph over purity in this case.
  if (impl != nullptr) {
    impl->use(_colourCode);
  }
}

std::ostream& operator<<(std::ostream& os, Colour const&) { return os; }

} // end namespace Catch

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

// end catch_console_colour.cpp
// start catch_context.cpp

namespace Catch {

class Context : public IMutableContext, NonCopyable {

public: // IContext
  IResultCapture* getResultCapture() override { return m_resultCapture; }
  IRunner* getRunner() override { return m_runner; }

  IConfigPtr const& getConfig() const override { return m_config; }

  ~Context() override;

public: // IMutableContext
  void setResultCapture(IResultCapture* resultCapture) override {
    m_resultCapture = resultCapture;
  }
  void setRunner(IRunner* runner) override { m_runner = runner; }
  void setConfig(IConfigPtr const& config) override { m_config = config; }

  friend IMutableContext& getCurrentMutableContext();

private:
  IConfigPtr m_config;
  IRunner* m_runner               = nullptr;
  IResultCapture* m_resultCapture = nullptr;
};

IMutableContext* IMutableContext::currentContext = nullptr;

void IMutableContext::createContext() { currentContext = new Context(); }

void cleanUpContext() {
  delete IMutableContext::currentContext;
  IMutableContext::currentContext = nullptr;
}
IContext::~IContext()               = default;
IMutableContext::~IMutableContext() = default;
Context::~Context()                 = default;

SimplePcg32& rng() {
  static SimplePcg32 s_rng;
  return s_rng;
}

} // namespace Catch
// end catch_context.cpp
// start catch_debug_console.cpp

// start catch_debug_console.h

#include <string>

namespace Catch {
void writeToDebugConsole(std::string const& text);
}

// end catch_debug_console.h
#if defined(CATCH_CONFIG_ANDROID_LOGWRITE)
#include <android/log.h>

namespace Catch {
void writeToDebugConsole(std::string const& text) {
  __android_log_write(ANDROID_LOG_DEBUG, "Catch", text.c_str());
}
} // namespace Catch

#elif defined(CATCH_PLATFORM_WINDOWS)

namespace Catch {
void writeToDebugConsole(std::string const& text) {
  ::OutputDebugStringA(text.c_str());
}
} // namespace Catch

#else

namespace Catch {
void writeToDebugConsole(std::string const& text) {
  // !TBD: Need a version for Mac/ XCode and other IDEs
  Catch::cout() << text;
}
} // namespace Catch

#endif // Platform
// end catch_debug_console.cpp
// start catch_debugger.cpp

#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)

#include <cassert>
#include <sys/types.h>
#include <unistd.h>
#include <cstddef>
#include <ostream>

#ifdef __apple_build_version__
// These headers will only compile with AppleClang (XCode)
// For other compilers (Clang, GCC, ... ) we need to exclude them
#include <sys/sysctl.h>
#endif

namespace Catch {
#ifdef __apple_build_version__
// The following function is taken directly from the following technical note:
// https://developer.apple.com/library/archive/qa/qa1361/_index.html

// Returns true if the current process is being debugged (either
// running under the debugger or has a debugger attached post facto).
bool isDebuggerActive() {
  int mib[4];
  struct kinfo_proc info;
  std::size_t size;

  // Initialize the flags so that, if sysctl fails for some bizarre
  // reason, we get a predictable result.

  info.kp_proc.p_flag = 0;

  // Initialize mib, which tells sysctl the info we want, in this case
  // we're looking for information about a specific process ID.

  mib[0] = CTL_KERN;
  mib[1] = KERN_PROC;
  mib[2] = KERN_PROC_PID;
  mib[3] = getpid();

  // Call sysctl.

  size = sizeof(info);
  if (sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0) {
    Catch::cerr() << "\n** Call to sysctl failed - unable to determine if "
                     "debugger is active **\n"
                  << std::endl;
    return false;
  }

  // We're being debugged if the P_TRACED flag is set.

  return ((info.kp_proc.p_flag & P_TRACED) != 0);
}
#else
bool isDebuggerActive() {
  // We need to find another way to determine this for non-appleclang compilers
  // on macOS
  return false;
}
#endif
} // namespace Catch

#elif defined(CATCH_PLATFORM_LINUX)
#include <fstream>
#include <string>

namespace Catch {
// The standard POSIX way of detecting a debugger is to attempt to
// ptrace() the process, but this needs to be done from a child and not
// this process itself to still allow attaching to this process later
// if wanted, so is rather heavy. Under Linux we have the PID of the
// "debugger" (which doesn't need to be gdb, of course, it could also
// be strace, for example) in /proc/$PID/status, so just get it from
// there instead.
bool isDebuggerActive() {
  // Libstdc++ has a bug, where std::ifstream sets errno to 0
  // This way our users can properly assert over errno values
  ErrnoGuard guard;
  std::ifstream in("/proc/self/status");
  for (std::string line; std::getline(in, line);) {
    static const int PREFIX_LEN = 11;
    if (line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0) {
      // We're traced if the PID is not 0 and no other PID starts
      // with 0 digit, so it's enough to check for just a single
      // character.
      return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
    }
  }

  return false;
}
} // namespace Catch
#elif defined(_MSC_VER)
extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
namespace Catch {
bool isDebuggerActive() { return IsDebuggerPresent() != 0; }
} // namespace Catch
#elif defined(__MINGW32__)
extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
namespace Catch {
bool isDebuggerActive() { return IsDebuggerPresent() != 0; }
} // namespace Catch
#else
namespace Catch {
bool isDebuggerActive() { return false; }
} // namespace Catch
#endif // Platform
// end catch_debugger.cpp
// start catch_decomposer.cpp

namespace Catch {

ITransientExpression::~ITransientExpression() = default;

void formatReconstructedExpression(std::ostream& os, std::string const& lhs,
                                   StringRef op, std::string const& rhs) {
  if (lhs.size() + rhs.size() < 40 && lhs.find('\n') == std::string::npos &&
      rhs.find('\n') == std::string::npos)
    os << lhs << " " << op << " " << rhs;
  else
    os << lhs << "\n" << op << "\n" << rhs;
}
} // namespace Catch
// end catch_decomposer.cpp
// start catch_enforce.cpp

#include <stdexcept>

namespace Catch {
#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) &&                                \
    !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)
[[noreturn]] void throw_exception(std::exception const& e) {
  Catch::cerr()
      << "Catch will terminate because it needed to throw an exception.\n"
      << "The message was: " << e.what() << '\n';
  std::terminate();
}
#endif

[[noreturn]] void throw_logic_error(std::string const& msg) {
  throw_exception(std::logic_error(msg));
}

[[noreturn]] void throw_domain_error(std::string const& msg) {
  throw_exception(std::domain_error(msg));
}

[[noreturn]] void throw_runtime_error(std::string const& msg) {
  throw_exception(std::runtime_error(msg));
}

} // namespace Catch
// end catch_enforce.cpp
// start catch_enum_values_registry.cpp
// start catch_enum_values_registry.h

#include <vector>
#include <memory>

namespace Catch {

namespace Detail {

std::unique_ptr<EnumInfo> makeEnumInfo(StringRef enumName,
                                       StringRef allValueNames,
                                       std::vector<int> const& values);

class EnumValuesRegistry : public IMutableEnumValuesRegistry {

  std::vector<std::unique_ptr<EnumInfo>> m_enumInfos;

  EnumInfo const& registerEnum(StringRef enumName, StringRef allEnums,
                               std::vector<int> const& values) override;
};

std::vector<StringRef> parseEnums(StringRef enums);

} // namespace Detail

} // namespace Catch

// end catch_enum_values_registry.h

#include <map>
#include <cassert>

namespace Catch {

IMutableEnumValuesRegistry::~IMutableEnumValuesRegistry() {}

namespace Detail {

namespace {
// Extracts the actual name part of an enum instance
// In other words, it returns the Blue part of Bikeshed::Colour::Blue
StringRef extractInstanceName(StringRef enumInstance) {
  // Find last occurrence of ":"
  size_t name_start = enumInstance.size();
  while (name_start > 0 && enumInstance[name_start - 1] != ':') {
    --name_start;
  }
  return enumInstance.substr(name_start, enumInstance.size() - name_start);
}
} // namespace

std::vector<StringRef> parseEnums(StringRef enums) {
  auto enumValues = splitStringRef(enums, ',');
  std::vector<StringRef> parsed;
  parsed.reserve(enumValues.size());
  for (auto const& enumValue : enumValues) {
    parsed.push_back(trim(extractInstanceName(enumValue)));
  }
  return parsed;
}

EnumInfo::~EnumInfo() {}

StringRef EnumInfo::lookup(int value) const {
  for (auto const& valueToName : m_values) {
    if (valueToName.first == value)
      return valueToName.second;
  }
  return "{** unexpected enum value **}"_sr;
}

std::unique_ptr<EnumInfo> makeEnumInfo(StringRef enumName,
                                       StringRef allValueNames,
                                       std::vector<int> const& values) {
  std::unique_ptr<EnumInfo> enumInfo(new EnumInfo);
  enumInfo->m_name = enumName;
  enumInfo->m_values.reserve(values.size());

  const auto valueNames = Catch::Detail::parseEnums(allValueNames);
  assert(valueNames.size() == values.size());
  std::size_t i = 0;
  for (auto value : values)
    enumInfo->m_values.emplace_back(value, valueNames[i++]);

  return enumInfo;
}

EnumInfo const&
EnumValuesRegistry::registerEnum(StringRef enumName, StringRef allValueNames,
                                 std::vector<int> const& values) {
  m_enumInfos.push_back(makeEnumInfo(enumName, allValueNames, values));
  return *m_enumInfos.back();
}

} // namespace Detail
} // namespace Catch

// end catch_enum_values_registry.cpp
// start catch_errno_guard.cpp

#include <cerrno>

namespace Catch {
ErrnoGuard::ErrnoGuard() : m_oldErrno(errno) {}
ErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }
} // namespace Catch
// end catch_errno_guard.cpp
// start catch_exception_translator_registry.cpp

// start catch_exception_translator_registry.h

#include <vector>
#include <string>
#include <memory>

namespace Catch {

class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
public:
  ~ExceptionTranslatorRegistry();
  virtual void registerTranslator(const IExceptionTranslator* translator);
  std::string translateActiveException() const override;
  std::string tryTranslators() const;

private:
  std::vector<std::unique_ptr<IExceptionTranslator const>> m_translators;
};
} // namespace Catch

// end catch_exception_translator_registry.h
#ifdef __OBJC__
#import "Foundation/Foundation.h"
#endif

namespace Catch {

ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {}

void ExceptionTranslatorRegistry::registerTranslator(
    const IExceptionTranslator* translator) {
  m_translators.push_back(
      std::unique_ptr<const IExceptionTranslator>(translator));
}

#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
std::string ExceptionTranslatorRegistry::translateActiveException() const {
  try {
#ifdef __OBJC__
    // In Objective-C try objective-c exceptions first
    @try {
      return tryTranslators();
    } @catch (NSException* exception) {
      return Catch::Detail::stringify([exception description]);
    }
#else
    // Compiling a mixed mode project with MSVC means that CLR
    // exceptions will be caught in (...) as well. However, these
    // do not fill-in std::current_exception and thus lead to crash
    // when attempting rethrow.
    // /EHa switch also causes structured exceptions to be caught
    // here, but they fill-in current_exception properly, so
    // at worst the output should be a little weird, instead of
    // causing a crash.
    if (std::current_exception() == nullptr) {
      return "Non C++ exception. Possibly a CLR exception.";
    }
    return tryTranslators();
#endif
  } catch (TestFailureException&) {
    std::rethrow_exception(std::current_exception());
  } catch (std::exception& ex) {
    return ex.what();
  } catch (std::string& msg) {
    return msg;
  } catch (const char* msg) {
    return msg;
  } catch (...) {
    return "Unknown exception";
  }
}

std::string ExceptionTranslatorRegistry::tryTranslators() const {
  if (m_translators.empty()) {
    std::rethrow_exception(std::current_exception());
  } else {
    return m_translators[0]->translate(m_translators.begin() + 1,
                                       m_translators.end());
  }
}

#else // ^^ Exceptions are enabled // Exceptions are disabled vv
std::string ExceptionTranslatorRegistry::translateActiveException() const {
  CATCH_INTERNAL_ERROR("Attempted to translate active exception under "
                       "CATCH_CONFIG_DISABLE_EXCEPTIONS!");
}

std::string ExceptionTranslatorRegistry::tryTranslators() const {
  CATCH_INTERNAL_ERROR("Attempted to use exception translators under "
                       "CATCH_CONFIG_DISABLE_EXCEPTIONS!");
}
#endif

} // namespace Catch
// end catch_exception_translator_registry.cpp
// start catch_fatal_condition.cpp

#include <algorithm>

#if !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_POSIX_SIGNALS)

namespace Catch {

// If neither SEH nor signal handling is required, the handler impls
// do not have to do anything, and can be empty.
FatalConditionHandler::engage_platform() {}
FatalConditionHandler::disengage_platform() {}
FatalConditionHandler::FatalConditionHandler()  = default;
FatalConditionHandler::~FatalConditionHandler() = default;

} // end namespace Catch

#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS

#if defined(CATCH_CONFIG_WINDOWS_SEH) && defined(CATCH_CONFIG_POSIX_SIGNALS)
#error                                                                         \
    "Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time"
#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS

#if defined(CATCH_CONFIG_WINDOWS_SEH) || defined(CATCH_CONFIG_POSIX_SIGNALS)

namespace {
//! Signals fatal error message to the run context
void reportFatal(char const* const message) {
  Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition(
      message);
}

//! Minimal size Catch2 needs for its own fatal error handling.
//! Picked anecdotally, so it might not be sufficient on all
//! platforms, and for all configurations.
constexpr std::size_t minStackSizeForErrors = 32 * 1024;
} // end unnamed namespace

#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS

#if defined(CATCH_CONFIG_WINDOWS_SEH)

namespace Catch {

struct SignalDefs {
  DWORD id;
  const char* name;
};

// There is no 1-1 mapping between signals and windows exceptions.
// Windows can easily distinguish between SO and SigSegV,
// but SigInt, SigTerm, etc are handled differently.
static SignalDefs signalDefs[] = {
    {static_cast<DWORD>(EXCEPTION_ILLEGAL_INSTRUCTION),
     "SIGILL - Illegal instruction signal"},
    {static_cast<DWORD>(EXCEPTION_STACK_OVERFLOW), "SIGSEGV - Stack overflow"},
    {static_cast<DWORD>(EXCEPTION_ACCESS_VIOLATION),
     "SIGSEGV - Segmentation violation signal"},
    {static_cast<DWORD>(EXCEPTION_INT_DIVIDE_BY_ZERO), "Divide by zero error"},
};

static LONG CALLBACK
handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) {
  for (auto const& def : signalDefs) {
    if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {
      reportFatal(def.name);
    }
  }
  // If its not an exception we care about, pass it along.
  // This stops us from eating debugger breaks etc.
  return EXCEPTION_CONTINUE_SEARCH;
}

// Since we do not support multiple instantiations, we put these
// into global variables and rely on cleaning them up in outlined
// constructors/destructors
static PVOID exceptionHandlerHandle = nullptr;

// For MSVC, we reserve part of the stack memory for handling
// memory overflow structured exception.
FatalConditionHandler::FatalConditionHandler() {
  ULONG guaranteeSize = static_cast<ULONG>(minStackSizeForErrors);
  if (!SetThreadStackGuarantee(&guaranteeSize)) {
    // We do not want to fully error out, because needing
    // the stack reserve should be rare enough anyway.
    Catch::cerr() << "Failed to reserve piece of stack."
                  << " Stack overflows will not be reported successfully.";
  }
}

// We do not attempt to unset the stack guarantee, because
// Windows does not support lowering the stack size guarantee.
FatalConditionHandler::~FatalConditionHandler() = default;

void FatalConditionHandler::engage_platform() {
  // Register as first handler in current chain
  exceptionHandlerHandle =
      AddVectoredExceptionHandler(1, handleVectoredException);
  if (!exceptionHandlerHandle) {
    CATCH_RUNTIME_ERROR("Could not register vectored exception handler");
  }
}

void FatalConditionHandler::disengage_platform() {
  if (!RemoveVectoredExceptionHandler(exceptionHandlerHandle)) {
    CATCH_RUNTIME_ERROR("Could not unregister vectored exception handler");
  }
  exceptionHandlerHandle = nullptr;
}

} // end namespace Catch

#endif // CATCH_CONFIG_WINDOWS_SEH

#if defined(CATCH_CONFIG_POSIX_SIGNALS)

#include <signal.h>

namespace Catch {

struct SignalDefs {
  int id;
  const char* name;
};

static SignalDefs signalDefs[] = {
    {SIGINT, "SIGINT - Terminal interrupt signal"},
    {SIGILL, "SIGILL - Illegal instruction signal"},
    {SIGFPE, "SIGFPE - Floating point error signal"},
    {SIGSEGV, "SIGSEGV - Segmentation violation signal"},
    {SIGTERM, "SIGTERM - Termination request signal"},
    {SIGABRT, "SIGABRT - Abort (abnormal termination) signal"}};

// Older GCCs trigger -Wmissing-field-initializers for T foo = {}
// which is zero initialization, but not explicit. We want to avoid
// that.
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif

static char* altStackMem        = nullptr;
static std::size_t altStackSize = 0;
static stack_t oldSigStack{};
static struct sigaction
    oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{};

static void restorePreviousSignalHandlers() {
  // We set signal handlers back to the previous ones. Hopefully
  // nobody overwrote them in the meantime, and doesn't expect
  // their signal handlers to live past ours given that they
  // installed them after ours..
  for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {
    sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);
  }
  // Return the old stack
  sigaltstack(&oldSigStack, nullptr);
}

static void handleSignal(int sig) {
  char const* name = "<unknown signal>";
  for (auto const& def : signalDefs) {
    if (sig == def.id) {
      name = def.name;
      break;
    }
  }
  // We need to restore previous signal handlers and let them do
  // their thing, so that the users can have the debugger break
  // when a signal is raised, and so on.
  restorePreviousSignalHandlers();
  reportFatal(name);
  raise(sig);
}

FatalConditionHandler::FatalConditionHandler() {
  assert(!altStackMem &&
         "Cannot initialize POSIX signal handler when one already exists");
  if (altStackSize == 0) {
    altStackSize =
        std::max(static_cast<size_t>(SIGSTKSZ), minStackSizeForErrors);
  }
  altStackMem = new char[altStackSize]();
}

FatalConditionHandler::~FatalConditionHandler() {
  delete[] altStackMem;
  // We signal that another instance can be constructed by zeroing
  // out the pointer.
  altStackMem = nullptr;
}

void FatalConditionHandler::engage_platform() {
  stack_t sigStack;
  sigStack.ss_sp    = altStackMem;
  sigStack.ss_size  = altStackSize;
  sigStack.ss_flags = 0;
  sigaltstack(&sigStack, &oldSigStack);
  struct sigaction sa = {};

  sa.sa_handler = handleSignal;
  sa.sa_flags   = SA_ONSTACK;
  for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {
    sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);
  }
}

#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

void FatalConditionHandler::disengage_platform() {
  restorePreviousSignalHandlers();
}

} // end namespace Catch

#endif // CATCH_CONFIG_POSIX_SIGNALS
// end catch_fatal_condition.cpp
// start catch_generators.cpp

#include <limits>
#include <set>

namespace Catch {

IGeneratorTracker::~IGeneratorTracker() {}

const char* GeneratorException::what() const noexcept { return m_msg; }

namespace Generators {

GeneratorUntypedBase::~GeneratorUntypedBase() {}

auto acquireGeneratorTracker(StringRef generatorName,
                             SourceLineInfo const& lineInfo)
    -> IGeneratorTracker& {
  return getResultCapture().acquireGeneratorTracker(generatorName, lineInfo);
}

} // namespace Generators
} // namespace Catch
// end catch_generators.cpp
// start catch_interfaces_capture.cpp

namespace Catch {
IResultCapture::~IResultCapture() = default;
}
// end catch_interfaces_capture.cpp
// start catch_interfaces_config.cpp

namespace Catch {
IConfig::~IConfig() = default;
}
// end catch_interfaces_config.cpp
// start catch_interfaces_exception.cpp

namespace Catch {
IExceptionTranslator::~IExceptionTranslator()                 = default;
IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;
} // namespace Catch
// end catch_interfaces_exception.cpp
// start catch_interfaces_registry_hub.cpp

namespace Catch {
IRegistryHub::~IRegistryHub()               = default;
IMutableRegistryHub::~IMutableRegistryHub() = default;
} // namespace Catch
// end catch_interfaces_registry_hub.cpp
// start catch_interfaces_reporter.cpp

// start catch_reporter_listening.h

namespace Catch {

class ListeningReporter : public IStreamingReporter {
  using Reporters = std::vector<IStreamingReporterPtr>;
  Reporters m_listeners;
  IStreamingReporterPtr m_reporter = nullptr;
  ReporterPreferences m_preferences;

public:
  ListeningReporter();

  void addListener(IStreamingReporterPtr&& listener);
  void addReporter(IStreamingReporterPtr&& reporter);

public: // IStreamingReporter
  ReporterPreferences getPreferences() const override;

  void noMatchingTestCases(std::string const& spec) override;

  void reportInvalidArguments(std::string const& arg) override;

  static std::set<Verbosity> getSupportedVerbosities();

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
  void benchmarkPreparing(std::string const& name) override;
  void benchmarkStarting(BenchmarkInfo const& benchmarkInfo) override;
  void benchmarkEnded(BenchmarkStats<> const& benchmarkStats) override;
  void benchmarkFailed(std::string const&) override;
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

  void testRunStarting(TestRunInfo const& testRunInfo) override;
  void testGroupStarting(GroupInfo const& groupInfo) override;
  void testCaseStarting(TestCaseInfo const& testInfo) override;
  void sectionStarting(SectionInfo const& sectionInfo) override;
  void assertionStarting(AssertionInfo const& assertionInfo) override;

  // The return value indicates if the messages buffer should be cleared:
  bool assertionEnded(AssertionStats const& assertionStats) override;
  void sectionEnded(SectionStats const& sectionStats) override;
  void testCaseEnded(TestCaseStats const& testCaseStats) override;
  void testGroupEnded(TestGroupStats const& testGroupStats) override;
  void testRunEnded(TestRunStats const& testRunStats) override;

  void skipTest(TestCaseInfo const& testInfo) override;
  bool isMulti() const override;
};

} // end namespace Catch

// end catch_reporter_listening.h
namespace Catch {

ReporterConfig::ReporterConfig(IConfigPtr const& _fullConfig)
    : m_stream(&_fullConfig->stream()), m_fullConfig(_fullConfig) {}

ReporterConfig::ReporterConfig(IConfigPtr const& _fullConfig,
                               std::ostream& _stream)
    : m_stream(&_stream), m_fullConfig(_fullConfig) {}

std::ostream& ReporterConfig::stream() const { return *m_stream; }
IConfigPtr ReporterConfig::fullConfig() const { return m_fullConfig; }

TestRunInfo::TestRunInfo(std::string const& _name) : name(_name) {}

GroupInfo::GroupInfo(std::string const& _name, std::size_t _groupIndex,
                     std::size_t _groupsCount)
    : name(_name), groupIndex(_groupIndex), groupsCounts(_groupsCount) {}

AssertionStats::AssertionStats(AssertionResult const& _assertionResult,
                               std::vector<MessageInfo> const& _infoMessages,
                               Totals const& _totals)
    : assertionResult(_assertionResult), infoMessages(_infoMessages),
      totals(_totals) {
  assertionResult.m_resultData.lazyExpression.m_transientExpression =
      _assertionResult.m_resultData.lazyExpression.m_transientExpression;

  if (assertionResult.hasMessage()) {
    // Copy message into messages list.
    // !TBD This should have been done earlier, somewhere
    MessageBuilder builder(assertionResult.getTestMacroName(),
                           assertionResult.getSourceInfo(),
                           assertionResult.getResultType());
    builder << assertionResult.getMessage();
    builder.m_info.message = builder.m_stream.str();

    infoMessages.push_back(builder.m_info);
  }
}

AssertionStats::~AssertionStats() = default;

SectionStats::SectionStats(SectionInfo const& _sectionInfo,
                           Counts const& _assertions, double _durationInSeconds,
                           bool _missingAssertions)
    : sectionInfo(_sectionInfo), assertions(_assertions),
      durationInSeconds(_durationInSeconds),
      missingAssertions(_missingAssertions) {}

SectionStats::~SectionStats() = default;

TestCaseStats::TestCaseStats(TestCaseInfo const& _testInfo,
                             Totals const& _totals, std::string const& _stdOut,
                             std::string const& _stdErr, bool _aborting)
    : testInfo(_testInfo), totals(_totals), stdOut(_stdOut), stdErr(_stdErr),
      aborting(_aborting) {}

TestCaseStats::~TestCaseStats() = default;

TestGroupStats::TestGroupStats(GroupInfo const& _groupInfo,
                               Totals const& _totals, bool _aborting)
    : groupInfo(_groupInfo), totals(_totals), aborting(_aborting) {}

TestGroupStats::TestGroupStats(GroupInfo const& _groupInfo)
    : groupInfo(_groupInfo), aborting(false) {}

TestGroupStats::~TestGroupStats() = default;

TestRunStats::TestRunStats(TestRunInfo const& _runInfo, Totals const& _totals,
                           bool _aborting)
    : runInfo(_runInfo), totals(_totals), aborting(_aborting) {}

TestRunStats::~TestRunStats() = default;

void IStreamingReporter::fatalErrorEncountered(StringRef) {}
bool IStreamingReporter::isMulti() const { return false; }

IReporterFactory::~IReporterFactory()   = default;
IReporterRegistry::~IReporterRegistry() = default;

} // end namespace Catch
// end catch_interfaces_reporter.cpp
// start catch_interfaces_runner.cpp

namespace Catch {
IRunner::~IRunner() = default;
}
// end catch_interfaces_runner.cpp
// start catch_interfaces_testcase.cpp

namespace Catch {
ITestInvoker::~ITestInvoker()           = default;
ITestCaseRegistry::~ITestCaseRegistry() = default;
} // namespace Catch
// end catch_interfaces_testcase.cpp
// start catch_leak_detector.cpp

#ifdef CATCH_CONFIG_WINDOWS_CRTDBG
#include <crtdbg.h>

namespace Catch {

LeakDetector::LeakDetector() {
  int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
  flag |= _CRTDBG_LEAK_CHECK_DF;
  flag |= _CRTDBG_ALLOC_MEM_DF;
  _CrtSetDbgFlag(flag);
  _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
  _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);
  // Change this to leaking allocation's number to break there
  _CrtSetBreakAlloc(-1);
}
} // namespace Catch

#else

Catch::LeakDetector::LeakDetector() {}

#endif

Catch::LeakDetector::~LeakDetector() { Catch::cleanUp(); }
// end catch_leak_detector.cpp
// start catch_list.cpp

// start catch_list.h

#include <set>

namespace Catch {

std::size_t listTests(Config const& config);

std::size_t listTestsNamesOnly(Config const& config);

struct TagInfo {
  void add(std::string const& spelling);
  std::string all() const;

  std::set<std::string> spellings;
  std::size_t count = 0;
};

std::size_t listTags(Config const& config);

std::size_t listReporters();

Option<std::size_t> list(std::shared_ptr<Config> const& config);

} // end namespace Catch

// end catch_list.h
// start catch_text.h

namespace Catch {
using namespace clara::TextFlow;
}

// end catch_text.h
#include <limits>
#include <algorithm>
#include <iomanip>

namespace Catch {

std::size_t listTests(Config const& config) {
  TestSpec const& testSpec = config.testSpec();
  if (config.hasTestFilters())
    Catch::cout() << "Matching test cases:\n";
  else {
    Catch::cout() << "All available test cases:\n";
  }

  auto matchedTestCases =
      filterTests(getAllTestCasesSorted(config), testSpec, config);
  for (auto const& testCaseInfo : matchedTestCases) {
    Colour::Code colour =
        testCaseInfo.isHidden() ? Colour::SecondaryText : Colour::None;
    Colour colourGuard(colour);

    Catch::cout() << Column(testCaseInfo.name).initialIndent(2).indent(4)
                  << "\n";
    if (config.verbosity() >= Verbosity::High) {
      Catch::cout()
          << Column(Catch::Detail::stringify(testCaseInfo.lineInfo)).indent(4)
          << std::endl;
      std::string description = testCaseInfo.description;
      if (description.empty())
        description = "(NO DESCRIPTION)";
      Catch::cout() << Column(description).indent(4) << std::endl;
    }
    if (!testCaseInfo.tags.empty())
      Catch::cout() << Column(testCaseInfo.tagsAsString()).indent(6) << "\n";
  }

  if (!config.hasTestFilters())
    Catch::cout() << pluralise(matchedTestCases.size(), "test case") << '\n'
                  << std::endl;
  else
    Catch::cout() << pluralise(matchedTestCases.size(), "matching test case")
                  << '\n'
                  << std::endl;
  return matchedTestCases.size();
}

std::size_t listTestsNamesOnly(Config const& config) {
  TestSpec const& testSpec = config.testSpec();
  std::size_t matchedTests = 0;
  std::vector<TestCase> matchedTestCases =
      filterTests(getAllTestCasesSorted(config), testSpec, config);
  for (auto const& testCaseInfo : matchedTestCases) {
    matchedTests++;
    if (startsWith(testCaseInfo.name, '#'))
      Catch::cout() << '"' << testCaseInfo.name << '"';
    else
      Catch::cout() << testCaseInfo.name;
    if (config.verbosity() >= Verbosity::High)
      Catch::cout() << "\t@" << testCaseInfo.lineInfo;
    Catch::cout() << std::endl;
  }
  return matchedTests;
}

void TagInfo::add(std::string const& spelling) {
  ++count;
  spellings.insert(spelling);
}

std::string TagInfo::all() const {
  size_t size = 0;
  for (auto const& spelling : spellings) {
    // Add 2 for the brackes
    size += spelling.size() + 2;
  }

  std::string out;
  out.reserve(size);
  for (auto const& spelling : spellings) {
    out += '[';
    out += spelling;
    out += ']';
  }
  return out;
}

std::size_t listTags(Config const& config) {
  TestSpec const& testSpec = config.testSpec();
  if (config.hasTestFilters())
    Catch::cout() << "Tags for matching test cases:\n";
  else {
    Catch::cout() << "All available tags:\n";
  }

  std::map<std::string, TagInfo> tagCounts;

  std::vector<TestCase> matchedTestCases =
      filterTests(getAllTestCasesSorted(config), testSpec, config);
  for (auto const& testCase : matchedTestCases) {
    for (auto const& tagName : testCase.getTestCaseInfo().tags) {
      std::string lcaseTagName = toLower(tagName);
      auto countIt             = tagCounts.find(lcaseTagName);
      if (countIt == tagCounts.end())
        countIt =
            tagCounts.insert(std::make_pair(lcaseTagName, TagInfo())).first;
      countIt->second.add(tagName);
    }
  }

  for (auto const& tagCount : tagCounts) {
    ReusableStringStream rss;
    rss << "  " << std::setw(2) << tagCount.second.count << "  ";
    auto str     = rss.str();
    auto wrapper = Column(tagCount.second.all())
                       .initialIndent(0)
                       .indent(str.size())
                       .width(CATCH_CONFIG_CONSOLE_WIDTH - 10);
    Catch::cout() << str << wrapper << '\n';
  }
  Catch::cout() << pluralise(tagCounts.size(), "tag") << '\n' << std::endl;
  return tagCounts.size();
}

std::size_t listReporters() {
  Catch::cout() << "Available reporters:\n";
  IReporterRegistry::FactoryMap const& factories =
      getRegistryHub().getReporterRegistry().getFactories();
  std::size_t maxNameLen = 0;
  for (auto const& factoryKvp : factories)
    maxNameLen = (std::max)(maxNameLen, factoryKvp.first.size());

  for (auto const& factoryKvp : factories) {
    Catch::cout()
        << Column(factoryKvp.first + ":").indent(2).width(5 + maxNameLen) +
               Column(factoryKvp.second->getDescription())
                   .initialIndent(0)
                   .indent(2)
                   .width(CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8)
        << "\n";
  }
  Catch::cout() << std::endl;
  return factories.size();
}

Option<std::size_t> list(std::shared_ptr<Config> const& config) {
  Option<std::size_t> listedCount;
  getCurrentMutableContext().setConfig(config);
  if (config->listTests())
    listedCount = listedCount.valueOr(0) + listTests(*config);
  if (config->listTestNamesOnly())
    listedCount = listedCount.valueOr(0) + listTestsNamesOnly(*config);
  if (config->listTags())
    listedCount = listedCount.valueOr(0) + listTags(*config);
  if (config->listReporters())
    listedCount = listedCount.valueOr(0) + listReporters();
  return listedCount;
}

} // end namespace Catch
// end catch_list.cpp
// start catch_matchers.cpp

namespace Catch {
namespace Matchers {
namespace Impl {

std::string MatcherUntypedBase::toString() const {
  if (m_cachedToString.empty())
    m_cachedToString = describe();
  return m_cachedToString;
}

MatcherUntypedBase::~MatcherUntypedBase() = default;

} // namespace Impl
} // namespace Matchers

using namespace Matchers;
using Matchers::Impl::MatcherBase;

} // namespace Catch
// end catch_matchers.cpp
// start catch_matchers_exception.cpp

namespace Catch {
namespace Matchers {
namespace Exception {

bool ExceptionMessageMatcher::match(std::exception const& ex) const {
  return ex.what() == m_message;
}

std::string ExceptionMessageMatcher::describe() const {
  return "exception message matches \"" + m_message + "\"";
}

} // namespace Exception
Exception::ExceptionMessageMatcher Message(std::string const& message) {
  return Exception::ExceptionMessageMatcher(message);
}

// namespace Exception
} // namespace Matchers
} // namespace Catch
// end catch_matchers_exception.cpp
// start catch_matchers_floating.cpp

// start catch_polyfills.hpp

namespace Catch {
bool isnan(float f);
bool isnan(double d);
} // namespace Catch

// end catch_polyfills.hpp
// start catch_to_string.hpp

#include <string>

namespace Catch {
template <typename T>
std::string to_string(T const& t) {
#if defined(CATCH_CONFIG_CPP11_TO_STRING)
  return std::to_string(t);
#else
  ReusableStringStream rss;
  rss << t;
  return rss.str();
#endif
}
} // end namespace Catch

// end catch_to_string.hpp
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <sstream>
#include <type_traits>
#include <iomanip>
#include <limits>

namespace Catch {
namespace {

int32_t convert(float f) {
  static_assert(sizeof(float) == sizeof(int32_t),
                "Important ULP matcher assumption violated");
  int32_t i;
  std::memcpy(&i, &f, sizeof(f));
  return i;
}

int64_t convert(double d) {
  static_assert(sizeof(double) == sizeof(int64_t),
                "Important ULP matcher assumption violated");
  int64_t i;
  std::memcpy(&i, &d, sizeof(d));
  return i;
}

template <typename FP>
bool almostEqualUlps(FP lhs, FP rhs, uint64_t maxUlpDiff) {
  // Comparison with NaN should always be false.
  // This way we can rule it out before getting into the ugly details
  if (Catch::isnan(lhs) || Catch::isnan(rhs)) {
    return false;
  }

  auto lc = convert(lhs);
  auto rc = convert(rhs);

  if ((lc < 0) != (rc < 0)) {
    // Potentially we can have +0 and -0
    return lhs == rhs;
  }

  // static cast as a workaround for IBM XLC
  auto ulpDiff = std::abs(static_cast<FP>(lc - rc));
  return static_cast<uint64_t>(ulpDiff) <= maxUlpDiff;
}

#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)

float nextafter(float x, float y) { return ::nextafterf(x, y); }

double nextafter(double x, double y) { return ::nextafter(x, y); }

#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^

template <typename FP>
FP step(FP start, FP direction, uint64_t steps) {
  for (uint64_t i = 0; i < steps; ++i) {
#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
    start = Catch::nextafter(start, direction);
#else
    start = std::nextafter(start, direction);
#endif
  }
  return start;
}

// Performs equivalent check of std::fabs(lhs - rhs) <= margin
// But without the subtraction to allow for INFINITY in comparison
bool marginComparison(double lhs, double rhs, double margin) {
  return (lhs + margin >= rhs) && (rhs + margin >= lhs);
}

template <typename FloatingPoint>
void write(std::ostream& out, FloatingPoint num) {
  out << std::scientific
      << std::setprecision(std::numeric_limits<FloatingPoint>::max_digits10 - 1)
      << num;
}

} // end anonymous namespace

namespace Matchers {
namespace Floating {

enum class FloatingPointKind : uint8_t { Float, Double };

WithinAbsMatcher::WithinAbsMatcher(double target, double margin)
    : m_target{target}, m_margin{margin} {
  CATCH_ENFORCE(margin >= 0,
                "Invalid margin: " << margin << '.'
                                   << " Margin has to be non-negative.");
}

// Performs equivalent check of std::fabs(lhs - rhs) <= margin
// But without the subtraction to allow for INFINITY in comparison
bool WithinAbsMatcher::match(double const& matchee) const {
  return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);
}

std::string WithinAbsMatcher::describe() const {
  return "is within " + ::Catch::Detail::stringify(m_margin) + " of " +
         ::Catch::Detail::stringify(m_target);
}

WithinUlpsMatcher::WithinUlpsMatcher(double target, uint64_t ulps,
                                     FloatingPointKind baseType)
    : m_target{target}, m_ulps{ulps}, m_type{baseType} {
  CATCH_ENFORCE(m_type == FloatingPointKind::Double ||
                    m_ulps < (std::numeric_limits<uint32_t>::max)(),
                "Provided ULP is impossibly large for a float comparison.");
}

#if defined(__clang__)
#pragma clang diagnostic push
// Clang <3.5 reports on the default branch in the switch below
#pragma clang diagnostic ignored "-Wunreachable-code"
#endif

bool WithinUlpsMatcher::match(double const& matchee) const {
  switch (m_type) {
  case FloatingPointKind::Float:
    return almostEqualUlps<float>(static_cast<float>(matchee),
                                  static_cast<float>(m_target), m_ulps);
  case FloatingPointKind::Double:
    return almostEqualUlps<double>(matchee, m_target, m_ulps);
  default:
    CATCH_INTERNAL_ERROR("Unknown FloatingPointKind value");
  }
}

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

std::string WithinUlpsMatcher::describe() const {
  std::stringstream ret;

  ret << "is within " << m_ulps << " ULPs of ";

  if (m_type == FloatingPointKind::Float) {
    write(ret, static_cast<float>(m_target));
    ret << 'f';
  } else {
    write(ret, m_target);
  }

  ret << " ([";
  if (m_type == FloatingPointKind::Double) {
    write(ret, step(m_target, static_cast<double>(-INFINITY), m_ulps));
    ret << ", ";
    write(ret, step(m_target, static_cast<double>(INFINITY), m_ulps));
  } else {
    // We have to cast INFINITY to float because of MinGW, see #1782
    write(ret, step(static_cast<float>(m_target), static_cast<float>(-INFINITY),
                    m_ulps));
    ret << ", ";
    write(ret, step(static_cast<float>(m_target), static_cast<float>(INFINITY),
                    m_ulps));
  }
  ret << "])";

  return ret.str();
}

WithinRelMatcher::WithinRelMatcher(double target, double epsilon)
    : m_target(target), m_epsilon(epsilon) {
  CATCH_ENFORCE(m_epsilon >= 0.,
                "Relative comparison with epsilon <  0 does not make sense.");
  CATCH_ENFORCE(m_epsilon < 1.,
                "Relative comparison with epsilon >= 1 does not make sense.");
}

bool WithinRelMatcher::match(double const& matchee) const {
  const auto relMargin =
      m_epsilon * (std::max)(std::fabs(matchee), std::fabs(m_target));
  return marginComparison(matchee, m_target,
                          std::isinf(relMargin) ? 0 : relMargin);
}

std::string WithinRelMatcher::describe() const {
  Catch::ReusableStringStream sstr;
  sstr << "and " << m_target << " are within " << m_epsilon * 100.
       << "% of each other";
  return sstr.str();
}

} // namespace Floating

Floating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff) {
  return Floating::WithinUlpsMatcher(target, maxUlpDiff,
                                     Floating::FloatingPointKind::Double);
}

Floating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff) {
  return Floating::WithinUlpsMatcher(target, maxUlpDiff,
                                     Floating::FloatingPointKind::Float);
}

Floating::WithinAbsMatcher WithinAbs(double target, double margin) {
  return Floating::WithinAbsMatcher(target, margin);
}

Floating::WithinRelMatcher WithinRel(double target, double eps) {
  return Floating::WithinRelMatcher(target, eps);
}

Floating::WithinRelMatcher WithinRel(double target) {
  return Floating::WithinRelMatcher(
      target, std::numeric_limits<double>::epsilon() * 100);
}

Floating::WithinRelMatcher WithinRel(float target, float eps) {
  return Floating::WithinRelMatcher(target, eps);
}

Floating::WithinRelMatcher WithinRel(float target) {
  return Floating::WithinRelMatcher(
      target, std::numeric_limits<float>::epsilon() * 100);
}

} // namespace Matchers
} // namespace Catch
// end catch_matchers_floating.cpp
// start catch_matchers_generic.cpp

std::string
Catch::Matchers::Generic::Detail::finalizeDescription(const std::string& desc) {
  if (desc.empty()) {
    return "matches undescribed predicate";
  } else {
    return "matches predicate: \"" + desc + '"';
  }
}
// end catch_matchers_generic.cpp
// start catch_matchers_string.cpp

#include <regex>

namespace Catch {
namespace Matchers {

namespace StdString {

CasedString::CasedString(std::string const& str,
                         CaseSensitive::Choice caseSensitivity)
    : m_caseSensitivity(caseSensitivity), m_str(adjustString(str)) {}
std::string CasedString::adjustString(std::string const& str) const {
  return m_caseSensitivity == CaseSensitive::No ? toLower(str) : str;
}
std::string CasedString::caseSensitivitySuffix() const {
  return m_caseSensitivity == CaseSensitive::No ? " (case insensitive)"
                                                : std::string();
}

StringMatcherBase::StringMatcherBase(std::string const& operation,
                                     CasedString const& comparator)
    : m_comparator(comparator), m_operation(operation) {}

std::string StringMatcherBase::describe() const {
  std::string description;
  description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +
                      m_comparator.caseSensitivitySuffix().size());
  description += m_operation;
  description += ": \"";
  description += m_comparator.m_str;
  description += "\"";
  description += m_comparator.caseSensitivitySuffix();
  return description;
}

EqualsMatcher::EqualsMatcher(CasedString const& comparator)
    : StringMatcherBase("equals", comparator) {}

bool EqualsMatcher::match(std::string const& source) const {
  return m_comparator.adjustString(source) == m_comparator.m_str;
}

ContainsMatcher::ContainsMatcher(CasedString const& comparator)
    : StringMatcherBase("contains", comparator) {}

bool ContainsMatcher::match(std::string const& source) const {
  return contains(m_comparator.adjustString(source), m_comparator.m_str);
}

StartsWithMatcher::StartsWithMatcher(CasedString const& comparator)
    : StringMatcherBase("starts with", comparator) {}

bool StartsWithMatcher::match(std::string const& source) const {
  return startsWith(m_comparator.adjustString(source), m_comparator.m_str);
}

EndsWithMatcher::EndsWithMatcher(CasedString const& comparator)
    : StringMatcherBase("ends with", comparator) {}

bool EndsWithMatcher::match(std::string const& source) const {
  return endsWith(m_comparator.adjustString(source), m_comparator.m_str);
}

RegexMatcher::RegexMatcher(std::string regex,
                           CaseSensitive::Choice caseSensitivity)
    : m_regex(std::move(regex)), m_caseSensitivity(caseSensitivity) {}

bool RegexMatcher::match(std::string const& matchee) const {
  auto flags =
      std::regex::ECMAScript; // ECMAScript is the default syntax option anyway
  if (m_caseSensitivity == CaseSensitive::Choice::No) {
    flags |= std::regex::icase;
  }
  auto reg = std::regex(m_regex, flags);
  return std::regex_match(matchee, reg);
}

std::string RegexMatcher::describe() const {
  return "matches " + ::Catch::Detail::stringify(m_regex) +
         ((m_caseSensitivity == CaseSensitive::Choice::Yes)
              ? " case sensitively"
              : " case insensitively");
}

} // namespace StdString

StdString::EqualsMatcher Equals(std::string const& str,
                                CaseSensitive::Choice caseSensitivity) {
  return StdString::EqualsMatcher(StdString::CasedString(str, caseSensitivity));
}
StdString::ContainsMatcher Contains(std::string const& str,
                                    CaseSensitive::Choice caseSensitivity) {
  return StdString::ContainsMatcher(
      StdString::CasedString(str, caseSensitivity));
}
StdString::EndsWithMatcher EndsWith(std::string const& str,
                                    CaseSensitive::Choice caseSensitivity) {
  return StdString::EndsWithMatcher(
      StdString::CasedString(str, caseSensitivity));
}
StdString::StartsWithMatcher StartsWith(std::string const& str,
                                        CaseSensitive::Choice caseSensitivity) {
  return StdString::StartsWithMatcher(
      StdString::CasedString(str, caseSensitivity));
}

StdString::RegexMatcher Matches(std::string const& regex,
                                CaseSensitive::Choice caseSensitivity) {
  return StdString::RegexMatcher(regex, caseSensitivity);
}

} // namespace Matchers
} // namespace Catch
// end catch_matchers_string.cpp
// start catch_message.cpp

// start catch_uncaught_exceptions.h

namespace Catch {
bool uncaught_exceptions();
} // end namespace Catch

// end catch_uncaught_exceptions.h
#include <cassert>
#include <stack>

namespace Catch {

MessageInfo::MessageInfo(StringRef const& _macroName,
                         SourceLineInfo const& _lineInfo,
                         ResultWas::OfType _type)
    : macroName(_macroName), lineInfo(_lineInfo), type(_type),
      sequence(++globalCount) {}

bool MessageInfo::operator==(MessageInfo const& other) const {
  return sequence == other.sequence;
}

bool MessageInfo::operator<(MessageInfo const& other) const {
  return sequence < other.sequence;
}

// This may need protecting if threading support is added
unsigned int MessageInfo::globalCount = 0;

////////////////////////////////////////////////////////////////////////////

Catch::MessageBuilder::MessageBuilder(StringRef const& macroName,
                                      SourceLineInfo const& lineInfo,
                                      ResultWas::OfType type)
    : m_info(macroName, lineInfo, type) {}

////////////////////////////////////////////////////////////////////////////

ScopedMessage::ScopedMessage(MessageBuilder const& builder)
    : m_info(builder.m_info), m_moved() {
  m_info.message = builder.m_stream.str();
  getResultCapture().pushScopedMessage(m_info);
}

ScopedMessage::ScopedMessage(ScopedMessage&& old)
    : m_info(old.m_info), m_moved() {
  old.m_moved = true;
}

ScopedMessage::~ScopedMessage() {
  if (!uncaught_exceptions() && !m_moved) {
    getResultCapture().popScopedMessage(m_info);
  }
}

Capturer::Capturer(StringRef macroName, SourceLineInfo const& lineInfo,
                   ResultWas::OfType resultType, StringRef names) {
  auto trimmed = [&](size_t start, size_t end) {
    while (names[start] == ',' ||
           isspace(static_cast<unsigned char>(names[start]))) {
      ++start;
    }
    while (names[end] == ',' ||
           isspace(static_cast<unsigned char>(names[end]))) {
      --end;
    }
    return names.substr(start, end - start + 1);
  };
  auto skipq = [&](size_t start, char quote) {
    for (auto i = start + 1; i < names.size(); ++i) {
      if (names[i] == quote)
        return i;
      if (names[i] == '\\')
        ++i;
    }
    CATCH_INTERNAL_ERROR("CAPTURE parsing encountered unmatched quote");
  };

  size_t start = 0;
  std::stack<char> openings;
  for (size_t pos = 0; pos < names.size(); ++pos) {
    char c = names[pos];
    switch (c) {
    case '[':
    case '{':
    case '(':
      // It is basically impossible to disambiguate between
      // comparison and start of template args in this context
      //            case '<':
      openings.push(c);
      break;
    case ']':
    case '}':
    case ')':
      //           case '>':
      openings.pop();
      break;
    case '"':
    case '\'':
      pos = skipq(pos, c);
      break;
    case ',':
      if (start != pos && openings.empty()) {
        m_messages.emplace_back(macroName, lineInfo, resultType);
        m_messages.back().message =
            static_cast<std::string>(trimmed(start, pos));
        m_messages.back().message += " := ";
        start = pos;
      }
    }
  }
  assert(openings.empty() && "Mismatched openings");
  m_messages.emplace_back(macroName, lineInfo, resultType);
  m_messages.back().message =
      static_cast<std::string>(trimmed(start, names.size() - 1));
  m_messages.back().message += " := ";
}
Capturer::~Capturer() {
  if (!uncaught_exceptions()) {
    assert(m_captured == m_messages.size());
    for (size_t i = 0; i < m_captured; ++i)
      m_resultCapture.popScopedMessage(m_messages[i]);
  }
}

void Capturer::captureValue(size_t index, std::string const& value) {
  assert(index < m_messages.size());
  m_messages[index].message += value;
  m_resultCapture.pushScopedMessage(m_messages[index]);
  m_captured++;
}

} // end namespace Catch
// end catch_message.cpp
// start catch_output_redirect.cpp

// start catch_output_redirect.h
#ifndef TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
#define TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H

#include <cstdio>
#include <iosfwd>
#include <string>

namespace Catch {

class RedirectedStream {
  std::ostream& m_originalStream;
  std::ostream& m_redirectionStream;
  std::streambuf* m_prevBuf;

public:
  RedirectedStream(std::ostream& originalStream,
                   std::ostream& redirectionStream);
  ~RedirectedStream();
};

class RedirectedStdOut {
  ReusableStringStream m_rss;
  RedirectedStream m_cout;

public:
  RedirectedStdOut();
  auto str() const -> std::string;
};

// StdErr has two constituent streams in C++, std::cerr and std::clog
// This means that we need to redirect 2 streams into 1 to keep proper
// order of writes
class RedirectedStdErr {
  ReusableStringStream m_rss;
  RedirectedStream m_cerr;
  RedirectedStream m_clog;

public:
  RedirectedStdErr();
  auto str() const -> std::string;
};

class RedirectedStreams {
public:
  RedirectedStreams(RedirectedStreams const&)            = delete;
  RedirectedStreams& operator=(RedirectedStreams const&) = delete;
  RedirectedStreams(RedirectedStreams&&)                 = delete;
  RedirectedStreams& operator=(RedirectedStreams&&)      = delete;

  RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr);
  ~RedirectedStreams();

private:
  std::string& m_redirectedCout;
  std::string& m_redirectedCerr;
  RedirectedStdOut m_redirectedStdOut;
  RedirectedStdErr m_redirectedStdErr;
};

#if defined(CATCH_CONFIG_NEW_CAPTURE)

// Windows's implementation of std::tmpfile is terrible (it tries
// to create a file inside system folder, thus requiring elevated
// privileges for the binary), so we have to use tmpnam(_s) and
// create the file ourselves there.
class TempFile {
public:
  TempFile(TempFile const&)            = delete;
  TempFile& operator=(TempFile const&) = delete;
  TempFile(TempFile&&)                 = delete;
  TempFile& operator=(TempFile&&)      = delete;

  TempFile();
  ~TempFile();

  std::FILE* getFile();
  std::string getContents();

private:
  std::FILE* m_file = nullptr;
#if defined(_MSC_VER)
  char m_buffer[L_tmpnam] = {0};
#endif
};

class OutputRedirect {
public:
  OutputRedirect(OutputRedirect const&)            = delete;
  OutputRedirect& operator=(OutputRedirect const&) = delete;
  OutputRedirect(OutputRedirect&&)                 = delete;
  OutputRedirect& operator=(OutputRedirect&&)      = delete;

  OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);
  ~OutputRedirect();

private:
  int m_originalStdout = -1;
  int m_originalStderr = -1;
  TempFile m_stdoutFile;
  TempFile m_stderrFile;
  std::string& m_stdoutDest;
  std::string& m_stderrDest;
};

#endif

} // end namespace Catch

#endif // TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
// end catch_output_redirect.h
#include <cstdio>
#include <cstring>
#include <fstream>
#include <sstream>
#include <stdexcept>

#if defined(CATCH_CONFIG_NEW_CAPTURE)
#if defined(_MSC_VER)
#include <io.h> //_dup and _dup2
#define dup _dup
#define dup2 _dup2
#define fileno _fileno
#else
#include <unistd.h> // dup and dup2
#endif
#endif

namespace Catch {

RedirectedStream::RedirectedStream(std::ostream& originalStream,
                                   std::ostream& redirectionStream)
    : m_originalStream(originalStream), m_redirectionStream(redirectionStream),
      m_prevBuf(m_originalStream.rdbuf()) {
  m_originalStream.rdbuf(m_redirectionStream.rdbuf());
}

RedirectedStream::~RedirectedStream() { m_originalStream.rdbuf(m_prevBuf); }

RedirectedStdOut::RedirectedStdOut() : m_cout(Catch::cout(), m_rss.get()) {}
auto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }

RedirectedStdErr::RedirectedStdErr()
    : m_cerr(Catch::cerr(), m_rss.get()), m_clog(Catch::clog(), m_rss.get()) {}
auto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }

RedirectedStreams::RedirectedStreams(std::string& redirectedCout,
                                     std::string& redirectedCerr)
    : m_redirectedCout(redirectedCout), m_redirectedCerr(redirectedCerr) {}

RedirectedStreams::~RedirectedStreams() {
  m_redirectedCout += m_redirectedStdOut.str();
  m_redirectedCerr += m_redirectedStdErr.str();
}

#if defined(CATCH_CONFIG_NEW_CAPTURE)

#if defined(_MSC_VER)
TempFile::TempFile() {
  if (tmpnam_s(m_buffer)) {
    CATCH_RUNTIME_ERROR("Could not get a temp filename");
  }
  if (fopen_s(&m_file, m_buffer, "w+")) {
    char buffer[100];
    if (strerror_s(buffer, errno)) {
      CATCH_RUNTIME_ERROR("Could not translate errno to a string");
    }
    CATCH_RUNTIME_ERROR("Could not open the temp file: '"
                        << m_buffer << "' because: " << buffer);
  }
}
#else
TempFile::TempFile() {
  m_file = std::tmpfile();
  if (!m_file) {
    CATCH_RUNTIME_ERROR("Could not create a temp file.");
  }
}

#endif

TempFile::~TempFile() {
  // TBD: What to do about errors here?
  std::fclose(m_file);
  // We manually create the file on Windows only, on Linux
  // it will be autodeleted
#if defined(_MSC_VER)
  std::remove(m_buffer);
#endif
}

FILE* TempFile::getFile() { return m_file; }

std::string TempFile::getContents() {
  std::stringstream sstr;
  char buffer[100] = {};
  std::rewind(m_file);
  while (std::fgets(buffer, sizeof(buffer), m_file)) {
    sstr << buffer;
  }
  return sstr.str();
}

OutputRedirect::OutputRedirect(std::string& stdout_dest,
                               std::string& stderr_dest)
    : m_originalStdout(dup(1)), m_originalStderr(dup(2)),
      m_stdoutDest(stdout_dest), m_stderrDest(stderr_dest) {
  dup2(fileno(m_stdoutFile.getFile()), 1);
  dup2(fileno(m_stderrFile.getFile()), 2);
}

OutputRedirect::~OutputRedirect() {
  Catch::cout() << std::flush;
  fflush(stdout);
  // Since we support overriding these streams, we flush cerr
  // even though std::cerr is unbuffered
  Catch::cerr() << std::flush;
  Catch::clog() << std::flush;
  fflush(stderr);

  dup2(m_originalStdout, 1);
  dup2(m_originalStderr, 2);

  m_stdoutDest += m_stdoutFile.getContents();
  m_stderrDest += m_stderrFile.getContents();
}

#endif // CATCH_CONFIG_NEW_CAPTURE

} // namespace Catch

#if defined(CATCH_CONFIG_NEW_CAPTURE)
#if defined(_MSC_VER)
#undef dup
#undef dup2
#undef fileno
#endif
#endif
// end catch_output_redirect.cpp
// start catch_polyfills.cpp

#include <cmath>

namespace Catch {

#if !defined(CATCH_CONFIG_POLYFILL_ISNAN)
bool isnan(float f) { return std::isnan(f); }
bool isnan(double d) { return std::isnan(d); }
#else
// For now we only use this for embarcadero
bool isnan(float f) { return std::_isnan(f); }
bool isnan(double d) { return std::_isnan(d); }
#endif

} // end namespace Catch
// end catch_polyfills.cpp
// start catch_random_number_generator.cpp

namespace Catch {

namespace {

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4146) // we negate uint32 during the rotate
#endif
// Safe rotr implementation thanks to John Regehr
uint32_t rotate_right(uint32_t val, uint32_t count) {
  const uint32_t mask = 31;
  count &= mask;
  return (val >> count) | (val << (-count & mask));
}

#if defined(_MSC_VER)
#pragma warning(pop)
#endif

} // namespace

SimplePcg32::SimplePcg32(result_type seed_) { seed(seed_); }

void SimplePcg32::seed(result_type seed_) {
  m_state = 0;
  (*this)();
  m_state += seed_;
  (*this)();
}

void SimplePcg32::discard(uint64_t skip) {
  // We could implement this to run in O(log n) steps, but this
  // should suffice for our use case.
  for (uint64_t s = 0; s < skip; ++s) {
    static_cast<void>((*this)());
  }
}

SimplePcg32::result_type SimplePcg32::operator()() {
  // prepare the output value
  const uint32_t xorshifted =
      static_cast<uint32_t>(((m_state >> 18u) ^ m_state) >> 27u);
  const auto output = rotate_right(xorshifted, m_state >> 59u);

  // advance state
  m_state = m_state * 6364136223846793005ULL + s_inc;

  return output;
}

bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
  return lhs.m_state == rhs.m_state;
}

bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
  return lhs.m_state != rhs.m_state;
}
} // namespace Catch
// end catch_random_number_generator.cpp
// start catch_registry_hub.cpp

// start catch_test_case_registry_impl.h

#include <vector>
#include <set>
#include <algorithm>
#include <ios>

namespace Catch {

class TestCase;
struct IConfig;

std::vector<TestCase> sortTests(IConfig const& config,
                                std::vector<TestCase> const& unsortedTestCases);

bool isThrowSafe(TestCase const& testCase, IConfig const& config);
bool matchTest(TestCase const& testCase, TestSpec const& testSpec,
               IConfig const& config);

void enforceNoDuplicateTestCases(std::vector<TestCase> const& functions);

std::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,
                                  TestSpec const& testSpec,
                                  IConfig const& config);
std::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config);

class TestRegistry : public ITestCaseRegistry {
public:
  virtual ~TestRegistry() = default;

  virtual void registerTest(TestCase const& testCase);

  std::vector<TestCase> const& getAllTests() const override;
  std::vector<TestCase> const&
  getAllTestsSorted(IConfig const& config) const override;

private:
  std::vector<TestCase> m_functions;
  mutable RunTests::InWhatOrder m_currentSortOrder =
      RunTests::InDeclarationOrder;
  mutable std::vector<TestCase> m_sortedFunctions;
  std::size_t m_unnamedCount = 0;
  std::ios_base::Init m_ostreamInit; // Forces cout/ cerr to be initialised
};

///////////////////////////////////////////////////////////////////////////

class TestInvokerAsFunction : public ITestInvoker {
  void (*m_testAsFunction)();

public:
  TestInvokerAsFunction(void (*testAsFunction)()) noexcept;

  void invoke() const override;
};

std::string extractClassName(StringRef const& classOrQualifiedMethodName);

///////////////////////////////////////////////////////////////////////////

} // end namespace Catch

// end catch_test_case_registry_impl.h
// start catch_reporter_registry.h

#include <map>

namespace Catch {

class ReporterRegistry : public IReporterRegistry {

public:
  ~ReporterRegistry() override;

  IStreamingReporterPtr create(std::string const& name,
                               IConfigPtr const& config) const override;

  void registerReporter(std::string const& name,
                        IReporterFactoryPtr const& factory);
  void registerListener(IReporterFactoryPtr const& factory);

  FactoryMap const& getFactories() const override;
  Listeners const& getListeners() const override;

private:
  FactoryMap m_factories;
  Listeners m_listeners;
};
} // namespace Catch

// end catch_reporter_registry.h
// start catch_tag_alias_registry.h

// start catch_tag_alias.h

#include <string>

namespace Catch {

struct TagAlias {
  TagAlias(std::string const& _tag, SourceLineInfo _lineInfo);

  std::string tag;
  SourceLineInfo lineInfo;
};

} // end namespace Catch

// end catch_tag_alias.h
#include <map>

namespace Catch {

class TagAliasRegistry : public ITagAliasRegistry {
public:
  ~TagAliasRegistry() override;
  TagAlias const* find(std::string const& alias) const override;
  std::string
  expandAliases(std::string const& unexpandedTestSpec) const override;
  void add(std::string const& alias, std::string const& tag,
           SourceLineInfo const& lineInfo);

private:
  std::map<std::string, TagAlias> m_registry;
};

} // end namespace Catch

// end catch_tag_alias_registry.h
// start catch_startup_exception_registry.h

#include <vector>
#include <exception>

namespace Catch {

class StartupExceptionRegistry {
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
public:
  void add(std::exception_ptr const& exception) noexcept;
  std::vector<std::exception_ptr> const& getExceptions() const noexcept;

private:
  std::vector<std::exception_ptr> m_exceptions;
#endif
};

} // end namespace Catch

// end catch_startup_exception_registry.h
// start catch_singletons.hpp

namespace Catch {

struct ISingleton {
  virtual ~ISingleton();
};

void addSingleton(ISingleton* singleton);
void cleanupSingletons();

template <typename SingletonImplT, typename InterfaceT = SingletonImplT,
          typename MutableInterfaceT = InterfaceT>
class Singleton : SingletonImplT, public ISingleton {

  static auto getInternal() -> Singleton* {
    static Singleton* s_instance = nullptr;
    if (!s_instance) {
      s_instance = new Singleton;
      addSingleton(s_instance);
    }
    return s_instance;
  }

public:
  static auto get() -> InterfaceT const& { return *getInternal(); }
  static auto getMutable() -> MutableInterfaceT& { return *getInternal(); }
};

} // namespace Catch

// end catch_singletons.hpp
namespace Catch {

namespace {

class RegistryHub : public IRegistryHub,
                    public IMutableRegistryHub,
                    private NonCopyable {

public: // IRegistryHub
  RegistryHub() = default;
  IReporterRegistry const& getReporterRegistry() const override {
    return m_reporterRegistry;
  }
  ITestCaseRegistry const& getTestCaseRegistry() const override {
    return m_testCaseRegistry;
  }
  IExceptionTranslatorRegistry const&
  getExceptionTranslatorRegistry() const override {
    return m_exceptionTranslatorRegistry;
  }
  ITagAliasRegistry const& getTagAliasRegistry() const override {
    return m_tagAliasRegistry;
  }
  StartupExceptionRegistry const& getStartupExceptionRegistry() const override {
    return m_exceptionRegistry;
  }

public: // IMutableRegistryHub
  void registerReporter(std::string const& name,
                        IReporterFactoryPtr const& factory) override {
    m_reporterRegistry.registerReporter(name, factory);
  }
  void registerListener(IReporterFactoryPtr const& factory) override {
    m_reporterRegistry.registerListener(factory);
  }
  void registerTest(TestCase const& testInfo) override {
    m_testCaseRegistry.registerTest(testInfo);
  }
  void registerTranslator(const IExceptionTranslator* translator) override {
    m_exceptionTranslatorRegistry.registerTranslator(translator);
  }
  void registerTagAlias(std::string const& alias, std::string const& tag,
                        SourceLineInfo const& lineInfo) override {
    m_tagAliasRegistry.add(alias, tag, lineInfo);
  }
  void registerStartupException() noexcept override {
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
    m_exceptionRegistry.add(std::current_exception());
#else
    CATCH_INTERNAL_ERROR("Attempted to register active exception under "
                         "CATCH_CONFIG_DISABLE_EXCEPTIONS!");
#endif
  }
  IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {
    return m_enumValuesRegistry;
  }

private:
  TestRegistry m_testCaseRegistry;
  ReporterRegistry m_reporterRegistry;
  ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
  TagAliasRegistry m_tagAliasRegistry;
  StartupExceptionRegistry m_exceptionRegistry;
  Detail::EnumValuesRegistry m_enumValuesRegistry;
};
} // namespace

using RegistryHubSingleton =
    Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;

IRegistryHub const& getRegistryHub() { return RegistryHubSingleton::get(); }
IMutableRegistryHub& getMutableRegistryHub() {
  return RegistryHubSingleton::getMutable();
}
void cleanUp() {
  cleanupSingletons();
  cleanUpContext();
}
std::string translateActiveException() {
  return getRegistryHub()
      .getExceptionTranslatorRegistry()
      .translateActiveException();
}

} // end namespace Catch
// end catch_registry_hub.cpp
// start catch_reporter_registry.cpp

namespace Catch {

ReporterRegistry::~ReporterRegistry() = default;

IStreamingReporterPtr ReporterRegistry::create(std::string const& name,
                                               IConfigPtr const& config) const {
  auto it = m_factories.find(name);
  if (it == m_factories.end())
    return nullptr;
  return it->second->create(ReporterConfig(config));
}

void ReporterRegistry::registerReporter(std::string const& name,
                                        IReporterFactoryPtr const& factory) {
  m_factories.emplace(name, factory);
}
void ReporterRegistry::registerListener(IReporterFactoryPtr const& factory) {
  m_listeners.push_back(factory);
}

IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
  return m_factories;
}
IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
  return m_listeners;
}

} // namespace Catch
// end catch_reporter_registry.cpp
// start catch_result_type.cpp

namespace Catch {

bool isOk(ResultWas::OfType resultType) {
  return (resultType & ResultWas::FailureBit) == 0;
}
bool isJustInfo(int flags) { return flags == ResultWas::Info; }

ResultDisposition::Flags operator|(ResultDisposition::Flags lhs,
                                   ResultDisposition::Flags rhs) {
  return static_cast<ResultDisposition::Flags>(static_cast<int>(lhs) |
                                               static_cast<int>(rhs));
}

bool shouldContinueOnFailure(int flags) {
  return (flags & ResultDisposition::ContinueOnFailure) != 0;
}
bool shouldSuppressFailure(int flags) {
  return (flags & ResultDisposition::SuppressFail) != 0;
}

} // end namespace Catch
// end catch_result_type.cpp
// start catch_run_context.cpp

#include <cassert>
#include <algorithm>
#include <sstream>

namespace Catch {

namespace Generators {
struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
  GeneratorBasePtr m_generator;

  GeneratorTracker(TestCaseTracking::NameAndLocation const& nameAndLocation,
                   TrackerContext& ctx, ITracker* parent)
      : TrackerBase(nameAndLocation, ctx, parent) {}
  ~GeneratorTracker();

  static GeneratorTracker&
  acquire(TrackerContext& ctx,
          TestCaseTracking::NameAndLocation const& nameAndLocation) {
    std::shared_ptr<GeneratorTracker> tracker;

    ITracker& currentTracker = ctx.currentTracker();
    // Under specific circumstances, the generator we want
    // to acquire is also the current tracker. If this is
    // the case, we have to avoid looking through current
    // tracker's children, and instead return the current
    // tracker.
    // A case where this check is important is e.g.
    //     for (int i = 0; i < 5; ++i) {
    //         int n = GENERATE(1, 2);
    //     }
    //
    // without it, the code above creates 5 nested generators.
    if (currentTracker.nameAndLocation() == nameAndLocation) {
      auto thisTracker = currentTracker.parent().findChild(nameAndLocation);
      assert(thisTracker);
      assert(thisTracker->isGeneratorTracker());
      tracker = std::static_pointer_cast<GeneratorTracker>(thisTracker);
    } else if (TestCaseTracking::ITrackerPtr childTracker =
                   currentTracker.findChild(nameAndLocation)) {
      assert(childTracker);
      assert(childTracker->isGeneratorTracker());
      tracker = std::static_pointer_cast<GeneratorTracker>(childTracker);
    } else {
      tracker = std::make_shared<GeneratorTracker>(nameAndLocation, ctx,
                                                   &currentTracker);
      currentTracker.addChild(tracker);
    }

    if (!tracker->isComplete()) {
      tracker->open();
    }

    return *tracker;
  }

  // TrackerBase interface
  bool isGeneratorTracker() const override { return true; }
  auto hasGenerator() const -> bool override { return !!m_generator; }
  void close() override {
    TrackerBase::close();
    // If a generator has a child (it is followed by a section)
    // and none of its children have started, then we must wait
    // until later to start consuming its values.
    // This catches cases where `GENERATE` is placed between two
    // `SECTION`s.
    // **The check for m_children.empty cannot be removed**.
    // doing so would break `GENERATE` _not_ followed by `SECTION`s.
    const bool should_wait_for_child = [&]() {
      // No children -> nobody to wait for
      if (m_children.empty()) {
        return false;
      }
      // If at least one child started executing, don't wait
      if (std::find_if(m_children.begin(), m_children.end(),
                       [](TestCaseTracking::ITrackerPtr tracker) {
                         return tracker->hasStarted();
                       }) != m_children.end()) {
        return false;
      }

      // No children have started. We need to check if they _can_
      // start, and thus we should wait for them, or they cannot
      // start (due to filters), and we shouldn't wait for them
      auto* parent = m_parent;
      // This is safe: there is always at least one section
      // tracker in a test case tracking tree
      while (!parent->isSectionTracker()) {
        parent = &(parent->parent());
      }
      assert(parent && "Missing root (test case) level section");

      auto const& parentSection = static_cast<SectionTracker&>(*parent);
      auto const& filters       = parentSection.getFilters();
      // No filters -> no restrictions on running sections
      if (filters.empty()) {
        return true;
      }

      for (auto const& child : m_children) {
        if (child->isSectionTracker() &&
            std::find(filters.begin(), filters.end(),
                      static_cast<SectionTracker&>(*child).trimmedName()) !=
                filters.end()) {
          return true;
        }
      }
      return false;
    }();

    // This check is a bit tricky, because m_generator->next()
    // has a side-effect, where it consumes generator's current
    // value, but we do not want to invoke the side-effect if
    // this generator is still waiting for any child to start.
    if (should_wait_for_child ||
        (m_runState == CompletedSuccessfully && m_generator->next())) {
      m_children.clear();
      m_runState = Executing;
    }
  }

  // IGeneratorTracker interface
  auto getGenerator() const -> GeneratorBasePtr const& override {
    return m_generator;
  }
  void setGenerator(GeneratorBasePtr&& generator) override {
    m_generator = std::move(generator);
  }
};
GeneratorTracker::~GeneratorTracker() {}
} // namespace Generators

RunContext::RunContext(IConfigPtr const& _config,
                       IStreamingReporterPtr&& reporter)
    : m_runInfo(_config->name()), m_context(getCurrentMutableContext()),
      m_config(_config), m_reporter(std::move(reporter)),
      m_lastAssertionInfo{StringRef(), SourceLineInfo("", 0), StringRef(),
                          ResultDisposition::Normal},
      m_includeSuccessfulResults(
          m_config->includeSuccessfulResults() ||
          m_reporter->getPreferences().shouldReportAllAssertions) {
  m_context.setRunner(this);
  m_context.setConfig(m_config);
  m_context.setResultCapture(this);
  m_reporter->testRunStarting(m_runInfo);
}

RunContext::~RunContext() {
  m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));
}

void RunContext::testGroupStarting(std::string const& testSpec,
                                   std::size_t groupIndex,
                                   std::size_t groupsCount) {
  m_reporter->testGroupStarting(GroupInfo(testSpec, groupIndex, groupsCount));
}

void RunContext::testGroupEnded(std::string const& testSpec,
                                Totals const& totals, std::size_t groupIndex,
                                std::size_t groupsCount) {
  m_reporter->testGroupEnded(TestGroupStats(
      GroupInfo(testSpec, groupIndex, groupsCount), totals, aborting()));
}

Totals RunContext::runTest(TestCase const& testCase) {
  Totals prevTotals = m_totals;

  std::string redirectedCout;
  std::string redirectedCerr;

  auto const& testInfo = testCase.getTestCaseInfo();

  m_reporter->testCaseStarting(testInfo);

  m_activeTestCase = &testCase;

  ITracker& rootTracker = m_trackerContext.startRun();
  assert(rootTracker.isSectionTracker());
  static_cast<SectionTracker&>(rootTracker)
      .addInitialFilters(m_config->getSectionsToRun());
  do {
    m_trackerContext.startCycle();
    m_testCaseTracker = &SectionTracker::acquire(
        m_trackerContext,
        TestCaseTracking::NameAndLocation(testInfo.name, testInfo.lineInfo));
    runCurrentTest(redirectedCout, redirectedCerr);
  } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());

  Totals deltaTotals = m_totals.delta(prevTotals);
  if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {
    deltaTotals.assertions.failed++;
    deltaTotals.testCases.passed--;
    deltaTotals.testCases.failed++;
  }
  m_totals.testCases += deltaTotals.testCases;
  m_reporter->testCaseEnded(TestCaseStats(testInfo, deltaTotals, redirectedCout,
                                          redirectedCerr, aborting()));

  m_activeTestCase  = nullptr;
  m_testCaseTracker = nullptr;

  return deltaTotals;
}

IConfigPtr RunContext::config() const { return m_config; }

IStreamingReporter& RunContext::reporter() const { return *m_reporter; }

void RunContext::assertionEnded(AssertionResult const& result) {
  if (result.getResultType() == ResultWas::Ok) {
    m_totals.assertions.passed++;
    m_lastAssertionPassed = true;
  } else if (!result.isOk()) {
    m_lastAssertionPassed = false;
    if (m_activeTestCase->getTestCaseInfo().okToFail())
      m_totals.assertions.failedButOk++;
    else
      m_totals.assertions.failed++;
  } else {
    m_lastAssertionPassed = true;
  }

  // We have no use for the return value (whether messages should be cleared),
  // because messages were made scoped and should be let to clear themselves
  // out.
  static_cast<void>(
      m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals)));

  if (result.getResultType() != ResultWas::Warning)
    m_messageScopes.clear();

  // Reset working state
  resetAssertionInfo();
  m_lastResult = result;
}
void RunContext::resetAssertionInfo() {
  m_lastAssertionInfo.macroName = StringRef();
  m_lastAssertionInfo.capturedExpression =
      "{Unknown expression after the reported line}"_sr;
}

bool RunContext::sectionStarted(SectionInfo const& sectionInfo,
                                Counts& assertions) {
  ITracker& sectionTracker = SectionTracker::acquire(
      m_trackerContext, TestCaseTracking::NameAndLocation(
                            sectionInfo.name, sectionInfo.lineInfo));
  if (!sectionTracker.isOpen())
    return false;
  m_activeSections.push_back(&sectionTracker);

  m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;

  m_reporter->sectionStarting(sectionInfo);

  assertions = m_totals.assertions;

  return true;
}
auto RunContext::acquireGeneratorTracker(StringRef generatorName,
                                         SourceLineInfo const& lineInfo)
    -> IGeneratorTracker& {
  using namespace Generators;
  GeneratorTracker& tracker = GeneratorTracker::acquire(
      m_trackerContext, TestCaseTracking::NameAndLocation(
                            static_cast<std::string>(generatorName), lineInfo));
  m_lastAssertionInfo.lineInfo = lineInfo;
  return tracker;
}

bool RunContext::testForMissingAssertions(Counts& assertions) {
  if (assertions.total() != 0)
    return false;
  if (!m_config->warnAboutMissingAssertions())
    return false;
  if (m_trackerContext.currentTracker().hasChildren())
    return false;
  m_totals.assertions.failed++;
  assertions.failed++;
  return true;
}

void RunContext::sectionEnded(SectionEndInfo const& endInfo) {
  Counts assertions      = m_totals.assertions - endInfo.prevAssertions;
  bool missingAssertions = testForMissingAssertions(assertions);

  if (!m_activeSections.empty()) {
    m_activeSections.back()->close();
    m_activeSections.pop_back();
  }

  m_reporter->sectionEnded(SectionStats(endInfo.sectionInfo, assertions,
                                        endInfo.durationInSeconds,
                                        missingAssertions));
  m_messages.clear();
  m_messageScopes.clear();
}

void RunContext::sectionEndedEarly(SectionEndInfo const& endInfo) {
  if (m_unfinishedSections.empty())
    m_activeSections.back()->fail();
  else
    m_activeSections.back()->close();
  m_activeSections.pop_back();

  m_unfinishedSections.push_back(endInfo);
}

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
void RunContext::benchmarkPreparing(std::string const& name) {
  m_reporter->benchmarkPreparing(name);
}
void RunContext::benchmarkStarting(BenchmarkInfo const& info) {
  m_reporter->benchmarkStarting(info);
}
void RunContext::benchmarkEnded(BenchmarkStats<> const& stats) {
  m_reporter->benchmarkEnded(stats);
}
void RunContext::benchmarkFailed(std::string const& error) {
  m_reporter->benchmarkFailed(error);
}
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

void RunContext::pushScopedMessage(MessageInfo const& message) {
  m_messages.push_back(message);
}

void RunContext::popScopedMessage(MessageInfo const& message) {
  m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message),
                   m_messages.end());
}

void RunContext::emplaceUnscopedMessage(MessageBuilder const& builder) {
  m_messageScopes.emplace_back(builder);
}

std::string RunContext::getCurrentTestName() const {
  return m_activeTestCase ? m_activeTestCase->getTestCaseInfo().name
                          : std::string();
}

const AssertionResult* RunContext::getLastResult() const {
  return &(*m_lastResult);
}

void RunContext::exceptionEarlyReported() { m_shouldReportUnexpected = false; }

void RunContext::handleFatalErrorCondition(StringRef message) {
  // First notify reporter that bad things happened
  m_reporter->fatalErrorEncountered(message);

  // Don't rebuild the result -- the stringification itself can cause more fatal
  // errors Instead, fake a result data.
  AssertionResultData tempResult(ResultWas::FatalErrorCondition, {false});
  tempResult.message = static_cast<std::string>(message);
  AssertionResult result(m_lastAssertionInfo, tempResult);

  assertionEnded(result);

  handleUnfinishedSections();

  // Recreate section for test case (as we will lose the one that was in scope)
  auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
  SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);

  Counts assertions;
  assertions.failed = 1;
  SectionStats testCaseSectionStats(testCaseSection, assertions, 0, false);
  m_reporter->sectionEnded(testCaseSectionStats);

  auto const& testInfo = m_activeTestCase->getTestCaseInfo();

  Totals deltaTotals;
  deltaTotals.testCases.failed  = 1;
  deltaTotals.assertions.failed = 1;
  m_reporter->testCaseEnded(TestCaseStats(testInfo, deltaTotals, std::string(),
                                          std::string(), false));
  m_totals.testCases.failed++;
  testGroupEnded(std::string(), m_totals, 1, 1);
  m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));
}

bool RunContext::lastAssertionPassed() { return m_lastAssertionPassed; }

void RunContext::assertionPassed() {
  m_lastAssertionPassed = true;
  ++m_totals.assertions.passed;
  resetAssertionInfo();
  m_messageScopes.clear();
}

bool RunContext::aborting() const {
  return m_totals.assertions.failed >=
         static_cast<std::size_t>(m_config->abortAfter());
}

void RunContext::runCurrentTest(std::string& redirectedCout,
                                std::string& redirectedCerr) {
  auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
  SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
  m_reporter->sectionStarting(testCaseSection);
  Counts prevAssertions    = m_totals.assertions;
  double duration          = 0;
  m_shouldReportUnexpected = true;
  m_lastAssertionInfo = {"TEST_CASE"_sr, testCaseInfo.lineInfo, StringRef(),
                         ResultDisposition::Normal};

  seedRng(*m_config);

  Timer timer;
  CATCH_TRY {
    if (m_reporter->getPreferences().shouldRedirectStdOut) {
#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
      RedirectedStreams redirectedStreams(redirectedCout, redirectedCerr);

      timer.start();
      invokeActiveTestCase();
#else
      OutputRedirect r(redirectedCout, redirectedCerr);
      timer.start();
      invokeActiveTestCase();
#endif
    } else {
      timer.start();
      invokeActiveTestCase();
    }
    duration = timer.getElapsedSeconds();
  }
  CATCH_CATCH_ANON(TestFailureException&) {
    // This just means the test was aborted due to failure
  }
  CATCH_CATCH_ALL {
    // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE
    // assertions are reported without translation at the point of origin.
    if (m_shouldReportUnexpected) {
      AssertionReaction dummyReaction;
      handleUnexpectedInflightException(
          m_lastAssertionInfo, translateActiveException(), dummyReaction);
    }
  }
  Counts assertions      = m_totals.assertions - prevAssertions;
  bool missingAssertions = testForMissingAssertions(assertions);

  m_testCaseTracker->close();
  handleUnfinishedSections();
  m_messages.clear();
  m_messageScopes.clear();

  SectionStats testCaseSectionStats(testCaseSection, assertions, duration,
                                    missingAssertions);
  m_reporter->sectionEnded(testCaseSectionStats);
}

void RunContext::invokeActiveTestCase() {
  FatalConditionHandlerGuard _(&m_fatalConditionhandler);
  m_activeTestCase->invoke();
}

void RunContext::handleUnfinishedSections() {
  // If sections ended prematurely due to an exception we stored their
  // infos here so we can tear them down outside the unwind process.
  for (auto it    = m_unfinishedSections.rbegin(),
            itEnd = m_unfinishedSections.rend();
       it != itEnd; ++it)
    sectionEnded(*it);
  m_unfinishedSections.clear();
}

void RunContext::handleExpr(AssertionInfo const& info,
                            ITransientExpression const& expr,
                            AssertionReaction& reaction) {
  m_reporter->assertionStarting(info);

  bool negated = isFalseTest(info.resultDisposition);
  bool result  = expr.getResult() != negated;

  if (result) {
    if (!m_includeSuccessfulResults) {
      assertionPassed();
    } else {
      reportExpr(info, ResultWas::Ok, &expr, negated);
    }
  } else {
    reportExpr(info, ResultWas::ExpressionFailed, &expr, negated);
    populateReaction(reaction);
  }
}
void RunContext::reportExpr(AssertionInfo const& info,
                            ResultWas::OfType resultType,
                            ITransientExpression const* expr, bool negated) {

  m_lastAssertionInfo = info;
  AssertionResultData data(resultType, LazyExpression(negated));

  AssertionResult assertionResult{info, data};
  assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;

  assertionEnded(assertionResult);
}

void RunContext::handleMessage(AssertionInfo const& info,
                               ResultWas::OfType resultType,
                               StringRef const& message,
                               AssertionReaction& reaction) {
  m_reporter->assertionStarting(info);

  m_lastAssertionInfo = info;

  AssertionResultData data(resultType, LazyExpression(false));
  data.message = static_cast<std::string>(message);
  AssertionResult assertionResult{m_lastAssertionInfo, data};
  assertionEnded(assertionResult);
  if (!assertionResult.isOk())
    populateReaction(reaction);
}
void RunContext::handleUnexpectedExceptionNotThrown(
    AssertionInfo const& info, AssertionReaction& reaction) {
  handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);
}

void RunContext::handleUnexpectedInflightException(
    AssertionInfo const& info, std::string const& message,
    AssertionReaction& reaction) {
  m_lastAssertionInfo = info;

  AssertionResultData data(ResultWas::ThrewException, LazyExpression(false));
  data.message = message;
  AssertionResult assertionResult{info, data};
  assertionEnded(assertionResult);
  populateReaction(reaction);
}

void RunContext::populateReaction(AssertionReaction& reaction) {
  reaction.shouldDebugBreak = m_config->shouldDebugBreak();
  reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition &
                                        ResultDisposition::Normal);
}

void RunContext::handleIncomplete(AssertionInfo const& info) {
  m_lastAssertionInfo = info;

  AssertionResultData data(ResultWas::ThrewException, LazyExpression(false));
  data.message =
      "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE";
  AssertionResult assertionResult{info, data};
  assertionEnded(assertionResult);
}
void RunContext::handleNonExpr(AssertionInfo const& info,
                               ResultWas::OfType resultType,
                               AssertionReaction& reaction) {
  m_lastAssertionInfo = info;

  AssertionResultData data(resultType, LazyExpression(false));
  AssertionResult assertionResult{info, data};
  assertionEnded(assertionResult);

  if (!assertionResult.isOk())
    populateReaction(reaction);
}

IResultCapture& getResultCapture() {
  if (auto* capture = getCurrentContext().getResultCapture())
    return *capture;
  else
    CATCH_INTERNAL_ERROR("No result capture instance");
}

void seedRng(IConfig const& config) {
  if (config.rngSeed() != 0) {
    std::srand(config.rngSeed());
    rng().seed(config.rngSeed());
  }
}

unsigned int rngSeed() { return getCurrentContext().getConfig()->rngSeed(); }

} // namespace Catch
// end catch_run_context.cpp
// start catch_section.cpp

namespace Catch {

Section::Section(SectionInfo const& info)
    : m_info(info), m_sectionIncluded(getResultCapture().sectionStarted(
                        m_info, m_assertions)) {
  m_timer.start();
}

Section::~Section() {
  if (m_sectionIncluded) {
    SectionEndInfo endInfo{m_info, m_assertions, m_timer.getElapsedSeconds()};
    if (uncaught_exceptions())
      getResultCapture().sectionEndedEarly(endInfo);
    else
      getResultCapture().sectionEnded(endInfo);
  }
}

// This indicates whether the section should be executed or not
Section::operator bool() const { return m_sectionIncluded; }

} // end namespace Catch
// end catch_section.cpp
// start catch_section_info.cpp

namespace Catch {

SectionInfo::SectionInfo(SourceLineInfo const& _lineInfo,
                         std::string const& _name)
    : name(_name), lineInfo(_lineInfo) {}

} // end namespace Catch
// end catch_section_info.cpp
// start catch_session.cpp

// start catch_session.h

#include <memory>

namespace Catch {

class Session : NonCopyable {
public:
  Session();
  ~Session() override;

  void showHelp() const;
  void libIdentify();

  int applyCommandLine(int argc, char const* const* argv);
#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
  int applyCommandLine(int argc, wchar_t const* const* argv);
#endif

  void useConfigData(ConfigData const& configData);

  template <typename CharT>
  int run(int argc, CharT const* const argv[]) {
    if (m_startupExceptions)
      return 1;
    int returnCode = applyCommandLine(argc, argv);
    if (returnCode == 0)
      returnCode = run();
    return returnCode;
  }

  int run();

  clara::Parser const& cli() const;
  void cli(clara::Parser const& newParser);
  ConfigData& configData();
  Config& config();

private:
  int runInternal();

  clara::Parser m_cli;
  ConfigData m_configData;
  std::shared_ptr<Config> m_config;
  bool m_startupExceptions = false;
};

} // end namespace Catch

// end catch_session.h
// start catch_version.h

#include <iosfwd>

namespace Catch {

// Versioning information
struct Version {
  Version(Version const&)            = delete;
  Version& operator=(Version const&) = delete;
  Version(unsigned int _majorVersion, unsigned int _minorVersion,
          unsigned int _patchNumber, char const* const _branchName,
          unsigned int _buildNumber);

  unsigned int const majorVersion;
  unsigned int const minorVersion;
  unsigned int const patchNumber;

  // buildNumber is only used if branchName is not null
  char const* const branchName;
  unsigned int const buildNumber;

  friend std::ostream& operator<<(std::ostream& os, Version const& version);
};

Version const& libraryVersion();
} // namespace Catch

// end catch_version.h
#include <cstdlib>
#include <iomanip>
#include <set>
#include <iterator>

namespace Catch {

namespace {
const int MaxExitCode = 255;

IStreamingReporterPtr createReporter(std::string const& reporterName,
                                     IConfigPtr const& config) {
  auto reporter = Catch::getRegistryHub().getReporterRegistry().create(
      reporterName, config);
  CATCH_ENFORCE(reporter,
                "No reporter registered with name: '" << reporterName << "'");

  return reporter;
}

IStreamingReporterPtr makeReporter(std::shared_ptr<Config> const& config) {
  if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()) {
    return createReporter(config->getReporterName(), config);
  }

  // On older platforms, returning std::unique_ptr<ListeningReporter>
  // when the return type is std::unique_ptr<IStreamingReporter>
  // doesn't compile without a std::move call. However, this causes
  // a warning on newer platforms. Thus, we have to work around
  // it a bit and downcast the pointer manually.
  auto ret    = std::unique_ptr<IStreamingReporter>(new ListeningReporter);
  auto& multi = static_cast<ListeningReporter&>(*ret);
  auto const& listeners =
      Catch::getRegistryHub().getReporterRegistry().getListeners();
  for (auto const& listener : listeners) {
    multi.addListener(listener->create(Catch::ReporterConfig(config)));
  }
  multi.addReporter(createReporter(config->getReporterName(), config));
  return ret;
}

class TestGroup {
public:
  explicit TestGroup(std::shared_ptr<Config> const& config)
      : m_config{config}, m_context{config, makeReporter(config)} {
    auto const& allTestCases = getAllTestCasesSorted(*m_config);
    m_matches = m_config->testSpec().matchesByFilter(allTestCases, *m_config);
    auto const& invalidArgs = m_config->testSpec().getInvalidArgs();

    if (m_matches.empty() && invalidArgs.empty()) {
      for (auto const& test : allTestCases)
        if (!test.isHidden())
          m_tests.emplace(&test);
    } else {
      for (auto const& match : m_matches)
        m_tests.insert(match.tests.begin(), match.tests.end());
    }
  }

  Totals execute() {
    auto const& invalidArgs = m_config->testSpec().getInvalidArgs();
    Totals totals;
    m_context.testGroupStarting(m_config->name(), 1, 1);
    for (auto const& testCase : m_tests) {
      if (!m_context.aborting())
        totals += m_context.runTest(*testCase);
      else
        m_context.reporter().skipTest(*testCase);
    }

    for (auto const& match : m_matches) {
      if (match.tests.empty()) {
        m_context.reporter().noMatchingTestCases(match.name);
        totals.error = -1;
      }
    }

    if (!invalidArgs.empty()) {
      for (auto const& invalidArg : invalidArgs)
        m_context.reporter().reportInvalidArguments(invalidArg);
    }

    m_context.testGroupEnded(m_config->name(), totals, 1, 1);
    return totals;
  }

private:
  using Tests = std::set<TestCase const*>;

  std::shared_ptr<Config> m_config;
  RunContext m_context;
  Tests m_tests;
  TestSpec::Matches m_matches;
};

void applyFilenamesAsTags(Catch::IConfig const& config) {
  auto& tests =
      const_cast<std::vector<TestCase>&>(getAllTestCasesSorted(config));
  for (auto& testCase : tests) {
    auto tags = testCase.tags;

    std::string filename = testCase.lineInfo.file;
    auto lastSlash       = filename.find_last_of("\\/");
    if (lastSlash != std::string::npos) {
      filename.erase(0, lastSlash);
      filename[0] = '#';
    }

    auto lastDot = filename.find_last_of('.');
    if (lastDot != std::string::npos) {
      filename.erase(lastDot);
    }

    tags.push_back(std::move(filename));
    setTags(testCase, tags);
  }
}

} // namespace

Session::Session() {
  static bool alreadyInstantiated = false;
  if (alreadyInstantiated) {
    CATCH_TRY {
      CATCH_INTERNAL_ERROR(
          "Only one instance of Catch::Session can ever be used");
    }
    CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }
  }

  // There cannot be exceptions at startup in no-exception mode.
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
  const auto& exceptions =
      getRegistryHub().getStartupExceptionRegistry().getExceptions();
  if (!exceptions.empty()) {
    config();
    getCurrentMutableContext().setConfig(m_config);

    m_startupExceptions = true;
    Colour colourGuard(Colour::Red);
    Catch::cerr() << "Errors occurred during startup!" << '\n';
    // iterate over all exceptions and notify user
    for (const auto& ex_ptr : exceptions) {
      try {
        std::rethrow_exception(ex_ptr);
      } catch (std::exception const& ex) {
        Catch::cerr() << Column(ex.what()).indent(2) << '\n';
      }
    }
  }
#endif

  alreadyInstantiated = true;
  m_cli               = makeCommandLineParser(m_configData);
}
Session::~Session() { Catch::cleanUp(); }

void Session::showHelp() const {
  Catch::cout() << "\nCatch v" << libraryVersion() << "\n"
                << m_cli << std::endl
                << "For more detailed usage please see the project docs\n"
                << std::endl;
}
void Session::libIdentify() {
  Catch::cout() << std::left << std::setw(16) << "description: "
                << "A Catch2 test executable\n"
                << std::left << std::setw(16) << "category: "
                << "testframework\n"
                << std::left << std::setw(16) << "framework: "
                << "Catch Test\n"
                << std::left << std::setw(16) << "version: " << libraryVersion()
                << std::endl;
}

int Session::applyCommandLine(int argc, char const* const* argv) {
  if (m_startupExceptions)
    return 1;

  auto result = m_cli.parse(clara::Args(argc, argv));
  if (!result) {
    config();
    getCurrentMutableContext().setConfig(m_config);
    Catch::cerr() << Colour(Colour::Red) << "\nError(s) in input:\n"
                  << Column(result.errorMessage()).indent(2) << "\n\n";
    Catch::cerr() << "Run with -? for usage\n" << std::endl;
    return MaxExitCode;
  }

  if (m_configData.showHelp)
    showHelp();
  if (m_configData.libIdentify)
    libIdentify();
  m_config.reset();
  return 0;
}

#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
int Session::applyCommandLine(int argc, wchar_t const* const* argv) {

  char** utf8Argv = new char*[argc];

  for (int i = 0; i < argc; ++i) {
    int bufSize = WideCharToMultiByte(CP_UTF8, 0, argv[i], -1, nullptr, 0,
                                      nullptr, nullptr);

    utf8Argv[i] = new char[bufSize];

    WideCharToMultiByte(CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr,
                        nullptr);
  }

  int returnCode = applyCommandLine(argc, utf8Argv);

  for (int i = 0; i < argc; ++i)
    delete[] utf8Argv[i];

  delete[] utf8Argv;

  return returnCode;
}
#endif

void Session::useConfigData(ConfigData const& configData) {
  m_configData = configData;
  m_config.reset();
}

int Session::run() {
  if ((m_configData.waitForKeypress & WaitForKeypress::BeforeStart) != 0) {
    Catch::cout() << "...waiting for enter/ return before starting"
                  << std::endl;
    static_cast<void>(std::getchar());
  }
  int exitCode = runInternal();
  if ((m_configData.waitForKeypress & WaitForKeypress::BeforeExit) != 0) {
    Catch::cout() << "...waiting for enter/ return before exiting, with code: "
                  << exitCode << std::endl;
    static_cast<void>(std::getchar());
  }
  return exitCode;
}

clara::Parser const& Session::cli() const { return m_cli; }
void Session::cli(clara::Parser const& newParser) { m_cli = newParser; }
ConfigData& Session::configData() { return m_configData; }
Config& Session::config() {
  if (!m_config)
    m_config = std::make_shared<Config>(m_configData);
  return *m_config;
}

int Session::runInternal() {
  if (m_startupExceptions)
    return 1;

  if (m_configData.showHelp || m_configData.libIdentify) {
    return 0;
  }

  CATCH_TRY {
    config(); // Force config to be constructed

    seedRng(*m_config);

    if (m_configData.filenamesAsTags)
      applyFilenamesAsTags(*m_config);

    // Handle list request
    if (Option<std::size_t> listed = list(m_config))
      return static_cast<int>(*listed);

    TestGroup tests{m_config};
    auto const totals = tests.execute();

    if (m_config->warnAboutNoTests() && totals.error == -1)
      return 2;

    // Note that on unices only the lower 8 bits are usually used, clamping
    // the return value to 255 prevents false negative when some multiple
    // of 256 tests has failed
    return (std::min)(
        MaxExitCode,
        (std::max)(totals.error, static_cast<int>(totals.assertions.failed)));
  }
#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
  catch (std::exception& ex) {
    Catch::cerr() << ex.what() << std::endl;
    return MaxExitCode;
  }
#endif
}

} // end namespace Catch
// end catch_session.cpp
// start catch_singletons.cpp

#include <vector>

namespace Catch {

namespace {
static auto getSingletons() -> std::vector<ISingleton*>*& {
  static std::vector<ISingleton*>* g_singletons = nullptr;
  if (!g_singletons)
    g_singletons = new std::vector<ISingleton*>();
  return g_singletons;
}
} // namespace

ISingleton::~ISingleton() {}

void addSingleton(ISingleton* singleton) {
  getSingletons()->push_back(singleton);
}
void cleanupSingletons() {
  auto& singletons = getSingletons();
  for (auto singleton : *singletons)
    delete singleton;
  delete singletons;
  singletons = nullptr;
}

} // namespace Catch
// end catch_singletons.cpp
// start catch_startup_exception_registry.cpp

#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
namespace Catch {
void StartupExceptionRegistry::add(
    std::exception_ptr const& exception) noexcept {
  CATCH_TRY { m_exceptions.push_back(exception); }
  CATCH_CATCH_ALL {
    // If we run out of memory during start-up there's really not a lot more we
    // can do about it
    std::terminate();
  }
}

std::vector<std::exception_ptr> const&
StartupExceptionRegistry::getExceptions() const noexcept {
  return m_exceptions;
}

} // end namespace Catch
#endif
// end catch_startup_exception_registry.cpp
// start catch_stream.cpp

#include <cstdio>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <memory>

namespace Catch {

Catch::IStream::~IStream() = default;

namespace Detail {
namespace {
template <typename WriterF, std::size_t bufferSize = 256>
class StreamBufImpl : public std::streambuf {
  char data[bufferSize];
  WriterF m_writer;

public:
  StreamBufImpl() { setp(data, data + sizeof(data)); }

  ~StreamBufImpl() noexcept { StreamBufImpl::sync(); }

private:
  int overflow(int c) override {
    sync();

    if (c != EOF) {
      if (pbase() == epptr())
        m_writer(std::string(1, static_cast<char>(c)));
      else
        sputc(static_cast<char>(c));
    }
    return 0;
  }

  int sync() override {
    if (pbase() != pptr()) {
      m_writer(std::string(
          pbase(), static_cast<std::string::size_type>(pptr() - pbase())));
      setp(pbase(), epptr());
    }
    return 0;
  }
};

///////////////////////////////////////////////////////////////////////////

struct OutputDebugWriter {

  void operator()(std::string const& str) { writeToDebugConsole(str); }
};

///////////////////////////////////////////////////////////////////////////

class FileStream : public IStream {
  mutable std::ofstream m_ofs;

public:
  FileStream(StringRef filename) {
    m_ofs.open(filename.c_str());
    CATCH_ENFORCE(!m_ofs.fail(), "Unable to open file: '" << filename << "'");
  }
  ~FileStream() override = default;

public: // IStream
  std::ostream& stream() const override { return m_ofs; }
};

///////////////////////////////////////////////////////////////////////////

class CoutStream : public IStream {
  mutable std::ostream m_os;

public:
  // Store the streambuf from cout up-front because
  // cout may get redirected when running tests
  CoutStream() : m_os(Catch::cout().rdbuf()) {}
  ~CoutStream() override = default;

public: // IStream
  std::ostream& stream() const override { return m_os; }
};

///////////////////////////////////////////////////////////////////////////

class DebugOutStream : public IStream {
  std::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
  mutable std::ostream m_os;

public:
  DebugOutStream()
      : m_streamBuf(new StreamBufImpl<OutputDebugWriter>()),
        m_os(m_streamBuf.get()) {}

  ~DebugOutStream() override = default;

public: // IStream
  std::ostream& stream() const override { return m_os; }
};

} // namespace
} // namespace Detail

///////////////////////////////////////////////////////////////////////////

auto makeStream(StringRef const& filename) -> IStream const* {
  if (filename.empty())
    return new Detail::CoutStream();
  else if (filename[0] == '%') {
    if (filename == "%debug")
      return new Detail::DebugOutStream();
    else
      CATCH_ERROR("Unrecognised stream: '" << filename << "'");
  } else
    return new Detail::FileStream(filename);
}

// This class encapsulates the idea of a pool of ostringstreams that can be
// reused.
struct StringStreams {
  std::vector<std::unique_ptr<std::ostringstream>> m_streams;
  std::vector<std::size_t> m_unused;
  std::ostringstream m_referenceStream; // Used for copy state/ flags from

  auto add() -> std::size_t {
    if (m_unused.empty()) {
      m_streams.push_back(
          std::unique_ptr<std::ostringstream>(new std::ostringstream));
      return m_streams.size() - 1;
    } else {
      auto index = m_unused.back();
      m_unused.pop_back();
      return index;
    }
  }

  void release(std::size_t index) {
    m_streams[index]->copyfmt(
        m_referenceStream); // Restore initial flags and other state
    m_unused.push_back(index);
  }
};

ReusableStringStream::ReusableStringStream()
    : m_index(Singleton<StringStreams>::getMutable().add()),
      m_oss(Singleton<StringStreams>::getMutable().m_streams[m_index].get()) {}

ReusableStringStream::~ReusableStringStream() {
  static_cast<std::ostringstream*>(m_oss)->str("");
  m_oss->clear();
  Singleton<StringStreams>::getMutable().release(m_index);
}

auto ReusableStringStream::str() const -> std::string {
  return static_cast<std::ostringstream*>(m_oss)->str();
}

///////////////////////////////////////////////////////////////////////////

#ifndef CATCH_CONFIG_NOSTDOUT // If you #define this you must implement these
                              // functions
std::ostream& cout() { return std::cout; }
std::ostream& cerr() { return std::cerr; }
std::ostream& clog() { return std::clog; }
#endif
} // namespace Catch
// end catch_stream.cpp
// start catch_string_manip.cpp

#include <algorithm>
#include <ostream>
#include <cstring>
#include <cctype>
#include <vector>

namespace Catch {

namespace {
char toLowerCh(char c) {
  return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
}
} // namespace

bool startsWith(std::string const& s, std::string const& prefix) {
  return s.size() >= prefix.size() &&
         std::equal(prefix.begin(), prefix.end(), s.begin());
}
bool startsWith(std::string const& s, char prefix) {
  return !s.empty() && s[0] == prefix;
}
bool endsWith(std::string const& s, std::string const& suffix) {
  return s.size() >= suffix.size() &&
         std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
}
bool endsWith(std::string const& s, char suffix) {
  return !s.empty() && s[s.size() - 1] == suffix;
}
bool contains(std::string const& s, std::string const& infix) {
  return s.find(infix) != std::string::npos;
}
void toLowerInPlace(std::string& s) {
  std::transform(s.begin(), s.end(), s.begin(), toLowerCh);
}
std::string toLower(std::string const& s) {
  std::string lc = s;
  toLowerInPlace(lc);
  return lc;
}
std::string trim(std::string const& str) {
  static char const* whitespaceChars = "\n\r\t ";
  std::string::size_type start       = str.find_first_not_of(whitespaceChars);
  std::string::size_type end         = str.find_last_not_of(whitespaceChars);

  return start != std::string::npos ? str.substr(start, 1 + end - start)
                                    : std::string();
}

StringRef trim(StringRef ref) {
  const auto is_ws = [](char c) {
    return c == ' ' || c == '\t' || c == '\n' || c == '\r';
  };
  size_t real_begin = 0;
  while (real_begin < ref.size() && is_ws(ref[real_begin])) {
    ++real_begin;
  }
  size_t real_end = ref.size();
  while (real_end > real_begin && is_ws(ref[real_end - 1])) {
    --real_end;
  }

  return ref.substr(real_begin, real_end - real_begin);
}

bool replaceInPlace(std::string& str, std::string const& replaceThis,
                    std::string const& withThis) {
  bool replaced = false;
  std::size_t i = str.find(replaceThis);
  while (i != std::string::npos) {
    replaced = true;
    str      = str.substr(0, i) + withThis + str.substr(i + replaceThis.size());
    if (i < str.size() - withThis.size())
      i = str.find(replaceThis, i + withThis.size());
    else
      i = std::string::npos;
  }
  return replaced;
}

std::vector<StringRef> splitStringRef(StringRef str, char delimiter) {
  std::vector<StringRef> subStrings;
  std::size_t start = 0;
  for (std::size_t pos = 0; pos < str.size(); ++pos) {
    if (str[pos] == delimiter) {
      if (pos - start > 1)
        subStrings.push_back(str.substr(start, pos - start));
      start = pos + 1;
    }
  }
  if (start < str.size())
    subStrings.push_back(str.substr(start, str.size() - start));
  return subStrings;
}

pluralise::pluralise(std::size_t count, std::string const& label)
    : m_count(count), m_label(label) {}

std::ostream& operator<<(std::ostream& os, pluralise const& pluraliser) {
  os << pluraliser.m_count << ' ' << pluraliser.m_label;
  if (pluraliser.m_count != 1)
    os << 's';
  return os;
}

} // namespace Catch
// end catch_string_manip.cpp
// start catch_stringref.cpp

#include <algorithm>
#include <ostream>
#include <cstring>
#include <cstdint>

namespace Catch {
StringRef::StringRef(char const* rawChars) noexcept
    : StringRef(rawChars,
                static_cast<StringRef::size_type>(std::strlen(rawChars))) {}

auto StringRef::c_str() const -> char const* {
  CATCH_ENFORCE(isNullTerminated(),
                "Called StringRef::c_str() on a non-null-terminated instance");
  return m_start;
}
auto StringRef::data() const noexcept -> char const* { return m_start; }

auto StringRef::substr(size_type start, size_type size) const noexcept
    -> StringRef {
  if (start < m_size) {
    return StringRef(m_start + start, (std::min)(m_size - start, size));
  } else {
    return StringRef();
  }
}
auto StringRef::operator==(StringRef const& other) const noexcept -> bool {
  return m_size == other.m_size &&
         (std::memcmp(m_start, other.m_start, m_size) == 0);
}

auto operator<<(std::ostream& os, StringRef const& str) -> std::ostream& {
  return os.write(str.data(), str.size());
}

auto operator+=(std::string& lhs, StringRef const& rhs) -> std::string& {
  lhs.append(rhs.data(), rhs.size());
  return lhs;
}

} // namespace Catch
// end catch_stringref.cpp
// start catch_tag_alias.cpp

namespace Catch {
TagAlias::TagAlias(std::string const& _tag, SourceLineInfo _lineInfo)
    : tag(_tag), lineInfo(_lineInfo) {}
} // namespace Catch
// end catch_tag_alias.cpp
// start catch_tag_alias_autoregistrar.cpp

namespace Catch {

RegistrarForTagAliases::RegistrarForTagAliases(char const* alias,
                                               char const* tag,
                                               SourceLineInfo const& lineInfo) {
  CATCH_TRY { getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo); }
  CATCH_CATCH_ALL {
    // Do not throw when constructing global objects, instead register the
    // exception to be processed later
    getMutableRegistryHub().registerStartupException();
  }
}

} // namespace Catch
// end catch_tag_alias_autoregistrar.cpp
// start catch_tag_alias_registry.cpp

#include <sstream>

namespace Catch {

TagAliasRegistry::~TagAliasRegistry() {}

TagAlias const* TagAliasRegistry::find(std::string const& alias) const {
  auto it = m_registry.find(alias);
  if (it != m_registry.end())
    return &(it->second);
  else
    return nullptr;
}

std::string
TagAliasRegistry::expandAliases(std::string const& unexpandedTestSpec) const {
  std::string expandedTestSpec = unexpandedTestSpec;
  for (auto const& registryKvp : m_registry) {
    std::size_t pos = expandedTestSpec.find(registryKvp.first);
    if (pos != std::string::npos) {
      expandedTestSpec =
          expandedTestSpec.substr(0, pos) + registryKvp.second.tag +
          expandedTestSpec.substr(pos + registryKvp.first.size());
    }
  }
  return expandedTestSpec;
}

void TagAliasRegistry::add(std::string const& alias, std::string const& tag,
                           SourceLineInfo const& lineInfo) {
  CATCH_ENFORCE(startsWith(alias, "[@") && endsWith(alias, ']'),
                "error: tag alias, '" << alias
                                      << "' is not of the form [@alias name].\n"
                                      << lineInfo);

  CATCH_ENFORCE(
      m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,
      "error: tag alias, '" << alias << "' already registered.\n"
                            << "\tFirst seen at: " << find(alias)->lineInfo
                            << "\n"
                            << "\tRedefined at: " << lineInfo);
}

ITagAliasRegistry::~ITagAliasRegistry() {}

ITagAliasRegistry const& ITagAliasRegistry::get() {
  return getRegistryHub().getTagAliasRegistry();
}

} // end namespace Catch
// end catch_tag_alias_registry.cpp
// start catch_test_case_info.cpp

#include <cctype>
#include <exception>
#include <algorithm>
#include <sstream>

namespace Catch {

namespace {
TestCaseInfo::SpecialProperties parseSpecialTag(std::string const& tag) {
  if (startsWith(tag, '.') || tag == "!hide")
    return TestCaseInfo::IsHidden;
  else if (tag == "!throws")
    return TestCaseInfo::Throws;
  else if (tag == "!shouldfail")
    return TestCaseInfo::ShouldFail;
  else if (tag == "!mayfail")
    return TestCaseInfo::MayFail;
  else if (tag == "!nonportable")
    return TestCaseInfo::NonPortable;
  else if (tag == "!benchmark")
    return static_cast<TestCaseInfo::SpecialProperties>(
        TestCaseInfo::Benchmark | TestCaseInfo::IsHidden);
  else
    return TestCaseInfo::None;
}
bool isReservedTag(std::string const& tag) {
  return parseSpecialTag(tag) == TestCaseInfo::None && tag.size() > 0 &&
         !std::isalnum(static_cast<unsigned char>(tag[0]));
}
void enforceNotReservedTag(std::string const& tag,
                           SourceLineInfo const& _lineInfo) {
  CATCH_ENFORCE(!isReservedTag(tag),
                "Tag name: [" << tag << "] is not allowed.\n"
                              << "Tag names starting with non alphanumeric "
                                 "characters are reserved\n"
                              << _lineInfo);
}
} // namespace

TestCase makeTestCase(ITestInvoker* _testCase, std::string const& _className,
                      NameAndTags const& nameAndTags,
                      SourceLineInfo const& _lineInfo) {
  bool isHidden = false;

  // Parse out tags
  std::vector<std::string> tags;
  std::string desc, tag;
  bool inTag = false;
  for (char c : nameAndTags.tags) {
    if (!inTag) {
      if (c == '[')
        inTag = true;
      else
        desc += c;
    } else {
      if (c == ']') {
        TestCaseInfo::SpecialProperties prop = parseSpecialTag(tag);
        if ((prop & TestCaseInfo::IsHidden) != 0)
          isHidden = true;
        else if (prop == TestCaseInfo::None)
          enforceNotReservedTag(tag, _lineInfo);

        // Merged hide tags like `[.approvals]` should be added as
        // `[.][approvals]`. The `[.]` is added at later point, so
        // we only strip the prefix
        if (startsWith(tag, '.') && tag.size() > 1) {
          tag.erase(0, 1);
        }
        tags.push_back(tag);
        tag.clear();
        inTag = false;
      } else
        tag += c;
    }
  }
  if (isHidden) {
    // Add all "hidden" tags to make them behave identically
    tags.insert(tags.end(), {".", "!hide"});
  }

  TestCaseInfo info(static_cast<std::string>(nameAndTags.name), _className,
                    desc, tags, _lineInfo);
  return TestCase(_testCase, std::move(info));
}

void setTags(TestCaseInfo& testCaseInfo, std::vector<std::string> tags) {
  std::sort(begin(tags), end(tags));
  tags.erase(std::unique(begin(tags), end(tags)), end(tags));
  testCaseInfo.lcaseTags.clear();

  for (auto const& tag : tags) {
    std::string lcaseTag    = toLower(tag);
    testCaseInfo.properties = static_cast<TestCaseInfo::SpecialProperties>(
        testCaseInfo.properties | parseSpecialTag(lcaseTag));
    testCaseInfo.lcaseTags.push_back(lcaseTag);
  }
  testCaseInfo.tags = std::move(tags);
}

TestCaseInfo::TestCaseInfo(std::string const& _name,
                           std::string const& _className,
                           std::string const& _description,
                           std::vector<std::string> const& _tags,
                           SourceLineInfo const& _lineInfo)
    : name(_name), className(_className), description(_description),
      lineInfo(_lineInfo), properties(None) {
  setTags(*this, _tags);
}

bool TestCaseInfo::isHidden() const { return (properties & IsHidden) != 0; }
bool TestCaseInfo::throws() const { return (properties & Throws) != 0; }
bool TestCaseInfo::okToFail() const {
  return (properties & (ShouldFail | MayFail)) != 0;
}
bool TestCaseInfo::expectedToFail() const {
  return (properties & (ShouldFail)) != 0;
}

std::string TestCaseInfo::tagsAsString() const {
  std::string ret;
  // '[' and ']' per tag
  std::size_t full_size = 2 * tags.size();
  for (const auto& tag : tags) {
    full_size += tag.size();
  }
  ret.reserve(full_size);
  for (const auto& tag : tags) {
    ret.push_back('[');
    ret.append(tag);
    ret.push_back(']');
  }

  return ret;
}

TestCase::TestCase(ITestInvoker* testCase, TestCaseInfo&& info)
    : TestCaseInfo(std::move(info)), test(testCase) {}

TestCase TestCase::withName(std::string const& _newName) const {
  TestCase other(*this);
  other.name = _newName;
  return other;
}

void TestCase::invoke() const { test->invoke(); }

bool TestCase::operator==(TestCase const& other) const {
  return test.get() == other.test.get() && name == other.name &&
         className == other.className;
}

bool TestCase::operator<(TestCase const& other) const {
  return name < other.name;
}

TestCaseInfo const& TestCase::getTestCaseInfo() const { return *this; }

} // end namespace Catch
// end catch_test_case_info.cpp
// start catch_test_case_registry_impl.cpp

#include <algorithm>
#include <sstream>

namespace Catch {

namespace {
struct TestHasher {
  using hash_t = uint64_t;

  explicit TestHasher(hash_t hashSuffix) : m_hashSuffix{hashSuffix} {}

  uint32_t operator()(TestCase const& t) const {
    // FNV-1a hash with multiplication fold.
    const hash_t prime = 1099511628211u;
    hash_t hash        = 14695981039346656037u;
    for (const char c : t.name) {
      hash ^= c;
      hash *= prime;
    }
    hash ^= m_hashSuffix;
    hash *= prime;
    const uint32_t low{static_cast<uint32_t>(hash)};
    const uint32_t high{static_cast<uint32_t>(hash >> 32)};
    return low * high;
  }

private:
  hash_t m_hashSuffix;
};
} // end unnamed namespace

std::vector<TestCase>
sortTests(IConfig const& config,
          std::vector<TestCase> const& unsortedTestCases) {
  switch (config.runOrder()) {
  case RunTests::InDeclarationOrder:
    // already in declaration order
    break;

  case RunTests::InLexicographicalOrder: {
    std::vector<TestCase> sorted = unsortedTestCases;
    std::sort(sorted.begin(), sorted.end());
    return sorted;
  }

  case RunTests::InRandomOrder: {
    seedRng(config);
    TestHasher h{config.rngSeed()};

    using hashedTest = std::pair<TestHasher::hash_t, TestCase const*>;
    std::vector<hashedTest> indexed_tests;
    indexed_tests.reserve(unsortedTestCases.size());

    for (auto const& testCase : unsortedTestCases) {
      indexed_tests.emplace_back(h(testCase), &testCase);
    }

    std::sort(indexed_tests.begin(), indexed_tests.end(),
              [](hashedTest const& lhs, hashedTest const& rhs) {
                if (lhs.first == rhs.first) {
                  return lhs.second->name < rhs.second->name;
                }
                return lhs.first < rhs.first;
              });

    std::vector<TestCase> sorted;
    sorted.reserve(indexed_tests.size());

    for (auto const& hashed : indexed_tests) {
      sorted.emplace_back(*hashed.second);
    }

    return sorted;
  }
  }
  return unsortedTestCases;
}

bool isThrowSafe(TestCase const& testCase, IConfig const& config) {
  return !testCase.throws() || config.allowThrows();
}

bool matchTest(TestCase const& testCase, TestSpec const& testSpec,
               IConfig const& config) {
  return testSpec.matches(testCase) && isThrowSafe(testCase, config);
}

void enforceNoDuplicateTestCases(std::vector<TestCase> const& functions) {
  std::set<TestCase> seenFunctions;
  for (auto const& function : functions) {
    auto prev = seenFunctions.insert(function);
    CATCH_ENFORCE(prev.second, "error: TEST_CASE( \""
                                   << function.name << "\" ) already defined.\n"
                                   << "\tFirst seen at "
                                   << prev.first->getTestCaseInfo().lineInfo
                                   << "\n"
                                   << "\tRedefined at "
                                   << function.getTestCaseInfo().lineInfo);
  }
}

std::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,
                                  TestSpec const& testSpec,
                                  IConfig const& config) {
  std::vector<TestCase> filtered;
  filtered.reserve(testCases.size());
  for (auto const& testCase : testCases) {
    if ((!testSpec.hasFilters() && !testCase.isHidden()) ||
        (testSpec.hasFilters() && matchTest(testCase, testSpec, config))) {
      filtered.push_back(testCase);
    }
  }
  return filtered;
}
std::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config) {
  return getRegistryHub().getTestCaseRegistry().getAllTestsSorted(config);
}

void TestRegistry::registerTest(TestCase const& testCase) {
  std::string name = testCase.getTestCaseInfo().name;
  if (name.empty()) {
    ReusableStringStream rss;
    rss << "Anonymous test case " << ++m_unnamedCount;
    return registerTest(testCase.withName(rss.str()));
  }
  m_functions.push_back(testCase);
}

std::vector<TestCase> const& TestRegistry::getAllTests() const {
  return m_functions;
}
std::vector<TestCase> const&
TestRegistry::getAllTestsSorted(IConfig const& config) const {
  if (m_sortedFunctions.empty())
    enforceNoDuplicateTestCases(m_functions);

  if (m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty()) {
    m_sortedFunctions  = sortTests(config, m_functions);
    m_currentSortOrder = config.runOrder();
  }
  return m_sortedFunctions;
}

///////////////////////////////////////////////////////////////////////////
TestInvokerAsFunction::TestInvokerAsFunction(void (*testAsFunction)()) noexcept
    : m_testAsFunction(testAsFunction) {}

void TestInvokerAsFunction::invoke() const { m_testAsFunction(); }

std::string extractClassName(StringRef const& classOrQualifiedMethodName) {
  std::string className(classOrQualifiedMethodName);
  if (startsWith(className, '&')) {
    std::size_t lastColons        = className.rfind("::");
    std::size_t penultimateColons = className.rfind("::", lastColons - 1);
    if (penultimateColons == std::string::npos)
      penultimateColons = 1;
    className =
        className.substr(penultimateColons, lastColons - penultimateColons);
  }
  return className;
}

} // end namespace Catch
// end catch_test_case_registry_impl.cpp
// start catch_test_case_tracker.cpp

#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <memory>
#include <sstream>

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wexit-time-destructors"
#endif

namespace Catch {
namespace TestCaseTracking {

NameAndLocation::NameAndLocation(std::string const& _name,
                                 SourceLineInfo const& _location)
    : name(_name), location(_location) {}

ITracker::~ITracker() = default;

ITracker& TrackerContext::startRun() {
  m_rootTracker = std::make_shared<SectionTracker>(
      NameAndLocation("{root}", CATCH_INTERNAL_LINEINFO), *this, nullptr);
  m_currentTracker = nullptr;
  m_runState       = Executing;
  return *m_rootTracker;
}

void TrackerContext::endRun() {
  m_rootTracker.reset();
  m_currentTracker = nullptr;
  m_runState       = NotStarted;
}

void TrackerContext::startCycle() {
  m_currentTracker = m_rootTracker.get();
  m_runState       = Executing;
}
void TrackerContext::completeCycle() { m_runState = CompletedCycle; }

bool TrackerContext::completedCycle() const {
  return m_runState == CompletedCycle;
}
ITracker& TrackerContext::currentTracker() { return *m_currentTracker; }
void TrackerContext::setCurrentTracker(ITracker* tracker) {
  m_currentTracker = tracker;
}

TrackerBase::TrackerBase(NameAndLocation const& nameAndLocation,
                         TrackerContext& ctx, ITracker* parent)
    : ITracker(nameAndLocation), m_ctx(ctx), m_parent(parent) {}

bool TrackerBase::isComplete() const {
  return m_runState == CompletedSuccessfully || m_runState == Failed;
}
bool TrackerBase::isSuccessfullyCompleted() const {
  return m_runState == CompletedSuccessfully;
}
bool TrackerBase::isOpen() const {
  return m_runState != NotStarted && !isComplete();
}
bool TrackerBase::hasChildren() const { return !m_children.empty(); }

void TrackerBase::addChild(ITrackerPtr const& child) {
  m_children.push_back(child);
}

ITrackerPtr TrackerBase::findChild(NameAndLocation const& nameAndLocation) {
  auto it = std::find_if(m_children.begin(), m_children.end(),
                         [&nameAndLocation](ITrackerPtr const& tracker) {
                           return tracker->nameAndLocation().location ==
                                      nameAndLocation.location &&
                                  tracker->nameAndLocation().name ==
                                      nameAndLocation.name;
                         });
  return (it != m_children.end()) ? *it : nullptr;
}
ITracker& TrackerBase::parent() {
  assert(m_parent); // Should always be non-null except for root
  return *m_parent;
}

void TrackerBase::openChild() {
  if (m_runState != ExecutingChildren) {
    m_runState = ExecutingChildren;
    if (m_parent)
      m_parent->openChild();
  }
}

bool TrackerBase::isSectionTracker() const { return false; }
bool TrackerBase::isGeneratorTracker() const { return false; }

void TrackerBase::open() {
  m_runState = Executing;
  moveToThis();
  if (m_parent)
    m_parent->openChild();
}

void TrackerBase::close() {

  // Close any still open children (e.g. generators)
  while (&m_ctx.currentTracker() != this)
    m_ctx.currentTracker().close();

  switch (m_runState) {
  case NeedsAnotherRun:
    break;

  case Executing:
    m_runState = CompletedSuccessfully;
    break;
  case ExecutingChildren:
    if (std::all_of(m_children.begin(), m_children.end(),
                    [](ITrackerPtr const& t) { return t->isComplete(); }))
      m_runState = CompletedSuccessfully;
    break;

  case NotStarted:
  case CompletedSuccessfully:
  case Failed:
    CATCH_INTERNAL_ERROR("Illogical state: " << m_runState);

  default:
    CATCH_INTERNAL_ERROR("Unknown state: " << m_runState);
  }
  moveToParent();
  m_ctx.completeCycle();
}
void TrackerBase::fail() {
  m_runState = Failed;
  if (m_parent)
    m_parent->markAsNeedingAnotherRun();
  moveToParent();
  m_ctx.completeCycle();
}
void TrackerBase::markAsNeedingAnotherRun() { m_runState = NeedsAnotherRun; }

void TrackerBase::moveToParent() {
  assert(m_parent);
  m_ctx.setCurrentTracker(m_parent);
}
void TrackerBase::moveToThis() { m_ctx.setCurrentTracker(this); }

SectionTracker::SectionTracker(NameAndLocation const& nameAndLocation,
                               TrackerContext& ctx, ITracker* parent)
    : TrackerBase(nameAndLocation, ctx, parent),
      m_trimmed_name(trim(nameAndLocation.name)) {
  if (parent) {
    while (!parent->isSectionTracker())
      parent = &parent->parent();

    SectionTracker& parentSection = static_cast<SectionTracker&>(*parent);
    addNextFilters(parentSection.m_filters);
  }
}

bool SectionTracker::isComplete() const {
  bool complete = true;

  if (m_filters.empty() || m_filters[0] == "" ||
      std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) !=
          m_filters.end()) {
    complete = TrackerBase::isComplete();
  }
  return complete;
}

bool SectionTracker::isSectionTracker() const { return true; }

SectionTracker&
SectionTracker::acquire(TrackerContext& ctx,
                        NameAndLocation const& nameAndLocation) {
  std::shared_ptr<SectionTracker> section;

  ITracker& currentTracker = ctx.currentTracker();
  if (ITrackerPtr childTracker = currentTracker.findChild(nameAndLocation)) {
    assert(childTracker);
    assert(childTracker->isSectionTracker());
    section = std::static_pointer_cast<SectionTracker>(childTracker);
  } else {
    section =
        std::make_shared<SectionTracker>(nameAndLocation, ctx, &currentTracker);
    currentTracker.addChild(section);
  }
  if (!ctx.completedCycle())
    section->tryOpen();
  return *section;
}

void SectionTracker::tryOpen() {
  if (!isComplete())
    open();
}

void SectionTracker::addInitialFilters(
    std::vector<std::string> const& filters) {
  if (!filters.empty()) {
    m_filters.reserve(m_filters.size() + filters.size() + 2);
    m_filters.emplace_back(""); // Root - should never be consulted
    m_filters.emplace_back(""); // Test Case - not a section filter
    m_filters.insert(m_filters.end(), filters.begin(), filters.end());
  }
}
void SectionTracker::addNextFilters(std::vector<std::string> const& filters) {
  if (filters.size() > 1)
    m_filters.insert(m_filters.end(), filters.begin() + 1, filters.end());
}

std::vector<std::string> const& SectionTracker::getFilters() const {
  return m_filters;
}

std::string const& SectionTracker::trimmedName() const {
  return m_trimmed_name;
}

} // namespace TestCaseTracking

using TestCaseTracking::ITracker;
using TestCaseTracking::SectionTracker;
using TestCaseTracking::TrackerContext;

} // namespace Catch

#if defined(__clang__)
#pragma clang diagnostic pop
#endif
// end catch_test_case_tracker.cpp
// start catch_test_registry.cpp

namespace Catch {

auto makeTestInvoker(void (*testAsFunction)()) noexcept -> ITestInvoker* {
  return new (std::nothrow) TestInvokerAsFunction(testAsFunction);
}

NameAndTags::NameAndTags(StringRef const& name_,
                         StringRef const& tags_) noexcept
    : name(name_), tags(tags_) {}

AutoReg::AutoReg(ITestInvoker* invoker, SourceLineInfo const& lineInfo,
                 StringRef const& classOrMethod,
                 NameAndTags const& nameAndTags) noexcept {
  CATCH_TRY {
    getMutableRegistryHub().registerTest(makeTestCase(
        invoker, extractClassName(classOrMethod), nameAndTags, lineInfo));
  }
  CATCH_CATCH_ALL {
    // Do not throw when constructing global objects, instead register the
    // exception to be processed later
    getMutableRegistryHub().registerStartupException();
  }
}

AutoReg::~AutoReg() = default;
} // namespace Catch
// end catch_test_registry.cpp
// start catch_test_spec.cpp

#include <algorithm>
#include <string>
#include <vector>
#include <memory>

namespace Catch {

TestSpec::Pattern::Pattern(std::string const& name) : m_name(name) {}

TestSpec::Pattern::~Pattern() = default;

std::string const& TestSpec::Pattern::name() const { return m_name; }

TestSpec::NamePattern::NamePattern(std::string const& name,
                                   std::string const& filterString)
    : Pattern(filterString),
      m_wildcardPattern(toLower(name), CaseSensitive::No) {}

bool TestSpec::NamePattern::matches(TestCaseInfo const& testCase) const {
  return m_wildcardPattern.matches(testCase.name);
}

TestSpec::TagPattern::TagPattern(std::string const& tag,
                                 std::string const& filterString)
    : Pattern(filterString), m_tag(toLower(tag)) {}

bool TestSpec::TagPattern::matches(TestCaseInfo const& testCase) const {
  return std::find(begin(testCase.lcaseTags), end(testCase.lcaseTags), m_tag) !=
         end(testCase.lcaseTags);
}

TestSpec::ExcludedPattern::ExcludedPattern(PatternPtr const& underlyingPattern)
    : Pattern(underlyingPattern->name()),
      m_underlyingPattern(underlyingPattern) {}

bool TestSpec::ExcludedPattern::matches(TestCaseInfo const& testCase) const {
  return !m_underlyingPattern->matches(testCase);
}

bool TestSpec::Filter::matches(TestCaseInfo const& testCase) const {
  return std::all_of(m_patterns.begin(), m_patterns.end(),
                     [&](PatternPtr const& p) { return p->matches(testCase); });
}

std::string TestSpec::Filter::name() const {
  std::string name;
  for (auto const& p : m_patterns)
    name += p->name();
  return name;
}

bool TestSpec::hasFilters() const { return !m_filters.empty(); }

bool TestSpec::matches(TestCaseInfo const& testCase) const {
  return std::any_of(m_filters.begin(), m_filters.end(),
                     [&](Filter const& f) { return f.matches(testCase); });
}

TestSpec::Matches
TestSpec::matchesByFilter(std::vector<TestCase> const& testCases,
                          IConfig const& config) const {
  Matches matches(m_filters.size());
  std::transform(m_filters.begin(), m_filters.end(), matches.begin(),
                 [&](Filter const& filter) {
                   std::vector<TestCase const*> currentMatches;
                   for (auto const& test : testCases)
                     if (isThrowSafe(test, config) && filter.matches(test))
                       currentMatches.emplace_back(&test);
                   return FilterMatch{filter.name(), currentMatches};
                 });
  return matches;
}

const TestSpec::vectorStrings& TestSpec::getInvalidArgs() const {
  return (m_invalidArgs);
}

} // namespace Catch
// end catch_test_spec.cpp
// start catch_test_spec_parser.cpp

namespace Catch {

TestSpecParser::TestSpecParser(ITagAliasRegistry const& tagAliases)
    : m_tagAliases(&tagAliases) {}

TestSpecParser& TestSpecParser::parse(std::string const& arg) {
  m_mode      = None;
  m_exclusion = false;
  m_arg       = m_tagAliases->expandAliases(arg);
  m_escapeChars.clear();
  m_substring.reserve(m_arg.size());
  m_patternName.reserve(m_arg.size());
  m_realPatternPos = 0;

  for (m_pos = 0; m_pos < m_arg.size(); ++m_pos)
    // if visitChar fails
    if (!visitChar(m_arg[m_pos])) {
      m_testSpec.m_invalidArgs.push_back(arg);
      break;
    }
  endMode();
  return *this;
}
TestSpec TestSpecParser::testSpec() {
  addFilter();
  return m_testSpec;
}
bool TestSpecParser::visitChar(char c) {
  if ((m_mode != EscapedName) && (c == '\\')) {
    escape();
    addCharToPattern(c);
    return true;
  } else if ((m_mode != EscapedName) && (c == ',')) {
    return separate();
  }

  switch (m_mode) {
  case None:
    if (processNoneChar(c))
      return true;
    break;
  case Name:
    processNameChar(c);
    break;
  case EscapedName:
    endMode();
    addCharToPattern(c);
    return true;
  default:
  case Tag:
  case QuotedName:
    if (processOtherChar(c))
      return true;
    break;
  }

  m_substring += c;
  if (!isControlChar(c)) {
    m_patternName += c;
    m_realPatternPos++;
  }
  return true;
}
// Two of the processing methods return true to signal the caller to return
// without adding the given character to the current pattern strings
bool TestSpecParser::processNoneChar(char c) {
  switch (c) {
  case ' ':
    return true;
  case '~':
    m_exclusion = true;
    return false;
  case '[':
    startNewMode(Tag);
    return false;
  case '"':
    startNewMode(QuotedName);
    return false;
  default:
    startNewMode(Name);
    return false;
  }
}
void TestSpecParser::processNameChar(char c) {
  if (c == '[') {
    if (m_substring == "exclude:")
      m_exclusion = true;
    else
      endMode();
    startNewMode(Tag);
  }
}
bool TestSpecParser::processOtherChar(char c) {
  if (!isControlChar(c))
    return false;
  m_substring += c;
  endMode();
  return true;
}
void TestSpecParser::startNewMode(Mode mode) { m_mode = mode; }
void TestSpecParser::endMode() {
  switch (m_mode) {
  case Name:
  case QuotedName:
    return addNamePattern();
  case Tag:
    return addTagPattern();
  case EscapedName:
    revertBackToLastMode();
    return;
  case None:
  default:
    return startNewMode(None);
  }
}
void TestSpecParser::escape() {
  saveLastMode();
  m_mode = EscapedName;
  m_escapeChars.push_back(m_realPatternPos);
}
bool TestSpecParser::isControlChar(char c) const {
  switch (m_mode) {
  default:
    return false;
  case None:
    return c == '~';
  case Name:
    return c == '[';
  case EscapedName:
    return true;
  case QuotedName:
    return c == '"';
  case Tag:
    return c == '[' || c == ']';
  }
}

void TestSpecParser::addFilter() {
  if (!m_currentFilter.m_patterns.empty()) {
    m_testSpec.m_filters.push_back(m_currentFilter);
    m_currentFilter = TestSpec::Filter();
  }
}

void TestSpecParser::saveLastMode() { lastMode = m_mode; }

void TestSpecParser::revertBackToLastMode() { m_mode = lastMode; }

bool TestSpecParser::separate() {
  if ((m_mode == QuotedName) || (m_mode == Tag)) {
    // invalid argument, signal failure to previous scope.
    m_mode = None;
    m_pos  = m_arg.size();
    m_substring.clear();
    m_patternName.clear();
    m_realPatternPos = 0;
    return false;
  }
  endMode();
  addFilter();
  return true; // success
}

std::string TestSpecParser::preprocessPattern() {
  std::string token = m_patternName;
  for (std::size_t i = 0; i < m_escapeChars.size(); ++i)
    token = token.substr(0, m_escapeChars[i] - i) +
            token.substr(m_escapeChars[i] - i + 1);
  m_escapeChars.clear();
  if (startsWith(token, "exclude:")) {
    m_exclusion = true;
    token       = token.substr(8);
  }

  m_patternName.clear();
  m_realPatternPos = 0;

  return token;
}

void TestSpecParser::addNamePattern() {
  auto token = preprocessPattern();

  if (!token.empty()) {
    TestSpec::PatternPtr pattern =
        std::make_shared<TestSpec::NamePattern>(token, m_substring);
    if (m_exclusion)
      pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
    m_currentFilter.m_patterns.push_back(pattern);
  }
  m_substring.clear();
  m_exclusion = false;
  m_mode      = None;
}

void TestSpecParser::addTagPattern() {
  auto token = preprocessPattern();

  if (!token.empty()) {
    // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo])
    // we have to create a separate hide tag and shorten the real one
    if (token.size() > 1 && token[0] == '.') {
      token.erase(token.begin());
      TestSpec::PatternPtr pattern =
          std::make_shared<TestSpec::TagPattern>(".", m_substring);
      if (m_exclusion) {
        pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
      }
      m_currentFilter.m_patterns.push_back(pattern);
    }

    TestSpec::PatternPtr pattern =
        std::make_shared<TestSpec::TagPattern>(token, m_substring);

    if (m_exclusion) {
      pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
    }
    m_currentFilter.m_patterns.push_back(pattern);
  }
  m_substring.clear();
  m_exclusion = false;
  m_mode      = None;
}

TestSpec parseTestSpec(std::string const& arg) {
  return TestSpecParser(ITagAliasRegistry::get()).parse(arg).testSpec();
}

} // namespace Catch
// end catch_test_spec_parser.cpp
// start catch_timer.cpp

#include <chrono>

static const uint64_t nanosecondsInSecond = 1000000000;

namespace Catch {

auto getCurrentNanosecondsSinceEpoch() -> uint64_t {
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             std::chrono::high_resolution_clock::now().time_since_epoch())
      .count();
}

namespace {
auto estimateClockResolution() -> uint64_t {
  uint64_t sum                     = 0;
  static const uint64_t iterations = 1000000;

  auto startTime = getCurrentNanosecondsSinceEpoch();

  for (std::size_t i = 0; i < iterations; ++i) {

    uint64_t ticks;
    uint64_t baseTicks = getCurrentNanosecondsSinceEpoch();
    do {
      ticks = getCurrentNanosecondsSinceEpoch();
    } while (ticks == baseTicks);

    auto delta = ticks - baseTicks;
    sum += delta;

    // If we have been calibrating for over 3 seconds -- the clock
    // is terrible and we should move on.
    // TBD: How to signal that the measured resolution is probably wrong?
    if (ticks > startTime + 3 * nanosecondsInSecond) {
      return sum / (i + 1u);
    }
  }

  // We're just taking the mean, here. To do better we could take the std. dev
  // and exclude outliers
  // - and potentially do more iterations if there's a high variance.
  return sum / iterations;
}
} // namespace
auto getEstimatedClockResolution() -> uint64_t {
  static auto s_resolution = estimateClockResolution();
  return s_resolution;
}

void Timer::start() { m_nanoseconds = getCurrentNanosecondsSinceEpoch(); }
auto Timer::getElapsedNanoseconds() const -> uint64_t {
  return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;
}
auto Timer::getElapsedMicroseconds() const -> uint64_t {
  return getElapsedNanoseconds() / 1000;
}
auto Timer::getElapsedMilliseconds() const -> unsigned int {
  return static_cast<unsigned int>(getElapsedMicroseconds() / 1000);
}
auto Timer::getElapsedSeconds() const -> double {
  return getElapsedMicroseconds() / 1000000.0;
}

} // namespace Catch
// end catch_timer.cpp
// start catch_tostring.cpp

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wexit-time-destructors"
#pragma clang diagnostic ignored "-Wglobal-constructors"
#endif

// Enable specific decls locally
#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
#endif

#include <cmath>
#include <iomanip>

namespace Catch {

namespace Detail {

const std::string unprintableString = "{?}";

namespace {
const int hexThreshold = 255;

struct Endianness {
  enum Arch { Big, Little };

  static Arch which() {
    int one = 1;
    // If the lowest byte we read is non-zero, we can assume
    // that little endian format is used.
    auto value = *reinterpret_cast<char*>(&one);
    return value ? Little : Big;
  }
};
} // namespace

std::string rawMemoryToString(const void* object, std::size_t size) {
  // Reverse order for little endian architectures
  int i = 0, end = static_cast<int>(size), inc = 1;
  if (Endianness::which() == Endianness::Little) {
    i   = end - 1;
    end = inc = -1;
  }

  unsigned char const* bytes = static_cast<unsigned char const*>(object);
  ReusableStringStream rss;
  rss << "0x" << std::setfill('0') << std::hex;
  for (; i != end; i += inc)
    rss << std::setw(2) << static_cast<unsigned>(bytes[i]);
  return rss.str();
}
} // namespace Detail

template <typename T>
std::string fpToString(T value, int precision) {
  if (Catch::isnan(value)) {
    return "nan";
  }

  ReusableStringStream rss;
  rss << std::setprecision(precision) << std::fixed << value;
  std::string d = rss.str();
  std::size_t i = d.find_last_not_of('0');
  if (i != std::string::npos && i != d.size() - 1) {
    if (d[i] == '.')
      i++;
    d = d.substr(0, i + 1);
  }
  return d;
}

//// ======================================================= ////
//
//   Out-of-line defs for full specialization of StringMaker
//
//// ======================================================= ////

std::string StringMaker<std::string>::convert(const std::string& str) {
  if (!getCurrentContext().getConfig()->showInvisibles()) {
    return '"' + str + '"';
  }

  std::string s("\"");
  for (char c : str) {
    switch (c) {
    case '\n':
      s.append("\\n");
      break;
    case '\t':
      s.append("\\t");
      break;
    default:
      s.push_back(c);
      break;
    }
  }
  s.append("\"");
  return s;
}

#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
std::string StringMaker<std::string_view>::convert(std::string_view str) {
  return ::Catch::Detail::stringify(std::string{str});
}
#endif

std::string StringMaker<char const*>::convert(char const* str) {
  if (str) {
    return ::Catch::Detail::stringify(std::string{str});
  } else {
    return {"{null string}"};
  }
}
std::string StringMaker<char*>::convert(char* str) {
  if (str) {
    return ::Catch::Detail::stringify(std::string{str});
  } else {
    return {"{null string}"};
  }
}

#ifdef CATCH_CONFIG_WCHAR
std::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {
  std::string s;
  s.reserve(wstr.size());
  for (auto c : wstr) {
    s += (c <= 0xff) ? static_cast<char>(c) : '?';
  }
  return ::Catch::Detail::stringify(s);
}

#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
std::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {
  return StringMaker<std::wstring>::convert(std::wstring(str));
}
#endif

std::string StringMaker<wchar_t const*>::convert(wchar_t const* str) {
  if (str) {
    return ::Catch::Detail::stringify(std::wstring{str});
  } else {
    return {"{null string}"};
  }
}
std::string StringMaker<wchar_t*>::convert(wchar_t* str) {
  if (str) {
    return ::Catch::Detail::stringify(std::wstring{str});
  } else {
    return {"{null string}"};
  }
}
#endif

#if defined(CATCH_CONFIG_CPP17_BYTE)
#include <cstddef>
std::string StringMaker<std::byte>::convert(std::byte value) {
  return ::Catch::Detail::stringify(std::to_integer<unsigned long long>(value));
}
#endif // defined(CATCH_CONFIG_CPP17_BYTE)

std::string StringMaker<int>::convert(int value) {
  return ::Catch::Detail::stringify(static_cast<long long>(value));
}
std::string StringMaker<long>::convert(long value) {
  return ::Catch::Detail::stringify(static_cast<long long>(value));
}
std::string StringMaker<long long>::convert(long long value) {
  ReusableStringStream rss;
  rss << value;
  if (value > Detail::hexThreshold) {
    rss << " (0x" << std::hex << value << ')';
  }
  return rss.str();
}

std::string StringMaker<unsigned int>::convert(unsigned int value) {
  return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
}
std::string StringMaker<unsigned long>::convert(unsigned long value) {
  return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
}
std::string StringMaker<unsigned long long>::convert(unsigned long long value) {
  ReusableStringStream rss;
  rss << value;
  if (value > Detail::hexThreshold) {
    rss << " (0x" << std::hex << value << ')';
  }
  return rss.str();
}

std::string StringMaker<bool>::convert(bool b) { return b ? "true" : "false"; }

std::string StringMaker<signed char>::convert(signed char value) {
  if (value == '\r') {
    return "'\\r'";
  } else if (value == '\f') {
    return "'\\f'";
  } else if (value == '\n') {
    return "'\\n'";
  } else if (value == '\t') {
    return "'\\t'";
  } else if ('\0' <= value && value < ' ') {
    return ::Catch::Detail::stringify(static_cast<unsigned int>(value));
  } else {
    char chstr[] = "' '";
    chstr[1]     = value;
    return chstr;
  }
}
std::string StringMaker<char>::convert(char c) {
  return ::Catch::Detail::stringify(static_cast<signed char>(c));
}
std::string StringMaker<unsigned char>::convert(unsigned char c) {
  return ::Catch::Detail::stringify(static_cast<char>(c));
}

std::string StringMaker<std::nullptr_t>::convert(std::nullptr_t) {
  return "nullptr";
}

int StringMaker<float>::precision = 5;

std::string StringMaker<float>::convert(float value) {
  return fpToString(value, precision) + 'f';
}

int StringMaker<double>::precision = 10;

std::string StringMaker<double>::convert(double value) {
  return fpToString(value, precision);
}

std::string ratio_string<std::atto>::symbol() { return "a"; }
std::string ratio_string<std::femto>::symbol() { return "f"; }
std::string ratio_string<std::pico>::symbol() { return "p"; }
std::string ratio_string<std::nano>::symbol() { return "n"; }
std::string ratio_string<std::micro>::symbol() { return "u"; }
std::string ratio_string<std::milli>::symbol() { return "m"; }

} // end namespace Catch

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

// end catch_tostring.cpp
// start catch_totals.cpp

namespace Catch {

Counts Counts::operator-(Counts const& other) const {
  Counts diff;
  diff.passed      = passed - other.passed;
  diff.failed      = failed - other.failed;
  diff.failedButOk = failedButOk - other.failedButOk;
  return diff;
}

Counts& Counts::operator+=(Counts const& other) {
  passed += other.passed;
  failed += other.failed;
  failedButOk += other.failedButOk;
  return *this;
}

std::size_t Counts::total() const { return passed + failed + failedButOk; }
bool Counts::allPassed() const { return failed == 0 && failedButOk == 0; }
bool Counts::allOk() const { return failed == 0; }

Totals Totals::operator-(Totals const& other) const {
  Totals diff;
  diff.assertions = assertions - other.assertions;
  diff.testCases  = testCases - other.testCases;
  return diff;
}

Totals& Totals::operator+=(Totals const& other) {
  assertions += other.assertions;
  testCases += other.testCases;
  return *this;
}

Totals Totals::delta(Totals const& prevTotals) const {
  Totals diff = *this - prevTotals;
  if (diff.assertions.failed > 0)
    ++diff.testCases.failed;
  else if (diff.assertions.failedButOk > 0)
    ++diff.testCases.failedButOk;
  else
    ++diff.testCases.passed;
  return diff;
}

} // namespace Catch
// end catch_totals.cpp
// start catch_uncaught_exceptions.cpp

// start catch_config_uncaught_exceptions.hpp

//              Copyright Catch2 Authors
// Distributed under the Boost Software License, Version 1.0.
//   (See accompanying file LICENSE_1_0.txt or copy at
//        https://www.boost.org/LICENSE_1_0.txt)

// SPDX-License-Identifier: BSL-1.0

#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP

#if defined(_MSC_VER)
#if _MSC_VER >= 1900 // Visual Studio 2015 or newer
#define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
#endif
#endif

#include <exception>

#if defined(__cpp_lib_uncaught_exceptions) &&                                  \
    !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)

#define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
#endif // __cpp_lib_uncaught_exceptions

#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) &&                \
    !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) &&                     \
    !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)

#define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
#endif

#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
// end catch_config_uncaught_exceptions.hpp
#include <exception>

namespace Catch {
bool uncaught_exceptions() {
#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
  return false;
#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
  return std::uncaught_exceptions() > 0;
#else
  return std::uncaught_exception();
#endif
}
} // end namespace Catch
// end catch_uncaught_exceptions.cpp
// start catch_version.cpp

#include <ostream>

namespace Catch {

Version::Version(unsigned int _majorVersion, unsigned int _minorVersion,
                 unsigned int _patchNumber, char const* const _branchName,
                 unsigned int _buildNumber)
    : majorVersion(_majorVersion), minorVersion(_minorVersion),
      patchNumber(_patchNumber), branchName(_branchName),
      buildNumber(_buildNumber) {}

std::ostream& operator<<(std::ostream& os, Version const& version) {
  os << version.majorVersion << '.' << version.minorVersion << '.'
     << version.patchNumber;
  // branchName is never null -> 0th char is \0 if it is empty
  if (version.branchName[0]) {
    os << '-' << version.branchName << '.' << version.buildNumber;
  }
  return os;
}

Version const& libraryVersion() {
  static Version version(2, 13, 5, "", 0);
  return version;
}

} // namespace Catch
// end catch_version.cpp
// start catch_wildcard_pattern.cpp

namespace Catch {

WildcardPattern::WildcardPattern(std::string const& pattern,
                                 CaseSensitive::Choice caseSensitivity)
    : m_caseSensitivity(caseSensitivity), m_pattern(normaliseString(pattern)) {
  if (startsWith(m_pattern, '*')) {
    m_pattern  = m_pattern.substr(1);
    m_wildcard = WildcardAtStart;
  }
  if (endsWith(m_pattern, '*')) {
    m_pattern  = m_pattern.substr(0, m_pattern.size() - 1);
    m_wildcard = static_cast<WildcardPosition>(m_wildcard | WildcardAtEnd);
  }
}

bool WildcardPattern::matches(std::string const& str) const {
  switch (m_wildcard) {
  case NoWildcard:
    return m_pattern == normaliseString(str);
  case WildcardAtStart:
    return endsWith(normaliseString(str), m_pattern);
  case WildcardAtEnd:
    return startsWith(normaliseString(str), m_pattern);
  case WildcardAtBothEnds:
    return contains(normaliseString(str), m_pattern);
  default:
    CATCH_INTERNAL_ERROR("Unknown enum");
  }
}

std::string WildcardPattern::normaliseString(std::string const& str) const {
  return trim(m_caseSensitivity == CaseSensitive::No ? toLower(str) : str);
}
} // namespace Catch
// end catch_wildcard_pattern.cpp
// start catch_xmlwriter.cpp

#include <iomanip>
#include <type_traits>

namespace Catch {

namespace {

size_t trailingBytes(unsigned char c) {
  if ((c & 0xE0) == 0xC0) {
    return 2;
  }
  if ((c & 0xF0) == 0xE0) {
    return 3;
  }
  if ((c & 0xF8) == 0xF0) {
    return 4;
  }
  CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
}

uint32_t headerValue(unsigned char c) {
  if ((c & 0xE0) == 0xC0) {
    return c & 0x1F;
  }
  if ((c & 0xF0) == 0xE0) {
    return c & 0x0F;
  }
  if ((c & 0xF8) == 0xF0) {
    return c & 0x07;
  }
  CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
}

void hexEscapeChar(std::ostream& os, unsigned char c) {
  std::ios_base::fmtflags f(os.flags());
  os << "\\x" << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
     << static_cast<int>(c);
  os.flags(f);
}

bool shouldNewline(XmlFormatting fmt) {
  return !!(static_cast<std::underlying_type<XmlFormatting>::type>(
      fmt & XmlFormatting::Newline));
}

bool shouldIndent(XmlFormatting fmt) {
  return !!(static_cast<std::underlying_type<XmlFormatting>::type>(
      fmt & XmlFormatting::Indent));
}

} // anonymous namespace

XmlFormatting operator|(XmlFormatting lhs, XmlFormatting rhs) {
  return static_cast<XmlFormatting>(
      static_cast<std::underlying_type<XmlFormatting>::type>(lhs) |
      static_cast<std::underlying_type<XmlFormatting>::type>(rhs));
}

XmlFormatting operator&(XmlFormatting lhs, XmlFormatting rhs) {
  return static_cast<XmlFormatting>(
      static_cast<std::underlying_type<XmlFormatting>::type>(lhs) &
      static_cast<std::underlying_type<XmlFormatting>::type>(rhs));
}

XmlEncode::XmlEncode(std::string const& str, ForWhat forWhat)
    : m_str(str), m_forWhat(forWhat) {}

void XmlEncode::encodeTo(std::ostream& os) const {
  // Apostrophe escaping not necessary if we always use " to write attributes
  // (see: http://www.w3.org/TR/xml/#syntax)

  for (std::size_t idx = 0; idx < m_str.size(); ++idx) {
    unsigned char c = m_str[idx];
    switch (c) {
    case '<':
      os << "&lt;";
      break;
    case '&':
      os << "&amp;";
      break;

    case '>':
      // See: http://www.w3.org/TR/xml/#syntax
      if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
        os << "&gt;";
      else
        os << c;
      break;

    case '\"':
      if (m_forWhat == ForAttributes)
        os << "&quot;";
      else
        os << c;
      break;

    default:
      // Check for control characters and invalid utf-8

      // Escape control characters in standard ascii
      // see
      // http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
      if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
        hexEscapeChar(os, c);
        break;
      }

      // Plain ASCII: Write it to stream
      if (c < 0x7F) {
        os << c;
        break;
      }

      // UTF-8 territory
      // Check if the encoding is valid and if it is not, hex escape bytes.
      // Important: We do not check the exact decoded values for validity, only
      // the encoding format First check that this bytes is a valid lead byte:
      // This means that it is not encoded as 1111 1XXX
      // Or as 10XX XXXX
      if (c < 0xC0 || c >= 0xF8) {
        hexEscapeChar(os, c);
        break;
      }

      auto encBytes = trailingBytes(c);
      // Are there enough bytes left to avoid accessing out-of-bounds memory?
      if (idx + encBytes - 1 >= m_str.size()) {
        hexEscapeChar(os, c);
        break;
      }
      // The header is valid, check data
      // The next encBytes bytes must together be a valid utf-8
      // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
      bool valid     = true;
      uint32_t value = headerValue(c);
      for (std::size_t n = 1; n < encBytes; ++n) {
        unsigned char nc = m_str[idx + n];
        valid &= ((nc & 0xC0) == 0x80);
        value = (value << 6) | (nc & 0x3F);
      }

      if (
          // Wrong bit pattern of following bytes
          (!valid) ||
          // Overlong encodings
          (value < 0x80) || (0x80 <= value && value < 0x800 && encBytes > 2) ||
          (0x800 < value && value < 0x10000 && encBytes > 3) ||
          // Encoded value out of range
          (value >= 0x110000)) {
        hexEscapeChar(os, c);
        break;
      }

      // If we got here, this is in fact a valid(ish) utf-8 sequence
      for (std::size_t n = 0; n < encBytes; ++n) {
        os << m_str[idx + n];
      }
      idx += encBytes - 1;
      break;
    }
  }
}

std::ostream& operator<<(std::ostream& os, XmlEncode const& xmlEncode) {
  xmlEncode.encodeTo(os);
  return os;
}

XmlWriter::ScopedElement::ScopedElement(XmlWriter* writer, XmlFormatting fmt)
    : m_writer(writer), m_fmt(fmt) {}

XmlWriter::ScopedElement::ScopedElement(ScopedElement&& other) noexcept
    : m_writer(other.m_writer), m_fmt(other.m_fmt) {
  other.m_writer = nullptr;
  other.m_fmt    = XmlFormatting::None;
}
XmlWriter::ScopedElement&
XmlWriter::ScopedElement::operator=(ScopedElement&& other) noexcept {
  if (m_writer) {
    m_writer->endElement();
  }
  m_writer       = other.m_writer;
  other.m_writer = nullptr;
  m_fmt          = other.m_fmt;
  other.m_fmt    = XmlFormatting::None;
  return *this;
}

XmlWriter::ScopedElement::~ScopedElement() {
  if (m_writer) {
    m_writer->endElement(m_fmt);
  }
}

XmlWriter::ScopedElement&
XmlWriter::ScopedElement::writeText(std::string const& text,
                                    XmlFormatting fmt) {
  m_writer->writeText(text, fmt);
  return *this;
}

XmlWriter::XmlWriter(std::ostream& os) : m_os(os) { writeDeclaration(); }

XmlWriter::~XmlWriter() {
  while (!m_tags.empty()) {
    endElement();
  }
  newlineIfNecessary();
}

XmlWriter& XmlWriter::startElement(std::string const& name, XmlFormatting fmt) {
  ensureTagClosed();
  newlineIfNecessary();
  if (shouldIndent(fmt)) {
    m_os << m_indent;
    m_indent += "  ";
  }
  m_os << '<' << name;
  m_tags.push_back(name);
  m_tagIsOpen = true;
  applyFormatting(fmt);
  return *this;
}

XmlWriter::ScopedElement XmlWriter::scopedElement(std::string const& name,
                                                  XmlFormatting fmt) {
  ScopedElement scoped(this, fmt);
  startElement(name, fmt);
  return scoped;
}

XmlWriter& XmlWriter::endElement(XmlFormatting fmt) {
  m_indent = m_indent.substr(0, m_indent.size() - 2);

  if (m_tagIsOpen) {
    m_os << "/>";
    m_tagIsOpen = false;
  } else {
    newlineIfNecessary();
    if (shouldIndent(fmt)) {
      m_os << m_indent;
    }
    m_os << "</" << m_tags.back() << ">";
  }
  m_os << std::flush;
  applyFormatting(fmt);
  m_tags.pop_back();
  return *this;
}

XmlWriter& XmlWriter::writeAttribute(std::string const& name,
                                     std::string const& attribute) {
  if (!name.empty() && !attribute.empty())
    m_os << ' ' << name << "=\""
         << XmlEncode(attribute, XmlEncode::ForAttributes) << '"';
  return *this;
}

XmlWriter& XmlWriter::writeAttribute(std::string const& name, bool attribute) {
  m_os << ' ' << name << "=\"" << (attribute ? "true" : "false") << '"';
  return *this;
}

XmlWriter& XmlWriter::writeText(std::string const& text, XmlFormatting fmt) {
  if (!text.empty()) {
    bool tagWasOpen = m_tagIsOpen;
    ensureTagClosed();
    if (tagWasOpen && shouldIndent(fmt)) {
      m_os << m_indent;
    }
    m_os << XmlEncode(text);
    applyFormatting(fmt);
  }
  return *this;
}

XmlWriter& XmlWriter::writeComment(std::string const& text, XmlFormatting fmt) {
  ensureTagClosed();
  if (shouldIndent(fmt)) {
    m_os << m_indent;
  }
  m_os << "<!--" << text << "-->";
  applyFormatting(fmt);
  return *this;
}

void XmlWriter::writeStylesheetRef(std::string const& url) {
  m_os << "<?xml-stylesheet type=\"text/xsl\" href=\"" << url << "\"?>\n";
}

XmlWriter& XmlWriter::writeBlankLine() {
  ensureTagClosed();
  m_os << '\n';
  return *this;
}

void XmlWriter::ensureTagClosed() {
  if (m_tagIsOpen) {
    m_os << '>' << std::flush;
    newlineIfNecessary();
    m_tagIsOpen = false;
  }
}

void XmlWriter::applyFormatting(XmlFormatting fmt) {
  m_needsNewline = shouldNewline(fmt);
}

void XmlWriter::writeDeclaration() {
  m_os << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
}

void XmlWriter::newlineIfNecessary() {
  if (m_needsNewline) {
    m_os << std::endl;
    m_needsNewline = false;
  }
}
} // namespace Catch
// end catch_xmlwriter.cpp
// start catch_reporter_bases.cpp

#include <cstring>
#include <cfloat>
#include <cstdio>
#include <cassert>
#include <memory>

namespace Catch {
void prepareExpandedExpression(AssertionResult& result) {
  result.getExpandedExpression();
}

// Because formatting using c++ streams is stateful, drop down to C is required
// Alternatively we could use stringstream, but its performance is... not good.
std::string getFormattedDuration(double duration) {
  // Max exponent + 1 is required to represent the whole part
  // + 1 for decimal point
  // + 3 for the 3 decimal places
  // + 1 for null terminator
  const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;
  char buffer[maxDoubleSize];

  // Save previous errno, to prevent sprintf from overwriting it
  ErrnoGuard guard;
#ifdef _MSC_VER
  sprintf_s(buffer, "%.3f", duration);
#else
  std::sprintf(buffer, "%.3f", duration);
#endif
  return std::string(buffer);
}

bool shouldShowDuration(IConfig const& config, double duration) {
  if (config.showDurations() == ShowDurations::Always) {
    return true;
  }
  if (config.showDurations() == ShowDurations::Never) {
    return false;
  }
  const double min = config.minDuration();
  return min >= 0 && duration >= min;
}

std::string serializeFilters(std::vector<std::string> const& container) {
  ReusableStringStream oss;
  bool first = true;
  for (auto&& filter : container) {
    if (!first)
      oss << ' ';
    else
      first = false;

    oss << filter;
  }
  return oss.str();
}

TestEventListenerBase::TestEventListenerBase(ReporterConfig const& _config)
    : StreamingReporterBase(_config) {}

std::set<Verbosity> TestEventListenerBase::getSupportedVerbosities() {
  return {Verbosity::Quiet, Verbosity::Normal, Verbosity::High};
}

void TestEventListenerBase::assertionStarting(AssertionInfo const&) {}

bool TestEventListenerBase::assertionEnded(AssertionStats const&) {
  return false;
}

} // end namespace Catch
// end catch_reporter_bases.cpp
// start catch_reporter_compact.cpp

namespace {

#ifdef CATCH_PLATFORM_MAC
const char* failedString() { return "FAILED"; }
const char* passedString() { return "PASSED"; }
#else
const char* failedString() { return "failed"; }
const char* passedString() { return "passed"; }
#endif

// Colour::LightGrey
Catch::Colour::Code dimColour() { return Catch::Colour::FileName; }

std::string bothOrAll(std::size_t count) {
  return count == 1 ? std::string() : count == 2 ? "both " : "all ";
}

} // namespace

namespace Catch {
namespace {
// Colour, message variants:
// - white: No tests ran.
// -   red: Failed [both/all] N test cases, failed [both/all] M assertions.
// - white: Passed [both/all] N test cases (no assertions).
// -   red: Failed N tests cases, failed M assertions.
// - green: Passed [both/all] N tests cases with M assertions.
void printTotals(std::ostream& out, const Totals& totals) {
  if (totals.testCases.total() == 0) {
    out << "No tests ran.";
  } else if (totals.testCases.failed == totals.testCases.total()) {
    Colour colour(Colour::ResultError);
    const std::string qualify_assertions_failed =
        totals.assertions.failed == totals.assertions.total()
            ? bothOrAll(totals.assertions.failed)
            : std::string();
    out << "Failed " << bothOrAll(totals.testCases.failed)
        << pluralise(totals.testCases.failed, "test case")
        << ", "
           "failed "
        << qualify_assertions_failed
        << pluralise(totals.assertions.failed, "assertion") << '.';
  } else if (totals.assertions.total() == 0) {
    out << "Passed " << bothOrAll(totals.testCases.total())
        << pluralise(totals.testCases.total(), "test case")
        << " (no assertions).";
  } else if (totals.assertions.failed) {
    Colour colour(Colour::ResultError);
    out << "Failed " << pluralise(totals.testCases.failed, "test case")
        << ", "
           "failed "
        << pluralise(totals.assertions.failed, "assertion") << '.';
  } else {
    Colour colour(Colour::ResultSuccess);
    out << "Passed " << bothOrAll(totals.testCases.passed)
        << pluralise(totals.testCases.passed, "test case") << " with "
        << pluralise(totals.assertions.passed, "assertion") << '.';
  }
}

// Implementation of CompactReporter formatting
class AssertionPrinter {
public:
  AssertionPrinter& operator=(AssertionPrinter const&) = delete;
  AssertionPrinter(AssertionPrinter const&)            = delete;
  AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats,
                   bool _printInfoMessages)
      : stream(_stream), result(_stats.assertionResult),
        messages(_stats.infoMessages), itMessage(_stats.infoMessages.begin()),
        printInfoMessages(_printInfoMessages) {}

  void print() {
    printSourceInfo();

    itMessage = messages.begin();

    switch (result.getResultType()) {
    case ResultWas::Ok:
      printResultType(Colour::ResultSuccess, passedString());
      printOriginalExpression();
      printReconstructedExpression();
      if (!result.hasExpression())
        printRemainingMessages(Colour::None);
      else
        printRemainingMessages();
      break;
    case ResultWas::ExpressionFailed:
      if (result.isOk())
        printResultType(Colour::ResultSuccess,
                        failedString() + std::string(" - but was ok"));
      else
        printResultType(Colour::Error, failedString());
      printOriginalExpression();
      printReconstructedExpression();
      printRemainingMessages();
      break;
    case ResultWas::ThrewException:
      printResultType(Colour::Error, failedString());
      printIssue("unexpected exception with message:");
      printMessage();
      printExpressionWas();
      printRemainingMessages();
      break;
    case ResultWas::FatalErrorCondition:
      printResultType(Colour::Error, failedString());
      printIssue("fatal error condition with message:");
      printMessage();
      printExpressionWas();
      printRemainingMessages();
      break;
    case ResultWas::DidntThrowException:
      printResultType(Colour::Error, failedString());
      printIssue("expected exception, got none");
      printExpressionWas();
      printRemainingMessages();
      break;
    case ResultWas::Info:
      printResultType(Colour::None, "info");
      printMessage();
      printRemainingMessages();
      break;
    case ResultWas::Warning:
      printResultType(Colour::None, "warning");
      printMessage();
      printRemainingMessages();
      break;
    case ResultWas::ExplicitFailure:
      printResultType(Colour::Error, failedString());
      printIssue("explicitly");
      printRemainingMessages(Colour::None);
      break;
      // These cases are here to prevent compiler warnings
    case ResultWas::Unknown:
    case ResultWas::FailureBit:
    case ResultWas::Exception:
      printResultType(Colour::Error, "** internal error **");
      break;
    }
  }

private:
  void printSourceInfo() const {
    Colour colourGuard(Colour::FileName);
    stream << result.getSourceInfo() << ':';
  }

  void printResultType(Colour::Code colour,
                       std::string const& passOrFail) const {
    if (!passOrFail.empty()) {
      {
        Colour colourGuard(colour);
        stream << ' ' << passOrFail;
      }
      stream << ':';
    }
  }

  void printIssue(std::string const& issue) const { stream << ' ' << issue; }

  void printExpressionWas() {
    if (result.hasExpression()) {
      stream << ';';
      {
        Colour colour(dimColour());
        stream << " expression was:";
      }
      printOriginalExpression();
    }
  }

  void printOriginalExpression() const {
    if (result.hasExpression()) {
      stream << ' ' << result.getExpression();
    }
  }

  void printReconstructedExpression() const {
    if (result.hasExpandedExpression()) {
      {
        Colour colour(dimColour());
        stream << " for: ";
      }
      stream << result.getExpandedExpression();
    }
  }

  void printMessage() {
    if (itMessage != messages.end()) {
      stream << " '" << itMessage->message << '\'';
      ++itMessage;
    }
  }

  void printRemainingMessages(Colour::Code colour = dimColour()) {
    if (itMessage == messages.end())
      return;

    const auto itEnd = messages.cend();
    const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));

    {
      Colour colourGuard(colour);
      stream << " with " << pluralise(N, "message") << ':';
    }

    while (itMessage != itEnd) {
      // If this assertion is a warning ignore any INFO messages
      if (printInfoMessages || itMessage->type != ResultWas::Info) {
        printMessage();
        if (itMessage != itEnd) {
          Colour colourGuard(dimColour());
          stream << " and";
        }
        continue;
      }
      ++itMessage;
    }
  }

private:
  std::ostream& stream;
  AssertionResult const& result;
  std::vector<MessageInfo> messages;
  std::vector<MessageInfo>::const_iterator itMessage;
  bool printInfoMessages;
};

} // namespace

std::string CompactReporter::getDescription() {
  return "Reports test results on a single line, suitable for IDEs";
}

void CompactReporter::noMatchingTestCases(std::string const& spec) {
  stream << "No test cases matched '" << spec << '\'' << std::endl;
}

void CompactReporter::assertionStarting(AssertionInfo const&) {}

bool CompactReporter::assertionEnded(AssertionStats const& _assertionStats) {
  AssertionResult const& result = _assertionStats.assertionResult;

  bool printInfoMessages = true;

  // Drop out if result was successful and we're not printing those
  if (!m_config->includeSuccessfulResults() && result.isOk()) {
    if (result.getResultType() != ResultWas::Warning)
      return false;
    printInfoMessages = false;
  }

  AssertionPrinter printer(stream, _assertionStats, printInfoMessages);
  printer.print();

  stream << std::endl;
  return true;
}

void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
  double dur = _sectionStats.durationInSeconds;
  if (shouldShowDuration(*m_config, dur)) {
    stream << getFormattedDuration(dur)
           << " s: " << _sectionStats.sectionInfo.name << std::endl;
  }
}

void CompactReporter::testRunEnded(TestRunStats const& _testRunStats) {
  printTotals(stream, _testRunStats.totals);
  stream << '\n' << std::endl;
  StreamingReporterBase::testRunEnded(_testRunStats);
}

CompactReporter::~CompactReporter() {}

CATCH_REGISTER_REPORTER("compact", CompactReporter)

} // end namespace Catch
// end catch_reporter_compact.cpp
// start catch_reporter_console.cpp

#include <cfloat>
#include <cstdio>

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(                                                               \
    disable : 4061) // Not all labels are EXPLICITLY handled in switch
  // Note that 4062 (not all labels are handled and default is missing) is
  // enabled
#endif

#if defined(__clang__)
#pragma clang diagnostic push
// For simplicity, benchmarking-only helpers are always enabled
#pragma clang diagnostic ignored "-Wunused-function"
#endif

namespace Catch {

namespace {

// Formatter impl for ConsoleReporter
class ConsoleAssertionPrinter {
public:
  ConsoleAssertionPrinter& operator=(ConsoleAssertionPrinter const&) = delete;
  ConsoleAssertionPrinter(ConsoleAssertionPrinter const&)            = delete;
  ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats,
                          bool _printInfoMessages)
      : stream(_stream), stats(_stats), result(_stats.assertionResult),
        colour(Colour::None), message(result.getMessage()),
        messages(_stats.infoMessages), printInfoMessages(_printInfoMessages) {
    switch (result.getResultType()) {
    case ResultWas::Ok:
      colour     = Colour::Success;
      passOrFail = "PASSED";
      // if( result.hasMessage() )
      if (_stats.infoMessages.size() == 1)
        messageLabel = "with message";
      if (_stats.infoMessages.size() > 1)
        messageLabel = "with messages";
      break;
    case ResultWas::ExpressionFailed:
      if (result.isOk()) {
        colour     = Colour::Success;
        passOrFail = "FAILED - but was ok";
      } else {
        colour     = Colour::Error;
        passOrFail = "FAILED";
      }
      if (_stats.infoMessages.size() == 1)
        messageLabel = "with message";
      if (_stats.infoMessages.size() > 1)
        messageLabel = "with messages";
      break;
    case ResultWas::ThrewException:
      colour       = Colour::Error;
      passOrFail   = "FAILED";
      messageLabel = "due to unexpected exception with ";
      if (_stats.infoMessages.size() == 1)
        messageLabel += "message";
      if (_stats.infoMessages.size() > 1)
        messageLabel += "messages";
      break;
    case ResultWas::FatalErrorCondition:
      colour       = Colour::Error;
      passOrFail   = "FAILED";
      messageLabel = "due to a fatal error condition";
      break;
    case ResultWas::DidntThrowException:
      colour       = Colour::Error;
      passOrFail   = "FAILED";
      messageLabel = "because no exception was thrown where one was expected";
      break;
    case ResultWas::Info:
      messageLabel = "info";
      break;
    case ResultWas::Warning:
      messageLabel = "warning";
      break;
    case ResultWas::ExplicitFailure:
      passOrFail = "FAILED";
      colour     = Colour::Error;
      if (_stats.infoMessages.size() == 1)
        messageLabel = "explicitly with message";
      if (_stats.infoMessages.size() > 1)
        messageLabel = "explicitly with messages";
      break;
      // These cases are here to prevent compiler warnings
    case ResultWas::Unknown:
    case ResultWas::FailureBit:
    case ResultWas::Exception:
      passOrFail = "** internal error **";
      colour     = Colour::Error;
      break;
    }
  }

  void print() const {
    printSourceInfo();
    if (stats.totals.assertions.total() > 0) {
      printResultType();
      printOriginalExpression();
      printReconstructedExpression();
    } else {
      stream << '\n';
    }
    printMessage();
  }

private:
  void printResultType() const {
    if (!passOrFail.empty()) {
      Colour colourGuard(colour);
      stream << passOrFail << ":\n";
    }
  }
  void printOriginalExpression() const {
    if (result.hasExpression()) {
      Colour colourGuard(Colour::OriginalExpression);
      stream << "  ";
      stream << result.getExpressionInMacro();
      stream << '\n';
    }
  }
  void printReconstructedExpression() const {
    if (result.hasExpandedExpression()) {
      stream << "with expansion:\n";
      Colour colourGuard(Colour::ReconstructedExpression);
      stream << Column(result.getExpandedExpression()).indent(2) << '\n';
    }
  }
  void printMessage() const {
    if (!messageLabel.empty())
      stream << messageLabel << ':' << '\n';
    for (auto const& msg : messages) {
      // If this assertion is a warning ignore any INFO messages
      if (printInfoMessages || msg.type != ResultWas::Info)
        stream << Column(msg.message).indent(2) << '\n';
    }
  }
  void printSourceInfo() const {
    Colour colourGuard(Colour::FileName);
    stream << result.getSourceInfo() << ": ";
  }

  std::ostream& stream;
  AssertionStats const& stats;
  AssertionResult const& result;
  Colour::Code colour;
  std::string passOrFail;
  std::string messageLabel;
  std::string message;
  std::vector<MessageInfo> messages;
  bool printInfoMessages;
};

std::size_t makeRatio(std::size_t number, std::size_t total) {
  std::size_t ratio =
      total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;
  return (ratio == 0 && number > 0) ? 1 : ratio;
}

std::size_t& findMax(std::size_t& i, std::size_t& j, std::size_t& k) {
  if (i > j && i > k)
    return i;
  else if (j > k)
    return j;
  else
    return k;
}

struct ColumnInfo {
  enum Justification { Left, Right };
  std::string name;
  int width;
  Justification justification;
};
struct ColumnBreak {};
struct RowBreak {};

class Duration {
  enum class Unit {
    Auto,
    Nanoseconds,
    Microseconds,
    Milliseconds,
    Seconds,
    Minutes
  };
  static const uint64_t s_nanosecondsInAMicrosecond = 1000;
  static const uint64_t s_nanosecondsInAMillisecond =
      1000 * s_nanosecondsInAMicrosecond;
  static const uint64_t s_nanosecondsInASecond =
      1000 * s_nanosecondsInAMillisecond;
  static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;

  double m_inNanoseconds;
  Unit m_units;

public:
  explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
      : m_inNanoseconds(inNanoseconds), m_units(units) {
    if (m_units == Unit::Auto) {
      if (m_inNanoseconds < s_nanosecondsInAMicrosecond)
        m_units = Unit::Nanoseconds;
      else if (m_inNanoseconds < s_nanosecondsInAMillisecond)
        m_units = Unit::Microseconds;
      else if (m_inNanoseconds < s_nanosecondsInASecond)
        m_units = Unit::Milliseconds;
      else if (m_inNanoseconds < s_nanosecondsInAMinute)
        m_units = Unit::Seconds;
      else
        m_units = Unit::Minutes;
    }
  }

  auto value() const -> double {
    switch (m_units) {
    case Unit::Microseconds:
      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);
    case Unit::Milliseconds:
      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);
    case Unit::Seconds:
      return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);
    case Unit::Minutes:
      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
    default:
      return m_inNanoseconds;
    }
  }
  auto unitsAsString() const -> std::string {
    switch (m_units) {
    case Unit::Nanoseconds:
      return "ns";
    case Unit::Microseconds:
      return "us";
    case Unit::Milliseconds:
      return "ms";
    case Unit::Seconds:
      return "s";
    case Unit::Minutes:
      return "m";
    default:
      return "** internal error **";
    }
  }
  friend auto operator<<(std::ostream& os, Duration const& duration)
      -> std::ostream& {
    return os << duration.value() << ' ' << duration.unitsAsString();
  }
};
} // namespace

class TablePrinter {
  std::ostream& m_os;
  std::vector<ColumnInfo> m_columnInfos;
  std::ostringstream m_oss;
  int m_currentColumn = -1;
  bool m_isOpen       = false;

public:
  TablePrinter(std::ostream& os, std::vector<ColumnInfo> columnInfos)
      : m_os(os), m_columnInfos(std::move(columnInfos)) {}

  auto columnInfos() const -> std::vector<ColumnInfo> const& {
    return m_columnInfos;
  }

  void open() {
    if (!m_isOpen) {
      m_isOpen = true;
      *this << RowBreak();

      Columns headerCols;
      Spacer spacer(2);
      for (auto const& info : m_columnInfos) {
        headerCols +=
            Column(info.name).width(static_cast<std::size_t>(info.width - 2));
        headerCols += spacer;
      }
      m_os << headerCols << '\n';

      m_os << Catch::getLineOfChars<'-'>() << '\n';
    }
  }
  void close() {
    if (m_isOpen) {
      *this << RowBreak();
      m_os << std::endl;
      m_isOpen = false;
    }
  }

  template <typename T>
  friend TablePrinter& operator<<(TablePrinter& tp, T const& value) {
    tp.m_oss << value;
    return tp;
  }

  friend TablePrinter& operator<<(TablePrinter& tp, ColumnBreak) {
    auto colStr        = tp.m_oss.str();
    const auto strSize = colStr.size();
    tp.m_oss.str("");
    tp.open();
    if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {
      tp.m_currentColumn = -1;
      tp.m_os << '\n';
    }
    tp.m_currentColumn++;

    auto colInfo = tp.m_columnInfos[tp.m_currentColumn];
    auto padding = (strSize + 1 < static_cast<std::size_t>(colInfo.width))
                       ? std::string(colInfo.width - (strSize + 1), ' ')
                       : std::string();
    if (colInfo.justification == ColumnInfo::Left)
      tp.m_os << colStr << padding << ' ';
    else
      tp.m_os << padding << colStr << ' ';
    return tp;
  }

  friend TablePrinter& operator<<(TablePrinter& tp, RowBreak) {
    if (tp.m_currentColumn > 0) {
      tp.m_os << '\n';
      tp.m_currentColumn = -1;
    }
    return tp;
  }
};

ConsoleReporter::ConsoleReporter(ReporterConfig const& config)
    : StreamingReporterBase(config),
      m_tablePrinter(new TablePrinter(
          config.stream(), [&config]() -> std::vector<ColumnInfo> {
            if (config.fullConfig()->benchmarkNoAnalysis()) {
              return {{"benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43,
                       ColumnInfo::Left},
                      {"     samples", 14, ColumnInfo::Right},
                      {"  iterations", 14, ColumnInfo::Right},
                      {"        mean", 14, ColumnInfo::Right}};
            } else {
              return {
                  {"benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43,
                   ColumnInfo::Left},
                  {"samples      mean       std dev", 14, ColumnInfo::Right},
                  {"iterations   low mean   low std dev", 14,
                   ColumnInfo::Right},
                  {"estimated    high mean  high std dev", 14,
                   ColumnInfo::Right}};
            }
          }())) {}
ConsoleReporter::~ConsoleReporter() = default;

std::string ConsoleReporter::getDescription() {
  return "Reports test results as plain lines of text";
}

void ConsoleReporter::noMatchingTestCases(std::string const& spec) {
  stream << "No test cases matched '" << spec << '\'' << std::endl;
}

void ConsoleReporter::reportInvalidArguments(std::string const& arg) {
  stream << "Invalid Filter: " << arg << std::endl;
}

void ConsoleReporter::assertionStarting(AssertionInfo const&) {}

bool ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {
  AssertionResult const& result = _assertionStats.assertionResult;

  bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();

  // Drop out if result was successful but we're not printing them.
  if (!includeResults && result.getResultType() != ResultWas::Warning)
    return false;

  lazyPrint();

  ConsoleAssertionPrinter printer(stream, _assertionStats, includeResults);
  printer.print();
  stream << std::endl;
  return true;
}

void ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {
  m_tablePrinter->close();
  m_headerPrinted = false;
  StreamingReporterBase::sectionStarting(_sectionInfo);
}
void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
  m_tablePrinter->close();
  if (_sectionStats.missingAssertions) {
    lazyPrint();
    Colour colour(Colour::ResultError);
    if (m_sectionStack.size() > 1)
      stream << "\nNo assertions in section";
    else
      stream << "\nNo assertions in test case";
    stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl;
  }
  double dur = _sectionStats.durationInSeconds;
  if (shouldShowDuration(*m_config, dur)) {
    stream << getFormattedDuration(dur)
           << " s: " << _sectionStats.sectionInfo.name << std::endl;
  }
  if (m_headerPrinted) {
    m_headerPrinted = false;
  }
  StreamingReporterBase::sectionEnded(_sectionStats);
}

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
void ConsoleReporter::benchmarkPreparing(std::string const& name) {
  lazyPrintWithoutClosingBenchmarkTable();

  auto nameCol = Column(name).width(
      static_cast<std::size_t>(m_tablePrinter->columnInfos()[0].width - 2));

  bool firstLine = true;
  for (auto line : nameCol) {
    if (!firstLine)
      (*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();
    else
      firstLine = false;

    (*m_tablePrinter) << line << ColumnBreak();
  }
}

void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
  (*m_tablePrinter) << info.samples << ColumnBreak() << info.iterations
                    << ColumnBreak();
  if (!m_config->benchmarkNoAnalysis())
    (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
}
void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
  if (m_config->benchmarkNoAnalysis()) {
    (*m_tablePrinter) << Duration(stats.mean.point.count()) << ColumnBreak();
  } else {
    (*m_tablePrinter) << ColumnBreak() << Duration(stats.mean.point.count())
                      << ColumnBreak()
                      << Duration(stats.mean.lower_bound.count())
                      << ColumnBreak()
                      << Duration(stats.mean.upper_bound.count())
                      << ColumnBreak() << ColumnBreak()
                      << Duration(stats.standardDeviation.point.count())
                      << ColumnBreak()
                      << Duration(stats.standardDeviation.lower_bound.count())
                      << ColumnBreak()
                      << Duration(stats.standardDeviation.upper_bound.count())
                      << ColumnBreak() << ColumnBreak() << ColumnBreak()
                      << ColumnBreak() << ColumnBreak();
  }
}

void ConsoleReporter::benchmarkFailed(std::string const& error) {
  Colour colour(Colour::Red);
  (*m_tablePrinter) << "Benchmark failed (" << error << ')' << ColumnBreak()
                    << RowBreak();
}
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

void ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
  m_tablePrinter->close();
  StreamingReporterBase::testCaseEnded(_testCaseStats);
  m_headerPrinted = false;
}
void ConsoleReporter::testGroupEnded(TestGroupStats const& _testGroupStats) {
  if (currentGroupInfo.used) {
    printSummaryDivider();
    stream << "Summary for group '" << _testGroupStats.groupInfo.name << "':\n";
    printTotals(_testGroupStats.totals);
    stream << '\n' << std::endl;
  }
  StreamingReporterBase::testGroupEnded(_testGroupStats);
}
void ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {
  printTotalsDivider(_testRunStats.totals);
  printTotals(_testRunStats.totals);
  stream << std::endl;
  StreamingReporterBase::testRunEnded(_testRunStats);
}
void ConsoleReporter::testRunStarting(TestRunInfo const& _testInfo) {
  StreamingReporterBase::testRunStarting(_testInfo);
  printTestFilters();
}

void ConsoleReporter::lazyPrint() {

  m_tablePrinter->close();
  lazyPrintWithoutClosingBenchmarkTable();
}

void ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {

  if (!currentTestRunInfo.used)
    lazyPrintRunInfo();
  if (!currentGroupInfo.used)
    lazyPrintGroupInfo();

  if (!m_headerPrinted) {
    printTestCaseAndSectionHeader();
    m_headerPrinted = true;
  }
}
void ConsoleReporter::lazyPrintRunInfo() {
  stream << '\n' << getLineOfChars<'~'>() << '\n';
  Colour colour(Colour::SecondaryText);
  stream << currentTestRunInfo->name << " is a Catch v" << libraryVersion()
         << " host application.\n"
         << "Run with -? for options\n\n";

  if (m_config->rngSeed() != 0)
    stream << "Randomness seeded to: " << m_config->rngSeed() << "\n\n";

  currentTestRunInfo.used = true;
}
void ConsoleReporter::lazyPrintGroupInfo() {
  if (!currentGroupInfo->name.empty() && currentGroupInfo->groupsCounts > 1) {
    printClosedHeader("Group: " + currentGroupInfo->name);
    currentGroupInfo.used = true;
  }
}
void ConsoleReporter::printTestCaseAndSectionHeader() {
  assert(!m_sectionStack.empty());
  printOpenHeader(currentTestCaseInfo->name);

  if (m_sectionStack.size() > 1) {
    Colour colourGuard(Colour::Headers);

    auto it   = m_sectionStack.begin() + 1, // Skip first section (test case)
        itEnd = m_sectionStack.end();
    for (; it != itEnd; ++it)
      printHeaderString(it->name, 2);
  }

  SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;

  stream << getLineOfChars<'-'>() << '\n';
  Colour colourGuard(Colour::FileName);
  stream << lineInfo << '\n';
  stream << getLineOfChars<'.'>() << '\n' << std::endl;
}

void ConsoleReporter::printClosedHeader(std::string const& _name) {
  printOpenHeader(_name);
  stream << getLineOfChars<'.'>() << '\n';
}
void ConsoleReporter::printOpenHeader(std::string const& _name) {
  stream << getLineOfChars<'-'>() << '\n';
  {
    Colour colourGuard(Colour::Headers);
    printHeaderString(_name);
  }
}

// if string has a : in first line will set indent to follow it on
// subsequent lines
void ConsoleReporter::printHeaderString(std::string const& _string,
                                        std::size_t indent) {
  std::size_t i = _string.find(": ");
  if (i != std::string::npos)
    i += 2;
  else
    i = 0;
  stream << Column(_string).indent(indent + i).initialIndent(indent) << '\n';
}

struct SummaryColumn {

  SummaryColumn(std::string _label, Colour::Code _colour)
      : label(std::move(_label)), colour(_colour) {}
  SummaryColumn addRow(std::size_t count) {
    ReusableStringStream rss;
    rss << count;
    std::string row = rss.str();
    for (auto& oldRow : rows) {
      while (oldRow.size() < row.size())
        oldRow = ' ' + oldRow;
      while (oldRow.size() > row.size())
        row = ' ' + row;
    }
    rows.push_back(row);
    return *this;
  }

  std::string label;
  Colour::Code colour;
  std::vector<std::string> rows;
};

void ConsoleReporter::printTotals(Totals const& totals) {
  if (totals.testCases.total() == 0) {
    stream << Colour(Colour::Warning) << "No tests ran\n";
  } else if (totals.assertions.total() > 0 && totals.testCases.allPassed()) {
    stream << Colour(Colour::ResultSuccess) << "All tests passed";
    stream << " (" << pluralise(totals.assertions.passed, "assertion") << " in "
           << pluralise(totals.testCases.passed, "test case") << ')' << '\n';
  } else {

    std::vector<SummaryColumn> columns;
    columns.push_back(SummaryColumn("", Colour::None)
                          .addRow(totals.testCases.total())
                          .addRow(totals.assertions.total()));
    columns.push_back(SummaryColumn("passed", Colour::Success)
                          .addRow(totals.testCases.passed)
                          .addRow(totals.assertions.passed));
    columns.push_back(SummaryColumn("failed", Colour::ResultError)
                          .addRow(totals.testCases.failed)
                          .addRow(totals.assertions.failed));
    columns.push_back(
        SummaryColumn("failed as expected", Colour::ResultExpectedFailure)
            .addRow(totals.testCases.failedButOk)
            .addRow(totals.assertions.failedButOk));

    printSummaryRow("test cases", columns, 0);
    printSummaryRow("assertions", columns, 1);
  }
}
void ConsoleReporter::printSummaryRow(std::string const& label,
                                      std::vector<SummaryColumn> const& cols,
                                      std::size_t row) {
  for (auto col : cols) {
    std::string value = col.rows[row];
    if (col.label.empty()) {
      stream << label << ": ";
      if (value != "0")
        stream << value;
      else
        stream << Colour(Colour::Warning) << "- none -";
    } else if (value != "0") {
      stream << Colour(Colour::LightGrey) << " | ";
      stream << Colour(col.colour) << value << ' ' << col.label;
    }
  }
  stream << '\n';
}

void ConsoleReporter::printTotalsDivider(Totals const& totals) {
  if (totals.testCases.total() > 0) {
    std::size_t failedRatio =
        makeRatio(totals.testCases.failed, totals.testCases.total());
    std::size_t failedButOkRatio =
        makeRatio(totals.testCases.failedButOk, totals.testCases.total());
    std::size_t passedRatio =
        makeRatio(totals.testCases.passed, totals.testCases.total());
    while (failedRatio + failedButOkRatio + passedRatio <
           CATCH_CONFIG_CONSOLE_WIDTH - 1)
      findMax(failedRatio, failedButOkRatio, passedRatio)++;
    while (failedRatio + failedButOkRatio + passedRatio >
           CATCH_CONFIG_CONSOLE_WIDTH - 1)
      findMax(failedRatio, failedButOkRatio, passedRatio)--;

    stream << Colour(Colour::Error) << std::string(failedRatio, '=');
    stream << Colour(Colour::ResultExpectedFailure)
           << std::string(failedButOkRatio, '=');
    if (totals.testCases.allPassed())
      stream << Colour(Colour::ResultSuccess) << std::string(passedRatio, '=');
    else
      stream << Colour(Colour::Success) << std::string(passedRatio, '=');
  } else {
    stream << Colour(Colour::Warning)
           << std::string(CATCH_CONFIG_CONSOLE_WIDTH - 1, '=');
  }
  stream << '\n';
}
void ConsoleReporter::printSummaryDivider() {
  stream << getLineOfChars<'-'>() << '\n';
}

void ConsoleReporter::printTestFilters() {
  if (m_config->testSpec().hasFilters()) {
    Colour guard(Colour::BrightYellow);
    stream << "Filters: " << serializeFilters(m_config->getTestsOrTags())
           << '\n';
  }
}

CATCH_REGISTER_REPORTER("console", ConsoleReporter)

} // end namespace Catch

#if defined(_MSC_VER)
#pragma warning(pop)
#endif

#if defined(__clang__)
#pragma clang diagnostic pop
#endif
// end catch_reporter_console.cpp
// start catch_reporter_junit.cpp

#include <cassert>
#include <sstream>
#include <ctime>
#include <algorithm>

namespace Catch {

namespace {
std::string getCurrentTimestamp() {
  // Beware, this is not reentrant because of backward compatibility issues
  // Also, UTC only, again because of backward compatibility (%z is C++11)
  time_t rawtime;
  std::time(&rawtime);
  auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");

#ifdef _MSC_VER
  std::tm timeInfo = {};
  gmtime_s(&timeInfo, &rawtime);
#else
  std::tm* timeInfo;
  timeInfo = std::gmtime(&rawtime);
#endif

  char timeStamp[timeStampSize];
  const char* const fmt = "%Y-%m-%dT%H:%M:%SZ";

#ifdef _MSC_VER
  std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
#else
  std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
#endif
  return std::string(timeStamp);
}

std::string fileNameTag(const std::vector<std::string>& tags) {
  auto it = std::find_if(begin(tags), end(tags), [](std::string const& tag) {
    return tag.front() == '#';
  });
  if (it != tags.end())
    return it->substr(1);
  return std::string();
}
} // anonymous namespace

JunitReporter::JunitReporter(ReporterConfig const& _config)
    : CumulativeReporterBase(_config), xml(_config.stream()) {
  m_reporterPrefs.shouldRedirectStdOut      = true;
  m_reporterPrefs.shouldReportAllAssertions = true;
}

JunitReporter::~JunitReporter() {}

std::string JunitReporter::getDescription() {
  return "Reports test results in an XML format that looks like Ant's "
         "junitreport target";
}

void JunitReporter::noMatchingTestCases(std::string const& /*spec*/) {}

void JunitReporter::testRunStarting(TestRunInfo const& runInfo) {
  CumulativeReporterBase::testRunStarting(runInfo);
  xml.startElement("testsuites");
}

void JunitReporter::testGroupStarting(GroupInfo const& groupInfo) {
  suiteTimer.start();
  stdOutForSuite.clear();
  stdErrForSuite.clear();
  unexpectedExceptions = 0;
  CumulativeReporterBase::testGroupStarting(groupInfo);
}

void JunitReporter::testCaseStarting(TestCaseInfo const& testCaseInfo) {
  m_okToFail = testCaseInfo.okToFail();
}

bool JunitReporter::assertionEnded(AssertionStats const& assertionStats) {
  if (assertionStats.assertionResult.getResultType() ==
          ResultWas::ThrewException &&
      !m_okToFail)
    unexpectedExceptions++;
  return CumulativeReporterBase::assertionEnded(assertionStats);
}

void JunitReporter::testCaseEnded(TestCaseStats const& testCaseStats) {
  stdOutForSuite += testCaseStats.stdOut;
  stdErrForSuite += testCaseStats.stdErr;
  CumulativeReporterBase::testCaseEnded(testCaseStats);
}

void JunitReporter::testGroupEnded(TestGroupStats const& testGroupStats) {
  double suiteTime = suiteTimer.getElapsedSeconds();
  CumulativeReporterBase::testGroupEnded(testGroupStats);
  writeGroup(*m_testGroups.back(), suiteTime);
}

void JunitReporter::testRunEndedCumulative() { xml.endElement(); }

void JunitReporter::writeGroup(TestGroupNode const& groupNode,
                               double suiteTime) {
  XmlWriter::ScopedElement e = xml.scopedElement("testsuite");

  TestGroupStats const& stats = groupNode.value;
  xml.writeAttribute("name", stats.groupInfo.name);
  xml.writeAttribute("errors", unexpectedExceptions);
  xml.writeAttribute("failures",
                     stats.totals.assertions.failed - unexpectedExceptions);
  xml.writeAttribute("tests", stats.totals.assertions.total());
  xml.writeAttribute("hostname", "tbd"); // !TBD
  if (m_config->showDurations() == ShowDurations::Never)
    xml.writeAttribute("time", "");
  else
    xml.writeAttribute("time", suiteTime);
  xml.writeAttribute("timestamp", getCurrentTimestamp());

  // Write properties if there are any
  if (m_config->hasTestFilters() || m_config->rngSeed() != 0) {
    auto properties = xml.scopedElement("properties");
    if (m_config->hasTestFilters()) {
      xml.scopedElement("property")
          .writeAttribute("name", "filters")
          .writeAttribute("value",
                          serializeFilters(m_config->getTestsOrTags()));
    }
    if (m_config->rngSeed() != 0) {
      xml.scopedElement("property")
          .writeAttribute("name", "random-seed")
          .writeAttribute("value", m_config->rngSeed());
    }
  }

  // Write test cases
  for (auto const& child : groupNode.children)
    writeTestCase(*child);

  xml.scopedElement("system-out")
      .writeText(trim(stdOutForSuite), XmlFormatting::Newline);
  xml.scopedElement("system-err")
      .writeText(trim(stdErrForSuite), XmlFormatting::Newline);
}

void JunitReporter::writeTestCase(TestCaseNode const& testCaseNode) {
  TestCaseStats const& stats = testCaseNode.value;

  // All test cases have exactly one section - which represents the
  // test case itself. That section may have 0-n nested sections
  assert(testCaseNode.children.size() == 1);
  SectionNode const& rootSection = *testCaseNode.children.front();

  std::string className = stats.testInfo.className;

  if (className.empty()) {
    className = fileNameTag(stats.testInfo.tags);
    if (className.empty())
      className = "global";
  }

  if (!m_config->name().empty())
    className = m_config->name() + "." + className;

  writeSection(className, "", rootSection);
}

void JunitReporter::writeSection(std::string const& className,
                                 std::string const& rootName,
                                 SectionNode const& sectionNode) {
  std::string name = trim(sectionNode.stats.sectionInfo.name);
  if (!rootName.empty())
    name = rootName + '/' + name;

  if (!sectionNode.assertions.empty() || !sectionNode.stdOut.empty() ||
      !sectionNode.stdErr.empty()) {
    XmlWriter::ScopedElement e = xml.scopedElement("testcase");
    if (className.empty()) {
      xml.writeAttribute("classname", name);
      xml.writeAttribute("name", "root");
    } else {
      xml.writeAttribute("classname", className);
      xml.writeAttribute("name", name);
    }
    xml.writeAttribute("time", ::Catch::Detail::stringify(
                                   sectionNode.stats.durationInSeconds));
    // This is not ideal, but it should be enough to mimic gtest's
    // junit output.
    // Ideally the JUnit reporter would also handle `skipTest`
    // events and write those out appropriately.
    xml.writeAttribute("status", "run");

    writeAssertions(sectionNode);

    if (!sectionNode.stdOut.empty())
      xml.scopedElement("system-out")
          .writeText(trim(sectionNode.stdOut), XmlFormatting::Newline);
    if (!sectionNode.stdErr.empty())
      xml.scopedElement("system-err")
          .writeText(trim(sectionNode.stdErr), XmlFormatting::Newline);
  }
  for (auto const& childNode : sectionNode.childSections)
    if (className.empty())
      writeSection(name, "", *childNode);
    else
      writeSection(className, name, *childNode);
}

void JunitReporter::writeAssertions(SectionNode const& sectionNode) {
  for (auto const& assertion : sectionNode.assertions)
    writeAssertion(assertion);
}

void JunitReporter::writeAssertion(AssertionStats const& stats) {
  AssertionResult const& result = stats.assertionResult;
  if (!result.isOk()) {
    std::string elementName;
    switch (result.getResultType()) {
    case ResultWas::ThrewException:
    case ResultWas::FatalErrorCondition:
      elementName = "error";
      break;
    case ResultWas::ExplicitFailure:
    case ResultWas::ExpressionFailed:
    case ResultWas::DidntThrowException:
      elementName = "failure";
      break;

    // We should never see these here:
    case ResultWas::Info:
    case ResultWas::Warning:
    case ResultWas::Ok:
    case ResultWas::Unknown:
    case ResultWas::FailureBit:
    case ResultWas::Exception:
      elementName = "internalError";
      break;
    }

    XmlWriter::ScopedElement e = xml.scopedElement(elementName);

    xml.writeAttribute("message", result.getExpression());
    xml.writeAttribute("type", result.getTestMacroName());

    ReusableStringStream rss;
    if (stats.totals.assertions.total() > 0) {
      rss << "FAILED"
          << ":\n";
      if (result.hasExpression()) {
        rss << "  ";
        rss << result.getExpressionInMacro();
        rss << '\n';
      }
      if (result.hasExpandedExpression()) {
        rss << "with expansion:\n";
        rss << Column(result.getExpandedExpression()).indent(2) << '\n';
      }
    } else {
      rss << '\n';
    }

    if (!result.getMessage().empty())
      rss << result.getMessage() << '\n';
    for (auto const& msg : stats.infoMessages)
      if (msg.type == ResultWas::Info)
        rss << msg.message << '\n';

    rss << "at " << result.getSourceInfo();
    xml.writeText(rss.str(), XmlFormatting::Newline);
  }
}

CATCH_REGISTER_REPORTER("junit", JunitReporter)

} // end namespace Catch
// end catch_reporter_junit.cpp
// start catch_reporter_listening.cpp

#include <cassert>

namespace Catch {

ListeningReporter::ListeningReporter() {
  // We will assume that listeners will always want all assertions
  m_preferences.shouldReportAllAssertions = true;
}

void ListeningReporter::addListener(IStreamingReporterPtr&& listener) {
  m_listeners.push_back(std::move(listener));
}

void ListeningReporter::addReporter(IStreamingReporterPtr&& reporter) {
  assert(!m_reporter && "Listening reporter can wrap only 1 real reporter");
  m_reporter = std::move(reporter);
  m_preferences.shouldRedirectStdOut =
      m_reporter->getPreferences().shouldRedirectStdOut;
}

ReporterPreferences ListeningReporter::getPreferences() const {
  return m_preferences;
}

std::set<Verbosity> ListeningReporter::getSupportedVerbosities() {
  return std::set<Verbosity>{};
}

void ListeningReporter::noMatchingTestCases(std::string const& spec) {
  for (auto const& listener : m_listeners) {
    listener->noMatchingTestCases(spec);
  }
  m_reporter->noMatchingTestCases(spec);
}

void ListeningReporter::reportInvalidArguments(std::string const& arg) {
  for (auto const& listener : m_listeners) {
    listener->reportInvalidArguments(arg);
  }
  m_reporter->reportInvalidArguments(arg);
}

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
void ListeningReporter::benchmarkPreparing(std::string const& name) {
  for (auto const& listener : m_listeners) {
    listener->benchmarkPreparing(name);
  }
  m_reporter->benchmarkPreparing(name);
}
void ListeningReporter::benchmarkStarting(BenchmarkInfo const& benchmarkInfo) {
  for (auto const& listener : m_listeners) {
    listener->benchmarkStarting(benchmarkInfo);
  }
  m_reporter->benchmarkStarting(benchmarkInfo);
}
void ListeningReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
  for (auto const& listener : m_listeners) {
    listener->benchmarkEnded(benchmarkStats);
  }
  m_reporter->benchmarkEnded(benchmarkStats);
}

void ListeningReporter::benchmarkFailed(std::string const& error) {
  for (auto const& listener : m_listeners) {
    listener->benchmarkFailed(error);
  }
  m_reporter->benchmarkFailed(error);
}
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

void ListeningReporter::testRunStarting(TestRunInfo const& testRunInfo) {
  for (auto const& listener : m_listeners) {
    listener->testRunStarting(testRunInfo);
  }
  m_reporter->testRunStarting(testRunInfo);
}

void ListeningReporter::testGroupStarting(GroupInfo const& groupInfo) {
  for (auto const& listener : m_listeners) {
    listener->testGroupStarting(groupInfo);
  }
  m_reporter->testGroupStarting(groupInfo);
}

void ListeningReporter::testCaseStarting(TestCaseInfo const& testInfo) {
  for (auto const& listener : m_listeners) {
    listener->testCaseStarting(testInfo);
  }
  m_reporter->testCaseStarting(testInfo);
}

void ListeningReporter::sectionStarting(SectionInfo const& sectionInfo) {
  for (auto const& listener : m_listeners) {
    listener->sectionStarting(sectionInfo);
  }
  m_reporter->sectionStarting(sectionInfo);
}

void ListeningReporter::assertionStarting(AssertionInfo const& assertionInfo) {
  for (auto const& listener : m_listeners) {
    listener->assertionStarting(assertionInfo);
  }
  m_reporter->assertionStarting(assertionInfo);
}

// The return value indicates if the messages buffer should be cleared:
bool ListeningReporter::assertionEnded(AssertionStats const& assertionStats) {
  for (auto const& listener : m_listeners) {
    static_cast<void>(listener->assertionEnded(assertionStats));
  }
  return m_reporter->assertionEnded(assertionStats);
}

void ListeningReporter::sectionEnded(SectionStats const& sectionStats) {
  for (auto const& listener : m_listeners) {
    listener->sectionEnded(sectionStats);
  }
  m_reporter->sectionEnded(sectionStats);
}

void ListeningReporter::testCaseEnded(TestCaseStats const& testCaseStats) {
  for (auto const& listener : m_listeners) {
    listener->testCaseEnded(testCaseStats);
  }
  m_reporter->testCaseEnded(testCaseStats);
}

void ListeningReporter::testGroupEnded(TestGroupStats const& testGroupStats) {
  for (auto const& listener : m_listeners) {
    listener->testGroupEnded(testGroupStats);
  }
  m_reporter->testGroupEnded(testGroupStats);
}

void ListeningReporter::testRunEnded(TestRunStats const& testRunStats) {
  for (auto const& listener : m_listeners) {
    listener->testRunEnded(testRunStats);
  }
  m_reporter->testRunEnded(testRunStats);
}

void ListeningReporter::skipTest(TestCaseInfo const& testInfo) {
  for (auto const& listener : m_listeners) {
    listener->skipTest(testInfo);
  }
  m_reporter->skipTest(testInfo);
}

bool ListeningReporter::isMulti() const { return true; }

} // end namespace Catch
// end catch_reporter_listening.cpp
// start catch_reporter_xml.cpp

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4061) // Not all labels are EXPLICITLY handled in
                                // switch Note that 4062 (not all labels are
                                // handled and default is missing) is enabled
#endif

namespace Catch {
XmlReporter::XmlReporter(ReporterConfig const& _config)
    : StreamingReporterBase(_config), m_xml(_config.stream()) {
  m_reporterPrefs.shouldRedirectStdOut      = true;
  m_reporterPrefs.shouldReportAllAssertions = true;
}

XmlReporter::~XmlReporter() = default;

std::string XmlReporter::getDescription() {
  return "Reports test results as an XML document";
}

std::string XmlReporter::getStylesheetRef() const { return std::string(); }

void XmlReporter::writeSourceInfo(SourceLineInfo const& sourceInfo) {
  m_xml.writeAttribute("filename", sourceInfo.file)
      .writeAttribute("line", sourceInfo.line);
}

void XmlReporter::noMatchingTestCases(std::string const& s) {
  StreamingReporterBase::noMatchingTestCases(s);
}

void XmlReporter::testRunStarting(TestRunInfo const& testInfo) {
  StreamingReporterBase::testRunStarting(testInfo);
  std::string stylesheetRef = getStylesheetRef();
  if (!stylesheetRef.empty())
    m_xml.writeStylesheetRef(stylesheetRef);
  m_xml.startElement("Catch");
  if (!m_config->name().empty())
    m_xml.writeAttribute("name", m_config->name());
  if (m_config->testSpec().hasFilters())
    m_xml.writeAttribute("filters",
                         serializeFilters(m_config->getTestsOrTags()));
  if (m_config->rngSeed() != 0)
    m_xml.scopedElement("Randomness")
        .writeAttribute("seed", m_config->rngSeed());
}

void XmlReporter::testGroupStarting(GroupInfo const& groupInfo) {
  StreamingReporterBase::testGroupStarting(groupInfo);
  m_xml.startElement("Group").writeAttribute("name", groupInfo.name);
}

void XmlReporter::testCaseStarting(TestCaseInfo const& testInfo) {
  StreamingReporterBase::testCaseStarting(testInfo);
  m_xml.startElement("TestCase")
      .writeAttribute("name", trim(testInfo.name))
      .writeAttribute("description", testInfo.description)
      .writeAttribute("tags", testInfo.tagsAsString());

  writeSourceInfo(testInfo.lineInfo);

  if (m_config->showDurations() == ShowDurations::Always)
    m_testCaseTimer.start();
  m_xml.ensureTagClosed();
}

void XmlReporter::sectionStarting(SectionInfo const& sectionInfo) {
  StreamingReporterBase::sectionStarting(sectionInfo);
  if (m_sectionDepth++ > 0) {
    m_xml.startElement("Section").writeAttribute("name",
                                                 trim(sectionInfo.name));
    writeSourceInfo(sectionInfo.lineInfo);
    m_xml.ensureTagClosed();
  }
}

void XmlReporter::assertionStarting(AssertionInfo const&) {}

bool XmlReporter::assertionEnded(AssertionStats const& assertionStats) {

  AssertionResult const& result = assertionStats.assertionResult;

  bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();

  if (includeResults || result.getResultType() == ResultWas::Warning) {
    // Print any info messages in <Info> tags.
    for (auto const& msg : assertionStats.infoMessages) {
      if (msg.type == ResultWas::Info && includeResults) {
        m_xml.scopedElement("Info").writeText(msg.message);
      } else if (msg.type == ResultWas::Warning) {
        m_xml.scopedElement("Warning").writeText(msg.message);
      }
    }
  }

  // Drop out if result was successful but we're not printing them.
  if (!includeResults && result.getResultType() != ResultWas::Warning)
    return true;

  // Print the expression if there is one.
  if (result.hasExpression()) {
    m_xml.startElement("Expression")
        .writeAttribute("success", result.succeeded())
        .writeAttribute("type", result.getTestMacroName());

    writeSourceInfo(result.getSourceInfo());

    m_xml.scopedElement("Original").writeText(result.getExpression());
    m_xml.scopedElement("Expanded").writeText(result.getExpandedExpression());
  }

  // And... Print a result applicable to each result type.
  switch (result.getResultType()) {
  case ResultWas::ThrewException:
    m_xml.startElement("Exception");
    writeSourceInfo(result.getSourceInfo());
    m_xml.writeText(result.getMessage());
    m_xml.endElement();
    break;
  case ResultWas::FatalErrorCondition:
    m_xml.startElement("FatalErrorCondition");
    writeSourceInfo(result.getSourceInfo());
    m_xml.writeText(result.getMessage());
    m_xml.endElement();
    break;
  case ResultWas::Info:
    m_xml.scopedElement("Info").writeText(result.getMessage());
    break;
  case ResultWas::Warning:
    // Warning will already have been written
    break;
  case ResultWas::ExplicitFailure:
    m_xml.startElement("Failure");
    writeSourceInfo(result.getSourceInfo());
    m_xml.writeText(result.getMessage());
    m_xml.endElement();
    break;
  default:
    break;
  }

  if (result.hasExpression())
    m_xml.endElement();

  return true;
}

void XmlReporter::sectionEnded(SectionStats const& sectionStats) {
  StreamingReporterBase::sectionEnded(sectionStats);
  if (--m_sectionDepth > 0) {
    XmlWriter::ScopedElement e = m_xml.scopedElement("OverallResults");
    e.writeAttribute("successes", sectionStats.assertions.passed);
    e.writeAttribute("failures", sectionStats.assertions.failed);
    e.writeAttribute("expectedFailures", sectionStats.assertions.failedButOk);

    if (m_config->showDurations() == ShowDurations::Always)
      e.writeAttribute("durationInSeconds", sectionStats.durationInSeconds);

    m_xml.endElement();
  }
}

void XmlReporter::testCaseEnded(TestCaseStats const& testCaseStats) {
  StreamingReporterBase::testCaseEnded(testCaseStats);
  XmlWriter::ScopedElement e = m_xml.scopedElement("OverallResult");
  e.writeAttribute("success", testCaseStats.totals.assertions.allOk());

  if (m_config->showDurations() == ShowDurations::Always)
    e.writeAttribute("durationInSeconds", m_testCaseTimer.getElapsedSeconds());

  if (!testCaseStats.stdOut.empty())
    m_xml.scopedElement("StdOut").writeText(trim(testCaseStats.stdOut),
                                            XmlFormatting::Newline);
  if (!testCaseStats.stdErr.empty())
    m_xml.scopedElement("StdErr").writeText(trim(testCaseStats.stdErr),
                                            XmlFormatting::Newline);

  m_xml.endElement();
}

void XmlReporter::testGroupEnded(TestGroupStats const& testGroupStats) {
  StreamingReporterBase::testGroupEnded(testGroupStats);
  // TODO: Check testGroupStats.aborting and act accordingly.
  m_xml.scopedElement("OverallResults")
      .writeAttribute("successes", testGroupStats.totals.assertions.passed)
      .writeAttribute("failures", testGroupStats.totals.assertions.failed)
      .writeAttribute("expectedFailures",
                      testGroupStats.totals.assertions.failedButOk);
  m_xml.scopedElement("OverallResultsCases")
      .writeAttribute("successes", testGroupStats.totals.testCases.passed)
      .writeAttribute("failures", testGroupStats.totals.testCases.failed)
      .writeAttribute("expectedFailures",
                      testGroupStats.totals.testCases.failedButOk);
  m_xml.endElement();
}

void XmlReporter::testRunEnded(TestRunStats const& testRunStats) {
  StreamingReporterBase::testRunEnded(testRunStats);
  m_xml.scopedElement("OverallResults")
      .writeAttribute("successes", testRunStats.totals.assertions.passed)
      .writeAttribute("failures", testRunStats.totals.assertions.failed)
      .writeAttribute("expectedFailures",
                      testRunStats.totals.assertions.failedButOk);
  m_xml.scopedElement("OverallResultsCases")
      .writeAttribute("successes", testRunStats.totals.testCases.passed)
      .writeAttribute("failures", testRunStats.totals.testCases.failed)
      .writeAttribute("expectedFailures",
                      testRunStats.totals.testCases.failedButOk);
  m_xml.endElement();
}

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
void XmlReporter::benchmarkPreparing(std::string const& name) {
  m_xml.startElement("BenchmarkResults").writeAttribute("name", name);
}

void XmlReporter::benchmarkStarting(BenchmarkInfo const& info) {
  m_xml.writeAttribute("samples", info.samples)
      .writeAttribute("resamples", info.resamples)
      .writeAttribute("iterations", info.iterations)
      .writeAttribute("clockResolution", info.clockResolution)
      .writeAttribute("estimatedDuration", info.estimatedDuration)
      .writeComment("All values in nano seconds");
}

void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
  m_xml.startElement("mean")
      .writeAttribute("value", benchmarkStats.mean.point.count())
      .writeAttribute("lowerBound", benchmarkStats.mean.lower_bound.count())
      .writeAttribute("upperBound", benchmarkStats.mean.upper_bound.count())
      .writeAttribute("ci", benchmarkStats.mean.confidence_interval);
  m_xml.endElement();
  m_xml.startElement("standardDeviation")
      .writeAttribute("value", benchmarkStats.standardDeviation.point.count())
      .writeAttribute("lowerBound",
                      benchmarkStats.standardDeviation.lower_bound.count())
      .writeAttribute("upperBound",
                      benchmarkStats.standardDeviation.upper_bound.count())
      .writeAttribute("ci",
                      benchmarkStats.standardDeviation.confidence_interval);
  m_xml.endElement();
  m_xml.startElement("outliers")
      .writeAttribute("variance", benchmarkStats.outlierVariance)
      .writeAttribute("lowMild", benchmarkStats.outliers.low_mild)
      .writeAttribute("lowSevere", benchmarkStats.outliers.low_severe)
      .writeAttribute("highMild", benchmarkStats.outliers.high_mild)
      .writeAttribute("highSevere", benchmarkStats.outliers.high_severe);
  m_xml.endElement();
  m_xml.endElement();
}

void XmlReporter::benchmarkFailed(std::string const& error) {
  m_xml.scopedElement("failed").writeAttribute("message", error);
  m_xml.endElement();
}
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

CATCH_REGISTER_REPORTER("xml", XmlReporter)

} // end namespace Catch

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
// end catch_reporter_xml.cpp

namespace Catch {
LeakDetector leakDetector;
}

#ifdef __clang__
#pragma clang diagnostic pop
#endif

// end catch_impl.hpp
#endif

#ifdef CATCH_CONFIG_MAIN
// start catch_default_main.hpp

#ifndef __OBJC__

#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) &&          \
    defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
// Standard C/C++ Win32 Unicode wmain entry point
extern "C" int wmain(int argc, wchar_t* argv[], wchar_t*[]) {
#else
// Standard C/C++ main entry point
int main(int argc, char* argv[]) {
#endif

  return Catch::Session().run(argc, argv);
}

#else // __OBJC__

// Objective-C entry point
int main(int argc, char* const argv[]) {
#if !CATCH_ARC_ENABLED
  NSAutoreleasePool* pool = [[NSAutoreleasePool alloc] init];
#endif

  Catch::registerTestMethods();
  int result = Catch::Session().run(argc, (char**)argv);

#if !CATCH_ARC_ENABLED
  [pool drain];
#endif

  return result;
}

#endif // __OBJC__

// end catch_default_main.hpp
#endif

#if !defined(CATCH_CONFIG_IMPL_ONLY)

#ifdef CLARA_CONFIG_MAIN_NOT_DEFINED
#undef CLARA_CONFIG_MAIN
#endif

#if !defined(CATCH_CONFIG_DISABLE)
//////
// If this config identifier is defined then all CATCH macros are prefixed with
// CATCH_
#ifdef CATCH_CONFIG_PREFIX_ALL

#define CATCH_REQUIRE(...)                                                     \
  INTERNAL_CATCH_TEST("CATCH_REQUIRE", Catch::ResultDisposition::Normal,       \
                      __VA_ARGS__)
#define CATCH_REQUIRE_FALSE(...)                                               \
  INTERNAL_CATCH_TEST("CATCH_REQUIRE_FALSE",                                   \
                      Catch::ResultDisposition::Normal |                       \
                          Catch::ResultDisposition::FalseTest,                 \
                      __VA_ARGS__)

#define CATCH_REQUIRE_THROWS(...)                                              \
  INTERNAL_CATCH_THROWS("CATCH_REQUIRE_THROWS",                                \
                        Catch::ResultDisposition::Normal, __VA_ARGS__)
#define CATCH_REQUIRE_THROWS_AS(expr, exceptionType)                           \
  INTERNAL_CATCH_THROWS_AS("CATCH_REQUIRE_THROWS_AS", exceptionType,           \
                           Catch::ResultDisposition::Normal, expr)
#define CATCH_REQUIRE_THROWS_WITH(expr, matcher)                               \
  INTERNAL_CATCH_THROWS_STR_MATCHES("CATCH_REQUIRE_THROWS_WITH",               \
                                    Catch::ResultDisposition::Normal, matcher, \
                                    expr)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher)             \
  INTERNAL_CATCH_THROWS_MATCHES("CATCH_REQUIRE_THROWS_MATCHES", exceptionType, \
                                Catch::ResultDisposition::Normal, matcher,     \
                                expr)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CATCH_REQUIRE_NOTHROW(...)                                             \
  INTERNAL_CATCH_NO_THROW("CATCH_REQUIRE_NOTHROW",                             \
                          Catch::ResultDisposition::Normal, __VA_ARGS__)

#define CATCH_CHECK(...)                                                       \
  INTERNAL_CATCH_TEST(                                                         \
      "CATCH_CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)
#define CATCH_CHECK_FALSE(...)                                                 \
  INTERNAL_CATCH_TEST("CATCH_CHECK_FALSE",                                     \
                      Catch::ResultDisposition::ContinueOnFailure |            \
                          Catch::ResultDisposition::FalseTest,                 \
                      __VA_ARGS__)
#define CATCH_CHECKED_IF(...)                                                  \
  INTERNAL_CATCH_IF("CATCH_CHECKED_IF",                                        \
                    Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)
#define CATCH_CHECKED_ELSE(...)                                                \
  INTERNAL_CATCH_ELSE("CATCH_CHECKED_ELSE",                                    \
                      Catch::ResultDisposition::ContinueOnFailure,             \
                      __VA_ARGS__)
#define CATCH_CHECK_NOFAIL(...)                                                \
  INTERNAL_CATCH_TEST("CATCH_CHECK_NOFAIL",                                    \
                      Catch::ResultDisposition::ContinueOnFailure |            \
                          Catch::ResultDisposition::SuppressFail,              \
                      __VA_ARGS__)

#define CATCH_CHECK_THROWS(...)                                                \
  INTERNAL_CATCH_THROWS("CATCH_CHECK_THROWS",                                  \
                        Catch::ResultDisposition::ContinueOnFailure,           \
                        __VA_ARGS__)
#define CATCH_CHECK_THROWS_AS(expr, exceptionType)                             \
  INTERNAL_CATCH_THROWS_AS("CATCH_CHECK_THROWS_AS", exceptionType,             \
                           Catch::ResultDisposition::ContinueOnFailure, expr)
#define CATCH_CHECK_THROWS_WITH(expr, matcher)                                 \
  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \
      "CATCH_CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure,  \
      matcher, expr)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_CHECK_THROWS_MATCHES(expr, exceptionType, matcher)               \
  INTERNAL_CATCH_THROWS_MATCHES("CATCH_CHECK_THROWS_MATCHES", exceptionType,   \
                                Catch::ResultDisposition::ContinueOnFailure,   \
                                matcher, expr)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CATCH_CHECK_NOTHROW(...)                                               \
  INTERNAL_CATCH_NO_THROW("CATCH_CHECK_NOTHROW",                               \
                          Catch::ResultDisposition::ContinueOnFailure,         \
                          __VA_ARGS__)

#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_CHECK_THAT(arg, matcher)                                         \
  INTERNAL_CHECK_THAT("CATCH_CHECK_THAT", matcher,                             \
                      Catch::ResultDisposition::ContinueOnFailure, arg)

#define CATCH_REQUIRE_THAT(arg, matcher)                                       \
  INTERNAL_CHECK_THAT("CATCH_REQUIRE_THAT", matcher,                           \
                      Catch::ResultDisposition::Normal, arg)
#endif // CATCH_CONFIG_DISABLE_MATCHERS

#define CATCH_INFO(msg) INTERNAL_CATCH_INFO("CATCH_INFO", msg)
#define CATCH_UNSCOPED_INFO(msg)                                               \
  INTERNAL_CATCH_UNSCOPED_INFO("CATCH_UNSCOPED_INFO", msg)
#define CATCH_WARN(msg)                                                        \
  INTERNAL_CATCH_MSG("CATCH_WARN", Catch::ResultWas::Warning,                  \
                     Catch::ResultDisposition::ContinueOnFailure, msg)
#define CATCH_CAPTURE(...)                                                     \
  INTERNAL_CATCH_CAPTURE(INTERNAL_CATCH_UNIQUE_NAME(capturer),                 \
                         "CATCH_CAPTURE", __VA_ARGS__)

#define CATCH_TEST_CASE(...) INTERNAL_CATCH_TESTCASE(__VA_ARGS__)
#define CATCH_TEST_CASE_METHOD(className, ...)                                 \
  INTERNAL_CATCH_TEST_CASE_METHOD(className, __VA_ARGS__)
#define CATCH_METHOD_AS_TEST_CASE(method, ...)                                 \
  INTERNAL_CATCH_METHOD_AS_TEST_CASE(method, __VA_ARGS__)
#define CATCH_REGISTER_TEST_CASE(Function, ...)                                \
  INTERNAL_CATCH_REGISTER_TESTCASE(Function, __VA_ARGS__)
#define CATCH_SECTION(...) INTERNAL_CATCH_SECTION(__VA_ARGS__)
#define CATCH_DYNAMIC_SECTION(...) INTERNAL_CATCH_DYNAMIC_SECTION(__VA_ARGS__)
#define CATCH_FAIL(...)                                                        \
  INTERNAL_CATCH_MSG("CATCH_FAIL", Catch::ResultWas::ExplicitFailure,          \
                     Catch::ResultDisposition::Normal, __VA_ARGS__)
#define CATCH_FAIL_CHECK(...)                                                  \
  INTERNAL_CATCH_MSG("CATCH_FAIL_CHECK", Catch::ResultWas::ExplicitFailure,    \
                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)
#define CATCH_SUCCEED(...)                                                     \
  INTERNAL_CATCH_MSG("CATCH_SUCCEED", Catch::ResultWas::Ok,                    \
                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)

#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define CATCH_TEMPLATE_TEST_CASE(...)                                          \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, __VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)
#else
#define CATCH_TEMPLATE_TEST_CASE(...)                                          \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__))
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__))
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__))
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className,              \
                                                       __VA_ARGS__))
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className,          \
                                                           __VA_ARGS__))
#endif

#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
#define CATCH_STATIC_REQUIRE(...)                                              \
  static_assert(__VA_ARGS__, #__VA_ARGS__);                                    \
  CATCH_SUCCEED(#__VA_ARGS__)
#define CATCH_STATIC_REQUIRE_FALSE(...)                                        \
  static_assert(!(__VA_ARGS__), "!(" #__VA_ARGS__ ")");                        \
  CATCH_SUCCEED(#__VA_ARGS__)
#else
#define CATCH_STATIC_REQUIRE(...) CATCH_REQUIRE(__VA_ARGS__)
#define CATCH_STATIC_REQUIRE_FALSE(...) CATCH_REQUIRE_FALSE(__VA_ARGS__)
#endif

// "BDD-style" convenience wrappers
#define CATCH_SCENARIO(...) CATCH_TEST_CASE("Scenario: " __VA_ARGS__)
#define CATCH_SCENARIO_METHOD(className, ...)                                  \
  INTERNAL_CATCH_TEST_CASE_METHOD(className, "Scenario: " __VA_ARGS__)
#define CATCH_GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("    Given: " << desc)
#define CATCH_AND_GIVEN(desc)                                                  \
  INTERNAL_CATCH_DYNAMIC_SECTION("And given: " << desc)
#define CATCH_WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("     When: " << desc)
#define CATCH_AND_WHEN(desc)                                                   \
  INTERNAL_CATCH_DYNAMIC_SECTION(" And when: " << desc)
#define CATCH_THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("     Then: " << desc)
#define CATCH_AND_THEN(desc)                                                   \
  INTERNAL_CATCH_DYNAMIC_SECTION("      And: " << desc)

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
#define CATCH_BENCHMARK(...)                                                   \
  INTERNAL_CATCH_BENCHMARK(                                                    \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____),              \
      INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__, , ),                               \
      INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__, , ))
#define CATCH_BENCHMARK_ADVANCED(name)                                         \
  INTERNAL_CATCH_BENCHMARK_ADVANCED(                                           \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not
// required
#else

#define REQUIRE(...)                                                           \
  INTERNAL_CATCH_TEST("REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__)
#define REQUIRE_FALSE(...)                                                     \
  INTERNAL_CATCH_TEST("REQUIRE_FALSE",                                         \
                      Catch::ResultDisposition::Normal |                       \
                          Catch::ResultDisposition::FalseTest,                 \
                      __VA_ARGS__)

#define REQUIRE_THROWS(...)                                                    \
  INTERNAL_CATCH_THROWS("REQUIRE_THROWS", Catch::ResultDisposition::Normal,    \
                        __VA_ARGS__)
#define REQUIRE_THROWS_AS(expr, exceptionType)                                 \
  INTERNAL_CATCH_THROWS_AS("REQUIRE_THROWS_AS", exceptionType,                 \
                           Catch::ResultDisposition::Normal, expr)
#define REQUIRE_THROWS_WITH(expr, matcher)                                     \
  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \
      "REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher)                   \
  INTERNAL_CATCH_THROWS_MATCHES("REQUIRE_THROWS_MATCHES", exceptionType,       \
                                Catch::ResultDisposition::Normal, matcher,     \
                                expr)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define REQUIRE_NOTHROW(...)                                                   \
  INTERNAL_CATCH_NO_THROW("REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, \
                          __VA_ARGS__)

#define CHECK(...)                                                             \
  INTERNAL_CATCH_TEST("CHECK", Catch::ResultDisposition::ContinueOnFailure,    \
                      __VA_ARGS__)
#define CHECK_FALSE(...)                                                       \
  INTERNAL_CATCH_TEST("CHECK_FALSE",                                           \
                      Catch::ResultDisposition::ContinueOnFailure |            \
                          Catch::ResultDisposition::FalseTest,                 \
                      __VA_ARGS__)
#define CHECKED_IF(...)                                                        \
  INTERNAL_CATCH_IF("CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure, \
                    __VA_ARGS__)
#define CHECKED_ELSE(...)                                                      \
  INTERNAL_CATCH_ELSE("CHECKED_ELSE",                                          \
                      Catch::ResultDisposition::ContinueOnFailure,             \
                      __VA_ARGS__)
#define CHECK_NOFAIL(...)                                                      \
  INTERNAL_CATCH_TEST("CHECK_NOFAIL",                                          \
                      Catch::ResultDisposition::ContinueOnFailure |            \
                          Catch::ResultDisposition::SuppressFail,              \
                      __VA_ARGS__)

#define CHECK_THROWS(...)                                                      \
  INTERNAL_CATCH_THROWS("CHECK_THROWS",                                        \
                        Catch::ResultDisposition::ContinueOnFailure,           \
                        __VA_ARGS__)
#define CHECK_THROWS_AS(expr, exceptionType)                                   \
  INTERNAL_CATCH_THROWS_AS("CHECK_THROWS_AS", exceptionType,                   \
                           Catch::ResultDisposition::ContinueOnFailure, expr)
#define CHECK_THROWS_WITH(expr, matcher)                                       \
  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \
      "CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure,        \
      matcher, expr)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CHECK_THROWS_MATCHES(expr, exceptionType, matcher)                     \
  INTERNAL_CATCH_THROWS_MATCHES("CHECK_THROWS_MATCHES", exceptionType,         \
                                Catch::ResultDisposition::ContinueOnFailure,   \
                                matcher, expr)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CHECK_NOTHROW(...)                                                     \
  INTERNAL_CATCH_NO_THROW("CHECK_NOTHROW",                                     \
                          Catch::ResultDisposition::ContinueOnFailure,         \
                          __VA_ARGS__)

#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CHECK_THAT(arg, matcher)                                               \
  INTERNAL_CHECK_THAT("CHECK_THAT", matcher,                                   \
                      Catch::ResultDisposition::ContinueOnFailure, arg)

#define REQUIRE_THAT(arg, matcher)                                             \
  INTERNAL_CHECK_THAT("REQUIRE_THAT", matcher,                                 \
                      Catch::ResultDisposition::Normal, arg)
#endif // CATCH_CONFIG_DISABLE_MATCHERS

#define INFO(msg) INTERNAL_CATCH_INFO("INFO", msg)
#define UNSCOPED_INFO(msg) INTERNAL_CATCH_UNSCOPED_INFO("UNSCOPED_INFO", msg)
#define WARN(msg)                                                              \
  INTERNAL_CATCH_MSG("WARN", Catch::ResultWas::Warning,                        \
                     Catch::ResultDisposition::ContinueOnFailure, msg)
#define CAPTURE(...)                                                           \
  INTERNAL_CATCH_CAPTURE(INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE",      \
                         __VA_ARGS__)

#define TEST_CASE(...) INTERNAL_CATCH_TESTCASE(__VA_ARGS__)
#define TEST_CASE_METHOD(className, ...)                                       \
  INTERNAL_CATCH_TEST_CASE_METHOD(className, __VA_ARGS__)
#define METHOD_AS_TEST_CASE(method, ...)                                       \
  INTERNAL_CATCH_METHOD_AS_TEST_CASE(method, __VA_ARGS__)
#define REGISTER_TEST_CASE(Function, ...)                                      \
  INTERNAL_CATCH_REGISTER_TESTCASE(Function, __VA_ARGS__)
#define SECTION(...) INTERNAL_CATCH_SECTION(__VA_ARGS__)
#define DYNAMIC_SECTION(...) INTERNAL_CATCH_DYNAMIC_SECTION(__VA_ARGS__)
#define FAIL(...)                                                              \
  INTERNAL_CATCH_MSG("FAIL", Catch::ResultWas::ExplicitFailure,                \
                     Catch::ResultDisposition::Normal, __VA_ARGS__)
#define FAIL_CHECK(...)                                                        \
  INTERNAL_CATCH_MSG("FAIL_CHECK", Catch::ResultWas::ExplicitFailure,          \
                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)
#define SUCCEED(...)                                                           \
  INTERNAL_CATCH_MSG("SUCCEED", Catch::ResultWas::Ok,                          \
                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)
#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define TEMPLATE_TEST_CASE(...) INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_TEST_CASE_SIG(...)                                            \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__)
#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE(...)                                        \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                                    \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, __VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \
  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)
#define TEMPLATE_LIST_TEST_CASE(...)                                           \
  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_LIST_TEST_CASE_METHOD(className, ...)                         \
  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(className, __VA_ARGS__)
#else
#define TEMPLATE_TEST_CASE(...)                                                \
  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__))
#define TEMPLATE_TEST_CASE_SIG(...)                                            \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__))
#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__))
#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__))
#define TEMPLATE_PRODUCT_TEST_CASE(...)                                        \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__))
#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                                    \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__))
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className,              \
                                                       __VA_ARGS__))
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className,          \
                                                           __VA_ARGS__))
#define TEMPLATE_LIST_TEST_CASE(...)                                           \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__))
#define TEMPLATE_LIST_TEST_CASE_METHOD(className, ...)                         \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(className, __VA_ARGS__))
#endif

#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
#define STATIC_REQUIRE(...)                                                    \
  static_assert(__VA_ARGS__, #__VA_ARGS__);                                    \
  SUCCEED(#__VA_ARGS__)
#define STATIC_REQUIRE_FALSE(...)                                              \
  static_assert(!(__VA_ARGS__), "!(" #__VA_ARGS__ ")");                        \
  SUCCEED("!(" #__VA_ARGS__ ")")
#else
#define STATIC_REQUIRE(...) REQUIRE(__VA_ARGS__)
#define STATIC_REQUIRE_FALSE(...) REQUIRE_FALSE(__VA_ARGS__)
#endif

#endif

#define CATCH_TRANSLATE_EXCEPTION(signature)                                   \
  INTERNAL_CATCH_TRANSLATE_EXCEPTION(signature)

// "BDD-style" convenience wrappers
#define SCENARIO(...) TEST_CASE("Scenario: " __VA_ARGS__)
#define SCENARIO_METHOD(className, ...)                                        \
  INTERNAL_CATCH_TEST_CASE_METHOD(className, "Scenario: " __VA_ARGS__)

#define GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("    Given: " << desc)
#define AND_GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("And given: " << desc)
#define WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("     When: " << desc)
#define AND_WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(" And when: " << desc)
#define THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("     Then: " << desc)
#define AND_THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION("      And: " << desc)

#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
#define BENCHMARK(...)                                                         \
  INTERNAL_CATCH_BENCHMARK(                                                    \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____),              \
      INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__, , ),                               \
      INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__, , ))
#define BENCHMARK_ADVANCED(name)                                               \
  INTERNAL_CATCH_BENCHMARK_ADVANCED(                                           \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)
#endif // CATCH_CONFIG_ENABLE_BENCHMARKING

using Catch::Detail::Approx;

#else // CATCH_CONFIG_DISABLE

//////
// If this config identifier is defined then all CATCH macros are prefixed with
// CATCH_
#ifdef CATCH_CONFIG_PREFIX_ALL

#define CATCH_REQUIRE(...) (void)(0)
#define CATCH_REQUIRE_FALSE(...) (void)(0)

#define CATCH_REQUIRE_THROWS(...) (void)(0)
#define CATCH_REQUIRE_THROWS_AS(expr, exceptionType) (void)(0)
#define CATCH_REQUIRE_THROWS_WITH(expr, matcher) (void)(0)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CATCH_REQUIRE_NOTHROW(...) (void)(0)

#define CATCH_CHECK(...) (void)(0)
#define CATCH_CHECK_FALSE(...) (void)(0)
#define CATCH_CHECKED_IF(...) if (__VA_ARGS__)
#define CATCH_CHECKED_ELSE(...) if (!(__VA_ARGS__))
#define CATCH_CHECK_NOFAIL(...) (void)(0)

#define CATCH_CHECK_THROWS(...) (void)(0)
#define CATCH_CHECK_THROWS_AS(expr, exceptionType) (void)(0)
#define CATCH_CHECK_THROWS_WITH(expr, matcher) (void)(0)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_CHECK_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CATCH_CHECK_NOTHROW(...) (void)(0)

#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CATCH_CHECK_THAT(arg, matcher) (void)(0)

#define CATCH_REQUIRE_THAT(arg, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS

#define CATCH_INFO(msg) (void)(0)
#define CATCH_UNSCOPED_INFO(msg) (void)(0)
#define CATCH_WARN(msg) (void)(0)
#define CATCH_CAPTURE(msg) (void)(0)

#define CATCH_TEST_CASE(...)                                                   \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define CATCH_TEST_CASE_METHOD(className, ...)                                 \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define CATCH_METHOD_AS_TEST_CASE(method, ...)
#define CATCH_REGISTER_TEST_CASE(Function, ...) (void)(0)
#define CATCH_SECTION(...)
#define CATCH_DYNAMIC_SECTION(...)
#define CATCH_FAIL(...) (void)(0)
#define CATCH_FAIL_CHECK(...) (void)(0)
#define CATCH_SUCCEED(...) (void)(0)

#define CATCH_ANON_TEST_CASE()                                                 \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define CATCH_TEMPLATE_TEST_CASE(...)                                          \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,          \
                                                           __VA_ARGS__)
#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className,      \
                                                               __VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \
  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \
  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \
  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \
  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#else
#define CATCH_TEMPLATE_TEST_CASE(...)                                          \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,      \
                                                               __VA_ARGS__))
#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(            \
          className, __VA_ARGS__))
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \
  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \
  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \
  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \
  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#endif

// "BDD-style" convenience wrappers
#define CATCH_SCENARIO(...)                                                    \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define CATCH_SCENARIO_METHOD(className, ...)                                  \
  INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(                              \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), className)
#define CATCH_GIVEN(desc)
#define CATCH_AND_GIVEN(desc)
#define CATCH_WHEN(desc)
#define CATCH_AND_WHEN(desc)
#define CATCH_THEN(desc)
#define CATCH_AND_THEN(desc)

#define CATCH_STATIC_REQUIRE(...) (void)(0)
#define CATCH_STATIC_REQUIRE_FALSE(...) (void)(0)

// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not
// required
#else

#define REQUIRE(...) (void)(0)
#define REQUIRE_FALSE(...) (void)(0)

#define REQUIRE_THROWS(...) (void)(0)
#define REQUIRE_THROWS_AS(expr, exceptionType) (void)(0)
#define REQUIRE_THROWS_WITH(expr, matcher) (void)(0)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define REQUIRE_NOTHROW(...) (void)(0)

#define CHECK(...) (void)(0)
#define CHECK_FALSE(...) (void)(0)
#define CHECKED_IF(...) if (__VA_ARGS__)
#define CHECKED_ELSE(...) if (!(__VA_ARGS__))
#define CHECK_NOFAIL(...) (void)(0)

#define CHECK_THROWS(...) (void)(0)
#define CHECK_THROWS_AS(expr, exceptionType) (void)(0)
#define CHECK_THROWS_WITH(expr, matcher) (void)(0)
#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CHECK_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS
#define CHECK_NOTHROW(...) (void)(0)

#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
#define CHECK_THAT(arg, matcher) (void)(0)

#define REQUIRE_THAT(arg, matcher) (void)(0)
#endif // CATCH_CONFIG_DISABLE_MATCHERS

#define INFO(msg) (void)(0)
#define UNSCOPED_INFO(msg) (void)(0)
#define WARN(msg) (void)(0)
#define CAPTURE(msg) (void)(0)

#define TEST_CASE(...)                                                         \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define TEST_CASE_METHOD(className, ...)                                       \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define METHOD_AS_TEST_CASE(method, ...)
#define REGISTER_TEST_CASE(Function, ...) (void)(0)
#define SECTION(...)
#define DYNAMIC_SECTION(...)
#define FAIL(...) (void)(0)
#define FAIL_CHECK(...) (void)(0)
#define SUCCEED(...) (void)(0)
#define ANON_TEST_CASE()                                                       \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))

#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
#define TEMPLATE_TEST_CASE(...)                                                \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
#define TEMPLATE_TEST_CASE_SIG(...)                                            \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,          \
                                                           __VA_ARGS__)
#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \
  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className,      \
                                                               __VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE(...) TEMPLATE_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...) TEMPLATE_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \
  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \
  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#else
#define TEMPLATE_TEST_CASE(...)                                                \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__))
#define TEMPLATE_TEST_CASE_SIG(...)                                            \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__))
#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,      \
                                                               __VA_ARGS__))
#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \
  INTERNAL_CATCH_EXPAND_VARGS(                                                 \
      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(            \
          className, __VA_ARGS__))
#define TEMPLATE_PRODUCT_TEST_CASE(...) TEMPLATE_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...) TEMPLATE_TEST_CASE(__VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \
  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \
  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)
#endif

#define STATIC_REQUIRE(...) (void)(0)
#define STATIC_REQUIRE_FALSE(...) (void)(0)

#endif

#define CATCH_TRANSLATE_EXCEPTION(signature)                                   \
  INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG(                                   \
      INTERNAL_CATCH_UNIQUE_NAME(catch_internal_ExceptionTranslator),          \
      signature)

// "BDD-style" convenience wrappers
#define SCENARIO(...)                                                          \
  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))
#define SCENARIO_METHOD(className, ...)                                        \
  INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(                              \
      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), className)

#define GIVEN(desc)
#define AND_GIVEN(desc)
#define WHEN(desc)
#define AND_WHEN(desc)
#define THEN(desc)
#define AND_THEN(desc)

using Catch::Detail::Approx;

#endif

#endif // ! CATCH_CONFIG_IMPL_ONLY

// start catch_reenable_warnings.h

#ifdef __clang__
#ifdef __ICC // icpc defines the __clang__ macro
#pragma warning(pop)
#else
#pragma clang diagnostic pop
#endif
#elif defined __GNUC__
#pragma GCC diagnostic pop
#endif

// end catch_reenable_warnings.h
// end catch.hpp
#endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED


================================================
FILE: lonestar/scientific/cpu/longestedge/test/model/MapTest.cpp
================================================
#include "../../src/utils/Utils.h"
#include "../../src/model/Map.h"

TEST_CASE("Map::get_height Test") {
  double value         = 45.;
  double** placeholder = (double**)malloc(2 * sizeof(double*));
  placeholder[0]       = (double*)malloc(3 * sizeof(double));
  placeholder[1]       = (double*)malloc(3 * sizeof(double));
  placeholder[0][0]    = value + 8;
  placeholder[0][1]    = value;
  placeholder[0][2]    = 0;
  placeholder[1][0]    = 0;
  placeholder[1][1]    = 0;
  placeholder[1][2]    = 0;
  Map map{placeholder, 3, 2, 1., 1.};

  REQUIRE(fabs(map.get_height(1, 0, false) - value) < 1e-1);
}

TEST_CASE("Map::get_height Test2") {
  double value         = 45.;
  double north_border  = 49;
  double west_border   = 20;
  double** placeholder = (double**)malloc(2 * sizeof(double*));
  placeholder[0]       = (double*)malloc(3 * sizeof(double));
  placeholder[1]       = (double*)malloc(3 * sizeof(double));
  placeholder[0][0]    = value + 8;
  placeholder[0][1]    = value;
  placeholder[0][2]    = 0;
  placeholder[1][0]    = 0;
  placeholder[1][1]    = 0;
  placeholder[1][2]    = 0;

  Map map{placeholder, 3, 2, 0.5, 0.5};
  map.setNorthBorder(north_border);
  map.setWestBorder(west_border);

  REQUIRE(fabs(map.get_height(west_border + 0.5, north_border, false) - value) <
          1e-1);
}

================================================
FILE: lonestar/scientific/cpu/longestedge/test/model/ProductionStateTest.cpp
================================================
#include "../catch.hpp"
#include "../../src/model/Graph.h"
#include "../testUtils.cpp"
#include "../../src/model/ProductionState.h"

TEST_CASE("ProductionState construction lengths test") {
  galois::SharedMemSys G;
  Graph graph{};
  std::vector<GNode> nodes = generateSampleGraph(graph);
  ConnectivityManager connManager{graph};
  ProductionState pState{connManager, nodes[4], false,
                         [](double, double) { return 0.; }};

  REQUIRE(fabs(pState.getLengths()[0] - 1) <= 1e-6);
  REQUIRE(fabs(pState.getLengths()[1] - 1) <= 1e-6);
  REQUIRE(fabs(pState.getLengths()[2] - sqrt(2)) <= 1e-6);
}


================================================
FILE: lonestar/scientific/cpu/longestedge/test/productions/Production1Test.cpp
================================================
#include "../catch.hpp"
#include "../../src/productions/Production1.h"
#include "../../src/model/Graph.h"
#include "../../src/model/Coordinates.h"
#include "../../src/model/NodeData.h"
#include "../testUtils.cpp"

std::vector<GNode> generateTest1Graph(Graph& graph) {
  std::vector<GNode> nodes;
  ConnectivityManager connManager{graph};
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{2, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{2, 1, 0}, false}));

  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);
  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);
  connManager.createEdge(nodes[2], nodes[3], false, Coordinates{1, 0.5, 0}, 1);
  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);
  connManager.createEdge(nodes[2], nodes[4], true, Coordinates{1.5, 0, 0}, 1);
  connManager.createEdge(nodes[3], nodes[5], true, Coordinates{1.5, 1, 0}, 1);
  connManager.createEdge(nodes[4], nodes[5], true, Coordinates{2, 0.5, 0}, 1);
  connManager.createEdge(nodes[1], nodes[2], false, Coordinates{0.5, 0.5, 0},
                         sqrt(2));
  connManager.createEdge(nodes[2], nodes[5], false, Coordinates{1.5, 0.5, 0},
                         sqrt(2));

  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[2]));
  nodes.push_back(connManager.createInterior(nodes[1], nodes[3], nodes[2]));
  nodes.push_back(connManager.createInterior(nodes[2], nodes[3], nodes[5]));
  nodes.push_back(connManager.createInterior(nodes[2], nodes[4], nodes[5]));
  return nodes;
}

TEST_CASE("Production1 simple Test") {
  galois::SharedMemSys G;
  Graph graph{};
  vector<GNode> nodes = generateSampleGraph(graph);
  nodes[5]->getData().setToRefine(true);
  galois::UserContext<GNode> ctx;
  ConnectivityManager connManager{graph};
  Production1 production{connManager};
  ProductionState pState(connManager, nodes[5], false,
                         [](double, double) { return 0.; });
  production.execute(pState, ctx);

  REQUIRE(countHEdges(graph) == 3);
  REQUIRE(countVertices(graph) == 5);
}

TEST_CASE("Production1 complex Test") {
  galois::SharedMemSys G;
  Graph graph{};
  vector<GNode> nodes = generateTest1Graph(graph);
  nodes[6]->getData().setToRefine(true);
  nodes[7]->getData().setToRefine(true);
  nodes[8]->getData().setToRefine(true);
  nodes[9]->getData().setToRefine(true);
  galois::UserContext<GNode> ctx;
  ConnectivityManager connManager{graph};
  Production1 production{connManager};
  ProductionState pState1(connManager, nodes[6], false,
                          [](double, double) { return 0.; });
  production.execute(pState1, ctx);
  ProductionState pState2(connManager, nodes[7], false,
                          [](double, double) { return 0.; });
  production.execute(pState2, ctx);
  ProductionState pState3(connManager, nodes[8], false,
                          [](double, double) { return 0.; });
  production.execute(pState3, ctx);
  ProductionState pState4(connManager, nodes[9], false,
                          [](double, double) { return 0.; });
  production.execute(pState4, ctx);

  REQUIRE(countHEdges(graph) == 6);
  REQUIRE(countVertices(graph) == 8);
}


================================================
FILE: lonestar/scientific/cpu/longestedge/test/testUtils.cpp
================================================
#ifndef GALOIS_TEST_UTILS
#define GALOIS_TEST_UTILS

#include "../src/utils/ConnectivityManager.h"

std::vector<GNode> generateSampleGraph(Graph& graph) {
  std::vector<GNode> nodes;
  ConnectivityManager connManager{graph};
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));

  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);
  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);
  connManager.createEdge(nodes[2], nodes[3], true, Coordinates{1, 0.5, 0}, 1);
  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);
  connManager.createEdge(nodes[3], nodes[0], false, Coordinates{0.5, 0.5, 0},
                         sqrt(2));

  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[3]));
  nodes.push_back(connManager.createInterior(nodes[0], nodes[3], nodes[2]));
  return nodes;
}

int countHEdges(Graph& graph) {
  int counter = 0;
  for (auto n : graph) {
    if (graph.containsNode(n) && n->getData().isHyperEdge()) {
      ++counter;
    }
  }
  return counter;
}

int countVertices(Graph& graph) {
  int counter = 0;
  for (auto n : graph) {
    if (!(n->getData().isHyperEdge())) {
      ++counter;
    }
  }
  return counter;
}

#endif

================================================
FILE: lonestar/scientific/cpu/longestedge/test/utils/ConnectivityManagerTest.cpp
================================================
#include "../catch.hpp"
#include "../../src/productions/Production1.h"
#include "../../src/model/Graph.h"
#include "../../src/model/Coordinates.h"
#include "../../src/model/NodeData.h"
#include "../testUtils.cpp"

vector<GNode> generateSampleGraph2(Graph& graph) {
  vector<GNode> nodes;
  ConnectivityManager connManager{graph};
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));
  nodes.push_back(
      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));

  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);
  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);
  connManager.createEdge(nodes[2], nodes[3], true, Coordinates{1, 0.5, 0}, 1);
  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);
  connManager.createEdge(nodes[3], nodes[0], false, Coordinates{0.5, 0.5, 0},
                         sqrt(2));

  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[3]));
  nodes.push_back(connManager.createInterior(nodes[0], nodes[3], nodes[2]));
  return nodes;
}

// TEST_CASE( "getSrc positive Test" ) {
//    galois::SharedMemSys G;
//    Graph graph{};
//    vector<GNode> nodes = generateSampleGraph(graph);
//    ConnectivityManager connManager{graph};
//
//    auto edge = graph.findEdge(nodes[0], nodes[1]);
//    EdgeData edgeData = graph.getEdgeData(edge);
//
//    REQUIRE((graph.getEdgeData(edge).getSrc()) != &(nodes[1]));
//}

// TEST_CASE( "findSrc negative Test" ) {
//    galois::SharedMemSys G;
//    Graph graph{};
//    vector<GNode> nodes = generateSampleGraph(graph);
//    ConnectivityManager connManager{graph};
//
//    auto edge = graph.findEdge(nodes[1], nodes[2]);
//
//    REQUIRE(edge.base() == edge.end());
//}
//
// TEST_CASE( "findSrc positive Test" ) {
//    galois::SharedMemSys G;
//    Graph graph{};
//    vector<GNode> nodes = generateSampleGraph(graph);
//    ConnectivityManager connManager{graph};
//
//    auto edge = graph.findEdge(nodes[0], nodes[1]);
//
//    REQUIRE(edge.base() != edge.end());
//}


================================================
FILE: lonestar/scientific/cpu/longestedge/test/utils/UtilsTest.cpp
================================================
#include "../../src/utils/Utils.h"
#include "../../src/model/Map.h"

TEST_CASE("convertToUtm Test") {
  double longitude     = 20.;
  double latitude      = 50.;
  long zone            = 34;
  char hemisphere      = 'N';
  double northing      = 5539109.82;
  double easting       = 428333.55;
  double** placeholder = (double**)malloc(sizeof(double*));
  placeholder[0]       = (double*)malloc(sizeof(double));
  placeholder[0][0]    = 8;
  Map map{placeholder, 1, 1, 1., 1.};

  const std::pair<double, double>& pair =
      Utils::convertToUtm(latitude, longitude, map);

  REQUIRE(fabs(pair.first - easting) < 1e-1);
  REQUIRE(fabs(pair.second - northing) < 1e-1);
  REQUIRE(map.getZone() == zone);
  REQUIRE(map.getHemisphere() == hemisphere);
}

================================================
FILE: lonestar/scientific/gpu/CMakeLists.txt
================================================
function(app_scientific_gpu name target_name)
  set(options NO_GPU)
  set(one_value_args)
  set(multi_value_args)
  cmake_parse_arguments(X "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
  string(CONCAT target_name ${target_name} "-gpu")
  add_executable(${target_name} ${name}.cu)
  install(TARGETS ${target_name} DESTINATION "${CMAKE_INSTALL_BINDIR}" EXCLUDE_FROM_ALL)
  if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})
    target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)
    target_link_libraries(${target_name} Galois::gpu)
    set_property(TARGET ${target_name} PROPERTY CUDA_STANDARD 14)
    set_property(TARGET ${target_name} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
  endif()
endfunction()

add_subdirectory(barneshut)
add_subdirectory(delaunayrefinement)


================================================
FILE: lonestar/scientific/gpu/README.md
================================================
Overview of LonestarGPU Scientific Benchmark Suite
================================================================================

The LonestarGPU suite contains CUDA implementations of several
irregular algorithms that exhibit amorphous data parallelism. Currently,
the LonestarGPU suite contains the following scientific applications,
which can be executed on a single-GPU.

### Scientific Applications
  * Barnes-Hut N-body Simulation
  * Delaunay Mesh Refinement

Compiling LonestarGPU Through CMake 
================================================================================

The dependencies for LonestarGPU suite are the same as shared-memory.
Note that  LonestarGPU requires CUDA 8.0 and above.

Note that heterogeneous Galois requires the cub and moderngpu git submodules,
which can be cloned using the followed commands.

```Shell
cd $GALOIS_ROOT
git submodule init
git submodule update 
```
These modules will be available in the ${GALOIS\_ROOT}/external directory

To build the LonestarGPU suite, first, create a build directory and
run CMake with -DGALOIS\_CUDA\_CAPABILITY=\<insert CUDA capability here\>
flag in the build directory. The CUDA capability should be one that your
GPU supports. For example, if you wanted to build for a GTX 1080 and a K80,
the commands would look like this:

```Shell
cd ${GALOIS_ROOT}
mkdir build
cd build
cmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY="3.7;6.1"

After compiling through CMake, the system will create the 'lonestar/analytics/gpu'
and 'lonestar/scientific/gpu' directories in ${GALOIS\_ROOT}/build directory. 

Compiling Scientific Applications
================================================================================

Once CMake is completed,  compile the provided scientific apps by executing the 
following command in the ${GALOIS\_ROOT}/build/lonestar/scientific/gpu directory.

`make -j`

You can compile a specific app by executing the following commands
(shown for barneshut).

```Shell
`cd barneshut`
`make -j`
```

Running Scientific Applications
================================================================================

To run a specific app, follow the instructions given in the README.md
in the particular app directory. 

Documentation
================================================================================

Further documentation is available at
[http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu](http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu)


================================================
FILE: lonestar/scientific/gpu/barneshut/CMakeLists.txt
================================================
app_scientific_gpu(bh barneshut)
add_test_gpu(barneshut rmat15 rmat15.out bh 50000 2 0)


================================================
FILE: lonestar/scientific/gpu/barneshut/LICENSE.md
================================================
CUDA BarnesHut v3.1: Simulation of the gravitational forces
in a galactic cluster using the Barnes-Hut n-body algorithm

Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted for academic, research, experimental, or personal use provided that
the following conditions are met:

   * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright notice,
     this list of conditions and the following disclaimer in the documentation
     and/or other materials provided with the distribution.
   * Neither the name of Texas State University-San Marcos nor the names of its
     contributors may be used to endorse or promote products derived from this
     software without specific prior written permission.

For all other uses, please contact the Office for Commercialization and Industry
Relations at Texas State University-San Marcos <http://www.txstate.edu/ocir/>.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.

Author: Martin Burtscher <burtscher@txstate.edu>


================================================
FILE: lonestar/scientific/gpu/barneshut/README.md
================================================
Barnes-Hut N-body Simulation
================================================================================

DESCRIPTION
--------------------------------------------------------------------------------

This benchmark simulates the gravitational forces acting on a galactic cluster
using the Barnes-Hut n-body algorithm. The positions and velocities of the n
galaxies are initialized according to the empirical Plummer model. The program
calculates the motion of each galaxy through space for a number of time steps.
The data parallelism in this algorithm arises primarily from the independent
force calculations.

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/analytics/gpu/barneshut; make -j`

RUN
--------------------------------------------------------------------------------

To run default algorithm, use the following:

-`$ ./barneshut-gpu  <bodies> <timesteps> <deviceid>`

-`$ ./barneshut-gpu 50000 2 0`


================================================
FILE: lonestar/scientific/gpu/barneshut/bh.cu
================================================
/*
 * CUDA BarnesHut v3.1: Simulation of the gravitational forces
 * in a galactic cluster using the Barnes-Hut n-body algorithm
 *
 * Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted for academic, research, experimental, or personal use provided that
 * the following conditions are met:
 *
 *    * Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright notice,
 *      this list of conditions and the following disclaimer in the documentation
 *      and/or other materials provided with the distribution.
 *    * Neither the name of Texas State University-San Marcos nor the names of its
 *      contributors may be used to endorse or promote products derived from this
 *      software without specific prior written permission.
 *
 * For all other uses, please contact the Office for Commercialization and Industry
 * Relations at Texas State University-San Marcos <http://www.txstate.edu/ocir/>.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Martin Burtscher <burtscher@txstate.edu>
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include "cuda_launch_config.hpp"
#include "bh_tuning.h"

#define WARPSIZE 32
#define MAXDEPTH 32

__device__ volatile int stepd, bottomd, maxdepthd;
__device__ unsigned int blkcntd;
__device__ volatile float radiusd;


/******************************************************************************/
/*** initialize memory ********************************************************/
/******************************************************************************/

__global__ void InitializationKernel(int * __restrict errd)
{
  *errd = 0;
  stepd = -1;
  maxdepthd = 1;
  blkcntd = 0;
}


/******************************************************************************/
/*** compute center and radius ************************************************/
/******************************************************************************/

__global__
__launch_bounds__(THREADS1, FACTOR1)
void BoundingBoxKernel(int nnodesd, int nbodiesd, volatile int * __restrict startd, volatile int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict maxxd, volatile float * __restrict maxyd, volatile float * __restrict maxzd, volatile float * __restrict minxd, volatile float * __restrict minyd, volatile float * __restrict minzd)
{
  register int i, j, k, inc;
  register float val, minx, maxx, miny, maxy, minz, maxz;
  __shared__ volatile float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1], sminz[THREADS1], smaxz[THREADS1];

  // initialize with valid data (in case #bodies < #threads)
  minx = maxx = posxd[0];
  miny = maxy = posyd[0];
  minz = maxz = poszd[0];

  // scan all bodies
  i = threadIdx.x;
  inc = THREADS1 * gridDim.x;
  for (j = i + blockIdx.x * THREADS1; j < nbodiesd; j += inc) {
    val = posxd[j];
    minx = fminf(minx, val);
    maxx = fmaxf(maxx, val);
    val = posyd[j];
    miny = fminf(miny, val);
    maxy = fmaxf(maxy, val);
    val = poszd[j];
    minz = fminf(minz, val);
    maxz = fmaxf(maxz, val);
  }

  // reduction in shared memory
  sminx[i] = minx;
  smaxx[i] = maxx;
  sminy[i] = miny;
  smaxy[i] = maxy;
  sminz[i] = minz;
  smaxz[i] = maxz;

  for (j = THREADS1 / 2; j > 0; j /= 2) {
    __syncthreads();
    if (i < j) {
      k = i + j;
      sminx[i] = minx = fminf(minx, sminx[k]);
      smaxx[i] = maxx = fmaxf(maxx, smaxx[k]);
      sminy[i] = miny = fminf(miny, sminy[k]);
      smaxy[i] = maxy = fmaxf(maxy, smaxy[k]);
      sminz[i] = minz = fminf(minz, sminz[k]);
      smaxz[i] = maxz = fmaxf(maxz, smaxz[k]);
    }
  }

  // write block result to global memory
  if (i == 0) {
    k = blockIdx.x;
    minxd[k] = minx;
    maxxd[k] = maxx;
    minyd[k] = miny;
    maxyd[k] = maxy;
    minzd[k] = minz;
    maxzd[k] = maxz;
    __threadfence();

    inc = gridDim.x - 1;
    if (inc == atomicInc(&blkcntd, inc)) {
      // I'm the last block, so combine all block results
      for (j = 0; j <= inc; j++) {
        minx = fminf(minx, minxd[j]);
        maxx = fmaxf(maxx, maxxd[j]);
        miny = fminf(miny, minyd[j]);
        maxy = fmaxf(maxy, maxyd[j]);
        minz = fminf(minz, minzd[j]);
        maxz = fmaxf(maxz, maxzd[j]);
      }

      // compute 'radius'
      val = fmaxf(maxx - minx, maxy - miny);
      radiusd = fmaxf(val, maxz - minz) * 0.5f;

      // create root node
      k = nnodesd;
      bottomd = k;

      massd[k] = -1.0f;
      startd[k] = 0;
      posxd[k] = (minx + maxx) * 0.5f;
      posyd[k] = (miny + maxy) * 0.5f;
      poszd[k] = (minz + maxz) * 0.5f;
      k *= 8;
      for (i = 0; i < 8; i++) childd[k + i] = -1;

      stepd++;
    }
  }
}


/******************************************************************************/
/*** build tree ***************************************************************/
/******************************************************************************/

__global__
__launch_bounds__(1024, 1)
void ClearKernel1(int nnodesd, int nbodiesd, volatile int * __restrict childd)
{
  register int k, inc, top, bottom;

  top = 8 * nnodesd;
  bottom = 8 * nbodiesd;
  inc = blockDim.x * gridDim.x;
  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
  if (k < bottom) k += inc;

  // iterate over all cells assigned to thread
  while (k < top) {
    childd[k] = -1;
    k += inc;
  }
}


__global__
__launch_bounds__(THREADS2, FACTOR2)
void TreeBuildingKernel(int nnodesd, int nbodiesd, volatile int * __restrict errd, volatile int * __restrict childd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd)
{
  register int i, j, depth, localmaxdepth, skip, inc;
  register float x, y, z, r;
  register float px, py, pz;
  register float dx, dy, dz;
  register int ch, n, cell, locked, patch;
  register float radius, rootx, rooty, rootz;

  // cache root data
  radius = radiusd;
  rootx = posxd[nnodesd];
  rooty = posyd[nnodesd];
  rootz = poszd[nnodesd];

  localmaxdepth = 1;
  skip = 1;
  inc = blockDim.x * gridDim.x;
  i = threadIdx.x + blockIdx.x * blockDim.x;

  // iterate over all bodies assigned to thread
  while (i < nbodiesd) {
    if (skip != 0) {
      // new body, so start traversing at root
      skip = 0;
      px = posxd[i];
      py = posyd[i];
      pz = poszd[i];
      n = nnodesd;
      depth = 1;
      r = radius * 0.5f;
      dx = dy = dz = -r;
      j = 0;
      // determine which child to follow
      if (rootx < px) {j = 1; dx = r;}
      if (rooty < py) {j |= 2; dy = r;}
      if (rootz < pz) {j |= 4; dz = r;}
      x = rootx + dx;
      y = rooty + dy;
      z = rootz + dz;
    }

    // follow path to leaf cell
    ch = childd[n*8+j];
    while (ch >= nbodiesd) {
      n = ch;
      depth++;
      r *= 0.5f;
      dx = dy = dz = -r;
      j = 0;
      // determine which child to follow
      if (x < px) {j = 1; dx = r;}
      if (y < py) {j |= 2; dy = r;}
      if (z < pz) {j |= 4; dz = r;}
      x += dx;
      y += dy;
      z += dz;
      ch = childd[n*8+j];
    }

    if (ch != -2) {  // skip if child pointer is locked and try again later
      locked = n*8+j;
      if (ch == -1) {
        if (-1 == atomicCAS((int *)&childd[locked], -1, i)) {  // if null, just insert the new body
          localmaxdepth = max(depth, localmaxdepth);
          i += inc;  // move on to next body
          skip = 1;
        }
      } else {  // there already is a body in this position
        if (ch == atomicCAS((int *)&childd[locked], ch, -2)) {  // try to lock
          patch = -1;
          // create new cell(s) and insert the old and new body
          do {
            depth++;

            cell = atomicSub((int *)&bottomd, 1) - 1;
            if (cell <= nbodiesd) {
              *errd = 1;
              bottomd = nnodesd;
            }

            if (patch != -1) {
              childd[n*8+j] = cell;
            }
            patch = max(patch, cell);

            j = 0;
            if (x < posxd[ch]) j = 1;
            if (y < posyd[ch]) j |= 2;
            if (z < poszd[ch]) j |= 4;
            childd[cell*8+j] = ch;

            n = cell;
            r *= 0.5f;
            dx = dy = dz = -r;
            j = 0;
            if (x < px) {j = 1; dx = r;}
            if (y < py) {j |= 2; dy = r;}
            if (z < pz) {j |= 4; dz = r;}
            x += dx;
            y += dy;
            z += dz;

            ch = childd[n*8+j];
            // repeat until the two bodies are different children
          } while (ch >= 0);
          childd[n*8+j] = i;

          localmaxdepth = max(depth, localmaxdepth);
          i += inc;  // move on to next body
          skip = 2;
        }
      }
    }
    __syncthreads();  // __threadfence();

    if (skip == 2) {
      childd[locked] = patch;
    }
  }
  // record maximum tree depth
  atomicMax((int *)&maxdepthd, localmaxdepth);
}


__global__
__launch_bounds__(1024, 1)
void ClearKernel2(int nnodesd, volatile int * __restrict startd, volatile float * __restrict massd)
{
  register int k, inc, bottom;

  bottom = bottomd;
  inc = blockDim.x * gridDim.x;
  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
  if (k < bottom) k += inc;

  // iterate over all cells assigned to thread
  while (k < nnodesd) {
    massd[k] = -1.0f;
    startd[k] = -1;
    k += inc;
  }
}


/******************************************************************************/
/*** compute center of mass ***************************************************/
/******************************************************************************/

__global__
__launch_bounds__(THREADS3, FACTOR3)
void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int * __restrict countd, const int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd)
{
  register int i, j, k, ch, inc, cnt, bottom, flag;
  register float m, cm, px, py, pz;
  __shared__ int child[THREADS3 * 8];
  __shared__ float mass[THREADS3 * 8];

  bottom = bottomd;
  inc = blockDim.x * gridDim.x;
  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
  if (k < bottom) k += inc;

  register int restart = k;
  for (j = 0; j < 5; j++) {  // wait-free pre-passes
    // iterate over all cells assigned to thread
    while (k <= nnodesd) {
      if (massd[k] < 0.0f) {
        for (i = 0; i < 8; i++) {
          ch = childd[k*8+i];
          child[i*THREADS3+threadIdx.x] = ch;  // cache children
          if ((ch >= nbodiesd) && ((mass[i*THREADS3+threadIdx.x] = massd[ch]) < 0.0f)) {
            break;
          }
        }
        if (i == 8) {
          // all children are ready
          cm = 0.0f;
          px = 0.0f;
          py = 0.0f;
          pz = 0.0f;
          cnt = 0;
          for (i = 0; i < 8; i++) {
            ch = child[i*THREADS3+threadIdx.x];
            if (ch >= 0) {
              if (ch >= nbodiesd) {  // count bodies (needed later)
                m = mass[i*THREADS3+threadIdx.x];
                cnt += countd[ch];
              } else {
                m = massd[ch];
                cnt++;
              }
              // add child's contribution
              cm += m;
              px += posxd[ch] * m;
              py += posyd[ch] * m;
              pz += poszd[ch] * m;
            }
          }
          countd[k] = cnt;
          m = 1.0f / cm;
          posxd[k] = px * m;
          posyd[k] = py * m;
          poszd[k] = pz * m;
          __threadfence();  // make sure data are visible before setting mass
          massd[k] = cm;
        }
      }
      k += inc;  // move on to next cell
    }
    k = restart;
  }

  flag = 0;
  j = 0;
  // iterate over all cells assigned to thread
  while (k <= nnodesd) {
    if (massd[k] >= 0.0f) {
      k += inc;
    } else {
      if (j == 0) {
        j = 8;
        for (i = 0; i < 8; i++) {
          ch = childd[k*8+i];
          child[i*THREADS3+threadIdx.x] = ch;  // cache children
          if ((ch < nbodiesd) || ((mass[i*THREADS3+threadIdx.x] = massd[ch]) >= 0.0f)) {
            j--;
          }
        }
      } else {
        j = 8;
        for (i = 0; i < 8; i++) {
          ch = child[i*THREADS3+threadIdx.x];
          if ((ch < nbodiesd) || (mass[i*THREADS3+threadIdx.x] >= 0.0f) || ((mass[i*THREADS3+threadIdx.x] = massd[ch]) >= 0.0f)) {
            j--;
          }
        }
      }

      if (j == 0) {
        // all children are ready
        cm = 0.0f;
        px = 0.0f;
        py = 0.0f;
        pz = 0.0f;
        cnt = 0;
        for (i = 0; i < 8; i++) {
          ch = child[i*THREADS3+threadIdx.x];
          if (ch >= 0) {
            if (ch >= nbodiesd) {  // count bodies (needed later)
              m = mass[i*THREADS3+threadIdx.x];
              cnt += countd[ch];
            } else {
              m = massd[ch];
              cnt++;
            }
            // add child's contribution
            cm += m;
            px += posxd[ch] * m;
            py += posyd[ch] * m;
            pz += poszd[ch] * m;
          }
        }
        countd[k] = cnt;
        m = 1.0f / cm;
        posxd[k] = px * m;
        posyd[k] = py * m;
        poszd[k] = pz * m;
        flag = 1;
      }
    }
    __syncthreads();  // __threadfence();
    if (flag != 0) {
      massd[k] = cm;
      k += inc;
      flag = 0;
    }
  }
}


/******************************************************************************/
/*** sort bodies **************************************************************/
/******************************************************************************/

__global__
__launch_bounds__(THREADS4, FACTOR4)
void SortKernel(int nnodesd, int nbodiesd, int * __restrict sortd, int * __restrict countd, volatile int * __restrict startd, int * __restrict childd)
{
  register int i, j, k, ch, dec, start, bottom;

  bottom = bottomd;
  dec = blockDim.x * gridDim.x;
  k = nnodesd + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x;

  // iterate over all cells assigned to thread
  while (k >= bottom) {
    start = startd[k];
    if (start >= 0) {
      j = 0;
      for (i = 0; i < 8; i++) {
        ch = childd[k*8+i];
        if (ch >= 0) {
          if (i != j) {
            // move children to front (needed later for speed)
            childd[k*8+i] = -1;
            childd[k*8+j] = ch;
          }
          j++;
          if (ch >= nbodiesd) {
            // child is a cell
            startd[ch] = start;  // set start ID of child
            start += countd[ch];  // add #bodies in subtree
          } else {
            // child is a body
            sortd[start] = ch;  // record body in 'sorted' array
            start++;
          }
        }
      }
      k -= dec;  // move on to next cell
    }
  }
}


/******************************************************************************/
/*** compute force ************************************************************/
/******************************************************************************/

__global__
__launch_bounds__(THREADS5, FACTOR5)
void ForceCalculationKernel(int nnodesd, int nbodiesd, volatile int * __restrict errd, float dthfd, float itolsqd, float epssqd, volatile int * __restrict sortd, volatile int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict velxd, volatile float * __restrict velyd, volatile float * __restrict velzd, volatile float * __restrict accxd, volatile float * __restrict accyd, volatile float * __restrict acczd)
{
  register int i, j, k, n, depth, base, sbase, diff, pd, nd;
  register float px, py, pz, ax, ay, az, dx, dy, dz, tmp;
  __shared__ volatile int pos[MAXDEPTH * THREADS5/WARPSIZE], node[MAXDEPTH * THREADS5/WARPSIZE];
  __shared__ float dq[MAXDEPTH * THREADS5/WARPSIZE];

  if (0 == threadIdx.x) {
    tmp = radiusd * 2;
    // precompute values that depend only on tree level
    dq[0] = tmp * tmp * itolsqd;
    for (i = 1; i < maxdepthd; i++) {
      dq[i] = dq[i - 1] * 0.25f;
      dq[i - 1] += epssqd;
    }
    dq[i - 1] += epssqd;

    if (maxdepthd > MAXDEPTH) {
      *errd = maxdepthd;
    }
  }
  __syncthreads();

  if (maxdepthd <= MAXDEPTH) {
    // figure out first thread in each warp (lane 0)
    base = threadIdx.x / WARPSIZE;
    sbase = base * WARPSIZE;
    j = base * MAXDEPTH;

    diff = threadIdx.x - sbase;
    // make multiple copies to avoid index calculations later
    if (diff < MAXDEPTH) {
      dq[diff+j] = dq[diff];
    }
    __syncthreads();
    __threadfence_block();

    // iterate over all bodies assigned to thread
    for (k = threadIdx.x + blockIdx.x * blockDim.x; k < nbodiesd; k += blockDim.x * gridDim.x) {
      i = sortd[k];  // get permuted/sorted index
      // cache position info
      px = posxd[i];
      py = posyd[i];
      pz = poszd[i];

      ax = 0.0f;
      ay = 0.0f;
      az = 0.0f;

      // initialize iteration stack, i.e., push root node onto stack
      depth = j;
      if (sbase == threadIdx.x) {
        pos[j] = 0;
        node[j] = nnodesd * 8;
      }

      do {
        // stack is not empty
        pd = pos[depth];
        nd = node[depth];
        while (pd < 8) {
          // node on top of stack has more children to process
          n = childd[nd + pd];  // load child pointer
          pd++;

          if (n >= 0) {
            dx = posxd[n] - px;
            dy = posyd[n] - py;
            dz = poszd[n] - pz;
            tmp = dx*dx + (dy*dy + (dz*dz + epssqd));  // compute distance squared (plus softening)
            if ((n < nbodiesd) || __all_sync(0xffffffff, tmp >= dq[depth])) {  // check if all threads agree that cell is far enough away (or is a body)
              tmp = rsqrtf(tmp);  // compute distance
              tmp = massd[n] * tmp * tmp * tmp;
              ax += dx * tmp;
              ay += dy * tmp;
              az += dz * tmp;
            } else {
              // push cell onto stack
              if (sbase == threadIdx.x) {  // maybe don't push and inc if last child
                pos[depth] = pd;
                node[depth] = nd;
              }
              depth++;
              pd = 0;
              nd = n * 8;
            }
          } else {
            pd = 8;  // early out because all remaining children are also zero
          }
        }
        depth--;  // done with this level
      } while (depth >= j);

      if (stepd > 0) {
        // update velocity
        velxd[i] += (ax - accxd[i]) * dthfd;
        velyd[i] += (ay - accyd[i]) * dthfd;
        velzd[i] += (az - acczd[i]) * dthfd;
      }

      // save computed acceleration
      accxd[i] = ax;
      accyd[i] = ay;
      acczd[i] = az;
    }
  }
}


/******************************************************************************/
/*** advance bodies ***********************************************************/
/******************************************************************************/

__global__
__launch_bounds__(THREADS6, FACTOR6)
void IntegrationKernel(int nbodiesd, float dtimed, float dthfd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict velxd, volatile float * __restrict velyd, volatile float * __restrict velzd, volatile float * __restrict accxd, volatile float * __restrict accyd, volatile float * __restrict acczd)
{
  register int i, inc;
  register float dvelx, dvely, dvelz;
  register float velhx, velhy, velhz;

  // iterate over all bodies assigned to thread
  inc = blockDim.x * gridDim.x;
  for (i = threadIdx.x + blockIdx.x * blockDim.x; i < nbodiesd; i += inc) {
    // integrate
    dvelx = accxd[i] * dthfd;
    dvely = accyd[i] * dthfd;
    dvelz = acczd[i] * dthfd;

    velhx = velxd[i] + dvelx;
    velhy = velyd[i] + dvely;
    velhz = velzd[i] + dvelz;

    posxd[i] += velhx * dtimed;
    posyd[i] += velhy * dtimed;
    poszd[i] += velhz * dtimed;

    velxd[i] = velhx + dvelx;
    velyd[i] = velhy + dvely;
    velzd[i] = velhz + dvelz;
  }
}


/******************************************************************************/

static void CudaTest(char *msg)
{
  cudaError_t e;

  cudaThreadSynchronize();
  if (cudaSuccess != (e = cudaGetLastError())) {
    fprintf(stderr, "%s: %d\n", msg, e);
    fprintf(stderr, "%s\n", cudaGetErrorString(e));
    exit(-1);
  }
}


/******************************************************************************/

// random number generator

#define MULT 1103515245
#define ADD 12345
#define MASK 0x7FFFFFFF
#define TWOTO31 2147483648.0

static int A = 1;
static int B = 0;
static int randx = 1;
static int lastrand;


static void drndset(int seed)
{
   A = 1;
   B = 0;
   randx = (A * seed + B) & MASK;
   A = (MULT * A) & MASK;
   B = (MULT * B + ADD) & MASK;
}


static double drnd()
{
   lastrand = randx;
   randx = (A * randx + B) & MASK;
   return (double)lastrand / TWOTO31;
}


/******************************************************************************/

int main(int argc, char *argv[])
{
  register int i, run, blocks;
  int nnodes, nbodies, step, timesteps;
  register double runtime;
  int error;
  register float dtime, dthf, epssq, itolsq;
  float time, timing[7];
  cudaEvent_t start, stop;
  float *mass, *posx, *posy, *posz, *velx, *vely, *velz;

  int *errl, *sortl, *childl, *countl, *startl;
  float *massl;
  float *posxl, *posyl, *poszl;
  float *velxl, *velyl, *velzl;
  float *accxl, *accyl, *acczl;
  float *maxxl, *maxyl, *maxzl;
  float *minxl, *minyl, *minzl;
  register double rsc, vsc, r, v, x, y, z, sq, scale;

  // perform some checks

  printf("CUDA BarnesHut v3.1 ");
#ifdef __KEPLER__
  printf("[Kepler]\n");
#else
  printf("[Fermi]\n");
#endif
  printf("Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.\n");
  fflush(stdout);
  if (argc != 4) {
    fprintf(stderr, "\n");
    fprintf(stderr, "arguments: number_of_bodies number_of_timesteps device\n");
    exit(-1);
  }

  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
    fprintf(stderr, "There is no device supporting CUDA\n");
    exit(-1);
  }

  const int dev = atoi(argv[3]);
  if ((dev < 0) || (deviceCount <= dev)) {
    fprintf(stderr, "There is no device %d\n", dev);
    exit(-1);
  }
  cudaSetDevice(dev);

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {
    fprintf(stderr, "There is no CUDA capable device\n");
    exit(-1);
  }
  if (deviceProp.major < 2) {
    fprintf(stderr, "Need at least compute capability 2.0\n");
    exit(-1);
  }
  if (deviceProp.warpSize != WARPSIZE) {
    fprintf(stderr, "Warp size must be %d\n", deviceProp.warpSize);
    exit(-1);
  }

  blocks = deviceProp.multiProcessorCount;
//  fprintf(stderr, "blocks = %d\n", blocks);

  if ((WARPSIZE <= 0) || (WARPSIZE & (WARPSIZE-1) != 0)) {
    fprintf(stderr, "Warp size must be greater than zero and a power of two\n");
    exit(-1);
  }
  if (MAXDEPTH > WARPSIZE) {
    fprintf(stderr, "MAXDEPTH must be less than or equal to WARPSIZE\n");
    exit(-1);
  }
  if ((THREADS1 <= 0) || (THREADS1 & (THREADS1-1) != 0)) {
    fprintf(stderr, "THREADS1 must be greater than zero and a power of two\n");
    exit(-1);
  }

  // set L1/shared memory configuration
  cudaFuncSetCacheConfig(BoundingBoxKernel, cudaFuncCachePreferShared);
  cudaFuncSetCacheConfig(TreeBuildingKernel, cudaFuncCachePreferL1);
  cudaFuncSetCacheConfig(ClearKernel1, cudaFuncCachePreferL1);
  cudaFuncSetCacheConfig(ClearKernel2, cudaFuncCachePreferL1);
  cudaFuncSetCacheConfig(SummarizationKernel, cudaFuncCachePreferShared);
  cudaFuncSetCacheConfig(SortKernel, cudaFuncCachePreferL1);
#ifdef __KEPLER__
  cudaFuncSetCacheConfig(ForceCalculationKernel, cudaFuncCachePreferEqual);
#else
  cudaFuncSetCacheConfig(ForceCalculationKernel, cudaFuncCachePreferL1);
#endif
  cudaFuncSetCacheConfig(IntegrationKernel, cudaFuncCachePreferL1);

  cudaGetLastError();  // reset error value
  for (run = 0; run < 3; run++) {
    for (i = 0; i < 7; i++) timing[i] = 0.0f;

    nbodies = atoi(argv[1]);
    if (nbodies < 1) {
      fprintf(stderr, "nbodies is too small: %d\n", nbodies);
      exit(-1);
    }
    if (nbodies > (1 << 30)) {
      fprintf(stderr, "nbodies is too large: %d\n", nbodies);
      exit(-1);
    }
    nnodes = nbodies * 2;
    if (nnodes < 1024*blocks) nnodes = 1024*blocks;
    while ((nnodes & (WARPSIZE-1)) != 0) nnodes++;
    nnodes--;

    timesteps = atoi(argv[2]);
    dtime = 0.025;  dthf = dtime * 0.5f;
    epssq = 0.05 * 0.05;
    itolsq = 1.0f / (0.5 * 0.5);

    // allocate memory

    if (run == 0) {
      printf("configuration: %d bodies, %d time steps\n", nbodies, timesteps);

      mass = (float *)malloc(sizeof(float) * nbodies);
      if (mass == NULL) {fprintf(stderr, "cannot allocate mass\n");  exit(-1);}
      posx = (float *)malloc(sizeof(float) * nbodies);
      if (posx == NULL) {fprintf(stderr, "cannot allocate posx\n");  exit(-1);}
      posy = (float *)malloc(sizeof(float) * nbodies);
      if (posy == NULL) {fprintf(stderr, "cannot allocate posy\n");  exit(-1);}
      posz = (float *)malloc(sizeof(float) * nbodies);
      if (posz == NULL) {fprintf(stderr, "cannot allocate posz\n");  exit(-1);}
      velx = (float *)malloc(sizeof(float) * nbodies);
      if (velx == NULL) {fprintf(stderr, "cannot allocate velx\n");  exit(-1);}
      vely = (float *)malloc(sizeof(float) * nbodies);
      if (vely == NULL) {fprintf(stderr, "cannot allocate vely\n");  exit(-1);}
      velz = (float *)malloc(sizeof(float) * nbodies);
      if (velz == NULL) {fprintf(stderr, "cannot allocate velz\n");  exit(-1);}

      if (cudaSuccess != cudaMalloc((void **)&errl, sizeof(int))) fprintf(stderr, "could not allocate errd\n");  CudaTest("couldn't allocate errd");
      if (cudaSuccess != cudaMalloc((void **)&childl, sizeof(int) * (nnodes+1) * 8)) fprintf(stderr, "could not allocate childd\n");  CudaTest("couldn't allocate childd");
      if (cudaSuccess != cudaMalloc((void **)&massl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate massd\n");  CudaTest("couldn't allocate massd");
      if (cudaSuccess != cudaMalloc((void **)&posxl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate posxd\n");  CudaTest("couldn't allocate posxd");
      if (cudaSuccess != cudaMalloc((void **)&posyl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate posyd\n");  CudaTest("couldn't allocate posyd");
      if (cudaSuccess != cudaMalloc((void **)&poszl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate poszd\n");  CudaTest("couldn't allocate poszd");
      if (cudaSuccess != cudaMalloc((void **)&velxl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate velxd\n");  CudaTest("couldn't allocate velxd");
      if (cudaSuccess != cudaMalloc((void **)&velyl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate velyd\n");  CudaTest("couldn't allocate velyd");
      if (cudaSuccess != cudaMalloc((void **)&velzl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate velzd\n");  CudaTest("couldn't allocate velzd");
      if (cudaSuccess != cudaMalloc((void **)&accxl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate accxd\n");  CudaTest("couldn't allocate accxd");
      if (cudaSuccess != cudaMalloc((void **)&accyl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate accyd\n");  CudaTest("couldn't allocate accyd");
      if (cudaSuccess != cudaMalloc((void **)&acczl, sizeof(float) * (nnodes+1))) fprintf(stderr, "could not allocate acczd\n");  CudaTest("couldn't allocate acczd");
      if (cudaSuccess != cudaMalloc((void **)&countl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate countd\n");  CudaTest("couldn't allocate countd");
      if (cudaSuccess != cudaMalloc((void **)&startl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate startd\n");  CudaTest("couldn't allocate startd");
      if (cudaSuccess != cudaMalloc((void **)&sortl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate sortd\n");  CudaTest("couldn't allocate sortd");

      if (cudaSuccess != cudaMalloc((void **)&maxxl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate maxxd\n");  CudaTest("couldn't allocate maxxd");
      if (cudaSuccess != cudaMalloc((void **)&maxyl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate maxyd\n");  CudaTest("couldn't allocate maxyd");
      if (cudaSuccess != cudaMalloc((void **)&maxzl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate maxzd\n");  CudaTest("couldn't allocate maxzd");
      if (cudaSuccess != cudaMalloc((void **)&minxl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate minxd\n");  CudaTest("couldn't allocate minxd");
      if (cudaSuccess != cudaMalloc((void **)&minyl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate minyd\n");  CudaTest("couldn't allocate minyd");
      if (cudaSuccess != cudaMalloc((void **)&minzl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, "could not allocate minzd\n");  CudaTest("couldn't allocate minzd");
    }

    // generate input

    drndset(7);
    printf("Generating Input: \n");
    rsc = (3 * 3.1415926535897932384626433832795) / 16;
    vsc = sqrt(1.0 / rsc);
    for (i = 0; i < nbodies; i++) {
      mass[i] = 1.0 / nbodies;
      r = 1.0 / sqrt(pow(drnd()*0.999, -2.0/3.0) - 1);
      do {
        x = drnd()*2.0 - 1.0;
        y = drnd()*2.0 - 1.0;
        z = drnd()*2.0 - 1.0;
        sq = x*x + y*y + z*z;
      } while (sq > 1.0);
      scale = rsc * r / sqrt(sq);
      posx[i] = x * scale;
      posy[i] = y * scale;
      posz[i] = z * scale;

      do {
        x = drnd();
        y = drnd() * 0.1;
      } while (y > x*x * pow(1 - x*x, 3.5));
      v = x * sqrt(2.0 / sqrt(1 + r*r));
      do {
        x = drnd()*2.0 - 1.0;
        y = drnd()*2.0 - 1.0;
        z = drnd()*2.0 - 1.0;
        sq = x*x + y*y + z*z;
      } while (sq > 1.0);
      scale = vsc * v / sqrt(sq);
      velx[i] = x * scale;
      vely[i] = y * scale;
      velz[i] = z * scale;
    }

    if (cudaSuccess != cudaMemcpy(massl, mass, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of mass to device failed\n");  CudaTest("mass copy to device failed");
    if (cudaSuccess != cudaMemcpy(posxl, posx, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of posx to device failed\n");  CudaTest("posx copy to device failed");
    if (cudaSuccess != cudaMemcpy(posyl, posy, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of posy to device failed\n");  CudaTest("posy copy to device failed");
    if (cudaSuccess != cudaMemcpy(poszl, posz, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of posz to device failed\n");  CudaTest("posz copy to device failed");
    if (cudaSuccess != cudaMemcpy(velxl, velx, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of velx to device failed\n");  CudaTest("velx copy to device failed");
    if (cudaSuccess != cudaMemcpy(velyl, vely, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of vely to device failed\n");  CudaTest("vely copy to device failed");
    if (cudaSuccess != cudaMemcpy(velzl, velz, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of velz to device failed\n");  CudaTest("velz copy to device failed");

    // run timesteps (launch GPU kernels)

    cudaEventCreate(&start);  cudaEventCreate(&stop);
    struct timeval starttime, endtime;
    gettimeofday(&starttime, NULL);

    cudaEventRecord(start, 0);
    InitializationKernel<<<1, 1>>>(errl);
    cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
    timing[0] += time;
    CudaTest("kernel 0 launch failed");

    for (step = 0; step < timesteps; step++) {
      //fprintf(stderr, "BBKernel\n");
      cudaEventRecord(start, 0);
      BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(nnodes, nbodies, startl, childl, massl, posxl, posyl, poszl, maxxl, maxyl, maxzl, minxl, minyl, minzl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[1] += time;
      CudaTest("kernel 1 launch failed");

      //fprintf(stderr, "TBKernel\n");
      cudaEventRecord(start, 0);
      ClearKernel1<<<blocks * 1, 1024>>>(nnodes, nbodies, childl);
      TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>(nnodes, nbodies, errl, childl, posxl, posyl, poszl);
      ClearKernel2<<<blocks * 1, 1024>>>(nnodes, startl, massl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[2] += time;
      CudaTest("kernel 2 launch failed");

      //fprintf(stderr, "SKKernel\n");
      cudaEventRecord(start, 0);
      assert(all_resident(SummarizationKernel, dim3(blocks * FACTOR3), dim3(THREADS3), 0));
      SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(nnodes, nbodies, countl, childl, massl, posxl, posyl, poszl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[3] += time;
      CudaTest("kernel 3 launch failed");

      cudaEventRecord(start, 0);
      assert(all_resident(SortKernel, dim3(blocks * FACTOR4), dim3(THREADS4), 0));
      SortKernel<<<blocks * FACTOR4, THREADS4>>>(nnodes, nbodies, sortl, countl, startl, childl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[4] += time;
      CudaTest("kernel 4 launch failed");

      //fprintf(stderr, "FCKernel\n");
      cudaEventRecord(start, 0);
      ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(nnodes, nbodies, errl, dthf, itolsq, epssq, sortl, childl, massl, posxl, posyl, poszl, velxl, velyl, velzl, accxl, accyl, acczl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[5] += time;
      CudaTest("kernel 5 launch failed");

      //fprintf(stderr, "IKKernel\n");
      cudaEventRecord(start, 0);
      IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(nbodies, dtime, dthf, posxl, posyl, poszl, velxl, velyl, velzl, accxl, accyl, acczl);
      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
      timing[6] += time;
      CudaTest("kernel 6 launch failed");
    }
    CudaTest("kernel launch failed");
    cudaEventDestroy(start);  cudaEventDestroy(stop);

    // transfer result back to CPU
    if (cudaSuccess != cudaMemcpy(&error, errl, sizeof(int), cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of err from device failed\n");  CudaTest("err copy from device failed");
    if (cudaSuccess != cudaMemcpy(posx, posxl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of posx from device failed\n");  CudaTest("posx copy from device failed");
    if (cudaSuccess != cudaMemcpy(posy, posyl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of posy from device failed\n");  CudaTest("posy copy from device failed");
    if (cudaSuccess != cudaMemcpy(posz, poszl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of posz from device failed\n");  CudaTest("posz copy from device failed");
    if (cudaSuccess != cudaMemcpy(velx, velxl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of velx from device failed\n");  CudaTest("velx copy from device failed");
    if (cudaSuccess != cudaMemcpy(vely, velyl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of vely from device failed\n");  CudaTest("vely copy from device failed");
    if (cudaSuccess != cudaMemcpy(velz, velzl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of velz from device failed\n");  CudaTest("velz copy from device failed");

    gettimeofday(&endtime, NULL);
    runtime = endtime.tv_sec + endtime.tv_usec/1000000.0 - starttime.tv_sec - starttime.tv_usec/1000000.0;

    printf("runtime: %.4lf s  (", runtime);
    time = 0;
    for (i = 1; i < 7; i++) {
      printf(" %.1f ", timing[i]);
      time += timing[i];
    }
    if (error == 0) {
      printf(") = %.1f ms\n", time);
    } else {
      printf(") = %.1f ms FAILED %d\n", time, error);
    }
  }

  // print output
  i = 0;
//  for (i = 0; i < nbodies; i++) {
    printf("%.2e %.2e %.2e\n", posx[i], posy[i], posz[i]);
//  }

  free(mass);
  free(posx);
  free(posy);
  free(posz);
  free(velx);
  free(vely);
  free(velz);

  cudaFree(errl);
  cudaFree(childl);
  cudaFree(massl);
  cudaFree(posxl);
  cudaFree(posyl);
  cudaFree(poszl);
  cudaFree(countl);
  cudaFree(startl);

  cudaFree(maxxl);
  cudaFree(maxyl);
  cudaFree(maxzl);
  cudaFree(minxl);
  cudaFree(minyl);
  cudaFree(minzl);

  return 0;
}


================================================
FILE: lonestar/scientific/gpu/barneshut/bh_tuning.h
================================================
/*
 * CUDA BarnesHut v3.1: Simulation of the gravitational forces
 * in a galactic cluster using the Barnes-Hut n-body algorithm
 *
 * Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted for academic, research, experimental, or personal
 * use provided that the following conditions are met:
 *
 *    * Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *    * Neither the name of Texas State University-San Marcos nor the names of
 * its contributors may be used to endorse or promote products derived from this
 *      software without specific prior written permission.
 *
 * For all other uses, please contact the Office for Commercialization and
 * Industry Relations at Texas State University-San Marcos
 * <http://www.txstate.edu/ocir/>.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Martin Burtscher <burtscher@txstate.edu>
 *
 */

#pragma once
#define THREADS1 32
#define THREADS2 192
#define THREADS3 64
#define THREADS4 352
#define THREADS5 192
#define THREADS6 96
#define FACTOR1 6
#define FACTOR2 7
#define FACTOR3 8
#define FACTOR4 4
#define FACTOR5 7
#define FACTOR6 6
static const char* TUNING_PARAMETERS =
    "THREADS1 32\nTHREADS2 192\nTHREADS3 64\nTHREADS4 352\nTHREADS5 "
    "192\nTHREADS6 96\nFACTOR1 6\nFACTOR2 7\nFACTOR3 8\nFACTOR4 4\nFACTOR5 "
    "7\nFACTOR6 6\n";


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/CMakeLists.txt
================================================
app_scientific_gpu(dmr delaunayrefinement)
#add_test_gpu(delaunayrefinement rmat15 rmat15.out dmr ${BASEINPUT}/reference/meshes/r10k.1 20)
add_test_gpu(delaunayrefinement rmat15 rmat15.out dmr ${BASEINPUT}/meshes/250k.2 20)


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/README.md
================================================
Delaunayrefinement
================================================================================

DESCRIPTION 
--------------------------------------------------------------------------------

The LSG Delaunay Mesh Refinement uses a variant of Chew's algorithm as
implemented in the Lonestar CPU benchmark.

A great resource on Delaunary Mesh Refinement is the website
maintained by Shewchuk:

https://www.cs.cmu.edu/~quake/triangle.research.html


INPUT
--------------------------------------------------------------------------------

Test inputs (files with extensions .ele, .node, .poly) can be downloaded
from [https://www.cs.cmu.edu/~quake/triangle.html](this url)

BUILD
--------------------------------------------------------------------------------

1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).

2. Run `cd <BUILD>/lonestar/scientific/gpu/delaunayrefinement; make -j`

RUN
--------------------------------------------------------------------------------

The following are a few example command lines.

- `$ ./delaunayrefinement-gpu <input-basename> <maxfactor>`
- `$ ./delaunayrefinement-gpu r1M 20`

PERFORMANCE  
--------------------------------------------------------------------------------

* In our experience, nondet schedule in  delaunayrefinement outperforms deterministic schedules, because determinism incurs a performance cost
* Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and
  machine dependent


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/devel.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once

void dump_element(FORD* nodex, FORD* nodey, uint3& element, int ele) {
  printf("%f %f %f %f", nodex[element.x], nodey[element.x], nodex[element.y],
         nodey[element.y]);

  if (!IS_SEGMENT(element))
    printf(" %f %f", nodex[element.z], nodey[element.z]);

  printf(" %d", ele);
}

void dump_neighbours(ShMesh& mesh) {
  FORD* nodex       = mesh.nodex.cpu_rd_ptr();
  FORD* nodey       = mesh.nodey.cpu_rd_ptr();
  uint3* elements   = mesh.elements.cpu_rd_ptr();
  uint3* neighbours = mesh.neighbours.cpu_rd_ptr();

  for (int i = 0; i < mesh.nelements; i++) {
    printf("center element [");
    dump_element(nodex, nodey, elements[i], i);
    printf("]\npre-graph\n");

    if (neighbours[i].x != INVALIDID) {
      printf("[");
      dump_element(nodex, nodey, elements[neighbours[i].x], i);
      printf("]\n");
    }

    if (neighbours[i].y != INVALIDID) {
      printf("[");
      dump_element(nodex, nodey, elements[neighbours[i].y], i);
      printf("]\n");
    }

    if (neighbours[i].z != INVALIDID) {
      printf("[");
      dump_element(nodex, nodey, elements[neighbours[i].z], i);
      printf("]\n");
    }
    printf("post-graph\n");

    printf("update over\n");
  }
}

void debug_isbad(Worklist2& wl, ShMesh& mesh) {
  bool* isbad = mesh.isbad.cpu_rd_ptr();
  bool* isdel = mesh.isdel.cpu_rd_ptr();

  int i;
  int badcount = 0;

  wl.update_cpu();
  int wlitems = wl.nitems();

  printf("checking %d elements\n", mesh.nelements);
  for (i = 0; i < mesh.nelements; i++) {
    if (isdel[i])
      continue;

    if (isbad[i]) {
      badcount++;

      int j = 0;
      for (j = 0; j < wlitems; j++) {
        if (wl.wl[j] == i)
          break;
      }

      if (j == wlitems)
        printf("\tnot found: %d\n", i);
    }
  }

  printf("bad count: %d\n", badcount);
}


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/dmr-nontex.cu
================================================
/*  -*- mode: c++ -*-  */
#include "gg.h"
#include "ggcuda.h"

void kernel_sizing(CSRGraph &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic";
#define CAVLEN 256
#define BCLEN 1024
#include "dmrggc.inc"
static const int __tb_refine = TB_SIZE;
__global__ void check_triangles(Mesh mesh, unsigned int * bad_triangles, int start, Worklist2 in_wl, Worklist2 out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type ele_end;
  // FP: "1 -> 2;
  uint3* el ;
  int count = 0;
  // FP: "2 -> 3;
  ele_end = ((mesh).nelements);
  for (index_type ele = start + tid; ele < ele_end; ele += nthreads)
  {
    if (ele < mesh.nelements)
    {
      if (!(mesh.isdel[ele] || IS_SEGMENT(mesh.elements[ele])))
      {
        if (!mesh.isbad[ele])
        {
          el = &mesh.elements[ele];
          mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) || angleLT(mesh, el->z, el->x, el->y) || angleLT(mesh, el->y, el->z, el->x));
        }
        if (mesh.isbad[ele])
        {
          count++;
          (out_wl).push(ele);
        }
      }
    }
  }
  // FP: "15 -> 16;
  atomicAdd(bad_triangles, count);
  // FP: "16 -> 17;
}
__global__ void __launch_bounds__(TB_SIZE) refine(Mesh mesh, int debg, uint * nnodes, uint * nelements, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl, ExclusiveLocks _ex, GlobalBarrier gb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type wlele_end;
  index_type wlele_rup;
  index_type wlele_block_size;
  index_type wlele_block_start;
  // FP: "1 -> 2;
  uint cavity[CAVLEN] ;
  uint nc = 0;
  uint boundary[BCLEN] ;
  uint bc = 0;
  uint blnodes[BCLEN/4] ;
  bool repush = false;
  int stage = 0;
  int x = 0;
  // FP: "2 -> 3;
  wlele_end = *((volatile index_type *) (in_wl).dindex);
  wlele_rup = ((0) + roundup(((*((volatile index_type *) (in_wl).dindex)) - (0)), (nthreads)));
  wlele_block_size = wlele_rup / nthreads;
  wlele_block_start = (0 + tid) * wlele_block_size;
  for (index_type wlele = wlele_block_start; wlele < (wlele_block_start + wlele_block_size) && (wlele < wlele_rup); wlele++)
  {
    FORD cx;
    FORD cy;
    bool pop;
    int ele;
    nc = 0;
    bc = 0;
    repush = false;
    stage = 0;
    pop = (in_wl).pop_id(wlele, ele);
    if (pop && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])
    {
      uint oldcav;
      cavity[nc++] = ele;
      do
      {
        oldcav = cavity[0];
        cavity[0] = opposite(mesh, ele);
      }
      while (cavity[0] != oldcav);
      if (!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))
      {
        build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);
      }
    }
    int nodes_added = 0;
    int elems_added = 0;
    {
      _ex.mark_p1(nc, (int *) cavity, tid);
      _ex.mark_p1_iterator(2, bc, 4, (int *) boundary, tid);
      gb.Sync();
      _ex.mark_p2(nc, (int *) cavity, tid);
      _ex.mark_p2_iterator(2, bc, 4, (int *) boundary, tid);
      gb.Sync();
      int _x = 1;
      _x &= _ex.owns(nc, (int *) cavity, tid);
      _x &= _ex.owns_iterator(2, bc, 4, (int *) boundary, tid);
      if (_x)
      {
        if (nc > 0)
        {
          nodes_added = 1;
          elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);
          uint cnode ;
          uint cseg1 = 0;
          uint cseg2 = 0;
          uint nelements_added ;
          uint oldelements ;
          uint newelemndx ;
          cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));
          nelements_added = elems_added;
          oldelements = atomicAdd(nelements, nelements_added);
          newelemndx = oldelements;
          if (IS_SEGMENT(mesh.elements[cavity[0]]))
          {
            cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);
            cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);
          }
          for (int i = 0; i < bc; i+=4)
          {
            uint ntri = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], newelemndx++);
          }
          assert(oldelements + nelements_added == newelemndx);
          setup_neighbours(mesh, oldelements, newelemndx);
          repush = true;
          for (int i = 0; i < nc; i++)
          {
            mesh.isdel[cavity[i]] = true;
            if (cavity[i] == ele)
            {
              repush = false;
            }
          }
        }
      }
      else
      {
        repush = true;
      }
    }
    gb.Sync();
    if (repush)
    {
      (out_wl).push(ele);
      continue;
    }
  }
}
void refine_mesh(ShMesh& mesh, dim3 blocks, dim3 threads)
{
  ExclusiveLocks refine_ex_locks(mesh.maxnelements);
  static GlobalBarrierLifetime refine_barrier;
  static bool refine_barrier_inited;
  PipeContextT<Worklist2> pipe;
  // FP: "1 -> 2;
  Shared<uint> nbad (1);
  Mesh gmesh (mesh);
  Shared<uint> nelements (1);
  Shared<uint> nnodes (1);
  int cnbad ;
  bool orig = false;
  ggc::Timer t ("total");
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  static const size_t refine_residency = maximum_residency(refine, __tb_refine, 0);
  static const size_t refine_blocks = GG_MIN(blocks.x, ggc_get_nSM() * refine_residency);
  if(!refine_barrier_inited) { refine_barrier.Setup(refine_blocks); refine_barrier_inited = true;};
  // FP: "4 -> 5;
  find_neighbours_cpu(mesh);
  gmesh.refresh(mesh);
  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;
  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;
  // FP: "5 -> 6;
  pipe = PipeContextT<Worklist2>(mesh.nelements);
  {
    {
      int lastnelements = 0;
      // FP: "7 -> 8;
      *(nbad.cpu_wr_ptr(true)) = 0;
      t.start();
      // FP: "8 -> 9;
      pipe.out_wl().will_write();
      check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      pipe.advance2();
      // FP: "9 -> 10;
      printf("%d initial bad triangles\n", *(nbad.cpu_rd_ptr()) );;
      // FP: "10 -> 11;
      while (pipe.in_wl().nitems())
      {
        lastnelements = gmesh.nelements;
        {
          pipe.out_wl().will_write();
          pipe.re_wl().will_write();
          refine <<<refine_blocks, __tb_refine>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), pipe.in_wl(), pipe.out_wl(), pipe.re_wl(), refine_ex_locks, refine_barrier);
          pipe.in_wl().swap_slots();
          pipe.retry2();
        }
        gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());
        gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());
        *(nbad.cpu_wr_ptr(true)) = 0;
        printf("checking triangles ...\n");
        pipe.out_wl().will_write();
        if (orig)
          check_triangles_orig <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());
        else
          check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        printf("%d bad triangles\n", *(nbad.cpu_rd_ptr()) );
      }
      // FP: "18 -> 19;
      t.stop();
      printf("time: %llu ns\n", t.duration());
      // FP: "19 -> 20;
      {
        *(nbad.cpu_wr_ptr(true)) = 0;
        // FP: "21 -> 22;
        pipe.out_wl().will_write();
        check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        // FP: "22 -> 23;
        printf("%d (%d) final bad triangles\n", *(nbad.cpu_rd_ptr()), pipe.in_wl().nitems() );
        // FP: "23 -> 24;
      }
      // FP: "20 -> 21;
    }
  }
  pipe.free();
  // FP: "6 -> 7;
}
#include "main.inc"

================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/dmr.cu
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "gg.h"
#include "ggcuda.h"

void kernel_sizing(CSRGraphTex &, dim3 &, dim3 &);
#define TB_SIZE 256
const char *GGC_OPTIONS = "coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture";
#define CAVLEN 256
#define BCLEN 1024
#include "dmrggc.inc"
static const int __tb_refine = TB_SIZE;
__global__ void check_triangles(Mesh mesh, unsigned int * bad_triangles, int start, WorklistT in_wl, WorklistT out_wl)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  //const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  index_type ele_end;
  // FP: "1 -> 2;
  uint3* el ;
  int count = 0;
  // FP: "2 -> 3;
  ele_end = ((mesh).nelements);
  for (index_type ele = start + tid; ele < ele_end; ele += nthreads)
  {
    if (ele < mesh.nelements)
    {
      if (!(mesh.isdel[ele] || IS_SEGMENT(mesh.elements[ele])))
      {
        if (!mesh.isbad[ele])
        {
          el = &mesh.elements[ele];
          mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) || angleLT(mesh, el->z, el->x, el->y) || angleLT(mesh, el->y, el->z, el->x));
        }
        if (mesh.isbad[ele])
        {
          count++;
          (out_wl).push(ele);
        }
      }
    }
  }
  // FP: "15 -> 16;
  atomicAdd(bad_triangles, count);
  // FP: "16 -> 17;
}
__global__ void __launch_bounds__(TB_SIZE) refine(Mesh mesh, int debg, uint * nnodes, uint * nelements, WorklistT in_wl, WorklistT out_wl, WorklistT re_wl, ExclusiveLocks _ex, GlobalBarrier gb)
{
  unsigned tid = TID_1D;
  unsigned nthreads = TOTAL_THREADS_1D;

  //const unsigned __kernel_tb_size = TB_SIZE;
  if (tid == 0)
    in_wl.reset_next_slot();

  uint cavity[CAVLEN] ;
  uint nc = 0;
  uint boundary[BCLEN] ;
  uint bc = 0;
  bool repush = false;
  index_type wlele_end = *((volatile index_type *) (in_wl).dindex);
  index_type wlele_rup = ((0) + roundup(((*((volatile index_type *) (in_wl).dindex)) - (0)), (nthreads)));
  index_type wlele_block_size = wlele_rup / nthreads;
  index_type wlele_block_start = (0 + tid) * wlele_block_size;
  int stage = 0;
  for (index_type wlele = wlele_block_start; wlele < (wlele_block_start + wlele_block_size) && (wlele < wlele_rup); wlele++)
  {
    FORD cx;
    FORD cy;
    bool pop;
    int ele;
    nc = 0;
    bc = 0;
    repush = false;
    stage = 0;
    pop = (in_wl).pop_id(wlele, ele);
    if (pop && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])
    {
      uint oldcav;
      cavity[nc++] = ele;
      do
      {
        oldcav = cavity[0];
        cavity[0] = opposite(mesh, ele);
      }
      while (cavity[0] != oldcav);
      if (!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))
      {
        build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);
      }
    }
    int nodes_added = 0;
    int elems_added = 0;
    {
      _ex.mark_p1(nc, (int *) cavity, tid);
      _ex.mark_p1_iterator(2, bc, 4, (int *) boundary, tid);
      gb.Sync();
      _ex.mark_p2(nc, (int *) cavity, tid);
      _ex.mark_p2_iterator(2, bc, 4, (int *) boundary, tid);
      gb.Sync();
      int _x = 1;
      _x &= _ex.owns(nc, (int *) cavity, tid);
      _x &= _ex.owns_iterator(2, bc, 4, (int *) boundary, tid);
      if (_x)
      {
        if (nc > 0)
        {
          nodes_added = 1;
          elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);
          uint cnode ;
          uint cseg1 = 0;
          uint cseg2 = 0;
          uint nelements_added ;
          uint oldelements ;
          uint newelemndx ;
          cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));
          nelements_added = elems_added;
          oldelements = atomicAdd(nelements, nelements_added);
          newelemndx = oldelements;
          if (IS_SEGMENT(mesh.elements[cavity[0]]))
          {
            cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);
            cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);
          }
          for (int i = 0; i < bc; i+=4)
          {
            uint ntri = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], newelemndx++);
          }
          assert(oldelements + nelements_added == newelemndx);
          setup_neighbours(mesh, oldelements, newelemndx);
          repush = true;
          for (int i = 0; i < nc; i++)
          {
            mesh.isdel[cavity[i]] = true;
            if (cavity[i] == ele)
            {
              repush = false;
            }
          }
        }
      }
      else
      {
        repush = true;
      }
    }
    gb.Sync();
    if (repush)
    {
      (out_wl).push(ele);
      continue;
    }
  }
}
void refine_mesh(ShMesh& mesh, dim3 blocks, dim3 threads)
{
  ExclusiveLocks refine_ex_locks(mesh.maxnelements);
  static GlobalBarrierLifetime refine_barrier;
  static bool refine_barrier_inited;
  PipeContextT<WorklistT> pipe;
  // FP: "1 -> 2;
  Shared<uint> nbad (1);
  Mesh gmesh (mesh);
  Shared<uint> nelements (1);
  Shared<uint> nnodes (1);
  //int cnbad ;
  bool orig = false;
  ggc::Timer t ("total");
  // FP: "2 -> 3;
  // FP: "3 -> 4;
  static const size_t refine_residency = maximum_residency(refine, __tb_refine, 0);
  static const size_t refine_blocks = GG_MIN(blocks.x, ggc_get_nSM() * refine_residency);
  if(!refine_barrier_inited) { refine_barrier.Setup(refine_blocks); refine_barrier_inited = true;};
  // FP: "4 -> 5;
  find_neighbours_cpu(mesh);
  gmesh.refresh(mesh);
  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;
  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;
  // FP: "5 -> 6;
  pipe = PipeContextT<WorklistT>(mesh.nelements);
  {
    {
      int lastnelements = 0;
      // FP: "7 -> 8;
      *(nbad.cpu_wr_ptr(true)) = 0;
      t.start();
      // FP: "8 -> 9;
      pipe.out_wl().will_write();
      check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());
      pipe.in_wl().swap_slots();
      pipe.advance2();
      // FP: "9 -> 10;
      printf("%d initial bad triangles\n", *(nbad.cpu_rd_ptr()) );;
      // FP: "10 -> 11;
      while (pipe.in_wl().nitems())
      {
        lastnelements = gmesh.nelements;
        {
          pipe.out_wl().will_write();
          pipe.re_wl().will_write();
          refine <<<refine_blocks, __tb_refine>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), pipe.in_wl(), pipe.out_wl(), pipe.re_wl(), refine_ex_locks, refine_barrier);
          pipe.in_wl().swap_slots();
          pipe.retry2();
        }
        gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());
        gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());
        *(nbad.cpu_wr_ptr(true)) = 0;
        printf("checking triangles ...\n");
        pipe.out_wl().will_write();
        if (orig)
          check_triangles_orig <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());
        else
          check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        printf("%d bad triangles\n", *(nbad.cpu_rd_ptr()) );
      }
      // FP: "18 -> 19;
      t.stop();
      printf("time: %llu ns\n", t.duration());
      // FP: "19 -> 20;
      {
        *(nbad.cpu_wr_ptr(true)) = 0;
        // FP: "21 -> 22;
        pipe.out_wl().will_write();
        check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());
        pipe.in_wl().swap_slots();
        pipe.advance2();
        // FP: "22 -> 23;
        printf("%d (%d) final bad triangles\n", *(nbad.cpu_rd_ptr()), pipe.in_wl().nitems() );
        // FP: "23 -> 24;
      }
      // FP: "20 -> 21;
    }
  }
  pipe.free();
  // FP: "6 -> 7;
}
#include "main.inc"


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/dmr.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#pragma once
#include "sharedptr.h"

#define MINANGLE 30
#define PI 3.14159265358979323846 // from C99 standard.
#define FORD double
#define DIMSTYPE unsigned

#define INVALIDID 1234567890
#define MAXID INVALIDID

// "usual" ratio of final nodes to final elements, determined empirically
// used to adjust maxfactor for nodes
// use 1 to be conservative
#define MAX_NNODES_TO_NELEMENTS 2

struct ShMesh {
  uint maxnelements;
  uint maxnnodes;
  uint ntriangles;
  uint nnodes;
  uint nsegments;
  uint nelements;

  Shared<FORD> nodex;
  Shared<FORD> nodey;
  Shared<uint3> elements;
  Shared<uint3> neighbours;
  Shared<bool> isdel;
  Shared<bool> isbad;
  Shared<int> owners;
};

struct Mesh {
  uint maxnelements;
  uint maxnnodes;
  uint ntriangles;
  uint nnodes;
  uint nsegments;
  uint nelements;

  FORD* nodex; // could be combined
  FORD* nodey;
  uint3* elements;
  volatile bool* isdel;
  bool* isbad;
  uint3* neighbours;
  int* owners;

  Mesh() {}

  Mesh(ShMesh& mesh) {
    maxnelements = mesh.maxnelements;
    maxnnodes    = mesh.maxnnodes;
    ntriangles   = mesh.ntriangles;
    nnodes       = mesh.nnodes;
    nsegments    = mesh.nsegments;
    nelements    = mesh.nelements;

    nodex      = mesh.nodex.gpu_wr_ptr();
    nodey      = mesh.nodey.gpu_wr_ptr();
    elements   = mesh.elements.gpu_wr_ptr();
    neighbours = mesh.neighbours.gpu_wr_ptr();
    isdel      = mesh.isdel.gpu_wr_ptr();
    isbad      = mesh.isbad.gpu_wr_ptr();
    owners     = mesh.owners.gpu_wr_ptr(true);
  }

  void refresh(ShMesh& mesh) {
    maxnelements = mesh.maxnelements;
    maxnnodes    = mesh.maxnnodes;
    ntriangles   = mesh.ntriangles;
    nnodes       = mesh.nnodes;
    nsegments    = mesh.nsegments;
    nelements    = mesh.nelements;

    nodex      = mesh.nodex.gpu_wr_ptr();
    nodey      = mesh.nodey.gpu_wr_ptr();
    elements   = mesh.elements.gpu_wr_ptr();
    neighbours = mesh.neighbours.gpu_wr_ptr();
    isdel      = mesh.isdel.gpu_wr_ptr();
    isbad      = mesh.isbad.gpu_wr_ptr();
    owners     = mesh.owners.gpu_wr_ptr(true);
  }
};

#define IS_SEGMENT(element) (((element).z == INVALIDID))


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/dmrggc.inc
================================================
/* -*- mode: c++ -*- */
#include "meshfiles.h"
#include "dmr.h"
#include "geomprim.h"
#include "gbar.cuh"
#include <cub/cub.cuh>
#include "devel.h"
#include <map>

#undef TB_SIZE
#define TB_SIZE 512

#define LOADCV(x) cub::ThreadLoad<cub::LOAD_CV>((x))
#define LOADCG(x) cub::ThreadLoad<cub::LOAD_CG>((x))
#define STORECG(x, y) cub::ThreadStore<cub::STORE_CG>((x), (y))

void addneighbour_cpu(uint3 &neigh, uint elem)
{
  // TODO
  if(neigh.x == elem || neigh.y == elem || neigh.z == elem) return;

  assert(neigh.x == INVALIDID || neigh.y == INVALIDID || neigh.z == INVALIDID);

  if(neigh.x == INVALIDID) { neigh.x = elem; return; }
  if(neigh.y == INVALIDID) { neigh.y = elem; return; }
  if(neigh.z == INVALIDID) { neigh.z = elem; return; }
}

void find_neighbours_cpu(ShMesh &mesh)
{
  std::map<std::pair<int, int>, int> edge_map;

  uint nodes1[3];

  uint3 *elements = mesh.elements.cpu_rd_ptr();
  uint3 *neighbours = mesh.neighbours.cpu_wr_ptr(true);
  int ele;

  for(ele = 0; ele < mesh.nelements; ele++)
    {
      uint3 *neigh = &neighbours[ele];

      neigh->x = INVALIDID;
      neigh->y = INVALIDID;
      neigh->z = INVALIDID;

      nodes1[0] = elements[ele].x;
      nodes1[1] = elements[ele].y;
      nodes1[2] = elements[ele].z;

      if(nodes1[0] > nodes1[1]) std::swap(nodes1[0], nodes1[1]);
      if(nodes1[1] > nodes1[2]) std::swap(nodes1[1], nodes1[2]);
      if(nodes1[0] > nodes1[1]) std::swap(nodes1[0], nodes1[1]);

      assert(nodes1[0] <= nodes1[1] && nodes1[1] <= nodes1[2]);

      std::pair<int, int> edges[3];
      edges[0] = std::make_pair<int, int>(nodes1[0], nodes1[1]);
      edges[1] = std::make_pair<int, int>(nodes1[1], nodes1[2]);
      edges[2] = std::make_pair<int, int>(nodes1[0], nodes1[2]);

      int maxn = IS_SEGMENT(elements[ele]) ? 1 : 3;
      
      for(int i = 0; i < maxn; i++) {
	if(edge_map.find(edges[i]) == edge_map.end())
	  edge_map[edges[i]] = ele;
	else {	  
	  int node = edge_map[edges[i]];
	  addneighbour_cpu(neighbours[node], ele);
	  addneighbour_cpu(neighbours[ele], node);
	  edge_map.erase(edges[i]);
	}
      }
    }
}

void verify_mesh(ShMesh &mesh)
{
  // code moved to refine_mesh (final invocation of check_triangles)
  // TODO: check for delaunay property
}

void read_mesh(const char *basefile, ShMesh &mesh, int maxfactor)
{
  readNodes(basefile, mesh, maxfactor);
  readTriangles(basefile, mesh, maxfactor);

  assert(mesh.maxnelements > 0);
  printf("memory for owners: %d MB\n", mesh.maxnelements * sizeof(int) / 1048576);
  mesh.owners.alloc(mesh.maxnelements);
  // see refine() for actual allocation
  printf("memory for worklists: %d MB\n", 2 * mesh.nelements * sizeof(int) / 1048576);

  printf("%s: %d nodes, %d triangles, %d segments read\n", basefile, mesh.nnodes, mesh.ntriangles, mesh.nsegments);
  assert(mesh.nnodes > 0);
  assert(mesh.ntriangles > 0);
  assert(mesh.nsegments > 0);
  assert(mesh.nelements > 0);
}

__device__ void check_is_bad(Mesh &mesh, int ele)
{
  uint3 *el = &mesh.elements[ele];

  mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) 
		     || angleLT(mesh, el->z, el->x, el->y) 
		     || angleLT(mesh, el->y, el->z, el->x));
}

__device__ bool shares_edge(uint nodes1[3], uint nodes2[3])
{
  int i;
  int match = 0;
  uint help;

  for (i = 0; i < 3; i++) {
    if ((help = nodes1[i]) != INVALIDID) {
      if (help == nodes2[0]) match++;
      else if (help == nodes2[1]) match++;
      else if (help == nodes2[2]) match++;
    }
  } 
  // for(i = 0; i < 3; i++)
  //   for(int j = 0; j < 3; j++)
  //     {
  // 	if(nodes1[i] == nodes2[j] && nodes1[i] != INVALIDID)
  // 	  {
  // 	    match++;
  // 	    break;
  // 	  }
  //     }


  return match == 2;
}

__global__ void find_neighbours(Mesh mesh, int start, int end)
{
  int id = threadIdx.x + blockDim.x * blockIdx.x;
  int threads = blockDim.x * gridDim.x;
  int ele;
  int oele;
  int nc = 0;
  uint nodes1[3], nodes2[3], neigh[3] = {INVALIDID, INVALIDID, INVALIDID};

  for(int x = 0; x < mesh.nelements; x += 4096) {
    // currently a n^2 algorithm -- launch times out for 250k.ele!
    for(ele = id + start; ele < end; ele += threads)
      {
	if(x == 0)
	  {
	    neigh[0] = INVALIDID;
	    neigh[1] = INVALIDID;
	    neigh[2] = INVALIDID;
	  }
	else
	  {
	    neigh[0] = mesh.neighbours[ele].x;
	    neigh[1] = mesh.neighbours[ele].y;
	    neigh[2] = mesh.neighbours[ele].z;
	  }

	if(neigh[2] != INVALIDID) continue;

	//TODO: possibly remove uint3 from Mesh/ShMesh
	nodes1[0] = mesh.elements[ele].x;
	nodes1[1] = mesh.elements[ele].y;
	nodes1[2] = mesh.elements[ele].z;
	nc = (neigh[0] == INVALIDID) ? 0 : ((neigh[1] == INVALIDID) ? 1 : 2);

	//TODO: block this
	for(oele = 0; oele < mesh.nelements; oele++)
	  {
	    nodes2[0] = mesh.elements[oele].x; 
	    nodes2[1] = mesh.elements[oele].y; 
	    nodes2[2] = mesh.elements[oele].z;

	    if(shares_edge(nodes1, nodes2))
	      {
		assert(nc < 3);
		neigh[nc++] = oele;
	      }

	    if((IS_SEGMENT(mesh.elements[ele]) && nc == 2) || nc == 3)
	      break;
	  }

	mesh.neighbours[ele].x = neigh[0];
	mesh.neighbours[ele].y = neigh[1];
	mesh.neighbours[ele].z = neigh[2];
      }
  }
}

__device__ void dump_mesh_element(Mesh &mesh, uint3 &ele, int element)
{
  if(IS_SEGMENT(ele))
    printf("[ %.17f %.17f %.17f %.17f %d]\n", 
	   mesh.nodex[ele.x], mesh.nodey[ele.x],
	   mesh.nodex[ele.y], mesh.nodey[ele.y], element);
  else
    printf("[ %.17f %.17f %.17f %.17f %.17f %.17f %d]\n", 
	   mesh.nodex[ele.x], mesh.nodey[ele.x],
	   mesh.nodex[ele.y], mesh.nodey[ele.y],
	   mesh.nodex[ele.z], mesh.nodey[ele.z], element);
}
__device__ void dump_mesh_element(Mesh &mesh, int element)
{
  dump_mesh_element(mesh, mesh.elements[element], element);
}

__device__ bool encroached(Mesh &mesh, int element, uint3 &celement, FORD centerx, FORD centery, bool &is_seg)
{
  if(element == INVALIDID)
    return false;

  assert(!mesh.isdel[element]);

  uint3 ele = LOADCG(&mesh.elements[element]);

  if(IS_SEGMENT(ele)) {
    //if(IS_SEGMENT(celement)) //TODO: regd second segment encroaching?
    //  return false;

    FORD cx, cy, radsqr;
    uint nsp;

    is_seg = true;

    nsp = (celement.x == ele.x) ? ((celement.y == ele.y) ? celement.z : celement.y) : celement.x;

    // check if center and triangle are on opposite sides of segment
    // one of the ccws does not return zero
    if(counterclockwise(mesh.nodex[ele.x], mesh.nodey[ele.x], 
    			mesh.nodex[ele.y], mesh.nodey[ele.y],
    			mesh.nodex[nsp], mesh.nodey[nsp]) > 0 != 
       counterclockwise(mesh.nodex[ele.x], mesh.nodey[ele.x], 
    			mesh.nodex[ele.y], mesh.nodey[ele.y],
    			centerx, centery) > 0)
      return true; 

    // nope, do a distance check
    cx = (mesh.nodex[ele.x] + mesh.nodex[ele.y]) / 2;
    cy = (mesh.nodey[ele.x] + mesh.nodey[ele.y]) / 2;
    radsqr = distanceSquare(cx, cy, mesh.nodex[ele.x], mesh.nodey[ele.x]);

    return distanceSquare(centerx, centery, cx, cy) < radsqr;
  } else
    return gincircle(mesh.nodex[ele.x], mesh.nodey[ele.x],
		     mesh.nodex[ele.y], mesh.nodey[ele.y],
		     mesh.nodex[ele.z], mesh.nodey[ele.z],
		     centerx, centery) > 0.0;
}

__device__ void add_to_cavity(uint cavity[], uint &cavlen, int element)
{
  int i;
  for(i = 0; i < cavlen; i++)
    if(cavity[i] == element)
      return;

  cavity[cavlen++] = element;
}

__device__ void add_to_boundary(uint boundary[], uint &boundarylen, uint sn1, uint sn2, uint src, uint dst)
{
  int i;
  for(i = 0; i < boundarylen; i+=4)
    if((sn1 == boundary[i] && sn2 == boundary[i+1]) ||
       (sn1 == boundary[i+1] && sn2 == boundary[i]))
      return;

  boundary[boundarylen++] = sn1;  
  boundary[boundarylen++] = sn2;
  boundary[boundarylen++] = src;
  boundary[boundarylen++] = dst;
}

__device__ unsigned add_node(Mesh &mesh, FORD x, FORD y, uint ndx)
{
  //uint ndx = atomicAdd(&mesh.nnodes, 1);
  assert(ndx < mesh.maxnnodes);

  mesh.nodex[ndx] = x;
  mesh.nodey[ndx] = y;  

  return ndx;
}

__device__ uint add_segment(Mesh &mesh, uint n1, uint n2, uint ndx)
{
  //TODO: parallelize
  uint3 ele;
  ele.x = n1; ele.y = n2; ele.z = INVALIDID;

  //uint ndx = atomicAdd(&mesh.nelements, 1);
  assert(ndx < mesh.maxnelements);

  mesh.isbad[ndx] = false;
  mesh.isdel[ndx] = false;
  mesh.elements[ndx] = ele;
  mesh.neighbours[ndx].x = mesh.neighbours[ndx].y = mesh.neighbours[ndx].z = INVALIDID;

  return ndx;
}

__device__ uint add_triangle(Mesh &mesh, uint n1, uint n2, uint n3, uint nb1, uint oldt, uint ndx)
{
  uint3 ele;
  if(counterclockwise(mesh.nodex[n1], mesh.nodey[n1], 
		      mesh.nodex[n2], mesh.nodey[n2],
		      mesh.nodex[n3], mesh.nodey[n3]) > 0)
    {
      ele.x = n1; ele.y = n2; ele.z = n3;
    }
  else
    {
      ele.x = n3; ele.y = n2; ele.z = n1;
    }

  //uint ndx = atomicAdd(&mesh.nelements, 1);
  assert(ndx < mesh.maxnelements);

  mesh.isbad[ndx] = false;
  mesh.isdel[ndx] = false;
  mesh.elements[ndx] = ele;
  mesh.neighbours[ndx].x = nb1;
  mesh.neighbours[ndx].y = mesh.neighbours[ndx].z = INVALIDID;
  //check_is_bad(mesh, ndx);

  uint3 *nb = &mesh.neighbours[nb1];

  if(mesh.neighbours[nb1].x == oldt)
    nb->x = ndx;
  else {
    if(mesh.neighbours[nb1].y == oldt)
      nb->y = ndx;
    else
      {
	if(mesh.neighbours[nb1].z != oldt)
	  printf("%u %u %u %u %u %u\n", ndx, oldt, nb1, mesh.neighbours[nb1].x, 
		 mesh.neighbours[nb1].y, mesh.neighbours[nb1].z);

	assert(mesh.neighbours[nb1].z == oldt);
	nb->z = ndx;
      }
  }

  // if(mesh.neighbours[nb1].x == oldt)
  //   cub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].x, ndx);
  // else {
  //   if(mesh.neighbours[nb1].y == oldt)
  //     cub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].y, ndx);
  //   else
  //     {
  // 	assert(mesh.neighbours[nb1].z == oldt);
  // 	cub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].z, ndx);
  //     }
  // }

  return ndx;
}

__device__ bool adjacent(uint3 &elem1, uint3 &elem2)
{
  int sc = 0;
  if(elem1.x == elem2.x || elem1.x == elem2.y || elem1.x == elem2.z)
    sc++;

  if(elem1.y == elem2.x || elem1.y == elem2.y || elem1.y == elem2.z)
    sc++;

  if(!IS_SEGMENT(elem1) && (elem1.z == elem2.x || elem1.z == elem2.y || elem1.z == elem2.z))
    sc++;

  return sc == 2;
}
__device__ void find_shared_edge(uint3 &elem1, uint3 &elem2, uint se[2])
{
  int sc = 0;
  if(elem1.x == elem2.x || elem1.x == elem2.y || elem1.x == elem2.z)
    se[sc++] = elem1.x;

  if(elem1.y == elem2.x || elem1.y == elem2.y || elem1.y == elem2.z)
    se[sc++] = elem1.y;

  if(!IS_SEGMENT(elem1) && (elem1.z == elem2.x || elem1.z == elem2.y || elem1.z == elem2.z))
    se[sc++] = elem1.z;

  assert(sc == 2);
  assert(se[0] != INVALIDID);
  assert(se[1] != INVALIDID);
}

__device__ bool build_cavity(Mesh &mesh, uint cavity[], uint &cavlen, int max_cavity, uint boundary[], uint &boundarylen, FORD &cx, FORD &cy)
{
  int ce = 0;
  //FORD cx, cy;
  uint3 ele = LOADCG(&mesh.elements[cavity[0]]);
  bool is_seg = false;

  if(IS_SEGMENT(ele))
    {
      cx = (mesh.nodex[ele.x] + mesh.nodex[ele.y]) / 2;
      cy = (mesh.nodey[ele.x] + mesh.nodey[ele.y]) / 2;
    }
  else
    {
      circumcenter(mesh.nodex[ele.x], mesh.nodey[ele.x],
		   mesh.nodex[ele.y], mesh.nodey[ele.y],
		   mesh.nodex[ele.z], mesh.nodey[ele.z],
		   cx, cy);
    }

  if(debug) printf("highlight %d %d [%f %f]\n", cavity[0], IS_SEGMENT(ele), cx, cy);
  while (ce < cavlen) {
    if(mesh.isdel[cavity[ce]])
      printf("deleted: %d\n", cavity[ce]);

    assert(cavlen < max_cavity);
    assert(!mesh.isdel[cavity[ce]]);


    uint3 neighbours = LOADCG(&mesh.neighbours[cavity[ce]]);
    uint neighb[3] = {neighbours.x, neighbours.y, neighbours.z};

    for(int i = 0; i < 3; i++) {
      if(neighb[i] == cavity[0])
	continue;

      if(neighb[i] == INVALIDID)
	continue;

      //printf("neigbour %d\n", neighb[i]);

      is_seg  = false;
      if(!(IS_SEGMENT(ele) && IS_SEGMENT(LOADCG(&mesh.elements[neighb[i]]))) && 
	 encroached(mesh, neighb[i], ele, cx, cy, is_seg)) {
	if(!is_seg)
	  add_to_cavity(cavity, cavlen, neighb[i]);
	else {
	  assert(!IS_SEGMENT(ele));
	  cavity[0] = neighb[i];
	  cavlen = 1;
	  boundarylen = 0;
	  return false;
	}
      } else {
	uint se[2];
	if(!adjacent(mesh.elements[cavity[ce]], mesh.elements[neighb[i]]))
	  {
	    dump_mesh_element(mesh, cavity[ce]);
	    dump_mesh_element(mesh, neighb[i]);
	    printf("%d %d\n", cavity[ce], neighb[i]);
	  }

	assert(boundarylen < BCLEN);
	find_shared_edge(mesh.elements[cavity[ce]], mesh.elements[neighb[i]], se);
	add_to_boundary(boundary, boundarylen, se[0], se[1], neighb[i], cavity[ce]);
      }
    }
    ce++;
  }

  return true;
}

__device__ void addneighbour(Mesh &mesh, uint3 &neigh, uint elem)
{
  // TODO
  if(neigh.x == elem || neigh.y == elem || neigh.z == elem) return;

  assert(neigh.x == INVALIDID || neigh.y == INVALIDID || neigh.z == INVALIDID);

  if(neigh.x == INVALIDID) { neigh.x = elem; return; }
  if(neigh.y == INVALIDID) { neigh.y = elem; return; }
  if(neigh.z == INVALIDID) { neigh.z = elem; return; }
}

__device__ void setup_neighbours(Mesh &mesh, uint start, uint end)
{
  // relies on all neighbours being in start--end
  for(uint i = start; i < end; i++) {
    uint3 &neigh = mesh.neighbours[i];

    for(uint j = i+1; j < end; j++) {
      if(adjacent(mesh.elements[i], mesh.elements[j]))
	{
	  addneighbour(mesh, neigh, j);
	  addneighbour(mesh, mesh.neighbours[j], i);
	}
    }    
  }
}

__device__ uint opposite(Mesh &mesh, uint element)
{
  bool obtuse = false;
  int obNode = INVALIDID;
  uint3 el = mesh.elements[element];

  if(IS_SEGMENT(el))
    return element;

  // figure out obtuse node
  if(angleOB(mesh, el.x, el.y, el.z)) {
    obtuse = true;
    obNode = el.z;
  } else {
    if(angleOB(mesh, el.z, el.x, el.y)) {
      obtuse = true;
      obNode = el.y;
    } else {
      if(angleOB(mesh, el.y, el.z, el.x)) {
	obtuse = true;
	obNode = el.x;
      }
    }
  }

  if(obtuse) {
    // find the neighbour that shares an edge whose points do not include obNode
    uint se_nodes[2];
    uint nobneigh;

    uint3 neigh = mesh.neighbours[element];

    if(debug) printf("obtuse node [%f %f]\n", mesh.nodex[obNode], mesh.nodey[obNode]);
    assert(neigh.x != INVALIDID && neigh.y != INVALIDID && neigh.z != INVALIDID);

    nobneigh = neigh.x;
    find_shared_edge(el, mesh.elements[neigh.x], se_nodes);
    if(se_nodes[0] == obNode || se_nodes[1] == obNode) {
      nobneigh = neigh.y;
      find_shared_edge(el, mesh.elements[neigh.y], se_nodes);
      if(se_nodes[0] == obNode || se_nodes[1] == obNode) {
	nobneigh = neigh.z;
      }
    }

    return nobneigh;
  }

  return element;
}


__global__
void refine_orig(Mesh mesh, int debg, uint *nnodes, uint *nelements, Worklist2 wl, Worklist2 owl, ExclusiveLocks _ex, GlobalBarrier gb)
{
  int id = threadIdx.x + blockDim.x * blockIdx.x;
  int threads = blockDim.x * gridDim.x;
  int ele, eleit, haselem;
  //int debg = 32;
  uint cavity[CAVLEN], nc = 0; // for now
  uint boundary[BCLEN], bc = 0;
  uint ulimit = ((*wl.dindex + threads - 1) / threads) * threads;
  bool repush = false;
  //typedef cub::BlockScan<int, REFINE_BLKSIZE> BlockScan;
  const int perthread = ulimit / threads;
  int stage = 0;
  int x = 0;

  for(eleit = id * perthread; eleit < (id * perthread + perthread) && eleit < ulimit; eleit++, x++)
    {
      haselem = wl.pop_id(eleit, ele);

      //printf("%d:%d:%d:%d\n", x, id, eleit, ele);
      FORD cx, cy;
      nc = 0;
      bc = 0;
      repush = false;
      stage = 0;

      if(haselem && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])
	{
	  cavity[nc++] = ele;

	  if(debug) {
	    printf("original center element ");
	    dump_mesh_element(mesh, cavity[0]);
	  }

	  uint oldcav;
	  do {
	    oldcav = cavity[0];
	    cavity[0] = opposite(mesh, ele);
	  } while(cavity[0] != oldcav);

	  if(!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))
	    build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);

	  if(debug) {
	    printf("center element [%f %f] ", cx, cy);
	    dump_mesh_element(mesh, cavity[0]);
	    printf("pre-graph %d\n", nc);
	    for(int i = 1; i < nc; i++)
	      {
		dump_mesh_element(mesh, cavity[i]);
	      }
	    printf("boundary-edges %d\n", bc);
	    for(int i = 0; i < bc; i+=4) {	
	      printf("[%f %f %f %f]\n", mesh.nodex[boundary[i]], mesh.nodey[boundary[i]],
		     mesh.nodex[boundary[i+1]], mesh.nodey[boundary[i+1]]);
	      dump_mesh_element(mesh, boundary[i+2]);
	    }
	  }

	  // try to claim ownership
	  for(int i = 0; i < nc; i++)
	    STORECG(&mesh.owners[cavity[i]], id);

	  for(int i = 0; i < bc; i+=4)
	    STORECG(&mesh.owners[boundary[i + 2]], id);

	  stage = 1;
	}

      gb.Sync();

      if(stage == 1)
	{
	  // check for conflicts
	  for(int i = 0; i < nc; i++) {
	    if(LOADCG(&mesh.owners[cavity[i]]) != id)
	      atomicMin((int *) &mesh.owners[cavity[i]], id);
	  }

	  for(int i = 0; i < bc; i+=4) {
	    if(LOADCG(&mesh.owners[boundary[i + 2]]) != id)
	      atomicMin((int *) &mesh.owners[boundary[i + 2]], id);
	  }

	  stage = 2;

	}

      gb.Sync();

      int nodes_added = 0;
      int elems_added = 0;
      if(stage == 2)
	{
	  int i;
	  for(i = 0; i < nc; i++)
	    if(LOADCG(&mesh.owners[cavity[i]]) != id) {
	      repush = true;
	      if(debug) printf("%d conflict\n", ele);
	      //printf("%d: %d owned by %d\n", id, cavity[i], mesh.owners[cavity[i]]);
	      break;
	    }

	  if(!repush)
	    for(i = 0; i < bc; i+=4)
	      if(LOADCG(&mesh.owners[boundary[i + 2]]) != id) {
		repush = true;
		if(debug) printf("%d conflict\n", ele);
		//printf("%d: %d owned by %d\n", id, boundary[i + 2], mesh.owners[boundary[i + 2]]);
		break;
	      }

	  // if(!repush)
	  //   {
	  //     for(int i = 0; i < nc; i++)
	  // 	printf("%d:%d:%d\n", x, id, cavity[i]);

	  //     for(int i = 0; i < bc; i+=4)
	  // 	printf("%d:%d:%d\n", x, id, boundary[i + 2]);
	  //   }

	  if(!repush)
	    {
	      stage = 3;

	      nodes_added = 1;
	      elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);
	    }
	}

      // __syncthreads();
      // typedef cub::WarpScan<int, 1, 32> WarpScan;
      // __shared__ typename WarpScan::TempStorage temp_storage;
      // int total = 0, offset = elems_added;
      // __shared__ int start;
      // WarpScan(temp_storage).ExclusiveSum(elems_added, offset, total);
      // __syncthreads();
      // if((threadIdx.x & 31) == 0 && total > 0) {
      // 	 start = atomicAdd(nelements, total);
      // }
      // __syncthreads();
      // if(total > 0)
      // 	 printf("total: %d %d %d\n", threadIdx.x, id, total);

      // if(elems_added > 0)
      // 	 printf("ea: %d %d\n", id, elems_added);

      if(stage == 3)
	{
	  uint cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));
	  uint cseg1 = 0, cseg2 = 0;

	  uint nelements_added = elems_added;
	  //printf("start: %d %d %d %d %d\n", id, elems_added, start, offset, start+offset);
	  uint oldelements = atomicAdd(nelements, nelements_added);

	  uint newelemndx = oldelements;
	  if(debug) printf("post-graph\n");
	  if(IS_SEGMENT(mesh.elements[cavity[0]]))
	    {
	      cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);
	      cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);
	      if(debug) {
		dump_mesh_element(mesh, cseg1);
		dump_mesh_element(mesh, cseg2);
	      }
	    }

	  for(int i = 0; i < bc; i+=4) {
	    uint ntri  = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], 
				      newelemndx++);
	    //if(mesh.isbad[ntri])
	    //{
	    //printf("puhsing %d\n", ntri);
	    //owl.push(ntri);
	    //}

	    //printf("%d wrote %d\n", id, ntri);
	    if(debug) dump_mesh_element(mesh, ntri);
	  }

	  assert(oldelements + nelements_added == newelemndx);

	  setup_neighbours(mesh, oldelements, newelemndx);

	  repush = true;
	  for(int i = 0; i < nc; i++)
	    {
	      mesh.isdel[cavity[i]] = true;
	      // if the resulting cavity does not contain the original triangle
	      // (because of the opposite() routine, add it back.
	      if(cavity[i] == ele) repush = false;  
	      //printf("%d: deleting %d\n", id, cavity[i]);
	    }

	  if(debug) printf("update over\n");
	  //if(debg-- == 0) break;      
	}

      //owl.push_1item<BlockScan>((repush ? 1 : 0), ele, 512);
      if(repush) owl.push(ele);
      gb.Sync();
    }
}

__global__
void check_triangles_orig(Mesh mesh, unsigned int * bad_triangles, int start, Worklist2 in_wl, Worklist2 wl)
{
  int id = threadIdx.x + blockDim.x * blockIdx.x;
  int threads = blockDim.x * gridDim.x;
  int ele;
  uint3 *el;
  int count = 0;
  int ulimit = mesh.nelements; //start + ((mesh.nelements - start + blockDim.x - 1) / blockDim.x) * blockDim.x;
  bool push;

  if(debug && id == 0)
    printf("start %d nelements %d\n", start, mesh.nelements);

  for(ele = id + start; ele < ulimit; ele += threads)
    {
      push = false;

      if(ele < mesh.nelements) {
	if(mesh.isdel[ele])
	  goto next;

	if(IS_SEGMENT(mesh.elements[ele]))
	  goto next;

	if(!mesh.isbad[ele])
	  {
	    el = &mesh.elements[ele];
	    
	    mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) 
			       || angleLT(mesh, el->z, el->x, el->y) 
			       || angleLT(mesh, el->y, el->z, el->x));
	  }

	if(mesh.isbad[ele])
	  {
	    push = true;
	    count++;
	  }
      }

    next:
      //wl.push_1item<BlockScan>(push ? 1: 0, ele, 384); // slower than push?
      if(push) wl.push(ele); // not really as bad as it looks
    }

  //TODO: replace with warp wide and then block wide
  atomicAdd(bad_triangles, count);
}

void refine_mesh_orig(ShMesh &mesh, dim3 &blocks, dim3 &threads)
{
  ExclusiveLocks refine_ex_locks(mesh.maxnelements);
  Shared<uint> nbad(1);
  Mesh gmesh(mesh);
  Shared<uint> nelements(1), nnodes(1);
  int cnbad;
  GlobalBarrierLifetime gb;
  Worklist2 wl1(mesh.nelements), wl2(mesh.nelements);
  const int REFINE_BLKSIZE = threads.x;
  const size_t RES_REFINE = maximum_residency(refine_orig, REFINE_BLKSIZE, 0); 
  const int nSM = 14;

  if(debug) printf("Running at residency %d.\n", RES_REFINE);

  find_neighbours_cpu(mesh);
  //dump_neighbours(mesh);
  gmesh.refresh(mesh);

  gb.Setup(nSM * RES_REFINE);

  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;
  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;

  //double starttime, endtime;
  int lastnelements = 0;
  Worklist2 *inwl = &wl1, *outwl = &wl2;

  *(nbad.cpu_wr_ptr(true)) = 0;
  printf("checking triangles...\n");
  check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), 0, *outwl, *inwl); 
  cnbad = *(nbad.cpu_rd_ptr());
  printf("%d initial bad triangles\n", cnbad);

  while(cnbad) {   
    if(debug) inwl->display_items();
    lastnelements = gmesh.nelements;

    refine_orig<<<nSM * RES_REFINE, REFINE_BLKSIZE>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), *inwl, *outwl, refine_ex_locks, gb);
    CUDA_SAFE_CALL(cudaDeviceSynchronize()); // not needed
    printf("refine over\n");
    gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());
    gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());
    if(debug) printf("out elements: %d\n", outwl->nitems());

    std::swap(inwl, outwl);
    outwl->reset();

    *(nbad.cpu_wr_ptr(true)) = 0;
    printf("checking triangles...\n");
    // need to check only new triangles
    //inwl->reset();
    check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, *outwl, *inwl); 
    //cnbad = *(nbad.cpu_rd_ptr());
 
    cnbad = inwl->nitems();
    printf("%u bad triangles.\n", cnbad);

    if(debug) {
      debug_isbad(*inwl, mesh);
      gmesh.refresh(mesh);
    }
    if(cnbad == 0) 
      break;
  }
  CUDA_SAFE_CALL(cudaDeviceSynchronize());

  *(nbad.cpu_wr_ptr(true)) = 0;
  check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), 0, *outwl, *inwl); 
  cnbad = *(nbad.cpu_rd_ptr());
  printf("%d final bad triangles\n", cnbad);
  //printf("time: %f ms\n", (endtime - starttime) * 1000);
  //printf("\truntime [dmr] = %f ms\n", (endtime - starttime) * 1000);
}


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/geomprim.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */
#pragma once

__device__ FORD distanceSquare(FORD onex, FORD oney, FORD twox, FORD twoy) {
  FORD dx  = onex - twox;
  FORD dy  = oney - twoy;
  FORD dsq = dx * dx + dy * dy;
  return dsq;
}

__device__ FORD distanceSquare(unsigned one, unsigned two, FORD* nodex,
                               FORD* nodey) {
  return distanceSquare(nodex[one], nodey[one], nodex[two], nodey[two]);
}

__device__ bool angleOB(Mesh& mesh, unsigned a, unsigned b, unsigned c) {
  FORD vax = mesh.nodex[a] - mesh.nodex[c];
  FORD vay = mesh.nodey[a] - mesh.nodey[c];
  FORD vbx = mesh.nodex[b] - mesh.nodex[c];
  FORD vby = mesh.nodey[b] - mesh.nodey[c];
  FORD dp  = vax * vbx + vay * vby; // dot-product?

  if (dp < 0)
    return true;

  return false;
}
__device__ bool angleLT(Mesh& mesh, unsigned a, unsigned b, unsigned c) {
  FORD vax = mesh.nodex[a] - mesh.nodex[c];
  FORD vay = mesh.nodey[a] - mesh.nodey[c];
  FORD vbx = mesh.nodex[b] - mesh.nodex[c];
  FORD vby = mesh.nodey[b] - mesh.nodey[c];
  FORD dp  = vax * vbx + vay * vby; // dot-product?

  if (dp < 0) {
    // id is obtuse at point ii.
    return false;
  } else {
    FORD dsqaacurr = distanceSquare(a, c, mesh.nodex, mesh.nodey);
    FORD dsqbbcurr = distanceSquare(b, c, mesh.nodex, mesh.nodey);
    FORD c         = dp * rsqrtf(dsqaacurr * dsqbbcurr);
    if (c > cos(MINANGLE * (PI / 180))) {
      return true;
    }
  }

  return false;
}

// > 0, in circle, == 0, on circle, < 0, outside circle
// assumes a, b, c are in counter-clockwise order
// code from triangle
__device__ FORD gincircle(FORD ax, FORD ay, FORD bx, FORD by, FORD cx, FORD cy,
                          FORD px, FORD py) {
  FORD apx, bpx, cpx, apy, bpy, cpy;
  FORD bpxcpy, cpxbpy, cpxapy, apxcpy, apxbpy, bpxapy;
  FORD alift, blift, clift, det;

  apx = ax - px;
  bpx = bx - px;
  cpx = cx - px;

  apy = ay - py;
  bpy = by - py;
  cpy = cy - py;

  bpxcpy = bpx * cpy;
  cpxbpy = cpx * bpy;
  alift  = apx * apx + apy * apy;

  cpxapy = cpx * apy;
  apxcpy = apx * cpy;
  blift  = bpx * bpx + bpy * bpy;

  apxbpy = apx * bpy;
  bpxapy = bpx * apy;
  clift  = cpx * cpx + cpy * cpy;

  det = alift * (bpxcpy - cpxbpy) + blift * (cpxapy - apxcpy) +
        clift * (apxbpy - bpxapy);

  return det;
}

__device__ FORD counterclockwise(FORD pax, FORD pay, FORD pbx, FORD pby,
                                 FORD pcx, FORD pcy) {
  FORD detleft, detright, det;

  detleft  = (pax - pcx) * (pby - pcy);
  detright = (pay - pcy) * (pbx - pcx);
  det      = detleft - detright;

  return det;
}

__device__ void circumcenter(FORD Ax, FORD Ay, FORD Bx, FORD By, FORD Cx,
                             FORD Cy, FORD& CCx, FORD& CCy) {
  FORD D;

  D = 2 * (Ax * (By - Cy) + Bx * (Cy - Ay) + Cx * (Ay - By));

  CCx = ((Ax * Ax + Ay * Ay) * (By - Cy) + (Bx * Bx + By * By) * (Cy - Ay) +
         (Cx * Cx + Cy * Cy) * (Ay - By)) /
        D;
  CCy = ((Ax * Ax + Ay * Ay) * (Cx - Bx) + (Bx * Bx + By * By) * (Ax - Cx) +
         (Cx * Cx + Cy * Cy) * (Bx - Ax)) /
        D;
}


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/main.inc
================================================
/* -*- mode: c++ -*- */
int main(int argc, char *argv[]) {
  ShMesh mesh;
  int maxfactor = 20;
  int mesh_nodes, mesh_elements;
  dim3 blocks(14*8), threads(TB_SIZE);
  int no_output = 0;

  if(argc == 1)
    {
      printf("Usage: %s basefile [maxfactor] [--no]\n", argv[0]);
      exit(0);
    }

  if(argc >= 3)
    {
      if(strcmp(argv[2], "--no") == 0)
	no_output = 1;
      else
	maxfactor = atoi(argv[2]);
    }

  if(argc == 4)
    {
      if(strcmp(argv[3], "--no") == 0)
	no_output = 1;
    }

  read_mesh(argv[1], mesh, maxfactor);
  mesh_nodes = mesh.nnodes; mesh_elements = mesh.ntriangles + mesh.nsegments;

  refine_mesh(mesh, blocks, threads);
  printf("%f increase in number of elements (maxfactor hint)\n", 1.0 * mesh.nelements / mesh_elements);
  printf("%f increase in number of nodes (maxfactor hint)\n", 1.0 * mesh.nnodes / mesh_nodes);

  verify_mesh(mesh);
  if(!no_output)
    write_mesh(argv[1], mesh);
  else
    printf("Not writing output.\n");

  return 0;
}


================================================
FILE: lonestar/scientific/gpu/delaunayrefinement/meshfiles.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */
#pragma once
#include "dmr.h"

void write_mesh(std::string infile, ShMesh& mesh) {
  FORD *nodex, *nodey;

  nodex = mesh.nodex.cpu_rd_ptr();
  nodey = mesh.nodey.cpu_rd_ptr();

  unsigned slash = infile.rfind("/");
  std::cout << "  -- " << infile.substr(slash + 1) + ".out.node ("
            << mesh.nnodes << " nodes)" << std::endl;
  std::ofstream outfilenode((infile.substr(slash + 1) + ".out.node").c_str());
  outfilenode.precision(17);
  outfilenode << mesh.nnodes << " 2 0 0\n";
  for (size_t ii = 0; ii < mesh.nnodes; ++ii) {
    outfilenode << ii << " " << nodex[ii] << " " << nodey[ii] << "\n";
  }
  outfilenode.close();

  uint3* elements = mesh.elements.cpu_rd_ptr();
  bool* isdel     = mesh.isdel.cpu_rd_ptr();

  unsigned ntriangles2 = mesh.nelements;
  unsigned segmentcnt  = 0;
  for (size_t ii = 0; ii < mesh.nelements; ++ii) {
    if (IS_SEGMENT(elements[ii]) || isdel[ii])
      ntriangles2--;
    if (IS_SEGMENT(elements[ii]) && !isdel[ii])
      segmentcnt++;
  }

  std::cout << "  -- " << infile.substr(slash + 1) + ".out.ele (" << ntriangles2
            << " triangles)" << std::endl;
  std::ofstream outfileele((infile.substr(slash + 1) + ".out.ele").c_str());

  outfileele << ntriangles2 << " 3 0\n";
  size_t kk = 0;
  for (size_t ii = 0; ii < mesh.nelements; ++ii) {
    if (!IS_SEGMENT(elements[ii]) && !isdel[ii])
      outfileele << kk++ << " " << elements[ii].x << " " << elements[ii].y
                 << " " << elements[ii].z << "\n";
  }
  outfileele.close();

  std::cout << "  -- " << infile.substr(slash + 1) + ".out.poly (" << segmentcnt
            << " segments)" << std::endl;
  std::ofstream outfilepoly((infile.substr(slash + 1) + ".out.poly").c_str());
  outfilepoly << "0 2 0 1\n";
  outfilepoly << segmentcnt << " 0\n";
  kk = 0;
  for (size_t ii = 0; ii < mesh.nelements; ++ii) {
    if (IS_SEGMENT(elements[ii]) && !isdel[ii])
      outfilepoly << kk++ << " " << elements[ii].x << " " << elements[ii].y
                  << "\n";
  }
  outfilepoly << "0\n";
  outfilepoly.close();

  std::cout << (ntriangles2 + segmentcnt) << " active elements of "
            << mesh.nelements << " total elements ("
            << mesh.nelements / (ntriangles2 + segmentcnt) << "x) "
            << std::endl;
  std::cout << 1.0 * mesh.maxnelements / mesh.nelements
            << " ratio of used to free elements." << std::endl;
}

void next_line(std::ifstream& scanner) {
  scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
}

void readNodes(std::string filename, ShMesh& mesh, int maxfactor = 2) {
  size_t index;
  FORD x, y;
  bool firstindex = true;

  std::ifstream scanner(filename.append(".node").c_str());
  if (!scanner) {
    fprintf(stderr, "Unable to open file '%s'\n", filename.c_str());
    exit(1);
  }
  scanner >> mesh.nnodes;

  mesh.maxnnodes = (maxfactor / MAX_NNODES_TO_NELEMENTS) * mesh.nnodes;
  printf("memory for nodes: %d MB\n",
         mesh.maxnnodes * sizeof(FORD) * 2 / 1048576);
  mesh.nodex = Shared<FORD>(mesh.maxnnodes);
  mesh.nodey = Shared<FORD>(mesh.maxnnodes);

  FORD* nodex = mesh.nodex.cpu_wr_ptr(true);
  FORD* nodey = mesh.nodey.cpu_wr_ptr(true);

  for (size_t i = 0; i < mesh.nnodes; i++) {
    next_line(scanner);
    scanner >> index >> x >> y;
    if (firstindex) {
      assert(index == 0);
      firstindex = false;
    }

    nodex[index] = x;
    nodey[index] = y;
  }
}

void readTriangles(std::string basename, ShMesh& mesh, int maxfactor = 2) {
  unsigned ntriangles, nsegments;
  unsigned i, index, n1, n2, n3;
  bool firstindex = true;
  std::string filename;

  filename = basename;
  std::ifstream scanner(filename.append(".ele").c_str());
  if (!scanner) {
    fprintf(stderr, "Unable to open file '%s'\n", filename.c_str());
    exit(1);
  }

  scanner >> ntriangles;

  filename = basename;
  std::ifstream scannerperimeter(filename.append(".poly").c_str());
  scannerperimeter >> nsegments; // first line is number of nodes
  assert(nsegments == 0);        // standard triangle format, nodes == 0
  next_line(scannerperimeter);
  scannerperimeter >> nsegments; // number of segments

  mesh.ntriangles   = ntriangles;
  mesh.nsegments    = nsegments;
  mesh.nelements    = ntriangles + nsegments;
  mesh.maxnelements = maxfactor * mesh.nelements;

  printf("memory for elements: %d MB\n",
         mesh.maxnelements * (sizeof(uint3) * 2 + sizeof(bool) * 2) / 1048576);
  mesh.elements.alloc(mesh.maxnelements);
  mesh.isdel.alloc(mesh.maxnelements);
  mesh.isbad.alloc(mesh.maxnelements);
  mesh.neighbours.alloc(mesh.maxnelements);

  uint3* elements = mesh.elements.cpu_wr_ptr(true);
  bool *isdel     = mesh.isdel.cpu_wr_ptr(true),
       *isbad     = mesh.isbad.cpu_wr_ptr(true);
  for (i = 0; i < ntriangles; i++) {
    next_line(scanner);
    scanner >> index >> n1 >> n2 >> n3;
    if (firstindex) {
      assert(index == 0);
      firstindex = false;
    }

    elements[index].x = n1;
    elements[index].y = n2;
    elements[index].z = n3;
    isdel[index] = isbad[index] = false;
  }

  firstindex = true;
  for (i = 0; i < nsegments; i++) {
    next_line(scannerperimeter);
    scannerperimeter >> index >> n1 >> n2;
    if (firstindex) {
      assert(index == 0);
      firstindex = false;
    }

    elements[index + ntriangles].x = n1;
    elements[index + ntriangles].y = n2;
    elements[index + ntriangles].z = INVALIDID;
    isdel[index] = isbad[index] = false;
  }
}


================================================
FILE: lonestar/tutorial_examples/CMakeLists.txt
================================================
function(app name)
  add_executable(${name} ${ARGN})
  add_dependencies(apps ${name})
  target_link_libraries(${name} PRIVATE Galois::shmem lonestar)
  install(TARGETS ${name} DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT apps EXCLUDE_FROM_ALL)
endfunction()

app(example-sssp-push-simple SSSPPushSimple.cpp)
app(example-sssp-simple SSSPsimple.cpp)
app(example-sssp-pull-simple SSSPPullSimple.cpp)
app(example-hello-world HelloWorld.cpp)
app(example-torus Torus.cpp)
app(example-torus-improved TorusImproved.cpp)
app(example-torus-construction TorusConstruction.cpp)
app(example-graph-traversal-serial GraphTraversalSerial.cpp)
app(example-graph-traversal-pull GraphTraversalPullOperator.cpp)
app(example-graph-traversal-push GraphTraversalPushOperator.cpp)
app(example-conflict-aware-torus ConflictAwareTorus.cpp)
app(example-spanningtree SpanningTree.cpp)
app(example-countlevels CountLevels.cpp)
app(example-malloc ThirdPartyMalloc.cpp)
app(example-wrapped-worklist ExampleWrappedWorklist.cpp)

add_test_scale(small example-hello-world 2 10)
add_test_scale(small example-torus 2 100)
add_test_scale(small example-torus-improved 2 100)
add_test_scale(small1 example-spanningtree "${BASEINPUT}/reference/structured/rome99.gr")
add_test_scale(small2 example-spanningtree "${BASEINPUT}/scalefree/rmat10.gr")


================================================
FILE: lonestar/tutorial_examples/ConflictAwareTorus.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows
// 1. how to bulid a conflict-aware data structure w/ Locakable
// 2. how to implement conflict detection in your data structure's APIs
// 3. how to define iterators for STL compliance
// 4. how to leverage LargeArray to do NUMA-aware allocation
// 5. how to turn-off conflict-detection when you do not want it
#include "galois/Galois.h"
#include "galois/LargeArray.h"

#include <boost/iterator/counting_iterator.hpp>

#include <iostream>

template <typename T>
class Torus2D {
  //! [Internal type with Lockable]
  //************************************************************************
  // internal type to combine user data with Lockable object
  //************************************************************************
  struct NodeData : public galois::runtime::Lockable {
  public:
    using reference = T&;

  public:
    T v;

  public:
    reference getData() { return v; }
  };
  //! [Internal type with Lockable]

  //! [Array of internal type]
  size_t numRows, numCols;

  // use galois::LargeArray for NUMA-aware allocation
  // will allocate numRows*numCols elements in constructors
  galois::LargeArray<NodeData> data;
  //! [Array of internal type]

  //! [Types for STL]
  //************************************************************************
  // subtypes visible to user
  //************************************************************************
public:
  // opaque type for node
  using TorusNode = size_t;

  // iterator for an STL container
  using iterator = boost::counting_iterator<TorusNode>;
  //! [Types for STL]

public:
  //************************************************************************
  // constructor for the torus
  //************************************************************************
  Torus2D(size_t r, size_t c) : numRows(r), numCols(c) {
    // allocate torus nodes in an interleaved way among NUMA domains
    data.allocateInterleaved(r * c);

    // call constructor for each torus node
    for (size_t n = 0; n < r * c; ++n) {
      data.constructAt(n);
    }
  }

  //! [APIs for sizes]
  //************************************************************************
  // functions for size of the torus
  //************************************************************************
  size_t height() { return numRows; }
  size_t width() { return numCols; }
  size_t size() { return width() * height(); }
  //! [APIs for sizes]

  //! [Iterators]
  //************************************************************************
  // functions to traverse nodes
  //************************************************************************
  iterator begin() { return iterator(0); }
  iterator end() { return iterator(size()); }
  //! [Iterators]

  //! [Acquire node ownership]
  //************************************************************************
  // functions to acquire node ownership
  //************************************************************************
  void acquireNode(TorusNode n,
                   galois::MethodFlag mflag = galois::MethodFlag::WRITE) {
    // sanity check
    assert(n < size());

    // use this call to detect conflicts and handling aborts
    galois::runtime::acquire(&data[n], mflag);
  }
  //! [Acquire node ownership]

  //! [Get data]
  //************************************************************************
  // function to access node data
  //************************************************************************
  typename NodeData::reference
  getData(TorusNode n, galois::MethodFlag mflag = galois::MethodFlag::WRITE) {
    acquireNode(n, mflag);

    // use the internal wrapper type to encapsulate users from Lockable objects
    return data[n].getData();
  }
  //! [Get data]

  //! [Easy operator cautiousness]
  //************************************************************************
  // functions to access neighboring nodes, i.e. edges in a general graph
  //************************************************************************
  iterator upNeighbor(TorusNode n) {
    auto r = n / numCols, c = n % numCols;
    auto newR = (r + numRows - 1) % numRows;
    return iterator(newR * numCols + c);
  }

  iterator downNeighbor(TorusNode n) {
    auto r = n / numCols, c = n % numCols;
    auto newR = (r + 1) % numRows;
    return iterator(newR * numCols + c);
  }

  iterator leftNeighbor(TorusNode n) {
    auto r = n / numCols, c = n % numCols;
    auto newC = (c + numCols - 1) % numCols;
    return iterator(r * numCols + newC);
  }

  iterator rightNeighbor(TorusNode n) {
    auto r = n / numCols, c = n % numCols;
    auto newC = (c + 1) % numCols;
    return iterator(r * numCols + newC);
  }

  //************************************************************************
  // function to lock all neighbors of node n
  // similar to edge_begin(), edge_end() or edges() in a general graph
  //************************************************************************
  void
  acquireAllNeighbors(TorusNode n,
                      galois::MethodFlag mflag = galois::MethodFlag::WRITE) {
    acquireNode(*upNeighbor(n), mflag);
    acquireNode(*downNeighbor(n), mflag);
    acquireNode(*leftNeighbor(n), mflag);
    acquireNode(*rightNeighbor(n), mflag);
  }
  //! [Easy operator cautiousness]
}; // end of class Torus2D

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;

  if (argc < 4) {
    std::cerr << "Usage: " << argv[0]
              << " <num_rows> <num_columns> <num_threads>" << std::endl;
    return 1;
  }

  galois::setActiveThreads(std::atoi(argv[3]));

  //! [Use torus]
  using Torus     = Torus2D<unsigned int>;
  using TorusNode = Torus::TorusNode;

  Torus torus(std::atoi(argv[1]), std::atoi(argv[2]));

  galois::do_all(
      galois::iterate(size_t{0},
                      torus.size()), // range as a pair of unsigned integers
      [&](TorusNode n) { torus.getData(n) = 0; } // operator
      ,
      galois::loopname("do_all_torus_reset_self") // options
  );

  galois::for_each(
      galois::iterate(
          torus), // range as a container. assuming begin() and end()
      [&](TorusNode n, auto&) { // operator
        // cautious point
        torus.acquireAllNeighbors(n);

        torus.getData(*torus.upNeighbor(n)) += 1;
        torus.getData(*torus.downNeighbor(n)) += 1;
        torus.getData(*torus.leftNeighbor(n)) += 1;
        torus.getData(*torus.rightNeighbor(n)) += 1;
      },
      galois::loopname("for_each_torus_add_neighbors") // options
      ,
      galois::no_pushes());
  //! [Use torus]

  //! [Turn off conflict detection]
  // serial verification, no conflict is possible
  size_t numWrongAnswer = 0;
  for (auto n : torus) {
    // use galois::MethodFlag::UNPROTECTED to notify Galois runtime
    // that do not acquire lock for this call
    if (torus.getData(n, galois::MethodFlag::UNPROTECTED) != 4) {
      numWrongAnswer++;
    }
  }
  std::cout << "# nodes of wrong answer: " << numWrongAnswer << std::endl;
  //! [Turn off conflict detection]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/CountLevels.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "Lonestar/BoilerPlate.h"
#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Timer.h"
#include "galois/graphs/LCGraph.h"

static const char* name = "Count levels";
static const char* desc = "Computes the number of degree levels";

#define DEBUG false

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input graph>"), cll::Required);
static cll::opt<unsigned int> startNode("startNode",
                                        cll::desc("Node to start search from"),
                                        cll::init(0));

enum COLOR { WHITE, GRAY, BLACK };

struct LNode {
  uint32_t dist;
  COLOR color;
};

using Graph = galois::graphs::LC_CSR_Graph<LNode, void>::with_numa_alloc<
    true>::type ::with_no_lockable<true>::type;
using GNode = Graph::GraphNode;

static const unsigned int DIST_INFINITY =
    std::numeric_limits<unsigned int>::max();

const galois::gstl::Vector<size_t>& countLevels(Graph& graph) {

  using Vec = galois::gstl::Vector<size_t>;

  //! [Define GReducible]
  auto merge = [](Vec& lhs, Vec&& rhs) -> Vec& {
    Vec v(std::move(rhs));
    if (lhs.size() < v.size()) {
      lhs.resize(v.size());
    }
    auto ll = lhs.begin();
    for (auto ii = v.begin(), ei = v.end(); ii != ei; ++ii, ++ll) {
      *ll += *ii;
    }
    return lhs;
  };

  auto identity = []() -> Vec { return Vec(); };

  auto r = galois::make_reducible(merge, identity);

  galois::do_all(galois::iterate(graph), [&](GNode n) {
    LNode srcdata = graph.getData(n);
    if (srcdata.dist == DIST_INFINITY) {
      return;
    }

    auto& vec = r.getLocal();
    if (vec.size() <= srcdata.dist) {
      vec.resize(srcdata.dist + 1);
    }
    vec[srcdata.dist] += 1;
  });

  return r.reduce();
  //! [Define GReducible]
}

void bfsSerial(Graph& graph, GNode source) {
  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;

  LNode& sdata = graph.getData(source, flag);
  sdata.dist   = 0u;
  sdata.color  = GRAY;

  std::queue<GNode> queue;
  queue.push(source);

  while (!queue.empty()) {
    GNode curr = queue.front();
    sdata      = graph.getData(curr, flag);
    queue.pop();

    // iterate over edges from node n
    for (auto e : graph.edges(curr)) {
      GNode dst    = graph.getEdgeDst(e);
      LNode& ddata = graph.getData(dst);

      if (ddata.color == WHITE) {
        ddata.color = GRAY;
        ddata.dist  = sdata.dist + 1;
        queue.push(dst);
      }
    }
    sdata.color = BLACK;
  } // end while
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);

  galois::StatTimer OT("OverheadTime");
  OT.start();

  Graph graph;
  galois::graphs::readGraph(graph, inputFile);
  std::cout << "Read " << graph.size() << " nodes, " << graph.sizeEdges()
            << " edges\n";

  galois::preAlloc(5 * numThreads +
                   (2 * graph.size() * sizeof(typename Graph::node_data_type)) /
                       galois::runtime::pagePoolSize());
  galois::reportPageAlloc("MeminfoPre");

  galois::do_all(
      galois::iterate(graph),
      [&](const GNode& src) {
        LNode& sdata = graph.getData(src);
        sdata.color  = WHITE;
        sdata.dist   = DIST_INFINITY;
      },
      galois::no_stats());

  if (startNode >= graph.size()) {
    std::cerr << "Source node index " << startNode
              << " is greater than the graph size" << graph.size()
              << ", failed to set source: " << startNode << "\n";
    assert(0);
    abort();
  }
  GNode source;
  auto it = graph.begin();
  std::advance(it, startNode.getValue());
  source = *it;

  galois::StatTimer T;
  T.start();
  bfsSerial(graph, source);
  const auto& counts = countLevels(graph);
  T.stop();

  galois::reportPageAlloc("MeminfoPost");

#if DEBUG
  for (auto n : graph) {
    LNode& data = graph.getData(n);
    std::cout << "Node: " << n << " BFS dist:" << data.dist << std::endl;
  }
#endif

  std::cout << "Number of BFS levels: " << counts.size() << "\n";

  OT.stop();

  return EXIT_SUCCESS;
}


================================================
FILE: lonestar/tutorial_examples/ExampleWrappedWorklist.cpp
================================================
#include "galois/Galois.h"
#include "galois/Bag.h"
#include "galois/UserContext.h"
#include "galois/substrate/PerThreadStorage.h"
#include "Lonestar/BoilerPlate.h"

#include <iostream>
#include <fstream>

class ExampleWrappedWorklist {
private:
  galois::InsertBag<int> bag;
  galois::substrate::PerThreadStorage<galois::UserContext<int>*> ctxPtr;
  bool inParallelPhase;

private:
  void reset() {
    bag.clear();
    for (unsigned i = 0; i < ctxPtr.size(); i++) {
      *(ctxPtr.getRemote(i)) = nullptr;
    }
  }

public:
  ExampleWrappedWorklist() : inParallelPhase(false) { reset(); }

  void enqueue(int item) {
    if (inParallelPhase) {
      (*(ctxPtr.getLocal()))->push(item);
    } else {
      bag.push(item);
    }
  }

  void execute() {
    inParallelPhase = true;

    galois::for_each(
        galois::iterate(bag),
        [&](int item, auto& ctx) {
          if (nullptr == *(ctxPtr.getLocal())) {
            *(ctxPtr.getLocal()) = &ctx;
          }

          std::cout << item << std::endl;

          if (item < 2000) {
            this->enqueue(item + item);
          }
        },
        galois::loopname("execute"), galois::disable_conflict_detection());

    inParallelPhase = false;
    reset();
  }
};

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv);

  ExampleWrappedWorklist q;
  for (unsigned i = 0; i < galois::getActiveThreads(); i++) {
    q.enqueue(i + 1);
  }
  q.execute();

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/GraphTraversalPullOperator.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows
// 0. reading in a graph from a file
// 1. serial iteration over nodes
// 2. do_all iteration over nodes
// 3. access to node and edge data
// 4. usage of galois::StatTimer
// 5. how to change # of threads
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "galois/Timer.h"
#include <iostream>

using Graph = galois::graphs::LC_CSR_Graph<int, int>;
using GNode = Graph::GraphNode;

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;

  if (argc < 3) {
    std::cerr << "Usage: " << argv[0] << " filename num_threads" << std::endl;
    return 1;
  }

  Graph g;
  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph
  galois::setActiveThreads(std::atoi(argv[2])); // argv[2] is # of threads

  //******************************************************************************************
  // serial traversal over a graph
  // sum over nodes and edges in C++11 syntax
  galois::StatTimer T("sum_serial");
  T.start();
  for (auto n : g) {
    auto& sum = g.getData(n);
    sum       = 0;
    for (auto e : g.edges(n)) {
      sum += g.getEdgeData(e);
    }
  }
  T.stop();

  //*****************************************************************************************
  // parallel traversal over a graph using galois::do_all w/o work stealing
  // 1. operator is specified using lambda expression
  // 2. do_all is named "sum_in_do_all_with_lambda" to show stat after this
  // program finishes
  //! [Graph traversal in pull using do_all]
  galois::do_all(
      galois::iterate(g.begin(), g.end()), // range
      [&](GNode n) {                       // operator
        auto& sum = g.getData(n);
        sum       = 0;
        for (auto e : g.edges(n)) {
          sum += g.getEdgeData(e);
        }
      },
      galois::loopname("sum_in_do_all_with_lambda") // options
  );
  //! [Graph traversal in pull using do_all]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/GraphTraversalPushOperator.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows
// 0. reading in a graph from a file
// 1. serial iteration over nodes
// 2. for_each iteration over nodes
// 3. access to node and edge data
// 4. usage of galois::StatTimer
// 5. how to change # of threads
// 6. push-style operator using atomic intrinsics in do_all
// 7. push-style operator using atomic intrinsics in for_each w/o conflict
// detection
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "galois/Timer.h"
#include <iostream>

using Graph = galois::graphs::LC_CSR_Graph<int, int>;
using GNode = Graph::GraphNode;

//! [Initialization]
void initialize(Graph& g) {
  galois::do_all(galois::iterate(g.begin(), g.end()), // range
                 [&](GNode n) { g.getData(n) = 0; }   // operator
  );
};
//! [Initialization]

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;

  if (argc < 3) {
    std::cerr << "Usage: " << argv[0] << " filename num_threads" << std::endl;
    return 1;
  }

  Graph g;
  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph
  galois::setActiveThreads(std::atoi(argv[2])); // argv[2] is # of threads

  //******************************************************
  // serial traversal over a graph
  // sum over nodes and edges in C++11 syntax
  galois::StatTimer T("sum_serial");
  T.start();
  for (auto n : g) {
    auto& sum = g.getData(n);
    sum       = 0;
    for (auto e : g.edges(n)) {
      sum += g.getEdgeData(e);
    }
  }
  T.stop();

  //! [For each with conflict detection]
  //******************************************************
  // parallel traversal over a graph using galois::for_each
  // 1. push operator is specified using lambda expression
  // 2. for_each is named "sum_in_for_each_with_push_operator" to show stat
  // after this program finishes
  initialize(g);
  galois::for_each(
      galois::iterate(g.begin(), g.end()), // range
      [&](GNode n, auto&) {                // operator
        for (auto e : g.edges(n)) {        // cautious point
          auto dst = g.getEdgeDst(e);
          g.getData(dst) += g.getEdgeData(e);
        }
      },
      galois::loopname("sum_in_for_each_with_push_operator") // options
  );
  //! [For each with conflict detection]

  //! [For each and do all without conflict detection]
  // define lambda expression as a varible for reuse
  auto sumEdgeWeightsAtomically = [&](GNode n) {
    for (auto e : g.edges(n)) {
      auto dst        = g.getEdgeDst(e);
      auto& dstData   = g.getData(dst);
      auto edgeWeight = g.getEdgeData(e);
      __sync_fetch_and_add(&dstData, edgeWeight);
    }
  };

  //******************************************************
  // parallel traversal over a graph using galois::do_all w/o work stealing
  // 1. push operator uses atomic intrinsic
  // 2. do_all is named "sum_in_do_all_with_push_atomic" to show stat after this
  // program finishes
  initialize(g);
  galois::do_all(galois::iterate(g.begin(), g.end()), // range
                 sumEdgeWeightsAtomically             // operator
                 ,
                 galois::loopname("sum_in_do_all_with_push_atomic") // options
  );

  //******************************************************
  // parallel traversal over a graph using galois::for_each
  // 1. push operator uses atomic intrinsic
  // 2. for_each is named "sum_in_do_for_each_with_push_atomic" to show stat
  // after this program finishes
  initialize(g);
  galois::for_each(
      galois::iterate(g.begin(), g.end()),                 // range
      [&](GNode n, auto&) { sumEdgeWeightsAtomically(n); } // operator
      ,
      galois::loopname("sum_in_for_each_with_push_atomic") // options
      ,
      galois::no_pushes(), galois::disable_conflict_detection());
  //! [For each and do all without conflict detection]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/GraphTraversalSerial.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows
// 0. reading in a graph from a file
// 1. serial iteration over nodes
// 2. access to node and edge data
// 3. usage of galois::StatTimer
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "galois/Timer.h"
#include <iostream>

int main(int argc, char* argv[]) {
  galois::SharedMemSys G;

  if (argc < 2) {
    std::cerr << "Usage: " << argv[0] << " filename" << std::endl;
    return 1;
  }

  //! [Define LC_CSR_Graph]
  // An LC_CSR_Graph whose node data type is int and edge data type is int
  using Graph = galois::graphs::LC_CSR_Graph<int, int>;
  //! [Define LC_CSR_Graph]

  //! [Read a graph]
  Graph g;
  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph
  //! [Read a graph]

  //! [use of a StatTimer]
  //******************************************************
  // serial traversal over a graph
  // sum over nodes and edges in C++11 syntax
  galois::StatTimer T("sum_serial");
  T.start();
  //! [Graph traversal]
  // iterate over nodes
  for (auto n : g) {
    auto& sum = g.getData(n); // get node data of n
    sum       = 0;
    // iterate over edges from node n
    for (auto e : g.edges(n)) {
      sum += g.getEdgeData(e); // get edge data of e
    }
  }
  //! [Graph traversal]
  T.stop();
  //! [use of a StatTimer]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/HelloWorld.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include <boost/iterator/counting_iterator.hpp>
#include <iostream>

//! [do_all example]
struct HelloWorld {
  void operator()(int i) const { std::cout << "Hello " << i << "\n"; }
};

void helloWorld(int i) { std::cout << "Hello " << i << "\n"; }

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  if (argc < 3) {
    std::cerr << "<num threads> <num of iterations>\n";
    return 1;
  }
  unsigned int numThreads = atoi(argv[1]);
  int n                   = atoi(argv[2]);

  numThreads = galois::setActiveThreads(numThreads);
  std::cout << "Using " << numThreads << " threads and " << n
            << " iterations\n";

  std::cout << "Using a lambda\n";
  galois::do_all(galois::iterate(0, n),
                 [](int i) { std::cout << "Hello " << i << "\n"; });

  std::cout << "Using a function object\n";
  galois::do_all(galois::iterate(0, n), HelloWorld());

  std::cout << "Using a function pointer (discouraged)\n";
  galois::do_all(galois::iterate(0, n), &helloWorld);

  //! [do_all example]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/SSSPPullSimple.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Timer.h"
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

//! [Define LC Graph]
typedef galois::graphs::LC_Linear_Graph<unsigned int, unsigned int> Graph;
//! [Define LC Graph]
typedef Graph::GraphNode GNode;
typedef std::pair<unsigned, GNode> UpdateRequest;

static const unsigned int DIST_INFINITY =
    std::numeric_limits<unsigned int>::max();

constexpr unsigned stepShift = 14;
Graph graph;

namespace cll = llvm::cl;
static cll::opt<std::string> filename(cll::Positional,
                                      cll::desc("<input file>"), cll::Required);

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv);

  //! [ReadGraph]
  galois::graphs::readGraph(graph, filename);
  //! [ReadGraph]

  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { graph.getData(n) = DIST_INFINITY; });

  //! [OrderedByIntegerMetic in SSSPsimple]
  auto reqIndexer = [](const UpdateRequest& req) {
    return (req.first >> stepShift);
  };

  using namespace galois::worklists;
  typedef PerSocketChunkLIFO<16> PSchunk;
  typedef OrderedByIntegerMetric<decltype(reqIndexer), PSchunk> OBIM;
  //! [OrderedByIntegerMetic in SSSPPullsimple]

  galois::StatTimer T;
  T.start();
  graph.getData(*graph.begin()) = 0;
  //! [for_each in SSSPPullsimple]
  std::vector<UpdateRequest> init;
  init.reserve(std::distance(graph.edge_begin(*graph.begin()),
                             graph.edge_end(*graph.begin())));
  for (auto ii : graph.edges(*graph.begin()))
    init.push_back(std::make_pair(0, graph.getEdgeDst(ii)));

  galois::for_each(
      galois::iterate(init.begin(), init.end()),
      [&](const UpdateRequest& req, auto& ctx) {
        GNode active_node = req.second;
        unsigned& data    = graph.getData(active_node);
        unsigned newValue = data;

        //![loop over neighbors to compute new value]
        for (auto ii : graph.edges(active_node)) {
          GNode dst = graph.getEdgeDst(ii);
          newValue =
              std::min(newValue, graph.getData(dst) + graph.getEdgeData(ii));
        }
        //![set new value and add neighbors to wotklist
        if (newValue < data) {
          data = newValue;
          for (auto ii : graph.edges(active_node)) {
            GNode dst = graph.getEdgeDst(ii);
            if (graph.getData(dst) > newValue)
              ctx.push(std::make_pair(newValue, dst));
          }
        }
      },
      galois::wl<OBIM>(reqIndexer), galois::loopname("sssp_run_loop"));
  //! [for_each in SSSPPullsimple]
  T.stop();
  return 0;
}


================================================
FILE: lonestar/tutorial_examples/SSSPPushSimple.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This examples shows
// 1. how to pass a range for data-driven algorithms
// 2. how to add new work items using context
// 3. how to specify schedulers
// 4. how to write an indexer for OBIM
#include "galois/Timer.h"
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"

#include <iostream>
#include <string>

using Graph = galois::graphs::LC_Linear_Graph<unsigned int, unsigned int>;
using GNode = Graph::GraphNode;
using UpdateRequest = std::pair<unsigned, GNode>;

static const unsigned int DIST_INFINITY =
    std::numeric_limits<unsigned int>::max();

constexpr unsigned int stepShift = 14;

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  galois::setActiveThreads(256); // Galois will cap at hw max

  if (argc != 3) {
    std::cout << "Usage: " << argv[0]
              << " filename <dchunk16|obim|ParaMeter|det>\n";
    return 1;
  } else {
    std::cout << "Note: This is just a very simple example and provides no "
                 "useful information for performance\n";
  }

  Graph graph;
  galois::graphs::readGraph(graph,
                            argv[1]); // argv[1] is the file name for graph

  // initialization
  galois::do_all(galois::iterate(graph),
                 [&graph](GNode N) {
                   graph.getData(N) = DIST_INFINITY;
                 } // operator as lambda expression
  );

  galois::StatTimer T;
  T.start();

  //! [SSSP push operator]
  // SSSP operator
  // auto& ctx expands to galois::UserContext<GNode>& ctx
  auto SSSP = [&](GNode active_node, auto& ctx) {
    // Get the value on the node
    auto srcData = graph.getData(active_node);

    // loop over neighbors to compute new value
    for (auto ii : graph.edges(active_node)) { // cautious point
      auto dst      = graph.getEdgeDst(ii);
      auto weight   = graph.getEdgeData(ii);
      auto& dstData = graph.getData(dst);
      if (dstData > weight + srcData) {
        dstData = weight + srcData;
        ctx.push(dst); // add new work items
      }
    }
  };
  //! [SSSP push operator]

  //! [Scheduler examples]
  // Priority Function in SSSPPushSimple
  // Map user-defined priority to a bucket number in OBIM
  auto reqIndexer = [&](const GNode& N) {
    return (graph.getData(N, galois::MethodFlag::UNPROTECTED) >> stepShift);
  };

  using namespace galois::worklists;
  using PSchunk = PerSocketChunkLIFO<16>; // chunk size 16
  using OBIM    = OrderedByIntegerMetric<decltype(reqIndexer), PSchunk>;
  //! [Scheduler examples]

  //! [Data-driven loops]
  std::string schedule = argv[2]; // argv[2] is the scheduler to be used

  // clear source
  graph.getData(*graph.begin()) = 0;

  if ("dchunk16" == schedule) {
    //! [chunk worklist]
    galois::for_each(
        galois::iterate(
            {*graph.begin()}), // initial range using initializer list
        SSSP                   // operator
        ,
        galois::wl<PSchunk>() // options. PSchunk expands to
                              // galois::worklists::PerSocketChunkLIFO<16>,
                              // where 16 is chunk size
        ,
        galois::loopname("sssp_dchunk16"));
    //! [chunk worklist]
  } else if ("obim" == schedule) {
    //! [OBIM]
    galois::for_each(
        galois::iterate(
            {*graph.begin()}), // initial range using initializer list
        SSSP                   // operator
        ,
        galois::wl<OBIM>(reqIndexer) // options. Pass an indexer instance for
                                     // OBIM construction.
        ,
        galois::loopname("sssp_obim"));
    //! [OBIM]
  }
  //! [Data-driven loops]

  else if ("ParaMeter" == schedule) {
    //! [ParaMeter loop iterator]
    galois::for_each(
        galois::iterate(
            {*graph.begin()}), // initial range using initializer list
        SSSP                   // operator
        ,
        galois::wl<galois::worklists::ParaMeter<>>() // options
        ,
        galois::loopname("sssp_ParaMeter"));
    //! [ParaMeter loop iterator]
  } else if ("det") {
    //! [Deterministic loop iterator]
    galois::for_each(
        galois::iterate(
            {*graph.begin()}), // initial range using initializer list
        SSSP                   // operator
        ,
        galois::wl<galois::worklists::Deterministic<>>() // options
        ,
        galois::loopname("sssp_deterministic"));
    //! [Deterministic loop iterator]
  } else {
    std::cerr << "Unknown schedule " << schedule << std::endl;
    return 1;
  }

  T.stop();
  return 0;
}


================================================
FILE: lonestar/tutorial_examples/SSSPsimple.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Timer.h"
#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "llvm/Support/CommandLine.h"
#include "Lonestar/BoilerPlate.h"

//! [Define LC Graph]
typedef galois::graphs::LC_Linear_Graph<unsigned int, unsigned int> Graph;
//! [Define LC Graph]
typedef Graph::GraphNode GNode;
typedef std::pair<unsigned, GNode> UpdateRequest;

static const unsigned int DIST_INFINITY =
    std::numeric_limits<unsigned int>::max();

unsigned stepShift = 14;
Graph graph;

namespace cll = llvm::cl;
static cll::opt<std::string>
    inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);

template <typename C>
void relax_edge(unsigned src_data, Graph::edge_iterator ii, C& ctx) {
  GNode dst = graph.getEdgeDst(ii);
  //![get edge and node data]
  unsigned int edge_data = graph.getEdgeData(ii);
  unsigned& dst_data     = graph.getData(dst);
  //![get edge and node data]
  unsigned int newDist = src_data + edge_data;
  if (newDist < dst_data) {
    dst_data = newDist;
    ctx.push(std::make_pair(newDist, dst));
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv);

  //! [ReadGraph]
  galois::graphs::readGraph(graph, inputFile);
  //! [ReadGraph]

  galois::for_each(galois::iterate(graph),
                   [&](GNode n, auto&) { graph.getData(n) = DIST_INFINITY; });

  //! [OrderedByIntegerMetic in SSSPsimple]
  auto reqIndexer = [](const UpdateRequest& req) {
    return (req.first >> stepShift);
  };

  using namespace galois::worklists;
  typedef PerSocketChunkLIFO<16> PSchunk;
  typedef OrderedByIntegerMetric<decltype(reqIndexer), PSchunk> OBIM;
  //! [OrderedByIntegerMetic in SSSPsimple]

  galois::StatTimer T;
  T.start();
  graph.getData(*graph.begin()) = 0;
  //! [for_each in SSSPsimple]
  galois::for_each(
      galois::iterate({std::make_pair(0U, *graph.begin())}),
      //! [Operator in SSSPsimple]
      [&](UpdateRequest& req, auto& ctx) {
        GNode active_node = req.second;
        unsigned& data    = graph.getData(active_node);
        if (req.first > data)
          return;
        //![loop over neighbors]
        for (auto ii : graph.edges(active_node))
          relax_edge(data, ii, ctx);
        //![loop over neighbors]
      }
      //! [Operator in SSSPsimple]
      ,
      galois::wl<OBIM>(reqIndexer), galois::loopname("sssp_run_loop"));
  //! [for_each in SSSPsimple]
  T.stop();
  return 0;
}


================================================
FILE: lonestar/tutorial_examples/SpanningTree.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Reduction.h"
#include "galois/Bag.h"
#include "galois/Timer.h"
#include "galois/UnionFind.h"
#include "galois/graphs/LCGraph.h"
#include "galois/ParallelSTL.h"
#include "llvm/Support/CommandLine.h"

#include "Lonestar/BoilerPlate.h"

#include <utility>
#include <algorithm>
#include <iostream>

namespace cll = llvm::cl;

const char* name = "Spanning Tree Algorithm";
const char* desc = "Computes the spanning forest of a graph";

enum Algo { demo, asynchronous, blockedasync };

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<Algo>
    algo("algo", cll::desc("Choose an algorithm:"),
         cll::values(clEnumVal(demo, "Demonstration algorithm"),
                     clEnumVal(asynchronous, "Asynchronous"),
                     clEnumVal(blockedasync, "Blocked Asynchronous")),
         cll::init(blockedasync));

struct Node : public galois::UnionFindNode<Node> {
  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}
  Node* component() { return find(); }
  void setComponent(Node* n) { m_component = n; }
};

std::ostream& operator<<(std::ostream& os, const Node& n) {
  os << "[id: " << &n << "]";
  return os;
}

typedef galois::graphs::LC_Linear_Graph<Node, void>::with_numa_alloc<true>::type
    Graph;
typedef Graph::GraphNode GNode;
typedef std::pair<GNode, GNode> Edge;

struct BlockedWorkItem {
  GNode src;
  Graph::edge_iterator start;
};

template <bool MakeContinuation, int Limit>
auto specialized_process(Graph& graph, galois::InsertBag<Edge>& mst)
    -> decltype(auto) {
  return
      [&](const GNode& src, const Graph::edge_iterator& start, auto& pusher) {
        Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
        int count   = 1;
        for (Graph::edge_iterator
                 ii = start,
                 ei = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);
             ii != ei; ++ii, ++count) {
          GNode dst   = graph.getEdgeDst(ii);
          Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
          if (sdata.merge(&ddata)) {
            mst.push(std::make_pair(src, dst));
            if (Limit == 0 || count != Limit) {
              continue;
            }
          }

          if (MakeContinuation || (Limit != 0 && count == Limit)) {
            BlockedWorkItem item = {src, ii + 1};
            pusher.push(item);
            break;
          }
        }
      };
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  LonestarStart(argc, argv, name, desc, nullptr, nullptr);

  Graph graph;

  galois::InsertBag<Edge> mst;

  galois::StatTimer Tinitial("InitializeTime");
  Tinitial.start();
  galois::graphs::readGraph(graph, inputFilename);
  std::cout << "Num nodes: " << graph.size() << "\n";
  Tinitial.stop();

  //! Normalize component by doing find with path compression
  auto Normalize = [&](const GNode& src) {
    Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
    sdata.setComponent(sdata.findAndCompress());
  };

  // galois::preAlloc(numThreads + graph.size() /
  // galois::runtime::MM::hugePageSize * 60);
  galois::reportPageAlloc("MeminfoPre");
  galois::StatTimer T;
  T.start();
  switch (algo) {
  /**
   * Construct a spanning forest via a modified BFS algorithm. Intended as a
   * simple introduction to the Galois system and not intended to particularly
   * fast. Restrictions: graph must be strongly connected. In this case, the
   * spanning tree is over the undirected graph created by making the directed
   * graph symmetric.
   */
  case demo: {
    Graph::iterator ii = graph.begin(), ei = graph.end();
    if (ii != ei) {
      Node* root = &graph.getData(*ii);
      galois::for_each(
          galois::iterate({*ii}),
          [&](GNode src, auto& ctx) {
            for (auto ii : graph.edges(src, galois::MethodFlag::WRITE)) {
              GNode dst   = graph.getEdgeDst(ii);
              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              if (ddata.component() == root)
                continue;
              ddata.setComponent(root);
              mst.push(std::make_pair(src, dst));
              ctx.push(dst);
            }
          },
          galois::loopname("DemoAlgo"),
          galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());
    }
  } break;

  case asynchronous:
    /**
     * Like asynchronous connected components algorithm.
     */
    {
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);
            for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {
              GNode dst   = graph.getEdgeDst(ii);
              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);
              if (sdata.merge(&ddata)) {
                mst.push(std::make_pair(src, dst));
              }
            }
          },
          galois::loopname("Merge"), galois::steal());
      galois::do_all(galois::iterate(graph), Normalize,
                     galois::loopname("Normalize"));
    }
    break;

  case blockedasync:
    /**
     * Improve performance of async algorithm by following machine topology.
     */
    {
      galois::InsertBag<BlockedWorkItem> items;
      galois::do_all(
          galois::iterate(graph),
          [&](const GNode& src) {
            Graph::edge_iterator start =
                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);
            if (galois::substrate::ThreadPool::getSocket() == 0) {
              specialized_process<true, 0>(graph, mst)(src, start, items);
            } else {
              specialized_process<true, 1>(graph, mst)(src, start, items);
            }
          },
          galois::loopname("Initialize"));
      galois::for_each(
          galois::iterate(items),
          [&](const BlockedWorkItem& i, auto& ctx) {
            specialized_process<true, 0>(graph, mst)(i.src, i.start, ctx);
          },
          galois::loopname("Merge"), galois::disable_conflict_detection(),
          galois::wl<galois::worklists::PerSocketChunkFIFO<128>>());
      //! Normalize component by doing find with path compression
      galois::do_all(galois::iterate(graph), Normalize,
                     galois::loopname("Normalize"));
    }
    break;

  default:
    std::cerr << "Unknown algo: " << algo << "\n";
  }
  T.stop();
  galois::reportPageAlloc("MeminfoPost");

  /* Verification Routines */
  auto is_bad_graph = [&](const GNode& n) {
    Node& me = graph.getData(n);
    for (auto ii : graph.edges(n)) {
      GNode dst  = graph.getEdgeDst(ii);
      Node& data = graph.getData(dst);
      if (me.component() != data.component()) {
        std::cerr << "not in same component: " << me << " and " << data << "\n";
        return true;
      }
    }
    return false;
  };

  auto is_bad_mst = [&](const Edge& e) {
    return graph.getData(e.first).component() !=
           graph.getData(e.second).component();
  };

  auto checkAcyclic = [&]() {
    galois::GAccumulator<unsigned> roots;
    galois::do_all(galois::iterate(graph), [&](const GNode& n) {
      Node& data = graph.getData(n);
      if (data.component() == &data)
        roots += 1;
    });
    unsigned numRoots = roots.reduce();
    unsigned numEdges = std::distance(mst.begin(), mst.end());
    if (graph.size() - numRoots != numEdges) {
      std::cerr << "Generated graph is not a forest. "
                << "Expected " << graph.size() - numRoots << " edges but "
                << "found " << numEdges << "\n";
      return false;
    }
    std::cout << "Num trees: " << numRoots << "\n";
    std::cout << "Tree edges: " << numEdges << "\n";
    return true;
  };

  auto verify = [&]() {
    if (galois::ParallelSTL::find_if(graph.begin(), graph.end(),
                                     is_bad_graph) == graph.end()) {
      if (galois::ParallelSTL::find_if(mst.begin(), mst.end(), is_bad_mst) ==
          mst.end()) {
        return checkAcyclic();
      }
    }
    return false;
  };

  if (!skipVerify && !verify()) {
    std::cerr << "verification failed\n";
    assert(0 && "verification failed");
    abort();
  }

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/ThirdPartyMalloc.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows how to use galois::runtime::ExternalHeapAllocator
// to wrap up 3rd-party allocators and use the wrapped heap for STL containers.
#include "galois/Galois.h"
#include "galois/runtime/Mem.h"

#include <iostream>

int main() {
  galois::SharedMemSys G;

  //! [heap wrapping example]
  // Our 3rd-party heap
  using RealHeap = galois::runtime::MallocHeap;

  // Wrap RealHeap to conform to STL allocators
  using WrappedHeap = galois::runtime::ExternalHeapAllocator<int, RealHeap>;

  // Instantiate heaps
  RealHeap externalHeap;
  WrappedHeap heap(&externalHeap);

  // Use the wrapped heap
  std::vector<int, WrappedHeap> v(heap);
  for (int i = 0; i < 5; i++) {
    v.push_back(i);
  }

  std::cout << "Use of a std::vector with a third-party allocator wrapped by "
               "galois::runtime::ExternalHeapAllocator.\n";
  for (auto& j : v) {
    std::cout << j << std::endl;
  }
  //! [heap wrapping example]

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/Torus.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include <iostream>

//! Graph has int node data, void edge data and is directed
//! [define a graph]
typedef galois::graphs::MorphGraph<int, void, true> Graph;
//! [define a graph]
//! Opaque pointer to graph node
typedef Graph::GraphNode GNode;

//! Construct a simple torus graph
void constructTorus(Graph& g, int height, int width) {
  // Construct set of nodes
  int numNodes = height * width;
  std::vector<GNode> nodes(numNodes);
  for (int i = 0; i < numNodes; ++i) {
    //! [create and add node]
    GNode n = g.createNode(0);
    g.addNode(n);
    nodes[i] = n;
    //! [create and add node]
  }

  //! [add edges]
  for (int x = 0; x < width; ++x) {
    for (int y = 0; y < height; ++y) {
      GNode c = nodes[x * height + y];
      GNode n = nodes[x * height + ((y + 1) % height)];
      GNode s = nodes[x * height + ((y - 1 + height) % height)];
      GNode e = nodes[((x + 1) % width) * height + y];
      GNode w = nodes[((x - 1 + width) % width) * height + y];
      g.addEdge(c, n);
      g.addEdge(c, s);
      g.addEdge(c, e);
      g.addEdge(c, w);
    }
  }
  //! [add edges]
}

void verify(Graph& graph, int n) {
  // Verify
  int count = std::count_if(graph.begin(), graph.end(), [&](GNode n) -> bool {
    return graph.getData(n) == 4;
  });
  if (count != n * n) {
    std::cerr << "Expected " << n * n << " nodes with value = 4 but found "
              << count << " instead.\n";
  } else {
    std::cout << "Correct!\n";
  }
}

void initialize(Graph& graph) {
  galois::do_all(galois::iterate(graph),
                 [&](GNode n) { graph.getData(n) = 0; });
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  if (argc < 3) {
    std::cerr << "<num threads> <sqrt grid size>\n";
    return 1;
  }
  unsigned int numThreads = atoi(argv[1]);
  int N                   = atoi(argv[2]);

  numThreads = galois::setActiveThreads(numThreads);
  std::cout << "Using " << numThreads << " thread(s) and " << N << " x " << N
            << " torus\n";

  Graph graph;
  constructTorus(graph, N, N);

  // read/write only a node itself
  galois::do_all(
      galois::iterate(graph),
      [&](GNode n) {
        graph.getData(n) =
            std::distance(graph.edge_begin(n), graph.edge_end(n));
      },
      galois::loopname("do_all"));
  verify(graph, N);

  // push operator with Galois synchronization
  initialize(graph);
  galois::for_each(
      galois::iterate(graph),
      [&](GNode n, auto&) {
        for (auto ii : graph.edges(n)) {
          GNode dst  = graph.getEdgeDst(ii);
          auto& data = graph.getData(dst);
          data += 1;
        }
      },
      galois::loopname("for_each"), galois::no_pushes());
  verify(graph, N);

  auto incrementNeighborsAtomically = [&](GNode n) {
    for (auto e : graph.edges(n)) {
      auto dst      = graph.getEdgeDst(e);
      auto& dstData = graph.getData(dst);
      __sync_fetch_and_add(&dstData, 1);
    }
  };

  // push operator with self synchronization in do_all
  initialize(graph);
  //! [work stealing]
  galois::do_all(galois::iterate(graph), incrementNeighborsAtomically,
                 galois::loopname("do_all_self_sync"), galois::steal(),
                 galois::chunk_size<32>());
  //! [work stealing]
  verify(graph, N);

  // push operator with self synchronization in optimized for_each
  initialize(graph);
  galois::for_each(
      galois::iterate(graph),
      [&](GNode n, auto&) { incrementNeighborsAtomically(n); },
      galois::loopname("for_each_self_sync"),
      galois::disable_conflict_detection(), galois::no_pushes());
  verify(graph, N);

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/TorusConstruction.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// This example shows how to manipulate MorphGraph to change graph topology
// 1. createNode
// 2. addNode
// 3. addEdge
#include "galois/Galois.h"
#include "galois/graphs/Graph.h"
#include <iostream>

//! [Define a MorphGraph]
// Graph has int node data, void edge data and is directed
using Graph = galois::graphs::MorphGraph<int, void, true>;
// Opaque pointer to graph node
using GNode = Graph::GraphNode;
//! [Define a MorphGraph]

//! [Construct torus]
void constructTorus(Graph& g, int height, int width) {
  // Construct set of nodes
  int numNodes = height * width;
  std::vector<GNode> nodes(numNodes);
  for (int i = 0; i < numNodes; ++i) {
    GNode n = g.createNode(
        0);       // allocate node data and initialize the node data with 0
    g.addNode(n); // add n to g. from now on n can be located from g
    nodes[i] = n;
  }

  // Add edges
  for (int x = 0; x < width; ++x) {
    for (int y = 0; y < height; ++y) {
      GNode c = nodes[x * height + y];
      GNode n = nodes[x * height + ((y + 1) % height)];
      GNode s = nodes[x * height + ((y - 1 + height) % height)];
      GNode e = nodes[((x + 1) % width) * height + y];
      GNode w = nodes[((x - 1 + width) % width) * height + y];
      g.addEdge(c, n); // addEdge checks if the edge exists or not. nop if so.
      g.addEdge(c, s);
      g.addEdge(c, e);
      g.addEdge(c, w);
    }
  }
}
//! [Construct torus]

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  if (argc < 2) {
    std::cerr << "<sqrt grid size>\n";
    return 1;
  }
  int N = atoi(argv[1]);

  Graph graph;
  constructTorus(graph, N, N);

  std::cout << "Constructed a " << N << " x " << N << " torus." << std::endl;

  return 0;
}


================================================
FILE: lonestar/tutorial_examples/TorusImproved.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/Timer.h"
#include "galois/graphs/Graph.h"
#include <iostream>

//! Graph has int node data, void edge data and is directed
typedef galois::graphs::MorphGraph<int, void, true> Graph;
//! Opaque pointer to graph node
typedef Graph::GraphNode GNode;

class Point2D {
  int v[2];

public:
  Point2D() : v{0, 0} {}
  Point2D(int x, int y) : v{x, y} {}

  const int& at(int i) const { return v[i]; }
  const int& x() const { return v[0]; }
  const int& y() const { return v[1]; }
  int dim() const { return 2; }
};

//! Construct a simple torus graph
void constructTorus(Graph& g, int height, int width) {
  // Construct set of nodes
  int numNodes = height * width;
  std::vector<Point2D> points(numNodes);
  for (int x = 0; x < width; ++x) {
    for (int y = 0; y < height; ++y) {
      points[x * height + y] = Point2D(x, y);
    }
  }

  // Sort in a space-filling way
  std::sort(points.begin(), points.end(),
            /**
             * Sort pairs according to Morton Z-Order.
             *
             * From http://en.wikipedia.org/wiki/Z-order_%28curve%29
             */
            [&](const Point2D& p1, const Point2D& p2) -> bool {
              int index = 0;
              int x     = 0;
              for (int k = 0; k < p1.dim(); ++k) {
                int y        = p1.at(k) ^ p2.at(k);
                bool lessMsb = x < y && x < (x ^ y);
                if (lessMsb) {
                  index = k;
                  x     = y;
                }
              }
              return p1.at(index) - p2.at(index) < 0;
            });

  // Using space-filling order, assign nodes and create (and allocate) them in
  // parallel
  std::vector<GNode> nodes(numNodes);
  galois::do_all(galois::iterate(points), [&](const Point2D& p) {
    auto n = g.createNode(0);
    g.addNode(n);
    nodes[p.x() * height + p.y()] = n;
  });

  // Add edges
  for (int x = 0; x < width; ++x) {
    for (int y = 0; y < height; ++y) {
      GNode c = nodes[x * height + y];
      GNode n = nodes[x * height + ((y + 1) % height)];
      GNode s = nodes[x * height + ((y - 1 + height) % height)];
      GNode e = nodes[((x + 1) % width) * height + y];
      GNode w = nodes[((x - 1 + width) % width) * height + y];
      g.addEdge(c, n);
      g.addEdge(c, s);
      g.addEdge(c, e);
      g.addEdge(c, w);
    }
  }
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;

  if (argc < 3) {
    std::cerr << "<num threads> <sqrt grid size>\n";
    return 1;
  }
  unsigned int numThreads = atoi(argv[1]);
  int n                   = atoi(argv[2]);

  GALOIS_ASSERT(n > 2);

  numThreads = galois::setActiveThreads(numThreads);
  std::cout << "Using " << numThreads << " threads and " << n << " x " << n
            << " torus\n";

  Graph graph;
  constructTorus(graph, n, n);

  galois::Timer T;
  T.start();

  galois::for_each(galois::iterate(graph), [&](GNode n, auto&) {
    // For each outgoing edge (n, dst)
    for (auto ii : graph.edges(n)) {
      GNode dst = graph.getEdgeDst(ii);
      int& data = graph.getData(dst);
      // Increment node data by 1
      data += 1;
    }
  });
  T.stop();

  std::cout << "Elapsed time: " << T.get() << " milliseconds\n";

  // Verify
  int count = std::count_if(graph.begin(), graph.end(), [&](GNode n) -> bool {
    return graph.getData(n) == 4;
  });
  if (count != n * n) {
    std::cerr << "Expected " << n * n << " nodes with value = 4 but found "
              << count << " instead.\n";
    return 1;
  } else {
    std::cout << "Correct!\n";
  }

  return 0;
}


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools", "wheel", "scikit-build", "cmake>=3.13", "cython"]


================================================
FILE: python/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.13)

project(pygalois)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

find_package(PythonExtensions REQUIRED)
find_package(Cython REQUIRED)

add_subdirectory(.. ${CMAKE_BINARY_DIR}/../cmake-galois-build EXCLUDE_FROM_ALL)

add_subdirectory(galois)


================================================
FILE: python/galois/CMakeLists.txt
================================================
include_directories(cpp)

add_cython_target(shmem shmem.pyx CXX OUTPUT_VAR GALOIS_SOURCES)
add_library(shmem MODULE ${GALOIS_SOURCES})
python_extension_module(shmem)
target_link_libraries(shmem Galois::shmem)

# Avoid collisions with existing application targets with the same name.
add_cython_target(_bfs _bfs.pyx shmem.pyx CXX OUTPUT_VAR BFS_SOURCES)
add_library(_bfs MODULE ${BFS_SOURCES})
python_extension_module(_bfs)
target_link_libraries(_bfs Galois::shmem)

add_cython_target(_sssp _sssp.pyx shmem.pyx CXX OUTPUT_VAR SSSP_SOURCES)
add_library(_sssp MODULE ${SSSP_SOURCES})
python_extension_module(_sssp)
target_link_libraries(_sssp Galois::shmem)

add_cython_target(_pagerank _pagerank.pyx shmem.pyx CXX OUTPUT_VAR PAGERANK_SOURCES)
add_library(_pagerank MODULE ${PAGERANK_SOURCES})
python_extension_module(_pagerank)
target_link_libraries(_pagerank Galois::shmem)

add_cython_target(_connected_components _connected_components.pyx shmem.pyx CXX OUTPUT_VAR CONNECTEDCOMPONENTS_SOURCES)
add_library(_connected_components MODULE ${CONNECTEDCOMPONENTS_SOURCES})
python_extension_module(_connected_components)
target_link_libraries(_connected_components Galois::shmem)

install(
  TARGETS shmem _bfs _sssp _pagerank _connected_components
  LIBRARY DESTINATION python/galois
)

install(
  TARGETS galois_shmem 
  PUBLIC_HEADER DESTINATION include
  ARCHIVE DESTINATION lib
  INCLUDES DESTINATION include
)

# In order to preserve the directory structure, galois_shmem uses
# install(DIRECTORY) rather than setting the PUBLIC_HEADER property on the
# galois_shmem itself. Mirror the install(DIRECTORY) logic here.
get_target_property(GALOIS_SOURCE_DIR galois_shmem SOURCE_DIR)
get_target_property(GALOIS_BINARY_DIR galois_shmem BINARY_DIR)
install(
  DIRECTORY "${GALOIS_SOURCE_DIR}/include/" "${GALOIS_BINARY_DIR}/include/"
  DESTINATION include
  FILES_MATCHING PATTERN "*.h"
)


================================================
FILE: python/galois/__init__.py
================================================


================================================
FILE: python/galois/_bfs.pyx
================================================
# cython: cdivision = True

from galois.shmem cimport *
from cython.operator cimport preincrement, dereference as deref

ctypedef atomic[uint32_t] atomuint32_t

ctypedef LC_CSR_Graph[uint32_t, void, dummy_true] Graph_CSR

# Cython bug: using a nested class from a previous typedef doesn't
# work for the time being. Instead, the full template specialization
# must be used to get the member type.
ctypedef LC_CSR_Graph[uint32_t, void, dummy_true].GraphNode GNodeCSR

cdef void printValue(Graph_CSR *g):
    cdef unsigned long numNodes = g[0].size()
    cdef uint32_t *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        data = &g[0].getData(n)
        gPrint(b"\t", data[0], b"\n")
         
##############################################################################
## Bfs implementation
###########################################################################
#
# Initialization for BFS
#
cdef void Initialize(Graph_CSR *g, unsigned long source):
    cdef unsigned long numNodes = g[0].size()
    cdef: 
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei
        uint32_t *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        #gPrint(n,"\n")
        data = &g[0].getData(n)
        if(n == source):
            data[0] = 0
        else:
            data[0] = numNodes
        

#
# BFS Operator to be executed on each Graph node
#
cdef void bfs_operator(Graph_CSR *g, bool *work_done, GNodeCSR n, UserContext[GNodeCSR] &ctx) nogil:
    cdef: 
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei
        uint32_t *src_data
        uint32_t *dst_data
    src_data = &g[0].getData(n)    
    ii = g[0].edge_begin(n)
    ei = g[0].edge_end(n)
    while ii != ei:
            dst_data = &g[0].getData(g[0].getEdgeDst(ii))
            if(src_data[0] > dst_data[0] + 1):
                src_data[0] = dst_data[0] + 1
                work_done[0] = 1
            preincrement(ii)
            
cdef void bfs_pull_topo(Graph_CSR *graph):
    cdef bool work_done = 1
    cdef Timer T
    rounds = 0; 
    while(work_done):
        rounds += 1;
        print("starting for_each")
        gPrint(b"Work done Before : ", work_done, b"\n")
        with nogil:
            T.start()
            work_done = 0
            for_each(iterate(graph[0].begin(), graph[0].end()),
                     bind_leading(&bfs_operator, graph, &work_done), no_pushes())#,
                     #loopname("name1"))
            T.stop()
            gPrint(b"Work done : ", work_done, b"\n")
            gPrint(b"Elapsed time:", T.get(), b" milliseconds.\n")
    print("Number of rounds : ", rounds, "\n")


#
# BFS sync operator to be executed on each Graph node
#
cdef void bfs_sync_operator(Graph_CSR *g, InsertBag[GNodeCSR] *next, int nextLevel, GNodeCSR n) nogil:
    cdef: 
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei
        uint32_t *src_data
        uint32_t *dst_data
        uint32_t numNodes = g[0].size()
        GNodeCSR dst
    src_data = &g[0].getData(n)    
    ii = g[0].edge_begin(n)
    ei = g[0].edge_end(n)
    while ii != ei:
            dst = g[0].getEdgeDst(ii)
            dst_data = &g[0].getData(dst)
            if(dst_data[0] == numNodes):
                dst_data[0] = nextLevel
                next[0].push(dst)
            preincrement(ii)
            
cdef void bfs_sync(Graph_CSR *graph, GNodeCSR source):
    cdef:
        Timer T
        InsertBag[GNodeCSR] curr, next
        uint32_t nextLevel = 0;
    
    next.push(source)
    T.start()
    while(not next.empty()):
        curr.swap(next)
        next.clear()
        nextLevel += 1;
        with nogil:
            do_all(iterate(curr),
                     bind_leading(&bfs_sync_operator, graph, &next, nextLevel), no_pushes(), steal(),
                     loopname("bfs_sync"))
    T.stop()
    gPrint(b"Elapsed time:", T.get(), b" milliseconds.\n")        
    print("Number of rounds : ", nextLevel, "\n")

cdef void not_visited_operator(Graph_CSR *graph, atomuint32_t *notVisited, GNodeCSR n):
    cdef: 
        uint32_t *data
        uint32_t numNodes = graph[0].size()
    data = &graph[0].getData(n)
    if (data[0] >= numNodes):
        notVisited[0].fetch_add(1)

cdef void max_dist_operator(Graph_CSR *graph, GReduceMax[uint32_t] *maxDist , GNodeCSR n):
    cdef: 
        uint32_t *data
        uint32_t numNodes = graph[0].size()
    data = &graph[0].getData(n)
    if(data[0] < numNodes):
        maxDist[0].update(data[0])


cdef bool verify_bfs(Graph_CSR *graph, GNodeCSR source):
    cdef: 
        atomuint32_t notVisited
        uint32_t *data
        GReduceMax[uint32_t] maxDist;

    data = &graph[0].getData(source)
    if(data[0] is not 0):
        gPrint(b"ERROR: source has non-zero dist value == ", data[0], b"\n")
    
    notVisited.store(0)
    with nogil:
        do_all(iterate(graph[0]),
                bind_leading(&not_visited_operator, graph, &notVisited), no_pushes(), steal(),
                loopname("not_visited_op"))

    if(notVisited.load() > 0):
        gPrint(notVisited.load(), b" unvisited nodes; this is an error if graph is strongly connected\n")

    with nogil:
        do_all(iterate(graph[0]),
                bind_leading(&max_dist_operator, graph, &maxDist), no_pushes(), steal(),
                loopname("not_visited_op"))

    gPrint(b"Max distance : ", maxDist.reduce(), b"\n")
#
# Main callsite for Bfs
#        
def bfs(int numThreads, unsigned long source, string filename):
    ## Hack: Need a better way to initialize shared
    ## memory runtime.
    sys = new SharedMemSys()
    cdef int new_numThreads = setActiveThreads(numThreads)
    if new_numThreads != numThreads:
        print("Warning, using fewer threads than requested")
    
    print("Using {0} thread(s).".format(new_numThreads))
    cdef Graph_CSR graph
    
    ## Read the CSR format of graph
    ## directly from disk.
    graph.readGraphFromGRFile(filename)
    gPrint(b"Using Source Node: ", source, b"\n");
    Initialize(&graph, source)
    #printValue(&graph)
    #bfs_pull_topo(&graph)
    bfs_sync(&graph, <GNodeCSR>source)
    verify_bfs(&graph, <GNodeCSR>source)
    gPrint(b"Node 1 has dist : ", graph.getData(1), b"\n")
    

================================================
FILE: python/galois/_connected_components.pyx
================================================
# cython: cdivision= True
from galois.shmem cimport *
from cython.operator cimport preincrement, dereference as deref
from libstd.atomic cimport atomic

ctypedef uint32_t ComponentTy
ctypedef atomic[ComponentTy] AtomicComponentTy
ctypedef atomic[uint32_t] atomuint32_t 

#
# Struct for CC
#
cdef struct NodeTy:
    AtomicComponentTy comp_current
    ComponentTy comp_old

ctypedef LC_CSR_Graph[NodeTy, void, dummy_true] Graph

# Cython bug: using a nested class from a previous typedef doesn't
# work for the time being. Instead, the full template specialization
# must be used to get the member type.
ctypedef LC_CSR_Graph[NodeTy, void, dummy_true].GraphNode GNode


#
# Initialization for Components
#
cdef void initializeCompnents(Graph *g):
    cdef:
        unsigned long numNodes = g[0].size()
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei
        NodeTy *data
    for n in range(numNodes):
        data = &g[0].getData(n)
        data[0].comp_current.store(n)
        data[0].comp_old = numNodes

##
# LabelProp algorithm operator
##
cdef void labelPropOperator(Graph *g, bool *work_done, GNode n) nogil:
    cdef: 
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei
        NodeTy *src_data
        NodeTy *dst_data
    src_data = &g[0].getData(n, FLAG_UNPROTECTED)
    if(src_data[0].comp_old > src_data[0].comp_current.load()):
        src_data[0].comp_old = src_data[0].comp_current.load()
        work_done[0] = 1        
        ii = g[0].edge_begin(n, FLAG_UNPROTECTED)
        ei = g[0].edge_end(n, FLAG_UNPROTECTED)
        while ii != ei:
                dst_data = &g[0].getData(g[0].getEdgeDst(ii), FLAG_UNPROTECTED)
                atomicMin[ComponentTy](dst_data.comp_current, src_data.comp_current.load())
                preincrement(ii)
##
# Label Propagation algorithm for 
# finding connected components
##
cdef void labelProp(Graph* graph):
    cdef:
        bool work_done = 1
        Timer T
    rounds = 0
    T.start()
    while(work_done):
        rounds += 1;
        with nogil:
            work_done = 0
            do_all(iterate(graph[0].begin(), graph[0].end()),
                     bind_leading(&labelPropOperator, graph, &work_done), 
                     no_pushes(),
                     steal(),
                     disable_conflict_detection(),
                     loopname("labelPropAlgo"))
    T.stop()
    gPrint(b"Elapsed time:", T.get(), b" milliseconds.\n")


#
# Main callsite for CC
#   
def connectedComponents(int numThreads, string filename):
    ## Hack: Need a better way to initialize shared
    ## memory runtime.
    sys = new SharedMemSys()
    cdef int new_numThreads = setActiveThreads(numThreads)
    gPrint(b"Running Pagerank on : ", filename, b"\n")
    if new_numThreads != numThreads:
        print("Warning, using fewer threads than requested")
    
    print("Using {0} thread(s).".format(new_numThreads))
    cdef Graph graph
    
    ## Read the CSR format of graph
    ## directly from disk.
    graph.readGraphFromGRFile(filename)
    
    initializeCompnents(&graph)
    labelProp(&graph)
    #printValuePR(&graph)


================================================
FILE: python/galois/_pagerank.pyx
================================================
# cython: cdivision = True

from galois.shmem cimport *
from cython.operator cimport preincrement, dereference as deref

ctypedef atomic[uint32_t] atomuint32_t
ctypedef atomic[uint64_t] atomuint64_t
##############################################################################
## Pagerank implementation
###############################################################################
#
# Struct for Pagerank
#
cdef struct NodeTy:
    float rank
    uint32_t nout

ctypedef LC_CSR_Graph[NodeTy, void, dummy_true] Graph

# Cython bug: using a nested class from a previous typedef doesn't
# work for the time being. Instead, the full template specialization
# must be used to get the member type.
ctypedef LC_CSR_Graph[NodeTy, void, dummy_true].GraphNode GNode

#
# Constants for Pagerank
#
cdef float ALPHA = 0.85
cdef float INIT_RESIDUAL = 1 - ALPHA;
cdef float TOLERANCE   = 1.0e-3;
cdef uint32_t MAX_ITER = 1000;

#
# Initialization for Pagerank
#
cdef void InitializePR(Graph *g):
    cdef unsigned long numNodes = g[0].size()
    cdef NodeTy *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        #gPrint(n,"\n")
        data = &g[0].getData(n)
        data[0].rank = INIT_RESIDUAL
        data[0].nout = 0

cdef void printValuePR(Graph *g):
    cdef unsigned long numNodes = g[0].size()
    cdef NodeTy *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        #gPrint(n,"\n")
        data = &g[0].getData(n)
        #if(data[0].nout.load() > 0):
        gPrint(data[0].rank, b"\n")

#
# Operator for computing outdegree of nodes in the Graph
#
cdef void computeOutDeg_operator(Graph *g, LargeArray[atomuint64_t] *largeArray, GNode n) nogil:
    cdef: 
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei
        GNode dst
        #NodeTy *dst_data
        
    ii = g[0].edge_begin(n)
    ei = g[0].edge_end(n)
    while ii != ei:
            dst = g[0].getEdgeDst(ii)
            largeArray[0][<size_t>dst].fetch_add(1)
            preincrement(ii)
    
#
# Operator for assigning outdegree of nodes in the Graph
#
cdef void assignOutDeg_operator(Graph *g, LargeArray[atomuint64_t] *largeArray, GNode n) nogil:
    cdef NodeTy *src_data
        
    src_data = &g[0].getData(n)
    src_data.nout = largeArray[0][<size_t>n].load()
#
#
# Main callsite for computing outdegree of nodes in the Graph
#
cdef void computeOutDeg(Graph *graph):
    cdef: 
        uint64_t numNodes = graph[0].size()
        LargeArray[atomuint64_t] largeArray

    largeArray.allocateInterleaved(numNodes)
    with nogil:
        do_all(iterate(graph[0].begin(), graph[0].end()),
                        bind_leading(&computeOutDeg_operator, graph, &largeArray), steal(),
                        loopname("ComputeDegree"))

        do_all(iterate(graph[0].begin(), graph[0].end()),
                        bind_leading(&assignOutDeg_operator, graph, &largeArray))


#
# Operator for PageRank
#
cdef void pagerankPullTopo_operator(Graph *g, GReduceMax[float] *max_delta, GNode n) nogil:
    cdef: 
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei
        GNode dst
        NodeTy *dst_data
        NodeTy *src_data
        float sum = 0
        float value = 0
        float diff = 0;
    ii = g[0].edge_begin(n, FLAG_UNPROTECTED)
    ei = g[0].edge_end(n, FLAG_UNPROTECTED)
    src_data = &g[0].getData(n)
    while ii != ei:
            dst_data = &g[0].getData(g[0].getEdgeDst(ii), FLAG_UNPROTECTED)
            sum += dst_data[0].rank / dst_data[0].nout
            preincrement(ii)
    value = sum * ALPHA + (1.0 - ALPHA)
    diff = fabs(value - src_data[0].rank);
    src_data[0].rank = value
    max_delta[0].update(diff)

#
# Pagerank routine: Loop till convergence
#
cdef void pagerankPullTopo(Graph *graph, uint32_t max_iterations) nogil:
    cdef: 
        GReduceMax[float] max_delta
        float delta = 0
        uint32_t iteration = 0
        Timer T

    T.start()
    while(1):
        with nogil:
            do_all(iterate(graph[0].begin(), graph[0].end()),
                        bind_leading(&pagerankPullTopo_operator, graph, &max_delta), steal(),
                        loopname("PageRank"))

        delta = max_delta.reduce()
        iteration += 1
        if(delta <= TOLERANCE or iteration >= max_iterations):
            break
        max_delta.reset();
    
    T.stop()
    gPrint(b"Elapsed time:", T.get(), b" milliseconds.\n")
    if(iteration >= max_iterations):
        gPrint(b"WARNING : failed to converge in ", iteration, b" iterations\n")
    

#
# Main callsite for Pagerank
#   
def pagerank(int numThreads, uint32_t max_iterations, string filename):
    ## Hack: Need a better way to initialize shared
    ## memory runtime.
    sys = new SharedMemSys()
    cdef int new_numThreads = setActiveThreads(numThreads)
    gPrint(b"Running Pagerank on : ", filename, b"\n")
    if new_numThreads != numThreads:
        print("Warning, using fewer threads than requested")
    
    print("Using {0} thread(s).".format(new_numThreads))
    cdef Graph graph
    
    ## Read the CSR format of graph
    ## directly from disk.
    graph.readGraphFromGRFile(filename)
    
    InitializePR(&graph)
    computeOutDeg(&graph)
    pagerankPullTopo(&graph, max_iterations)
    #printValuePR(&graph)
    
   
================================================
FILE: python/galois/_sssp.pyx
================================================
# cython: cdivision= True
from galois.shmem cimport *
from cython.operator cimport preincrement, dereference as deref
from libstd.atomic cimport atomic

ctypedef uint32_t Dist
ctypedef atomic[Dist] AtomicDist
ctypedef atomic[uint32_t] atomuint32_t 

ctypedef uint32_t EdgeTy
ctypedef LC_CSR_Graph[AtomicDist, EdgeTy, dummy_true] Graph_CSR

# Cython bug: using a nested class from a previous typedef doesn't
# work for the time being. Instead, the full template specialization
# must be used to get the member type.
ctypedef LC_CSR_Graph[AtomicDist, EdgeTy, dummy_true].GraphNode GNodeCSR

cdef void printValue(Graph_CSR *g):
    cdef unsigned long numNodes = g[0].size()
    cdef AtomicDist *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        data = &g[0].getData(n)
        gPrint(b"\t", data[0].load(), b"\n")         
##############################################################################
## SSSP implementation
###########################################################################
#
# Initialization for SSSP
# Source distance is set to 0; Other nodes distance is set
# to number of nodes 
#
cdef void Initialize(Graph_CSR *g, unsigned long source):
    cdef:
        unsigned long numNodes = g[0].size()
        AtomicDist *data
    gPrint(b"Number of nodes : ", numNodes, b"\n")
    for n in range(numNodes):
        #gPrint(n,"\n")
        data = &g[0].getData(n)
        if(n == source):
            data[0].store(0)
        else:
            data[0].store(numNodes)
        

ctypedef UpdateRequest[GNodeCSR, Dist] UpdateRequestObj
#
# SSSP Delta step Operator to be executed on each Graph node
#
cdef void ssspOperator(Graph_CSR *g, UpdateRequestObj item, UserContext[UpdateRequestObj] &ctx) nogil:

    cdef: 
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii
        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei
        AtomicDist *src_data
        AtomicDist *dst_data
        Dist oldDist, newDist
        EdgeTy edge_data
        GNodeCSR dst
        unsigned long numNodes = g[0].size()
    
    src_data = &g[0].getData(item.src, FLAG_UNPROTECTED)    
    ii = g[0].edge_begin(item.src, FLAG_UNPROTECTED)
    ei = g[0].edge_end(item.src, FLAG_UNPROTECTED)
    if(src_data.load() < item.dist):
        return
    while ii != ei:
            dst = g[0].getEdgeDst(ii)
            dst_data = &g[0].getData(dst, FLAG_UNPROTECTED)
            edge_data = g[0].getEdgeData(ii, FLAG_UNPROTECTED)
            newDist = src_data[0].load() + edge_data

            oldDist = atomicMin[Dist](dst_data[0], newDist)
            if(newDist < oldDist):
                ctx.push(UpdateRequestObj(dst, newDist))

            preincrement(ii)

######
# SSSP Delta step algo using OBIM 
#####
ctypedef ChunkFIFO[Uint_64u] ChunkFIFO_64
ctypedef PerSocketChunkFIFO[Uint_64u] PerSocketChunkFIFO_64
ctypedef OrderedByIntegerMetric[UpdateRequestIndexer, PerSocketChunkFIFO_64] OBIM
cdef void ssspDeltaStep(Graph_CSR *graph, GNodeCSR source, uint32_t shift):
    cdef:
        Timer T
        InsertBag[UpdateRequestObj] initBag
        
    initBag.push(UpdateRequestObj(source, 0))
    T.start()
    with nogil:
        for_each(iterate(initBag),
                    bind_leading(&ssspOperator, graph),
                                wl[OBIM](UpdateRequestIndexer(shift)), 
                                #steal(), 
                                disable_conflict_detection(),
                                loopname("SSSP"))
    T.stop()
    gPrint(b"Elapsed time:", T.get(), b" milliseconds.\n")        


#######################
# Verification routines
#######################
cdef void not_visited_operator(Graph_CSR *graph, atomuint32_t *notVisited, GNodeCSR n):
    cdef: 
        AtomicDist *data
        uint32_t numNodes = graph[0].size()
    data = &graph[0].getData(n)
    if (data[0].load() >= numNodes):
        notVisited[0].fetch_add(1)

cdef void max_dist_operator(Graph_CSR *graph, GReduceMax[uint32_t] *maxDist , GNodeCSR n):
    cdef: 
        AtomicDist *data
        uint32_t numNodes = graph[0].size()
    data = &graph[0].getData(n)
    if(data[0].load() < numNodes):
        maxDist[0].update(data[0].load())

cdef bool verify_sssp(Graph_CSR *graph, GNodeCSR source):
    cdef: 
        atomuint32_t notVisited
        AtomicDist *data
        GReduceMax[uint32_t] maxDist;

    data = &graph[0].getData(source)
    if(data[0].load() is not 0):
        gPrint(b"ERROR: source has non-zero dist value == ", data[0].load(), b"\n")
    
    notVisited.store(0)
    with nogil:
        do_all(iterate(graph[0]),
                bind_leading(&not_visited_operator, graph, &notVisited), no_pushes(), steal(),
                loopname("not_visited_op"))

    if(notVisited.load() > 0):
        gPrint(notVisited.load(), b" unvisited nodes; this is an error if graph is strongly connected\n")

    with nogil:
        do_all(iterate(graph[0]),
                bind_leading(&max_dist_operator, graph, &maxDist), no_pushes(), steal(),
                loopname("not_visited_op"))

    gPrint(b"Max distance : ", maxDist.reduce(), b"\n")


#
# Main callsite for SSSP
#        
def sssp(int numThreads, uint32_t shift, unsigned long source, string filename):
    ## Hack: Need a better way to initialize shared
    ## memory runtime.
    sys = new SharedMemSys()
    cdef int new_numThreads = setActiveThreads(numThreads)
    if new_numThreads != numThreads:
        print("Warning, using fewer threads than requested")
    
    print("Using {0} thread(s).".format(new_numThreads))
    cdef Graph_CSR graph
    
    ## Read the CSR format of graph
    ## directly from disk.
    graph.readGraphFromGRFile(filename)
    gPrint(b"Using Source Node: ", source, b"\n");
    Initialize(&graph, source)
    #printValue(&graph)
    #ssspWorklist(&graph, <GNodeCSR>source)
    ssspDeltaStep(&graph, <GNodeCSR>source, shift)
    #verify_sssp(&graph, <GNodeCSR>source)
    gPrint(b"Node 1 has dist : ", graph.getData(1), b"\n")
    

================================================
FILE: python/galois/bfs.py
================================================
from ._bfs import *


================================================
FILE: python/galois/connected_components.py
================================================
from ._connected_components import *


================================================
FILE: python/galois/cpp/__init__.pxd
================================================


================================================
FILE: python/galois/cpp/libgalois/Galois.pxd
================================================
# distutils: language=c++
from libcpp cimport bool
from libc.stdint cimport *
from libstd.atomic cimport atomic

# Declaration from "Galois/Threads.h"

#ctypedef uint64_t size_t 

# Hack to make auto return type for galois::iterate work.
# It may be necessary to write a wrapper header around for_each,
# but this should be good enough for the forseeable future either way.
cdef extern from * nogil:
    cppclass InternalRange "auto":
        pass

cdef extern from "galois/Galois.h" namespace "galois" nogil:
    unsigned int setActiveThreads(unsigned int)
    void gPrint(...)
    cppclass UserContext[T]:
        pass
        void push(...)

    void for_each(...)
    void do_all(...)

    InternalRange iterate[T](T &, T &)
    InternalRange iterate[T](T &)

    cppclass SharedMemSys:
        SharedMemSys()

    cppclass loopname:
        loopname(char *name)

    cppclass no_pushes:
        no_pushes()

    cppclass steal:
        steal()

    cppclass disable_conflict_detection:
        disable_conflict_detection()

    cppclass GReduceMax[T]:
        pass
        void update(T)
        T reduce()
        void reset()

    cppclass InsertBag[T]:
        pass
        void push(T)
        bool empty()
        void swap(InsertBag&)
        void clear()

    cppclass LargeArray[T]:
        pass
        void allocateInterleaved(size_t)
        void allocateBlocked(size_t)
        T &operator[](size_t)


    #### Atomic Helpers ####
cdef extern from "galois/AtomicHelpers.h" namespace "galois" nogil:
    const T atomicMin[T](atomic[T]&, const T)
    const uint32_t atomicMin[uint32_t](atomic[uint32_t]&, const uint32_t)

cdef extern from "galois/MethodFlags.h" namespace "galois" nogil:
    cdef cppclass MethodFlag:
        pass
    
    cdef MethodFlag FLAG_UNPROTECTED "galois::MethodFlag::UNPROTECTED"
    cdef MethodFlag FLAG_WRITE "galois::MethodFlag::WRITE"
    cdef MethodFlag FLAG_READ "galois::MethodFlag::READ"
    cdef MethodFlag FLAG_INTERNAL_MASK "galois::MethodFlag::INTERNAL_MASK"
    cdef MethodFlag PREVIOUS "galois::MethodFlag::PREVIOUS"

    
================================================
FILE: python/galois/cpp/libgalois/Timer.pxd
================================================
cdef extern from "galois/Timer.h" namespace "galois" nogil:
    cppclass Timer:
        Timer()
        void start()
        void stop()
        unsigned int get()


================================================
FILE: python/galois/cpp/libgalois/Worklist.pxd
================================================
from libc.stdint cimport *

cdef extern from *:
    cppclass dummy_true "true"
    cppclass dummy_false "false"
    cppclass Uint_64u "64u"

##
# TODO: Need a better way to provide user defined
# functions as template parameters to DS such as
# OBIM
##
cdef extern from "galois/Constants.h" namespace "galois" nogil:
    cppclass UpdateRequestIndexer:
        UpdateRequestIndexer(uint32_t)
        pass
    cppclass UpdateRequest[G, D]:
        G src
        D dist
        UpdateRequest(G&, D)
        pass
    cppclass ReqPushWrap:
        pass
        
cdef extern from "galois/Traits.h" namespace "galois" nogil:
    cppclass s_wl:
        pass
    s_wl wl[T](...)

cdef extern from "galois/worklists/Chunk.h" namespace "galois::worklists" nogil:
    cppclass ChunkFIFO[T]:
        pass
    cppclass PerSocketChunkFIFO[T]:
        pass

cdef extern from "galois/worklists/Obim.h" namespace "galois::worklists" nogil:
    cppclass OrderedByIntegerMetric[UpdateFuncTy, WorkListTy]:
        pass


================================================
FILE: python/galois/cpp/libgalois/__init__.pxd
================================================


================================================
FILE: python/galois/cpp/libgalois/graphs/Graph.pxd
================================================

from libcpp.string cimport string
from ..Galois cimport MethodFlag

# Fake types to work around Cython's lack of support
# for non-type template parameters.
cdef extern from *:
    cppclass dummy_true "true"
    cppclass dummy_false "false"

# Omit the exception specifications here to
# allow returning lvalues.
# Since the exception specifications are omitted here,
# these classes/functions ABSOLUTELY MUST be used only
# within functions with C++ exception handling specifications.
# This is intentional and is required to ensure that C++ exceptions
# thrown in the code written using these forward declarations
# are forwarded properly into the Galois library rather than
# being converted into Python exceptions.
cdef extern from "galois/graphs/Graph.h" namespace "galois::graphs" nogil:
    cppclass MorphGraph[node_data, edge_data, is_directed]:

        morph_graph()
        cppclass GraphNode:
            pass

        cppclass edge_iterator:
            bint operator==(edge_iterator)
            bint operator!=(edge_iterator)
            edge_iterator operator++()
            edge_iterator operator--()

        cppclass iterator:
            bint operator==(iterator)
            bint operator!=(iterator)
            iterator operator++()
            iterator operator--()

        edge_iterator edge_begin(GraphNode)
        edge_iterator edge_end(GraphNode)

        iterator begin()
        iterator end()

        GraphNode getEdgeDst(edge_iterator)
        node_data& getData(GraphNode)

        GraphNode createNode(node_data)
        void addNode(GraphNode)
        void addEdge(GraphNode, GraphNode)

    cppclass LC_CSR_Graph[node_data, edge_data, is_directed]:

        LC_CSR_Graph()
        cppclass GraphNode:
            pass
            bint operator==(unsigned long)

        cppclass edge_iterator:
            bint operator==(edge_iterator)
            bint operator!=(edge_iterator)
            edge_iterator operator++()
            edge_iterator operator--()

        cppclass iterator:
            bint operator==(iterator)
            bint operator!=(iterator)
            iterator operator++()
            iterator operator--()

        edge_iterator edge_begin(GraphNode)
        edge_iterator edge_end(GraphNode)
        edge_iterator edge_begin(unsigned long)
        edge_iterator edge_end(unsigned long)

        edge_iterator edge_begin(GraphNode, MethodFlag)
        edge_iterator edge_end(GraphNode, MethodFlag)
        edge_iterator edge_begin(unsigned long, MethodFlag)
        edge_iterator edge_end(unsigned long, MethodFlag)


        iterator begin()
        iterator end()

        GraphNode getEdgeDst(edge_iterator)
        node_data& getData(GraphNode)
        node_data& getData(GraphNode, MethodFlag)
        node_data& getData(unsigned long)
        node_data& getData(unsigned long, MethodFlag)
        void readGraphFromGRFile(string filename)
        unsigned long size()
        edge_data getEdgeData(edge_iterator)
        edge_data getEdgeData(edge_iterator, MethodFlag)


================================================
FILE: python/galois/cpp/libgalois/graphs/Util.pxd
================================================
# Omit the exception specifications here to
# allow returning lvalues.
# Since the exception specifications are omitted here,
# these classes/functions ABSOLUTELY MUST be used only
# within functions with C++ exception handling specifications.
# This is intentional and is required to ensure that C++ exceptions
# thrown in the code written using these forward declarations
# are forwarded properly into the Galois library rather than
# being converted into Python exceptions.
cdef extern from "galois/graphs/Util.h" namespace "galois::graphs" nogil:
    #void readGraph[G, A](G &, A&&...)
    void readGraph(...)


================================================
FILE: python/galois/cpp/libgalois/graphs/__init__.pxd
================================================


================================================
FILE: python/galois/cpp/libstd/__init__.pxd
================================================


================================================
FILE: python/galois/cpp/libstd/atomic.pxd
================================================
cdef extern from "<atomic>" namespace "std" nogil:
    
    cdef enum memory_order:
        memory_order_relaxed
        memory_order_consume
        memory_order_acquire
        memory_order_release
        memory_order_acq_rel
        memory_order_seq_cst
    
    cdef cppclass atomic[T]:
        atomic()
        atomic(T)
        
        bint is_lock_free()
        void store(T)
        void store(T, memory_order)
        T load()
        T load(memory_order)
        T exchange(T)
        T exchange(T, memory_order)
        
        bint compare_exchange_weak(T&, T, memory_order, memory_order)
        bint compare_exchange_weak(T&, T, memory_order)
        bint compare_exchange_weak(T&, T)
        bint compare_exchange_strong(T&, T, memory_order, memory_order)
        bint compare_exchange_strong(T&, T, memory_order)
        bint compare_exchange_strong(T&, T)
        
        T fetch_add(T, memory_order)
        T fetch_add(T)
        T fetch_sub(T, memory_order)
        T fetch_sub(T)
        T fetch_and(T, memory_order)
        T fetch_and(T)
        T fetch_or(T, memory_order)
        T fetch_or(T)
        T fetch_xor(T, memory_order)
        T fetch_xor(T)
        
        T operator++()
        T operator++(int)
        T operator--()
        T operator--(int)
        
        # modify-in-place operators not yet supported by Cython:
        # T operator+=(T)
        # T operator-=(T)
        # T operator&=(T)
        # T operator|=(T)
        # T operator^=(T)
        
        bint operator==(atomic[T]&, atomic[T]&)
        bint operator==(atomic[T]&, T&)
        bint operator==(T&, atomic[T]&)
        bint operator!=(atomic[T]&, atomic[T]&)
        bint operator!=(atomic[T]&, T&)
        bint operator!=(T&, atomic[T]&)


================================================
FILE: python/galois/pagerank.py
================================================
from ._pagerank import *


================================================
FILE: python/galois/shmem.pxd
================================================
from cython.operator cimport preincrement, dereference as deref
from libgalois.Galois cimport UserContext, iterate, for_each, setActiveThreads, SharedMemSys, loopname, disable_conflict_detection, no_pushes, gPrint, do_all, GReduceMax, InsertBag, steal
from libgalois.Galois cimport LargeArray, MethodFlag, FLAG_UNPROTECTED, atomicMin
from libgalois.graphs.Graph cimport dummy_true, dummy_false, MorphGraph, LC_CSR_Graph
from libgalois.Worklist cimport ChunkFIFO, OrderedByIntegerMetric, wl, Uint_64u, UpdateRequestIndexer, PerSocketChunkFIFO, ReqPushWrap, UpdateRequest
from libgalois.Timer cimport Timer
from libstd.atomic cimport atomic
from libcpp.vector cimport vector
from libcpp.string cimport string
from libcpp cimport bool
import sys
from libc.stdint cimport *
from libc.math cimport fabs

# Initialize the Galois runtime when the Python module is loaded.
cdef class _galois_runtime_wrapper:
    cdef SharedMemSys _galois_runtime

cdef extern from * nogil:
    # hack to bind leading arguments by value to something that can be passed
    # to for_each. The returned lambda needs to be usable after the scope
    # where it is created closes, so captured values are captured by value.
    # The by-value capture in turn requires that graphs be passed as
    # pointers. This function is used without exception specification under
    # the assumption that it will always be used as a subexpression of
    # a whole expression that requires exception handling or that it will
    # be used in a context where C++ exceptions are appropriate.
    # There are more robust ways to do this, but this didn't require
    # users to find and include additional C++ headers specific to
    # this interface.
    # Syntactically, this is using the cname of an "external" function
    # to create a one-line macro that can be used like a function.
    # The expected use is bind_leading(function, args).
    cdef void *bind_leading "[](auto f, auto&&... bound_args){return [=](auto&&... pars){return f(bound_args..., pars...);};}"(...)
    # Similar thing to invoke a function and return an integer.
    # Useful for verifying that this approach works.
    cdef int invoke "[](auto f, auto&&... args){return f(args...);}"(...)

#cdef int myfunc(int a, int b, int c):
#    return a + b + c

cdef extern from "algorithm" namespace "std" nogil:
    # This function from <algorithm> isn't currently
    # provided by Cython's known interfaces for the C++ standard library,
    # so this is needed to get it working here.
    # The variadic signature could probably be removed and this could
    # be made to match the original templates more closely, but since
    # this form matches the syntax we need to use, it is good enough.
    int count_if(...) except +

# This function is expected to forward C++ exceptions thrown to
# its caller. This is unusual for Cython, but it's the simplest
# way to guarantee no loos Python exceptions end up floating around.
#cdef void IncrementNeighbors(Graph *g, GNode n, UserContext[GNode] &ctx) nogil:
 #   cdef:
#        MorphGraph[int, void, dummy_true].edge_iterator ii = g[0].edge_begin(n)
#        MorphGraph[int, void, dummy_true].edge_iterator ei = g[0].edge_end(n)
#        int *data
#    while ii != ei:
#        data = &g[0].getData(g[0].getEdgeDst(ii))
#        preincrement(data[0])
#        preincrement(ii)

# C++ exceptions thrown inside this function are forwarded to its caller.
#cdef bint ValueEqual(Graph *g, int v, GNode n) nogil:
#    return g[0].getData(n) == v

#cdef bint SameNodes(GNodeCSR n, GNodeCSR s) nogil:
 #   return n == s

#cdef void setGNode(Graph_CSR *g, GNodeCSR n, int val) nogil:
#    gPrint("inside setGNode\n")
#    cdef uint32_t *data = &g[0].getData(n);
#    data[0] = val;
#    gPrint("n : ", deref(data), "\n");
#    preincrement(data[0])
#    gPrint("n : ", deref(data), "\n");


================================================
FILE: python/galois/shmem.pyx
================================================
# cython: cdivision = True
_galois_runtime = _galois_runtime_wrapper()


================================================
FILE: scripts/CMakeLists.txt
================================================
configure_file("make_dist.sh.in" "make_dist.sh")
file(COPY .  DESTINATION ${CMAKE_CURRENT_BINARY_DIR} PATTERN .svn EXCLUDE)


================================================
FILE: scripts/abelian_log_parser.py
================================================
##########################################
# To parse log files generated by abelian.
# Author: Gurbinder Gill
# Email: gurbinder533@gmail.com
#########################################

import re
import os
import sys, getopt
import csv
import numpy
import subprocess

######## NOTES:
# All time values are in sec by default.


def match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition):

  mean_time = 0.0;
  recvNum_total = 0
  recvBytes_total = 0
  sendNum_total = 0
  sendBytes_total = 0
  sync_pull_avg_time_total = 0.0;
  extract_avg_time_total = 0.0;
  set_avg_time_total = 0.0;
  sync_push_avg_time_total = 0.0;
  graph_init_time = 0
  hg_init_time = 0
  total_time = 0

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  if(benchmark == "pagerank"):
    benchmark = "PageRank"

  if (time_unit == 'seconds'):
    divisor = 1000
  else:
    divisor = 1

  log_data = open(fileName).read()

  timer_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_(\d*),(\d*),0,(\d*)')
  timers = re.findall(timer_regex, log_data)
  #print timers

  time = []

  for i in range(int(total_hosts)):
    time.append(0)

  #overall slowest host for all 3 runs
  slowest_host = int(0);
  for timer in timers:
    host = int(timer[1])
    #print host ," : ", timer[2]
    host_time = float(timer[2])
    time[host] += host_time

  print time
  for i, val in enumerate(time):
    if(time[slowest_host] < val):
      slowest_host = i

  print slowest_host
  print time[slowest_host]

  if(len(time) > 0):
    mean_time = time[slowest_host] / int(numRuns)
  mean_time /= divisor
  mean_time = round(mean_time, 3)
  print "Mean time: ", mean_time

  forHost = slowest_host 


  rep_regex = re.compile(r'.*,\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')

  rep_search = rep_regex.search(log_data)
  rep_factor = 0;
  if rep_search is not None:
    rep_factor = rep_search.group(2)
    rep_factor = round(float(rep_factor), 3)
    if (rep_factor == 1.0):
      rep_regex = re.compile(r'.*,\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')
      rep_search = rep_regex.search(log_data)
      rep_factor = rep_search.group(2)
      rep_factor = round(float(rep_factor), 3)

    print ("Replication factor  : ", rep_factor)


  #Finding mean, variance and sd for first iteration in compute time
  do_all_regex = re.compile(r'.*,\(NULL\),0\s,\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\d*,(\d*)')
  do_all_all_hosts = re.findall(do_all_regex, log_data)
  num_arr = numpy.array(map(int,do_all_all_hosts))

  if(num_arr.size < total_hosts):
    #for i in range(1, int(total_hosts)):
    for i in range(1, 3):
      do_all_regex = re.compile(r'.*,\(NULL\),0\s,\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) +r',.*' + r',\d*,(\d*)')
      do_all_all_hosts = re.findall(do_all_regex, log_data)
      num_arr = numpy.array(map(int,do_all_all_hosts))
      if(num_arr.size == total_hosts):
        break;

  print num_arr
  sd_do_all = round(numpy.std(num_arr, axis=0),3)
  mean_do_all = round(numpy.mean(num_arr, axis=0),3)
  var_do_all = round(numpy.var(num_arr, axis=0),3)
  min_do_all = num_arr.min(axis=0)
  max_do_all = num_arr.max(axis=0)


  print "sd_do_all", sd_do_all
  print "mean_do_all", mean_do_all
  print "min_do_all", min_do_all
  print "max_do_all", max_do_all


  #Finding mean, variance and sd for first iteration in sync time
  sync_pull_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PULL_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\d*,(\d*)')
  sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)
  num_arr = numpy.array(map(int,sync_pull_all_hosts))
  if(num_arr.size == 0):
    sync_pull_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PULL_FirstItr_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\d*,(\d*)')
    sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)
    num_arr = numpy.array(map(int,sync_pull_all_hosts))

  if(num_arr.size < total_hosts):
    #for i in range(1, int(total_hosts)):
    for i in range(1, 3):
      sync_pull_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PULL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) +r',.*' + r',\d*,(\d*)')
      sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)
      try:
        num_arr = numpy.array(map(int,sync_pull_all_hosts))
      except ValueError:
        pass
      if(num_arr.size == total_hosts):
        break;

  print "SYNC_PULL" , num_arr
  sd_sync_pull = 0.0
  mean_sync_pull = 0.0
  var_sync_pull = 0.0
  min_sync_pull = 0.0
  max_sync_pull = 0.0
  if(num_arr.size > 0):
    try:
      sd_sync_pull = round(numpy.std(num_arr, axis=0),3)
      mean_sync_pull = round(numpy.mean(num_arr, axis=0),3)
      var_sync_pull = round(numpy.var(num_arr, axis=0),3)
      min_sync_pull = num_arr.min(axis=0)
      max_sync_pull = num_arr.max(axis=0)
    except ValueError:  #raised if `num_arr` is empty.
      pass

  print "sd_sync_pull", sd_sync_pull
  print "mean_sync_pull", mean_sync_pull
  print "min_sync_pull", min_sync_pull
  print "max_sync_pull", max_sync_pull

  sync_push_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PUSH_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\d*,(\d*)')
  sync_push_all_hosts = re.findall(sync_push_regex, log_data)
  num_arr = numpy.array(map(int,sync_push_all_hosts))
  if(num_arr.size == 0):
    sync_push_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PUSH_FirstItr_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\d*,(\d*)')
    sync_push_all_hosts = re.findall(sync_push_regex, log_data)
    num_arr = numpy.array(map(int,sync_push_all_hosts))


  if(num_arr.size < total_hosts):
    #for i in range(1, int(total_hosts)):
    for i in range(1, 3):
      sync_push_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PUSH_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
      sync_push_all_hosts = re.findall(sync_push_regex, log_data)
      num_arr = numpy.array(map(int,sync_push_all_hosts))
      if(num_arr.size == total_hosts):
        break;

  print "SYNC_PUSH" , num_arr

  sd_sync_push = 0.0
  mean_sync_push = 0.0
  var_sync_push = 0.0
  min_sync_push = 0.0
  max_sync_push = 0.0
  if(num_arr.size > 0):
    try:
      sd_sync_push = round(numpy.std(num_arr, axis=0),3)
      mean_sync_push = round(numpy.mean(num_arr, axis=0),3)
      var_sync_push = round(numpy.var(num_arr, axis=0),3)
      min_sync_push = num_arr.min(axis=0)
      max_sync_push = num_arr.max(axis=0)
    except ValueError:  #raised if `num_arr` is empty.
      pass
  print "sd_sync_push", sd_sync_push
  print "mean_sync_push", mean_sync_push
  print "min_sync_push", min_sync_push
  print "max_sync_push", max_sync_push


  ## Get Graph_init, HG_init, total
  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306
  timer_graph_init_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_GRAPH_INIT,\d*,\d*,(\d*)')
  timer_hg_init_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_HG_INIT' + r',\d*,\d*,(\d*)')
  timer_total_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_TOTAL' + r',\d*,\d*,(\d*)')


  timer_graph_init = timer_graph_init_regex.search(log_data)
  timer_hg_init = timer_hg_init_regex.search(log_data)
  timer_total = timer_total_regex.search(log_data)

  if timer_graph_init is not None:
    graph_init_time = float(timer_graph_init.group(1))
    graph_init_time /= divisor
    graph_init_time = round(graph_init_time, 3)

  if timer_hg_init is not None:
    hg_init_time = float(timer_hg_init.group(1))
    hg_init_time /= divisor
    hg_init_time = round(hg_init_time, 3)

  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 3)

  print graph_init_time
  print hg_init_time
  print total_time

  return mean_time,rep_factor,mean_do_all,sd_do_all,var_do_all,min_do_all,max_do_all,mean_sync_pull,sd_sync_pull,var_sync_pull,min_sync_pull,max_sync_pull,mean_sync_push,sd_sync_push,var_sync_push,min_sync_push,max_sync_push,graph_init_time,hg_init_time,total_time

def get_basicInfo(fileName):

  hostNum_regex = re.compile(r'.*,\(NULL\),0\s,\sHosts,0,0,(\d*)')
  cmdLine_regex = re.compile(r'.*,\(NULL\),0\s,\sCommandLine,0,0,(.*)')
  threads_regex = re.compile(r'.*,\(NULL\),0\s,\sThreads,0,0,(\d*)')
  runs_regex = re.compile(r'.*,\(NULL\),0\s,\sRuns,0,0,(\d*)')

  log_data = open(fileName).read()

  hostNum    = ''
  cmdLine    = ''
  threads    = ''
  runs       = ''
  benchmark  = ''
  algo_type  = ''
  cut_type   = ''
  input_graph = ''

  hostNum_search = hostNum_regex.search(log_data)
  if hostNum_search is not None:
    hostNum = hostNum_search.group(1)

  cmdLine_search = cmdLine_regex.search(log_data)
  if cmdLine_search is not None:
    cmdLine = cmdLine_search.group(1)

  threads_search = threads_regex.search(log_data)
  if threads_search is not None:
    threads = threads_search.group(1)

  runs_search    = runs_regex.search(log_data)
  if runs_search is not None:
    runs = runs_search.group(1)
  if runs == "":
    runs = "3"

  print cmdLine
  split_cmdLine_algo = cmdLine.split()[0].split("/")[-1].split("_")
  benchmark, algo_type = split_cmdLine_algo

  split_cmdLine_input = cmdLine.split()[1].split("/")
  input_graph_name = split_cmdLine_input[-1]
  input_graph = input_graph_name.split(".")[0]

  print cmdLine
  split_cmdLine = cmdLine.split()
  print split_cmdLine
  cut_type = "edge-cut"
  for index in range(0, len(split_cmdLine)):
    if split_cmdLine[index] == "-enableVertexCut=1":
      cut_type = "vertex-cut"
      break
    elif split_cmdLine[index] == "-enableVertexCut":
         cut_type = "vertex-cut"
         break
    elif split_cmdLine[index] == "-enableVertexCut=0":
         cut_type = "edge-cut"
         break


  devices = str(hostNum) + " CPU"
  deviceKind = "CPU"
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-pset':
      devices_str = split_cmdLine_devices[-1]
      cpus = devices_str.count('c')
      gpus = devices_str.count('g')
      if str(cpus + gpus) == hostNum and gpus > 0:
        if cpus == 0:
          devices = str(gpus) + " GPU"
          deviceKind = "GPU"
        else:
          devices = str(cpus) + " CPU + " + str(gpus) + " GPU"
          deviceKind = "CPU+GPU"
          hostNum = str(int(hostNum) - cpus)
      break

  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind

def format_str(col):
  max_len = 0
  for c in col:
    if max_len < len(str(c)):
      max_len = len(str(c))
  return max_len

def main(argv):
  inputFile = ''
  forHost = ''
  outputFile = 'LOG_output.csv'
  time_unit = 'seconds'
  try:
    opts, args = getopt.getopt(argv,"hi:n:o:md",["ifile=","node=","ofile=","milliseconds"])
  except getopt.GetoptError:
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputFile = arg
    elif opt in ("-n", "--node"):
      forHost = arg
    elif opt in ("-o", "--ofile"):
      outputFile = arg
    elif opt in ("-m", "--milliseconds"):
      time_unit = 'milliseconds'

  if inputFile == '':
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)

  print 'Input file is : ', inputFile
  print 'Output file is : ', outputFile
  print 'Data for host : ', forHost

  if forHost == '':
    print 'Find the slowest host and calculating everything for that host'

  hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile)

  #shorten the graph names:
  if input_graph == "twitter-ICWSM10-component_withRandomWeights" or input_graph == "twitter-ICWSM10-component-transpose" or input_graph == "twitter-ICWSM10-component":
    input_graph = "twitter-50"
  elif input_graph == "twitter-WWW10-component_withRandomWeights" or input_graph == "twitter-WWW10-component-transpose" or input_graph == "twitter-WWW10-component":
    input_graph = "twitter-40"

  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph
  print 'Devices : ', devices
  data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type)

  print data

  output_str = benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','
  output_str += deviceKind  + ',' + devices  + ','
  output_str += input_graph  + ',' + algo_type  + ',' + cut_type
  print output_str


  header_csv_str = "benchmark,platform,host,threads,"
  header_csv_str += "deviceKind,devices,"
  header_csv_str += "input,variant,partition,mean_time,rep_factor,mean_do_all,sd_do_all,var_do_all,min_do_all,max_do_all,mean_sync_pull,sd_sync_pull,var_sync_pull,min_sync_pull,max_sync_pull,mean_sync_push,sd_sync_push,var_sync_push,min_sync_push,max_sync_push,graph_init_time,hg_init_time,total_time"
  

  header_csv_list = header_csv_str.split(',')
  try:
    if os.path.isfile(outputFile) is False:
      fd_outputFile = open(outputFile, 'wb')
      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
      wr.writerow(header_csv_list)
      fd_outputFile.close()
      print "Adding header to the empty file."
    else:
      print "outputFile : ", outputFile, " exists, results will be appended to it."
  except OSError:
    print "Error in outfile opening\n"

  data_list = list(data) #[data] #list(data)
  complete_data = output_str.split(",") + data_list
  fd_outputFile = open(outputFile, 'a')
  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
  wr.writerow(complete_data)
  fd_outputFile.close()

'''
  ## Write ghost and slave nodes to a file.
  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)
  ghostNodes_file = outputFile + "_" + cut_type
  fd_ghostNodes_file = open(ghostNodes_file, 'ab')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.write("\nHosts : " + hostNum + "\nInputFile : "+ inputFile + "\nBenchmark: " + benchmark + "\nPartition: " + cut_type + "\n\n")
  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.close()
'''

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/check_format.sh
================================================
#!/bin/bash

CLANG_FORMAT=${CLANG_FORMAT:-clang-format}
set -e

if [ $# -eq 0 ]; then
  echo "$(basename $0) [-fix] <paths>" >&2
  exit 1
fi

FIX=
if [ "$1" == "-fix" ]; then
  FIX=1
  shift 1
fi

ROOTS="$@"
FAILED=

while read -d '' filename; do
  if [ -n "${FIX}" ]; then
    echo "fixing ${filename}"
    ${CLANG_FORMAT} -style=file -i "${filename}"
  else
    if ${CLANG_FORMAT} -style=file -output-replacements-xml "${filename}" | grep '<replacement ' > /dev/null; then
        echo "${filename} NOT OK"
        FAILED=1
    fi
  fi
done < <(find ${ROOTS} -name experimental -prune -o -name external -prune -o -name '*.cpp' -print0 -o -name '*.h' -print0)

if [ -n "${FAILED}" ]; then
  exit 1
fi


================================================
FILE: scripts/constraints_checking/parse_dump.rb
================================================
# ##############################
# Parsing dump files
#
# Author : Gurbinder Gill
# email  : gill@cs.utexas.edu
# Date   : 17 Dec, 2014
#
# ##############################


require 'optparse'
require 'pp'
require 'ostruct'

#Gloabal Variables
$arr_local_store = []
$arr_remote_store = []

class OptParserClass
  def self.parse(args)
    options = OpenStruct.new
    options.n = 0;
    options.t = 0;


    opt_parser = OptionParser.new do |opts|
      opts.banner = "Usage: script.rb [options]"

      opts.on("-n", "--n_hosts hosts", Integer, "Give number of hosts") do |hosts|
        options.n = hosts
      end

      opts.on("-t", "--TimeStamp timestamp", Integer, "Which time stamp u want to process.") do |timestamp|
        options.t = timestamp
      end

      opts.on_tail('-h', '--help', String, 'Display Help.') do
        puts opts
        exit
      end
    end
  opt_parser.parse!(args)
  options
  end #end pasrse

end #OptParserClass

def check_opts(opts)
  if opts.n <= 1
    print "Number of hosts must be > 1 : specify : "
    opts.n = gets.chomp
    print "\n"
  end

  if opts.t == 0
    print "Using Default timestamp 0\n"
  end

end #check_opts

def construct_fileNames(opts)
  t = opts.t

  #open and store local files
  opts.n.times do |i|
    #arr_name = "local_#{i}"
    $arr_local_store[i] = open_files("dump_local_#{i}_#{t}.txt")
    #puts arr_local_store[i][0]
    #puts "------------------------------\n"
  end

  #checking
  #opts.n.times do |i|
    #puts $arr_local_store[i][0]
    #puts "------------------------------------\n"
  #end

  #checking
  #arr_local_store[0].each do |line|
    #puts line
  #end

  #open and store remote files
  opts.n.times do |i|
    $arr_remote_store[i] = open_files("dump_remote_#{i}_#{t}.txt")
    #puts arr_remote_store[i][0]
    #puts "------------------------------\n"
  end
end #construct_fileNames

def open_files(filename)
  file = File.new(filename, "r")
  array = []
  while (line = file.gets)
    array << line.chomp
  end
  file.close
  return array
end #open_files

# Functions to check all the constraints
# CHECK 1 : If local dir has given some object, remote dir must know about that object.
def local_to_remote_check
  #checking locals are consistent with remotes
  obj_ptr_re = /\[\d{1},(.*)\]/
  locRW_re = /\locRW\:(.?)\,/
  recalled_re = /recalled\:(.?)\,/
  count = 0
  $arr_local_store.each do |local_file|
    local_file.each do |line|
      unless line.chomp.empty?
        obj_ptr = line.match(obj_ptr_re)
        remote_host = line.match(locRW_re)
        recalled_for = line.match(recalled_re)

        #check if remote host knows about this obj_ptr
        if !remote_host[1].eql?"" and recalled_for[1].eql?""
          found = false
          $arr_remote_store[remote_host[1].to_i].each do |r_line|
            if r_line.include? obj_ptr[1]
              found = true
            end
          end
          if !found
            p "OMG! #{count} gave its object #{obj_ptr[1]} to #{remote_host[1]}, but it doesn't seem to know about it"
          end
        end
      end
    end
    count = count + 1
  end
end

#CHECK 2: If obj in local dir and has a reqsRW, then remote must have contented for that obj.
def local_reqsRW_remote_contended
  obj_ptr_re = /\[\d{1},(.*)\]/
  reqsRW_re = /reqsRW:\<(.*)\>/
  contended_re = /contended\:(.?)\,/
  count = 0
  $arr_local_store.each do |local_file|
    local_file.each do |line|
      unless line.chomp.empty?
        obj_ptr = line.match(obj_ptr_re)
        reqsRW = line.match(reqsRW_re)
        reqs_arr = reqsRW[1].split(/,/)
        reqs_arr.size.times do |i|
          found = false
          $arr_remote_store[reqs_arr[i].to_i].each do |r_line|
            if r_line.include? obj_ptr[1]
              found = true
              contended = r_line.match(contended_re)
              if contended[1].to_i == 0
                p "OMG! #{count} has received a request, for object #{obj_ptr[1]} from #{reqs_arr[i].to_i}, but its not conteneded there"
              end
            end
          end
          if !found and !count.eql?reqs_arr[i].to_i
              p "OMG! #{count} has received a request, for object #{obj_ptr[1]} from #{reqs_arr[i].to_i}, but remote dir at this host doesn't know"
            end
        end
      end
    end
    count = count + 1
  end
end

#CHECK 3: Obj is present locally, its not contented, and there is request for it, but its not given.
def not_contended_with_reqs
  obj_ptr_re = /\[\d{1},(.*)\]/
  reqsRW_re = /reqsRW:\<(.*)\>/
  locRW_re = /\locRW\:(.?)\,/
  contended_re = /contended\:(.?)\,/
  count = 0
  $arr_local_store.each do |local_file|
    local_file.each do |line|
      unless line.chomp.empty?
        obj_ptr = line.match(obj_ptr_re)
        lockRW = line.match(locRW_re)
        reqsRW = line.match(reqsRW_re)
        reqs_arr = reqsRW[1].split(/,/)
        contended = line.match(contended_re)
        #if lockRW is empty, it should be contended locally
        if lockRW[1].eql?"" and reqs_arr.size > 0
          if contended[1].to_i == 0
            p "OMG! #{count} has an object #{obj_ptr} , which is needed by remote hosts #{reqs_arr} and is not locally contended"
          end
        end
      end
    end
    count = count + 1
  end
end

options = OptParserClass.parse(ARGV)
check_opts(options)
p options.t.class
construct_fileNames(options)
#
######constraint_checking
local_to_remote_check
local_reqsRW_remote_contended
not_contended_with_reqs
#host_0_local =
##
#
#
#my_re = /\[\d{1},(.*)\]/
#m = my_re.match(line)
#p m[1]
#locRW_re = /\locRW\:(.?)\,/
#
#string.include? "pattrn"


================================================
FILE: scripts/docker/Dockerfile
================================================
FROM ubuntu:20.04

RUN apt-get update \
      && apt-get install -qy \
      apt-transport-https \
      ca-certificates \
      curl \
      gnupg \
      software-properties-common \
      && curl -fL https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - \
      && apt-add-repository -y 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal main' \
      && apt-get update
RUN apt-get install -qy \
      ccache \
      clang++-10 \
      clang-10 \
      clang-format-10 \
      clang-tidy-10 \
      cmake \
      g++-9 \
      gcc-9 \
      git \
      gosu \
      libfmt-dev \
      libopenmpi-dev \
      llvm-10-dev \
      python3-pip \
      python-is-python3 \
      && update-alternatives --verbose --install /usr/bin/gcc gcc /usr/bin/gcc-9 90 \
      && update-alternatives --verbose --install /usr/bin/g++ g++ /usr/bin/g++-9 90 \
      && update-alternatives --verbose --install /usr/bin/clang clang /usr/bin/clang-10 90 \
      && update-alternatives --verbose --install /usr/bin/clang++ clang++ /usr/bin/clang++-10 90 \
      && rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade --no-cache-dir pip setuptools \
      && pip3 install --no-cache-dir conan==1.24
RUN conan profile new --detect --force default \
    && conan profile update settings.compiler.libcxx=libstdc++11 default \
    && conan remote add kmaragon https://api.bintray.com/conan/kmaragon/conan

# Make it a little more convenient when USER is not root
RUN find /root -type f -print0 | xargs -0 chmod 666 \
      && find /root -type d -print0 | xargs -0 chmod 777
RUN echo "export PS1='\\W$ '" >> /root/.bashrc
ENV HOME=/root
# Yes, allow anyone to run as root with gosu
RUN chmod gu+s `which gosu`

VOLUME /root/.conan/data
VOLUME /root/.ccache
VOLUME /source
WORKDIR /source


================================================
FILE: scripts/docker/Dockerfile.msan
================================================
FROM quick-dev
COPY msan /tmp/msan
RUN bash -x /tmp/msan/build-llvm.sh \
      && bash -x /tmp/msan/build-boost.sh \
      && rm -r /tmp/msan


================================================
FILE: scripts/docker/README.md
================================================
# Reproducible development environments

For long term development it is better to set up a development environment on
your host development machine, but if you'd like to get started quickly, this
directory contains a Docker configuration called quick-dev to simplify the
configuration of a new development environment.

This directory also contains a configuration, msan, that provides instrumented
libraries for use with `-fsanitize=memory` (GALOIS_USE_SANITIZER=Memory).

# Building

```bash
docker build -t quick-dev .
```

```bash
docker build -t quick-dev .
docker build -t msan -f Dockerfile.msan .
```

# Using

```bash
run-image.sh
# Or...
IMAGE=msan run-image.sh
/source/scripts/docker/msan/config-galois.sh
```


================================================
FILE: scripts/docker/msan/build-boost.sh
================================================
#!/bin/bash

BUILD_DIR=${BUILD_DIR:-/tmp/msan/boost-build}
SOURCE_DIR=${SOURCE_DIR:-/tmp/msan/boost}
BOOST_LIBRARIES=${BOOST_LIBRARIES:-headers,iostreams,serialization}
AS_ROOT=${AS_ROOT:-gosu root}
LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}

set -e

MSAN_LINKER_FLAGS="-lc++abi \
  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \
  -L${LLVM_INSTALL_PREFIX}/lib"
MSAN_FLAGS="-nostdinc++ -stdlib=libc++ \
  -isystem ${LLVM_INSTALL_PREFIX}/include \
  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \
  ${MSAN_LINKER_FLAGS} \
  -fsanitize=memory\
  -w"

mkdir -p "${SOURCE_DIR}"
curl -fL https://dl.bintray.com/boostorg/release/1.73.0/source/boost_1_73_0.tar.bz2 | tar -xjv -f - -C "${SOURCE_DIR}"
cd "${SOURCE_DIR}/boost_1_73_0"
./bootstrap.sh --with-toolset=clang --with-libraries="${BOOST_LIBRARIES}"
./b2 threading=multi cxxflags="${MSAN_FLAGS}" linkflags="${MSAN_LINKER_FLAGS}" 
${AS_ROOT} ./b2 install


================================================
FILE: scripts/docker/msan/build-llvm.sh
================================================
#!/bin/bash
#
# Build and install libc++abi, libc++ and ${LLVM_COMPONENTS} with memory
# sanitization.
set -e

NUM_PARALLEL=${NUM_PARALLEL:-2}
BUILD_DIR=${BUILD_DIR:-/tmp/msan/llvm-build}
SOURCE_DIR=${SOURCE_DIR:-/tmp/msan/llvm}
LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}
LLVM_COMMIT=${LLVM_COMMIT:-release/10.x}
LLVM_COMPONENTS=${LLVM_COMPONENTS-LLVMSupport}
AS_ROOT=${AS_ROOT:-gosu root}

git clone -b ${LLVM_COMMIT} --depth 1 https://github.com/llvm/llvm-project.git "${SOURCE_DIR}"

mkdir -p "${BUILD_DIR}/libcxxabi"
cmake \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_C_COMPILER=clang \
  -DCMAKE_CXX_COMPILER=clang++ \
  -DCMAKE_INSTALL_PREFIX="${LLVM_INSTALL_PREFIX}" \
  -DLIBCXXABI_LIBCXX_INCLUDES="${SOURCE_DIR}/libcxx/include" \
  -DLLVM_PATH="${SOURCE_DIR}" \
  -S "${SOURCE_DIR}/libcxxabi" \
  -B "${BUILD_DIR}/libcxxabi"
cmake --build "${BUILD_DIR}/libcxxabi" --parallel "${NUM_PARALLEL}"
${AS_ROOT} cmake --build "${BUILD_DIR}/libcxxabi" --target install

# Bootstrap llvm build with memory sanitized libcxx.

MSAN_LINKER_FLAGS="-lc++abi \
  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \
  -L${LLVM_INSTALL_PREFIX}/lib"

mkdir -p "${BUILD_DIR}/libcxx"
cmake \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_C_COMPILER=clang \
  -DCMAKE_CXX_COMPILER=clang++ \
  -DCMAKE_SHARED_LINKER_FLAGS="${MSAN_LINKER_FLAGS}" \
  -DCMAKE_INSTALL_PREFIX="${LLVM_INSTALL_PREFIX}" \
  -DLIBCXX_CXX_ABI_INCLUDE_PATHS=${SOURCE_DIR}/libcxxabi/include \
  -DLIBCXX_CXX_ABI=libcxxabi \
  -DLLVM_PATH="${SOURCE_DIR}" \
  -DLLVM_USE_SANITIZER=MemoryWithOrigins \
  -S "${SOURCE_DIR}/libcxx" \
  -B "${BUILD_DIR}/libcxx"
cmake --build "${BUILD_DIR}/libcxx" --parallel "${NUM_PARALLEL}"
${AS_ROOT} cmake --build "${BUILD_DIR}/libcxx" --target install

# Build llvm libraries
#
# -fsanitize and -stdlib=c++ are required here in addition to CMake below
# because even linking test programs with libc++-msan requires
# -fsanitize=memory.
MSAN_FLAGS="-nostdinc++ -stdlib=libc++ \
  -isystem ${LLVM_INSTALL_PREFIX}/include \
  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \
  ${MSAN_LINKER_FLAGS} \
  -fsanitize=memory \
  -w"
mkdir -p "${BUILD_DIR}/llvm"
cmake \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_C_COMPILER=clang \
  -DCMAKE_C_FLAGS="${MSAN_FLAGS}" \
  -DCMAKE_CXX_COMPILER=clang++ \
  -DCMAKE_CXX_FLAGS="${MSAN_FLAGS}" \
  -DCMAKE_EXE_LINKER_FLAGS="${MSAN_LINKER_FLAGS}" \
  -DCMAKE_INSTALL_PREFIX="${LLVM_INSTALL_PREFIX}" \
  -DLLVM_ENABLE_LIBCXX=ON \
  -DLLVM_ENABLE_RTTI=ON \
  -DLLVM_USE_SANITIZER=MemoryWithOrigins \
  -S "${SOURCE_DIR}/llvm" \
  -B "${BUILD_DIR}/llvm"
cmake --build "${BUILD_DIR}/llvm" --parallel "${NUM_PARALLEL}" --target ${LLVM_COMPONENTS}
for c in ${LLVM_COMPONENTS}; do
  ${AS_ROOT} cmake -DCOMPONENT=${c} -P "${BUILD_DIR}/llvm/cmake_install.cmake"
done


================================================
FILE: scripts/docker/msan/config-galois.sh
================================================
#!/bin/bash

BUILD_DIR=${BUILD_DIR:-/source/build}
SOURCE_DIR=${SOURCE_DIR:-/source}
LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}

MSAN_LINKER_FLAGS="-lc++abi \
  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \
  -L${LLVM_INSTALL_PREFIX}/lib"

MSAN_FLAGS="-nostdinc++ -stdlib=libc++ \
  -isystem ${LLVM_INSTALL_PREFIX}/include \
  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \
  ${MSAN_LINKER_FLAGS} \
  -fsanitize=memory \
  -w"

cmake \
  -DCMAKE_PREFIX_PATH="${LLVM_INSTALL_PREFIX}" \
  -DGALOIS_USE_SANITIZER=MemoryWithOrigins \
  -DCMAKE_CXX_COMPILER=clang++ \
  -DCMAKE_C_COMPILER=clang \
  -DCMAKE_CXX_FLAGS="${MSAN_FLAGS}" \
  -DCMAKE_C_FLAGS="${MSAN_FLAGS}" \
  -DCMAKE_EXE_LINKER_FLAGS="${MSAN_LINKER_FLAGS}" \
  -S "${SOURCE_DIR}" \
  -B "${BUILD_DIR}"


================================================
FILE: scripts/docker/run-image.sh
================================================
#!/bin/bash
#
# This script runs a development environment inside a docker container. This
# can be useful if you don't want to or have difficulty installing dependencies
# on your host machine.
#
# In order to use this script, you must first build the quick-dev image:
#
#   docker build -t quick-dev .
#
ROOT_DIR=$(cd $(dirname $0)/../..; pwd)

IMAGE=${IMAGE:-quick-dev}
CACHE_DIR=${CACHE_DIR:-$HOME/.cache/quick-dev}

if [[ -z "${DOCKER_USER}" ]]; then
  DOCKER_USER="$(id -u):$(id -g)"
fi

cat<<EOF
###############################################
The following commands will create a working build:

  mkdir build
  conan install -if build --build=missing config
  cmake -S . -B build \\
    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \\
    -DCMAKE_TOOLCHAIN_FILE=build/conan_paths.cmake
  make -C build -j 4

If you need to become root in the container:

  gosu root whoami

Because your user ID does not exist in this container, you may see errors
related to missing group or user IDs. You can ignore them.
###############################################

EOF

mkdir -p "$CACHE_DIR/conan" "$CACHE_DIR/ccache"

exec docker run --rm -it \
  --user "$DOCKER_USER" \
  -v "$ROOT_DIR":/source \
  -v "$CACHE_DIR/conan":/root/.conan/data \
  -v "$CACHE_DIR/ccache":/root/.ccache \
  $IMAGE "$@"


================================================
FILE: scripts/experimental/abelian_log_parser_analysis.py
================================================
##########################################
# To parse log files generated by abelian.
# Author: Gurbinder Gill
# Email: gurbinder533@gmail.com
#########################################

import re
import os
import sys, getopt
import csv
import numpy

######## NOTES:
# All time values are in sec by default.


def sd_iterations(inputFile, outputFile, outputFile_mainfile, benchmark, runs, time_unit, hostNum, iterationNum, variant, input_graph, deviceKind, devices, partition):

  mean_time = 0.0;
  recvNum_total = 0
  recvBytes_total = 0
  sendNum_total = 0
  sendBytes_total = 0
  sync_pull_avg_time_total = 0.0;
  extract_avg_time_total = 0.0;
  set_avg_time_total = 0.0;
  sync_push_avg_time_total = 0.0;
  graph_init_time = 0
  hg_init_time = 0
  total_time = 0

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  if (time_unit == 'seconds'):
    divisor = 1000
  else:
    divisor = 1

  log_data = open(inputFile).read()

  data = [variant, input_graph, hostNum, benchmark, partition, deviceKind, devices]
  fd_outputFile = open(outputFile, 'a')
  fd_outputFile_main = open(outputFile_mainfile, 'a')

  rep_regex = re.compile(r'.*,\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')

  Total_mean_compute = 0.0
  Total_rsd_compute = 0.0
  rep_search = rep_regex.search(log_data)
  if rep_search is not None:
    rep_factor = rep_search.group(2)
    rep_factor = round(float(rep_factor), 3)
    print ("FOUND  : ", rep_factor)

  iterNum_start = 0
  #do_all_regex = re.compile(r'.*,\(NULL\),0\s,\sDO_ALL_IMPL_FirstItr_(?i)' + re.escape(benchmark) + r'_0_' + r',.*' + r',\d*,(\d*)')
  #do_all_all_hosts = re.findall(do_all_regex, log_data)
  #num_arr = numpy.array(map(int,do_all_all_hosts))

  #if(num_arr.size > 0):
    #sd = numpy.std(num_arr, axis=0)
    #mean = numpy.mean(num_arr, axis=0)
    #var = numpy.var(num_arr, axis=0)

    #complete_data = data + [rep_factor,iterNum, mean, var, sd, sd/mean]
    #wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
    #wr.writerow(complete_data)
    #iterNum_start += 1

    #Total_mean_compute += mean
    #Total_rsd_compute += sd/mean


  for iterNum in range(iterNum_start, int(iterationNum)):
    do_all_regex = re.compile(r'.*,\(NULL\),0\s,\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(iterNum))  +r',.*' + r',\d*,(\d*)')
    do_all_all_hosts = re.findall(do_all_regex, log_data)
    num_arr_tmp = numpy.array(map(int,do_all_all_hosts))
    if(num_arr_tmp.size < int(hostNum) and iterNum == 0):
      num_arr = numpy.zeros(int(hostNum));
      for i in range(0, num_arr_tmp.size):
          num_arr[i] = num_arr_tmp[i]
    else:
      num_arr = num_arr_tmp
    print num_arr

    if(num_arr.size < int(hostNum)):
      print "SOME DATA IS MISSING\n"
      #sys.exit("aa! errors! SOME DATA MISSING IN THE LOG FILES!!")

    sd=0.0
    mean=0.0
    var=0.0
    try:
      if(num_arr.size > 0):
        sd = numpy.std(num_arr, axis=0)
        mean = numpy.mean(num_arr, axis=0)
        var = numpy.var(num_arr, axis=0)
    except ValueError:
      pass

    rsd = 0.0;
    if(mean > 0):
      rsd = sd/mean
    complete_data = data + [rep_factor,iterNum, mean, var, sd, rsd]
    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
    wr.writerow(complete_data)

    Total_mean_compute += mean
    Total_rsd_compute += rsd
    print ("MEAN : ", Total_mean_compute)
    print ("RSD : ", Total_rsd_compute)


  Total_mean_compute = round(Total_mean_compute,3)
  Total_rsd_compute = round(Total_rsd_compute/int(iterationNum),3)

  print ("Total_mean_compute : ", Total_mean_compute)
  print ("Total_rsd_compute : ", Total_rsd_compute)

  complete_data = data + [rep_factor,iterNum, Total_mean_compute, Total_rsd_compute]
  wr = csv.writer(fd_outputFile_main, quoting=csv.QUOTE_NONE, lineterminator='\n')
  wr.writerow(complete_data)
  fd_outputFile_main.close();
  fd_outputFile.close()

def get_basicInfo(fileName):

  hostNum_regex = re.compile(r'.*,\(NULL\),0\s,\sHosts,0,0,(\d*)')
  cmdLine_regex = re.compile(r'.*,\(NULL\),0\s,\sCommandLine,0,0,(.*)')
  threads_regex = re.compile(r'.*,\(NULL\),0\s,\sThreads,0,0,(\d*)')
  runs_regex = re.compile(r'.*,\(NULL\),0\s,\sRuns,0,0,(\d*)')
  num_itr_regex = re.compile(r'.*,\(NULL\),0\s,\sNUM_ITERATIONS_\d*,0,0,(\d*)')

  log_data = open(fileName).read()

  hostNum    = ''
  cmdLine    = ''
  threads    = ''
  runs       = ''
  benchmark  = ''
  variant  = ''
  cut_type   = ''
  input_graph = ''
  iterationNum = '' 

  hostNum_search = hostNum_regex.search(log_data)
  if hostNum_search is not None:
    hostNum = hostNum_search.group(1)

  cmdLine_search = cmdLine_regex.search(log_data)
  if cmdLine_search is not None:
    cmdLine = cmdLine_search.group(1)

  threads_search = threads_regex.search(log_data)
  if threads_search is not None:
    threads = threads_search.group(1)

  runs_search    = runs_regex.search(log_data)
  if runs_search is not None:
    runs = runs_search.group(1)
 
  num_itr_search = num_itr_regex.search(log_data)
  if num_itr_search is not None:
    iterationNum = num_itr_search.group(1)

  split_cmdLine_algo = cmdLine.split()[0].split("/")[-1].split("_")
  benchmark, variant =  split_cmdLine_algo

  split_cmdLine_input = cmdLine.split()[1].split("/")
  input_graph_name = split_cmdLine_input[-1]
  input_graph = input_graph_name.split(".")[0]

  split_cmdLine = cmdLine.split()
  cut_type = "edge-cut"
  for index in range(0, len(split_cmdLine)):
    if split_cmdLine[index] == "-enableVertexCut=1":
      cut_type = "vertex-cut"
      break
    elif split_cmdLine[index] == "-enableVertexCut":
         cut_type = "vertex-cut"
         break
    elif split_cmdLine[index] == "-enableVertexCut=0":
         cut_type = "edge-cut"
         break

  #cut_type = "edge-cut"
  #for index in range(0, len(split_cmdLine_input)):
    #if split_cmdLine_input[index] == "-enableVertexCut":
      #cut_type = "vertex-cut"
      #break

  devices = str(hostNum) + " CPU"
  deviceKind = "CPU"
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-pset':
      devices_str = split_cmdLine_devices[-1]
      cpus = devices_str.count('c')
      gpus = devices_str.count('g')
      if str(cpus + gpus) == hostNum and gpus > 0:
        if cpus == 0:
          devices = str(gpus) + " GPU"
          deviceKind = "GPU"
        else:
          devices = str(cpus) + " CPU + " + str(gpus) + " GPU"
          deviceKind = "CPU+GPU"
          hostNum = str(int(hostNum) - cpus)
      break

  return hostNum, cmdLine, threads, runs, benchmark, variant, cut_type, input_graph, devices, deviceKind, iterationNum


def main(argv):
  inputFile = ''
  forHost = '0'
  outputFile = 'LOG_output.csv'
  time_unit = 'seconds'
  try:
    opts, args = getopt.getopt(argv,"hi:n:o:md",["ifile=","node=","ofile=","milliseconds"])
  except getopt.GetoptError:
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputFile = arg
    elif opt in ("-n", "--node"):
      forHost = arg
    elif opt in ("-o", "--ofile"):
      outputFile = arg
    elif opt in ("-m", "--milliseconds"):
      time_unit = 'milliseconds'

  if inputFile == '':
    print 'abelian_log_parser_analysis.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)

  print 'Input file is : ', inputFile
  print 'Output file is : ', outputFile
  print 'Data for host : ', forHost

  hostNum, cmdLine, threads, runs, benchmark, variant, cut_type, input_graph, devices, deviceKind, iterationNum = get_basicInfo(inputFile)

  #shorten the graph names:
  if input_graph == "twitter-ICWSM10-component_withRandomWeights" or input_graph == "twitter-ICWSM10-component-transpose" or input_graph == "twitter-ICWSM10-component":
    input_graph = "twitter-50"
  elif input_graph == "twitter-WWW10-component_withRandomWeights" or input_graph == "twitter-WWW10-component-transpose" or input_graph == "twitter-WWW10-component":
    input_graph = "twitter-40"

  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' variant :', variant, ' cut_type : ', cut_type, ' input_graph : ', input_graph, 'iterationNum :', iterationNum
  print 'Devices : ', devices


  header_csv_str = "variant,input,hosts,benchmark,partition,"
  header_csv_str += "deviceKind,devices,replication,iteration,mean,variance,sd,sdByMean"

  header_csv_str_mainfile = "variant,input,hosts,benchmark,partition,"
  header_csv_str_mainfile += "deviceKind,devices,replication,total_mean_compute,rsd_total"

  output_str = variant + ',' + input_graph + ',' + hostNum + ',' + benchmark + ','
  output_str += deviceKind  + ',' + devices  + ','


  header_csv_list = header_csv_str.split(',')
  header_csv_list_mainfile = header_csv_str_mainfile.split(',')

  outputFile_mainfile = outputFile
  outputFile = outputFile + ".csv"
  #if outputFile is empty add the header to the file
  try:
    if os.path.isfile(outputFile) is False:
      fd_outputFile = open(outputFile, 'wb')
      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
      wr.writerow(header_csv_list)
      fd_outputFile.close()
      print "Adding header to the empty file."
    else:
      print "outputFile : ", outputFile, " exists, results will be appended to it."
  except OSError:
    print "Error in outfile opening\n"

  outputFile_mainfile = outputFile_mainfile + "_main.csv"
  try:
    if os.path.isfile(outputFile_mainfile) is False:
      fd_outputFile = open(outputFile_mainfile, 'wb')
      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
      wr.writerow(header_csv_list_mainfile)
      fd_outputFile.close()
      print "Adding header to the empty file."
    else:
      print "outputFile_mainfile : ", outputFile_mainfile, " exists, results will be appended to it."
  except OSError:
    print "Error in outfile opening\n"


  sd_iterations(inputFile, outputFile, outputFile_mainfile, benchmark, runs, time_unit, hostNum, iterationNum, variant, input_graph, deviceKind, devices, cut_type)
'''
  data_list = list(data) #[data] #list(data)
  #data_list.extend((total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync))
  complete_data = output_str.split(",") + data_list + [rep_factor]#+ list(sendBytes_list)
  fd_outputFile = open(outputFile, 'a')
  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
  wr.writerow(complete_data)
  fd_outputFile.close()
'''
'''
  ## Write ghost and slave nodes to a file.
  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)
  ghostNodes_file = outputFile + "_" + cut_type
  fd_ghostNodes_file = open(ghostNodes_file, 'ab')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.write("\nHosts : " + hostNum + "\nInputFile : "+ inputFile + "\nBenchmark: " + benchmark + "\nPartition: " + cut_type + "\n\n")
  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.close()
'''

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/experimental/abelian_log_parser_deprecated.py
================================================
##########################################
# To parse log files generated by abelian.
# Author: Gurbinder Gill
# Email: gurbinder533@gmail.com
#########################################

import re
import os
import sys, getopt
import csv
import numpy

######## NOTES:
# All time values are in sec by default.


def match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition):

  mean_time = 0.0;
  recvNum_total = 0
  recvBytes_total = 0
  sendNum_total = 0
  sendBytes_total = 0
  sync_pull_avg_time_total = 0.0;
  extract_avg_time_total = 0.0;
  set_avg_time_total = 0.0;
  sync_push_avg_time_total = 0.0;
  graph_init_time = 0
  hg_init_time = 0
  total_time = 0

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  if (time_unit == 'seconds'):
    divisor = 1000
  else:
    divisor = 1
  #e2901bc2-f648-4ff4-9976-ac3b4c794a6a,(NULL),0 , TIMER_2,7,0,79907
  timer_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_(\d*),(\d*),0,(\d*)')
  #timer_regex = re.compile(r'.*,\(NULL\),0\s,\sTIMER_(\d*),7,\d*,(\d*)')

  log_data = open(fileName).read()

  timers = re.findall(timer_regex, log_data)
  print timers

  time = []
  for i in range(int(numRuns)):
    time.append(0)
  for timer in timers:
    run_num = int(timer[0])
    host = int(timer[1])
    host_time = float(timer[2])
    if time[run_num] < host_time:
      time[run_num] = host_time
  for i in range(int(numRuns)):
    mean_time = mean_time + time[i]
  if(len(time) > 0):
    mean_time /= int(numRuns) 
  mean_time /= divisor
  mean_time = round(mean_time, 3)
  print "Mean time: ", mean_time

  #total_cpu_do_all_impl = 0.0
  #max_cpu_do_all_impl = 0.0;
  #min_cpu_do_all_impl = sys.maxint;
  #for host in range(int(total_hosts)):
  #  cpu_do_all_impl_regex = re.compile(r'.*,\(NULL\),0\s,\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\d*,(\d*)')
  #  cpu_do_all_impl_per_host = re.findall(cpu_do_all_impl_regex, log_data)
  #  time_per_host = 0.0
  #  for cpu_do_all_time in cpu_do_all_impl_per_host:
  #    time_per_host += float(cpu_do_all_time)
  #    #print time_per_host
  #  time_per_host /= int(numRuns)
  #  total_cpu_do_all_impl += time_per_host
  #  if(max_cpu_do_all_impl < time_per_host):
  #    max_cpu_do_all_impl = time_per_host
  #  if(min_cpu_do_all_impl > time_per_host):
  #    min_cpu_do_all_impl = time_per_host
  #total_cpu_do_all_impl /= divisor
  #total_cpu_do_all_impl = round(total_cpu_do_all_impl, 3)
  #mean_cpu_do_all_impl = total_cpu_do_all_impl/int(total_hosts)
  #mean_cpu_do_all_impl = round(mean_cpu_do_all_impl, 3)
  #max_cpu_do_all_impl /= divisor
  #max_cpu_do_all_impl = round(max_cpu_do_all_impl, 3)
  #min_cpu_do_all_impl /= divisor
  #min_cpu_do_all_impl = round(min_cpu_do_all_impl, 3)
  #print "total_cpu_do_all : ", total_cpu_do_all_impl
  #print "mean_cpu_do_all : ", mean_cpu_do_all_impl

  #total_cuda_do_all_impl = 0.0
  #max_cuda_do_all_impl = 0.0;
  #min_cuda_do_all_impl = sys.maxint;
  #for host in range(int(total_hosts)):
  #  cuda_do_all_impl_regex = re.compile(r'.*,\(NULL\),0\s,\sCUDA_DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\d*,(\d*)')
  #  cuda_do_all_impl_per_host = re.findall(cuda_do_all_impl_regex, log_data)
  #  time_per_host = 0.0
  #  for cuda_do_all_time in cuda_do_all_impl_per_host:
  #    time_per_host += float(cuda_do_all_time)
  #    #print time_per_host
  #  time_per_host /= int(numRuns)
  #  total_cuda_do_all_impl += time_per_host
  #  if(max_cuda_do_all_impl < time_per_host):
  #    max_cuda_do_all_impl = time_per_host
  #  if(min_cuda_do_all_impl > time_per_host):
  #    min_cuda_do_all_impl = time_per_host
  #total_cuda_do_all_impl /= divisor
  #total_cuda_do_all_impl = round(total_cuda_do_all_impl, 3)
  #mean_cuda_do_all_impl = total_cuda_do_all_impl/int(total_hosts)
  #mean_cuda_do_all_impl = round(mean_cuda_do_all_impl, 3)
  #max_cuda_do_all_impl /= divisor
  #max_cuda_do_all_impl = round(max_cuda_do_all_impl, 3)
  #min_cuda_do_all_impl /= divisor
  #min_cuda_do_all_impl = round(min_cuda_do_all_impl, 3)
  #print "total_cuda_do_all : ", total_cuda_do_all_impl
  #print "mean_cuda_do_all : ", mean_cuda_do_all_impl

  #TOTAL_DO_ALL_IMPL all hosts
  #414c1fb5-0df1-4741-a0ee-cee82f2fc83b,(NULL),0 , DO_ALL_IMPL_bfs,0,0,389
  total_do_all_impl = 0.0
  max_do_all_impl = 0.0;
  min_do_all_impl = sys.maxint;
  #6d3d9407-ed61-4fc9-a4ee-dd9af891b47a,(NULL),0 , DO_ALL_IMPL_BFS_0_1,0,0,5145
  #35537c51-6ba3-47aa-afa0-72edec803b75,(NULL),0 , DO_ALL_IMPL_bfs,3,0,41104
  #0d0cd9b5-f61d-4bd7-963d-f5d129cd711e,(NULL),0 , DO_ALL_IMPL_BFS_0_1,0,0,8007
  for host in range(int(total_hosts)):
    do_all_impl_regex = re.compile(r'.*,\(NULL\),0\s,\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\d*,(\d*)')
    do_all_impl_per_host = re.findall(do_all_impl_regex, log_data)
    #print do_all_impl_all
    time_per_host = 0.0
    #print "----> ", do_all_impl_per_host
    for do_all_time in do_all_impl_per_host:
      if (do_all_time != ""):
        time_per_host += float(do_all_time)
    if time_per_host == 0.0:
      continue
    time_per_host /= len(do_all_impl_per_host) 
    total_do_all_impl += time_per_host
    if(max_do_all_impl < time_per_host):
      max_do_all_impl = time_per_host
    if(min_do_all_impl > time_per_host):
      min_do_all_impl = time_per_host
  total_do_all_impl /= divisor
  total_do_all_impl = round(total_do_all_impl, 3)
  mean_do_all_impl = total_do_all_impl/int(total_hosts)
  mean_do_all_impl = round(mean_do_all_impl, 3)
  max_do_all_impl /= divisor
  max_do_all_impl = round(max_do_all_impl, 3)
  min_do_all_impl /= divisor
  min_do_all_impl = round(min_do_all_impl, 3)
  #print "total_do_all : ", total_do_all_impl
  #print "mean_do_all : ", mean_do_all_impl
  #print "max_do_all : ", max_do_all_impl
  #print "min_do_all : ", min_do_all_impl

  total_comm_time = 0.0
  max_comm_time = 0.0;
  min_comm_time = sys.maxint;
  for host in range(int(total_hosts)):
    comm_time_regex = re.compile(r'.*,\(NULL\),0\s,\sSYNC_PU.._(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\d*,(\d*)')
    #comm_time_regex = re.compile(r'.*,\(NULL\),0\s,\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\d*,(\d*)')
    comm_time_per_host = re.findall(comm_time_regex, log_data)
    #print comm_time_per_host
    time_per_host = 0.0
    #print "----> ", comm_time_per_host
    for comm_time in comm_time_per_host:
      if (comm_time != ""):
        time_per_host += float(comm_time)
    if time_per_host == 0.0:
      continue
    time_per_host /= len(comm_time_per_host) 
    total_comm_time += time_per_host
    if(max_comm_time < time_per_host):
      max_comm_time = time_per_host
    if(min_comm_time > time_per_host):
      min_comm_time = time_per_host
  total_comm_time /= divisor
  total_comm_time = round(total_comm_time, 3)
  mean_comm_time = total_comm_time/int(total_hosts)
  mean_comm_time = round(mean_comm_time, 3)
  max_comm_time /= divisor
  max_comm_time = round(max_comm_time, 3)
  min_comm_time /= divisor
  min_comm_time = round(min_comm_time, 3)
  #print "total_comm_time : ", total_comm_time
  #print "mean_comm_time : ", mean_comm_time
  #print "max_comm_time : ", max_comm_time
  #print "min_comm_time : ", min_comm_time

  total_send_bytes = 0;
  #6d3d9407-ed61-4fc9-a4ee-dd9af891b47a,BFS,0 , SEND_BYTES_SYNC_PULL,0,0,4209914580
  if(partition == "edge-cut"):
    send_bytes_regex = re.compile(r'.*,\(NULL\),0\s,\sSEND_BYTES_SYNC_(PUSH|PULL)_(?i)' + re.escape(benchmark) + r'.*,\d*,\d*,(\d*)')
    send_bytes_firstItr_regex = re.compile(r'.*,\(NULL\),0\s,\sSEND_BYTES_SYNC_(PUSH|PULL)_FirstItr_(?i)' + re.escape(benchmark) + r'.*,\d*,\d*,(\d*)')

    send_bytes_host = re.findall(send_bytes_regex, log_data)
    send_bytes_firstItr_host = re.findall(send_bytes_firstItr_regex, log_data)

    for byte in send_bytes_host:
      if (byte[1] != ""):
        total_send_bytes += (int(byte[1]))
      #print("->", byte[0], " , " , byte[1])
      #print("->", byte_firstItr[0], " , " , byte_firstItr[1])

    for byte_firstItr in send_bytes_firstItr_host:
      if (byte_firstItr[1] != ""):
        total_send_bytes += int(byte_firstItr[1])

    total_send_bytes /= int(numRuns)

  elif(partition == "vertex-cut"):
    #35537c51-6ba3-47aa-afa0-72edec803b75,BFS,0 , SEND_BYTES_SYNC_PULL,0,0,4210027620
    send_bytes_regex = re.compile(r'.*,(?i)' + re.escape(benchmark) + r',0\s,\sSEND_BYTES_SYNC_(PUSH|PULL),\d*,\d*,(\d*)')
    send_bytes_firstItr_regex = re.compile(r'.*,FirstItr_(?i)' + re.escape(benchmark) + r',0\s,\sSEND_BYTES_SYNC_(PUSH|PULL),\d*,\d*,(\d*)')

    send_bytes_host = re.findall(send_bytes_regex, log_data)
    send_bytes_firstItr_host = re.findall(send_bytes_firstItr_regex, log_data)

    for byte in send_bytes_host:
      if (byte[1] != ""):
        total_send_bytes += (int(byte[1]))
      #print("->", byte[0], " , " , byte[1])
      #print("->", byte_firstItr[0], " , " , byte_firstItr[1])

    for byte_firstItr in send_bytes_firstItr_host:
      if (byte_firstItr[1] != ""):
        total_send_bytes += int(byte_firstItr[1])

    total_send_bytes /= int(numRuns)

  ## SYNC_PULL and SYNC_PUSH total average over runs.
  #num_iterations = 0
  #for i in range(0, int(numRuns)):
    # find extract
    #extract_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SYNC_EXTRACT_(?i)' + re.escape(benchmark) + r'\w*_' + re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
    #extract_lines = re.findall(extract_regex, log_data)
    #for j in range (0, len(extract_lines)):
      #extract_avg_time_total += float(extract_lines[j][2])

    # find set
    #set_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SYNC_SET_(?i)' + re.escape(benchmark) + r'\w*_' + re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
    #set_lines = re.findall(set_regex, log_data)
    #for j in range (0, len(set_lines)):
      #set_avg_time_total += float(set_lines[j][2])

    # find sync_pull
    #sync_pull_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SYNC_PULL_(?i)' + re.escape(benchmark) + r'\w*_' + re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
    #sync_pull_lines = re.findall(sync_pull_regex, log_data)
    #num_iterations = len(sync_pull_lines);
    #for j in range (0, len(sync_pull_lines)):
      #sync_pull_avg_time_total += float(sync_pull_lines[j][2])

    # find sync_push
    #sync_push_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SYNC_PUSH_(?i)' + re.escape(benchmark) + r'\w*_'+ re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
    #sync_push_lines = re.findall(sync_push_regex, log_data)

    #if(num_iterations == 0):
    #  num_iterations = len(sync_push_lines)

    #for j in range (0, len(sync_push_lines)):
    #  sync_push_avg_time_total += float(sync_push_lines[j][2])

  #extract_avg_time_total /= int(numRuns)
  #extract_avg_time_total /= divisor
  #extract_avg_time_total = round(extract_avg_time_total, 0)

  #set_avg_time_total /= int(numRuns)
  #set_avg_time_total /= divisor
  #set_avg_time_total = round(set_avg_time_total, 0)

  #sync_pull_avg_time_total /= int(numRuns)
  #sync_pull_avg_time_total /= divisor
  #sync_pull_avg_time_total = round(sync_pull_avg_time_total, 0)

  #sync_push_avg_time_total /= int(numRuns)
  #sync_push_avg_time_total /= divisor
  #sync_push_avg_time_total = round(sync_push_avg_time_total, 0)

  ## sendBytes and recvBytes.
  #recvBytes_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),RecvBytes,\d*,(\d*),(\d*),.*')
  #recvBytes_search = recvBytes_regex.search(log_data)
  #if recvBytes_search is not None:
     #recvBytes_total = float(recvBytes_search.group(1))/int(numRuns)

  #sendBytes_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SendBytes,\d*,(\d*),(\d*),.*')
  #sendBytes_search = sendBytes_regex.search(log_data)
  #if sendBytes_search is not None:
    #sendBytes_total = float(sendBytes_search.group(1))/int(numRuns)

  ## sendNum and recvNum.
  #recvNum_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),RecvNum,\d*,(\d*),(\d*),.*')
  #recvNum_search = recvNum_regex.search(log_data)
  #if recvNum_search is not None:
    #recvNum_total = float(recvNum_search.group(1))/int(numRuns)

  #sendNum_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),SendNum,\d*,(\d*),(\d*),.*')
  #sendNum_search = sendNum_regex.search(log_data)
  #if sendNum_search is not None:
    #sendNum_total = float(sendNum_search.group(1))/int(numRuns)

  ## Get Graph_init, HG_init, total
  timer_graph_init_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),TIMER_GRAPH_INIT,' + re.escape(numThreads) + r',(\d*),(\d*).*')
  timer_hg_init_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),TIMER_HG_INIT,' + re.escape(numThreads) + r',(\d*),(\d*).*')
  timer_total_regex = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,\(NULL\),TIMER_TOTAL,' + re.escape(numThreads) + r',(\d*),(\d*).*')


  timer_graph_init = timer_graph_init_regex.search(log_data)
  timer_hg_init = timer_hg_init_regex.search(log_data)
  timer_total = timer_total_regex.search(log_data)

  if timer_graph_init is not None:
    graph_init_time = float(timer_graph_init.group(1))
    graph_init_time /= divisor
    graph_init_time = round(graph_init_time, 0)

  if timer_hg_init is not None:
    hg_init_time = float(timer_hg_init.group(1))
    hg_init_time /= divisor
    hg_init_time = round(hg_init_time, 0)

  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 0)

  ## Get Commits, Conflicts, Iterations, Pushes for worklist versions:
  commits_search = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,(?i)' + re.escape(benchmark) + '\w*,Commits,' + re.escape(numThreads) + r',(\d*),(\d*).*').search(log_data)
  conflicts_search = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,(?i)' + re.escape(benchmark) + r'\w*,Conflicts,' + re.escape(numThreads) + r',(\d*),(\d*).*').search(log_data)
  iterations_search = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,(?i)' + re.escape(benchmark) + r'\w*,Iterations,' + re.escape(numThreads) + r',(\d*),(\d*).*').search(log_data)
  pushes_search = re.compile(r'\[' + re.escape(forHost) + r'\]STAT,(?i)' + re.escape(benchmark) + r'\w*,Pushes,' + re.escape(numThreads) + r',(\d*),(\d*).*').search(log_data)

  commits    = 0
  conflicts  = 0
  iterations = 0
  pushes     = 0
  if commits_search is not None:
    commits = int(commits_search.group(1))
    commits /= int(numRuns)
  if conflicts_search is not None:
    conflicts = int(conflicts_search.group(1))
    conflicts /= int(numRuns)
  if iterations_search is not None:
    iterations = int(iterations_search.group(1))
    iterations /= int(numRuns)
  if pushes_search is not None:
    pushes = int(pushes_search.group(1))
    pushes /= int(numRuns)

  #return mean_time,graph_init_time,hg_init_time,total_time,sync_pull_avg_time_total,sync_push_avg_time_total,recvNum_total,recvBytes_total,sendNum_total,sendBytes_total,commits,conflicts,iterations, pushes
  #return mean_time,graph_init_time,hg_init_time,total_time,extract_avg_time_total,set_avg_time_total,sync_pull_avg_time_total,sync_push_avg_time_total,num_iterations,commits,conflicts,iterations, pushes
  return mean_time,total_do_all_impl,mean_do_all_impl,max_do_all_impl,min_do_all_impl,total_comm_time,mean_comm_time,max_comm_time,min_comm_time,total_send_bytes


def sendRecv_bytes_all(fileName, benchmark, total_hosts, numRuns, numThreads):
  sendBytes_list = [0]*256 #Max host number is 256
  recvBytes_list = [0]*256 #Max host number is 256

  log_data = open(fileName).read()

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  ## sendBytes and recvBytes.
  total_SendBytes = 0;
  for host in range(0,int(total_hosts)):
    sendBytes_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SendBytes,\d*,(\d*),(\d*),.*')
    sendBytes_search = sendBytes_regex.search(log_data)
    if sendBytes_search is not None:
      sendBytes_list[host] = float(sendBytes_search.group(1))/int(numRuns)

  total_SendBytes = sum(sendBytes_list)

  total_RecvBytes = 0;
  for host in range(0,int(total_hosts)):
    recvBytes_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),RecvBytes,\d*,(\d*),(\d*),.*')
    recvBytes_search = recvBytes_regex.search(log_data)
    if recvBytes_search is not None:
       recvBytes_list[host] = float(recvBytes_search.group(1))/int(numRuns)

  total_RecvBytes = sum(recvBytes_list)
  return total_SendBytes, sendBytes_list


def sendBytes_syncOnly(fileName, benchmark, total_hosts, numRuns, numThreads):
  sendBytes_total_list = [0]*256 #Max host number is 256
  sendBytes_pull_sync_list = [0]*256 #Max host number is 256
  sendBytes_push_sync_list = [0]*256 #Max host number is 256
  sendBytes_pull_sync_reply_list = [0]*256 #Max host number is 256

  log_data = open(fileName).read()

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  ## sendBytes from sync_pull.
  total_SendBytes_pull_sync = 0;
  for host in range(0,int(total_hosts)):
    sendBytes_sync_pull_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SEND_BYTES_SYNC_PULL_(?i)'+ re.escape(benchmark) + r'_0_\d*,\d*,(\d*),(\d*),.*')
    sendBytes_sync_pull_lines = re.findall(sendBytes_sync_pull_regex, log_data)
    print sendBytes_sync_pull_lines

    if len(sendBytes_sync_pull_lines) > 0:
      sendBytes_pull_sync_list[host] = float(sendBytes_sync_pull_lines[0][0]) * len(sendBytes_sync_pull_lines)
      sendBytes_total_list[host] += sendBytes_pull_sync_list[host]
      print "-------> : ", host , " val : " , sendBytes_pull_sync_list[host]

  total_SendBytes_pull_sync = sum(sendBytes_pull_sync_list)

  ## sendBytes from sync_pull_reply.
  total_SendBytes_pull_reply = 0;
  for host in range(0,int(total_hosts)):
    sendBytes_sync_pull_reply_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SEND_BYTES_SYNC_PULL_REPLY_(?i)'+ re.escape(benchmark) + r'_0_\d*,\d*,(\d*),(\d*),.*')
    sendBytes_sync_pull_reply_lines = re.findall(sendBytes_sync_pull_reply_regex, log_data)
    print sendBytes_sync_pull_reply_lines

    if len(sendBytes_sync_pull_reply_lines) > 0:
      sendBytes_pull_sync_reply_list[host] = float(sendBytes_sync_pull_reply_lines[0][0]) * len(sendBytes_sync_pull_reply_lines)
      sendBytes_total_list[host] += sendBytes_pull_sync_reply_list[host]
      #print "-------> : ", host , " val : " , sendBytes_pull_sync_reply_list[host]

  total_SendBytes_pull_reply = sum(sendBytes_pull_sync_reply_list)

  #[2]STAT,(NULL),SEND_BYTES_SYNC_PUSH_BFS_0_0,15,33738828,33738828,0,0,0,0,0,0,0,0,0,0,0,0,0,0
   ## sendBytes from sync_push.
  total_SendBytes_push_sync = 0;
  for host in range(0,int(total_hosts)):
    sendBytes_sync_push_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SEND_BYTES_SYNC_PUSH_(?i)'+ re.escape(benchmark) + r'_0_\d*,\d*,(\d*),(\d*),.*')
    sendBytes_sync_push_lines = re.findall(sendBytes_sync_push_regex, log_data)
    print sendBytes_sync_push_lines

    if len(sendBytes_sync_push_lines) > 0:
      sendBytes_push_sync_list[host] = float(sendBytes_sync_push_lines[0][0]) * len(sendBytes_sync_push_lines)
      sendBytes_total_list[host] += sendBytes_push_sync_list[host]
      #print "-------> : ", host , " val : " , sendBytes_push_sync_list[host]

  total_SendBytes_push_sync = sum(sendBytes_push_sync_list)

  total_SendBytes = total_SendBytes_pull_sync + total_SendBytes_pull_reply + total_SendBytes_push_sync

  return total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync, sendBytes_total_list


def replication_factor(fileName, benchmark, partition, total_hosts, numRuns, numThreads, input_graph):
  log_data = open(fileName).read()
  total_nodes = 0
  if(input_graph == "rmat28"):
    total_nodes = 268435456
  elif(input_graph == "twitter-50"):
    total_nodes = 51161011
  elif(input_graph == "rmat25"):
    total_nodes = 33554432
  elif(input_graph == "twitter-40"):
    total_nodes = 41652230
  else:
    return 0

  print "total_nodes : ", total_nodes
  rep_regex = re.compile(r'.*,\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')

  rep_search = rep_regex.search(log_data)
  if rep_search is not None:
    rep_factor = rep_search.group(2)
    rep_factor = round(float(rep_factor), 3)
    print ("FOUND  : ", rep_factor)
    return rep_factor

  if partition == "edge-cut":
    total_ghost = 0
    #7fee06cb-4c74-458f-a761-ddf6997a1edd,(NULL),0 , TotalGhostNodes,3,0,28215509
    ghost_from_re = re.compile(r'.*,\(NULL\),0\s,\sTotalGhostNodes,(\d*),\d*,(\d*)')
    ghost_from_lines = re.findall(ghost_from_re, log_data)
    for line in ghost_from_lines:
      #print int(line[1])
      total_ghost += int(line[1])

    rep_factor = float(total_nodes + total_ghost)/float(total_nodes)
    rep_factor = round(rep_factor, 3)
    return rep_factor
  elif partition == "vertex-cut" or partition == "vertex-cut-balanced":
    total_slave = 0
    #8190584f-391e-45ca-9d3b-bf1d0d682fad,(NULL),0 , SLAVE_NODES_FROM_0,0,0,83207225
    slave_from_re = re.compile(r'.*,\(NULL\),0\s,\sSLAVE_NODES_FROM_(\d*),(\d*),\d*,(\d*)')
    slave_from_lines = re.findall(slave_from_re, log_data)
    for line in slave_from_lines:
      #print "v", int(line[2])
      total_slave += int(line[2])

    rep_factor = float(float(total_slave)/float(total_nodes))
    rep_factor = round(rep_factor, 3)
    return rep_factor


def build_master_ghost_matrix(fileName, benchmark, partition, total_hosts, numRuns, numThreads):
  #[1]STAT,(NULL),GhostNodes_from_1,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  log_data = open(fileName).read()
  if partition == "edge-cut":
    GhostNodes_array = numpy.zeros((int(total_hosts), int(total_hosts)))
    for host in range(0, int(total_hosts)):
      #(NULL),0 , GhostNodes_from_1,3,0,45865
      #ghost_from_re = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),GhostNodes_from_(\d*),\d*,(\d*),.*')
      ghost_from_re = re.compile(r'\(NULL\),\d* , GhostNodes_from_(\d*),' + re.escape(str(host)) + r',\d*,(\d*)')
      ghost_from_lines = re.findall(ghost_from_re, log_data)
      if(len(ghost_from_lines) > 0):
        for line in ghost_from_lines:
          GhostNodes_array[host][int(line[0])] = int(line[1])
    return GhostNodes_array
  #[1]STAT,(NULL),SLAVE_NODES_FROM_0,15,21693895,21693895,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  elif partition == "vertex-cut" or partition == "vertex-cut-balanced":
    SlaveNodes_array = numpy.zeros((int(total_hosts), int(total_hosts)))
    for host in range(0, int(total_hosts)):
      slave_from_re = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SLAVE_NODES_FROM_(\d*),\d*,(\d*),.*')
      slave_from_lines = re.findall(slave_from_re, log_data)
      if(len(slave_from_lines) > 0):
        for line in slave_from_lines:
          SlaveNodes_array[host][int(line[0])] = int(line[1])
    return SlaveNodes_array


#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_1,15,992,992,0,0,0,0,0,0,0,0,0,0,0,0,0,0
#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_2,15,538,538,0,0,0,0,0,0,0,0,0,0,0,0,0,0
#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_3,15,1408,1408,0,0,0,0,0,0,0,0,0,0,0,0,0,0
#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_1,15,1458,1458,0,0,0,0,0,0,0,0,0,0,0,0,0,0
#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_2,15,1568,1568,0,0,0,0,0,0,0,0,0,0,0,0,0,0
#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_3,15,2766,2766,0,0,0,0,0,0,0,0,0,0,0,0,0,0
def time_at_barrier(fileName, benchmark, total_hosts, numRuns, numThreads):
  log_data = open(fileName).read()
  thousand = 1000.0
  sync_pull_barrier_avg_time_total = [0.0]*256
  sync_pull_avg_time_total = [0.0]*256

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  for host in range(0, int(total_hosts)):
      for i in range(0, int(numRuns)):
        # find sync_pull
        sync_pull_barrier_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SYNC_PULL_BARRIER_(?i)' + re.escape(benchmark) + r'\w*_' + re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
        sync_pull_barrier_lines = re.findall(sync_pull_barrier_regex, log_data)
        num_iterations = len(sync_pull_barrier_lines);
        for j in range (0, len(sync_pull_barrier_lines)):
          sync_pull_barrier_avg_time_total[host] += float(sync_pull_barrier_lines[j][2])

      sync_pull_barrier_avg_time_total[host] /= int(numRuns)
      sync_pull_barrier_avg_time_total[host] /= thousand

  for host in range(0, int(total_hosts)):
      for i in range(0, int(numRuns)):
        # find sync_pull
        sync_pull_regex = re.compile(r'\[' + re.escape(str(host)) + r'\]STAT,\(NULL\),SYNC_PULL_(?i)' + re.escape(benchmark) + r'\w*_' + re.escape(str(i)) + r'_(\d*),\d*,(\d*),(\d*).*')
        sync_pull_lines = re.findall(sync_pull_regex, log_data)
        num_iterations = len(sync_pull_lines);
        for j in range (0, len(sync_pull_lines)):
          sync_pull_avg_time_total[host] += float(sync_pull_lines[j][2])

      sync_pull_avg_time_total[host] /= int(numRuns)
      sync_pull_avg_time_total[host] /= thousand

  print sync_pull_barrier_avg_time_total
  print sync_pull_avg_time_total


#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , CommandLine,0,0,/work/02982/ggill0/Distributed_latest/build_dist_hetero/release_new_gcc/exp/apps/compiler_outputs/bfs_push-topological_edge-cut /scratch/01131/rashid/inputs/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr -startNode=155526494 -maxIterations=10000 -verify=0 -t=15
#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Threads,0,0,15
#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Hosts,0,0,4
#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Runs,0,0,3

def get_basicInfo(fileName):

  hostNum_regex = re.compile(r'.*,\(NULL\),0\s,\sHosts,0,0,(\d*)')
  cmdLine_regex = re.compile(r'.*,\(NULL\),0\s,\sCommandLine,0,0,(.*)')
  threads_regex = re.compile(r'.*,\(NULL\),0\s,\sThreads,0,0,(\d*)')
  runs_regex = re.compile(r'.*,\(NULL\),0\s,\sRuns,0,0,(\d*)')

  log_data = open(fileName).read()

  hostNum    = ''
  cmdLine    = ''
  threads    = ''
  runs       = ''
  benchmark  = ''
  algo_type  = ''
  cut_type   = ''
  input_graph = ''

  hostNum_search = hostNum_regex.search(log_data)
  if hostNum_search is not None:
    hostNum = hostNum_search.group(1)

  cmdLine_search = cmdLine_regex.search(log_data)
  if cmdLine_search is not None:
    cmdLine = cmdLine_search.group(1)

  threads_search = threads_regex.search(log_data)
  if threads_search is not None:
    threads = threads_search.group(1)

  runs_search    = runs_regex.search(log_data)
  if runs_search is not None:
    runs = runs_search.group(1)
  if runs == "":
    runs = "3"
 
  split_cmdLine_algo = cmdLine.split()[0].split("/")[-1].split("_")
  benchmark, algo_type = split_cmdLine_algo

  split_cmdLine_input = cmdLine.split()[1].split("/")
  input_graph_name = split_cmdLine_input[-1]
  input_graph = input_graph_name.split(".")[0]
  cut_type = "edge-cut"
  for index in range(0, len(split_cmdLine_input)):
    if split_cmdLine_input[index] == "-enableVertexCut":
      cut_type = "vertex-cut"
      break

  devices = str(hostNum) + " CPU"
  deviceKind = "CPU"
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-pset':
      devices_str = split_cmdLine_devices[-1]
      cpus = devices_str.count('c')
      gpus = devices_str.count('g')
      if str(cpus + gpus) == hostNum and gpus > 0:
        if cpus == 0:
          devices = str(gpus) + " GPU"
          deviceKind = "GPU"
        else:
          devices = str(cpus) + " CPU + " + str(gpus) + " GPU"
          deviceKind = "CPU+GPU"
          hostNum = str(int(hostNum) - cpus)
      break

  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind

def format_str(col):
  max_len = 0
  for c in col:
    if max_len < len(str(c)):
      max_len = len(str(c))
  return max_len

def main(argv):
  inputFile = ''
  forHost = '0'
  outputFile = 'LOG_output.csv'
  time_unit = 'seconds'
  try:
    opts, args = getopt.getopt(argv,"hi:n:o:md",["ifile=","node=","ofile=","milliseconds"])
  except getopt.GetoptError:
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputFile = arg
    elif opt in ("-n", "--node"):
      forHost = arg
    elif opt in ("-o", "--ofile"):
      outputFile = arg
    elif opt in ("-m", "--milliseconds"):
      time_unit = 'milliseconds'

  if inputFile == '':
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)

  print 'Input file is : ', inputFile
  print 'Output file is : ', outputFile
  print 'Data for host : ', forHost

  hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile)

  #shorten the graph names:
  if input_graph == "twitter-ICWSM10-component_withRandomWeights" or input_graph == "twitter-ICWSM10-component-transpose" or input_graph == "twitter-ICWSM10-component":
    input_graph = "twitter-50"
  elif input_graph == "twitter-WWW10-component_withRandomWeights" or input_graph == "twitter-WWW10-component-transpose" or input_graph == "twitter-WWW10-component":
    input_graph = "twitter-40"

  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph
  print 'Devices : ', devices
  data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type)
  rep_factor = replication_factor(inputFile, benchmark, cut_type, hostNum, runs, threads, input_graph)
  print "rep factor : " , rep_factor
  #total_SendBytes, sendBytes_list = sendRecv_bytes_all(inputFile, benchmark, hostNum, runs, threads)
  #total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync, sendBytes_list = sendBytes_syncOnly(inputFile, benchmark, hostNum, runs, threads)
  print data

  output_str = benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','
  output_str += deviceKind  + ',' + devices  + ','
  output_str += input_graph  + ',' + algo_type  + ',' + cut_type
  #time_at_barrier(inputFile, benchmark, forHost, runs, threads)

  #output_str = benchmark + ',' + 'abelian'  + ',' + hostNum  + ',' + threads  + ',' + input_graph  + ',' + algo_type  + ',' + cut_type

  #for d in data:
    #output_str += ','
    #output_str += str(d)
  print output_str


  header_csv_str = "benchmark,platform,host,threads,"
  header_csv_str += "deviceKind,devices,"
  header_csv_str += "input,variant,partition,mean_time,total_comp_time,mean_comp_time,max_comp_time,min_comp_time,total_comm_time,mean_comm_time,max_comm_time,min_comm_time,total_bytes_sent,rep_factor" #,graph_init_time,hg_init_time,total_time,extract_avg_time,set_avg_time,sync_pull_avg_time,sync_push_avg_time,converge_iterations,commits,conflicts,iterations,pushes,total_sendBytes, total_sendBytes_pull_sync, total_sendBytes_pull_reply, total_sendBytes_push_sync"

  #for i in range(0,256):
    #header_csv_str += ","
    #header_csv_str += ("SB_" + str(i))

  header_csv_list = header_csv_str.split(',')
  #if outputFile is empty add the header to the file
  try:
    if os.path.isfile(outputFile) is False:
      fd_outputFile = open(outputFile, 'wb')
      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
      wr.writerow(header_csv_list)
      fd_outputFile.close()
      print "Adding header to the empty file."
    else:
      print "outputFile : ", outputFile, " exists, results will be appended to it."
  except OSError:
    print "Error in outfile opening\n"

  data_list = list(data) #[data] #list(data)
  #data_list.extend((total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync))
  complete_data = output_str.split(",") + data_list + [rep_factor]#+ list(sendBytes_list)
  fd_outputFile = open(outputFile, 'a')
  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
  wr.writerow(complete_data)
  fd_outputFile.close()

'''
  ## Write ghost and slave nodes to a file.
  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)
  ghostNodes_file = outputFile + "_" + cut_type
  fd_ghostNodes_file = open(ghostNodes_file, 'ab')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.write("\nHosts : " + hostNum + "\nInputFile : "+ inputFile + "\nBenchmark: " + benchmark + "\nPartition: " + cut_type + "\n\n")
  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.close()
'''

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/experimental/abelian_log_parser_multipleRuns.py
================================================
##########################################
# To parse log files generated by abelian.
# Author: Gurbinder Gill
# Email: gurbinder533@gmail.com
#########################################

import re
import os
import sys, getopt
import csv
import numpy
import subprocess

######## NOTES:
# All time values are in sec by default.


def match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition, run_identifier):

  mean_time = 0.0;
  recvNum_total = 0
  recvBytes_total = 0
  sendNum_total = 0
  sendBytes_total = 0
  sync_pull_avg_time_total = 0.0;
  extract_avg_time_total = 0.0;
  set_avg_time_total = 0.0;
  sync_push_avg_time_total = 0.0;
  graph_init_time = 0
  hg_init_time = 0
  total_time = 0

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  if(benchmark == "pagerank"):
    benchmark = "PageRank"

  if (time_unit == 'seconds'):
    divisor = 1000
  else:
    divisor = 1

  log_data = open(fileName).read()


  timer_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sTIMER_0,\d*,0,(\d*)')
  timers = re.findall(timer_regex, log_data)
  #print timers

  time = []
  total_mean_time=0.0

  print timers
  for i in range(int(total_hosts)):
    time.append(0)

  for timer in timers:
    total_mean_time += float(timer)
    print "TIMER : ", timer

  print "TOTAL MEAN TIME " , total_mean_time
  total_mean_time = total_mean_time/int(total_hosts)
  total_mean_time /= divisor
  mean_time = total_mean_time = round(total_mean_time, 3)
  print "Total Mean time: ", total_mean_time

  rep_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')

  rep_search = rep_regex.search(log_data)
  rep_factor = 0;
  if rep_search is not None:
    rep_factor = rep_search.group(2)
    rep_factor = round(float(rep_factor), 3)
  print ("Replication factor  : ", rep_factor)

  num_iter_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sNUM_ITERATIONS_0' + r',\d*,\d*,(\d*)')
  num_iter_search = num_iter_regex.search(log_data)
  if num_iter_regex is not None:
    if num_iter_search is None:
      num_iter = -1
    else:
      num_iter = num_iter_search.group(1)
    print "NUM_ITER : ", num_iter


  #Finding mean,max,sd compute time over all hosts
  max_do_all = 0
  sum_do_all = 0
  for i in range(0,int(num_iter)):
    do_all_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    do_all_all_hosts = re.findall(do_all_regex, log_data)
    num_arr = numpy.array(map(int,do_all_all_hosts))

    if len(num_arr) != 0:
      #print (" COMPUTE NUM_ARR", num_arr)
      max_compute = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_do_all += max_compute
      sum_do_all += numpy.sum(num_arr, axis=0)
  print "max_do_all " , max_do_all
  print "sum_do_all " , sum_do_all
  mean_do_all = float(sum_do_all)/float(total_hosts)


  print "mean_do_all", mean_do_all


  #Finding mean serialization time
  sum_extract = 0
  max_extract = 0
  sync_extract_firstItr_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sSYNC_PU.._EXTRACT_FirstItr_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\d*,(\d*)')
  sync_extract_firstItr_all_hosts = re.findall(sync_extract_firstItr_regex, log_data)
  num_arr_firstItr = numpy.array(map(int,sync_extract_firstItr_all_hosts))
  if(num_arr_firstItr.size > 0):
    sum_extract += numpy.sum(num_arr_firstItr, axis=0)
    max_extract += numpy.max(num_arr_firstItr, axis=0)


  for i in range(0,int(num_iter)):
    sync_extract_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_PU.._EXTRACT_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    sync_extract_all_hosts = re.findall(sync_extract_regex, log_data)
    num_arr = numpy.array(map(int,sync_extract_all_hosts))
    #print "extract", num_arr

    if(num_arr.size > 0):
      sum_extract += numpy.sum(num_arr, axis=0)
      max_extract += numpy.max(num_arr, axis=0)

    # TOTAL EXTRACT
  mean_exract_time = round(sum_extract/float(total_hosts),3)


  #Finding mean deserialization time
  sum_set = 0;
  max_set = 0;
  sync_set_firstItr_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sSYNC_PU.._SET_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\d*'  +r',.*' + r',\d*,(\d*)')
  sync_set_firstItr_all_hosts = re.findall(sync_set_firstItr_regex, log_data)
  num_arr_firstItr = numpy.array(map(int,sync_set_firstItr_all_hosts))
  if(num_arr_firstItr.size > 0):
    sum_set += numpy.sum(num_arr_firstItr, axis=0)
    max_set += numpy.max(num_arr_firstItr, axis=0)

  for i in range(0,int(num_iter)):
    sync_set_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_PU.._SET_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    sync_set_all_hosts = re.findall(sync_set_regex, log_data)
    num_arr = numpy.array(map(int,sync_set_all_hosts))
    #print "set", num_arr

    if(num_arr.size > 0):
      sum_set += numpy.sum(num_arr, axis=0)
      max_set += numpy.max(num_arr, axis=0)

  # TOTAL EXTRACT
  mean_set_time = round(sum_set/float(total_hosts),3)


  #Finding total mean communication time
  max_sync = 0
  sum_sync = 0
  for i in range(0,int(num_iter)):
    sync_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_.*WARD_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\d*,(\d*)')
    sync_all_hosts = re.findall(sync_regex, log_data)
    if sync_all_hosts is None:
      sync_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_PU.._(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\d*,(\d*)')
      sync_all_hosts = re.findall(sync_regex, log_data)
    num_arr = numpy.array(map(int,sync_all_hosts))
    #print "forward", num_arr

    if(num_arr.size > 0):
      sum_sync += numpy.sum(num_arr, axis=0)
      max_sync += numpy.max(num_arr, axis=0)

  sync_firstItr_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sSYNC_.*WARD_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\d*'  +r',.*' + r',\d*,(\d*)')
  sync_firstItr_all_hosts = re.findall(sync_firstItr_regex, log_data)
  if sync_firstItr_all_hosts is None:
    sync_firstItr_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sSYNC_PU.._FirstItr_(?i)' + re.escape(benchmark) + r'_0_\d*'  +r',.*' + r',\d*,(\d*)')
    sync_firstItr_all_hosts = re.findall(sync_firstItr_regex, log_data)
  num_arr_firstItr = numpy.array(map(int,sync_firstItr_all_hosts))

  # TOTAL SYNC TIME
  if(num_arr_firstItr.size > 0):
    sum_sync += numpy.sum(num_arr_firstItr, axis=0)
    max_sync += numpy.max(num_arr_firstItr, axis=0)
  mean_sync_time = sum_sync/float(total_hosts)
  mean_sync_time = round(mean_sync_time/divisor,3)

  #Finding total communication volume in bytes
  sum_sync_bytes = 0
  max_sync_bytes = 0
  for i in range(0,int(num_iter)):
    sync_bytes_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_PU.._SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i))   +r',.*' + r',\d*,(\d*)')
    sync_bytes_all_hosts = re.findall(sync_bytes_regex, log_data)
    num_arr = numpy.array(map(int,sync_bytes_all_hosts))
    #print "send", num_arr

    if(num_arr.size > 0):
      sum_sync_bytes += numpy.sum(num_arr, axis=0)
      max_sync_bytes += numpy.max(num_arr, axis=0)

  print "BYTES : ", sum_sync_bytes
  print "MAX BYTES : ", max_sync_bytes

  sync_bytes_firstItr_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sSYNC_PU.._SEND_BYTES_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\d*'  +r',.*' + r',\d*,(\d*)')
  sync_bytes_firstItr_all_hosts = re.findall(sync_bytes_firstItr_regex, log_data)
  num_arr_firstItr = numpy.array(map(int,sync_bytes_firstItr_all_hosts))

  # TOTAL BYTES EXCHANGED
  if(num_arr_firstItr.size > 0):
    sum_sync_bytes += numpy.sum(num_arr_firstItr, axis=0)
    max_sync_bytes += numpy.max(num_arr_firstItr, axis=0)
  print "BYTES : ", sum_sync_bytes
  total_sync_bytes = sum_sync_bytes


  #75ae6860-be9f-4498-9315-1478c78551f6,(NULL),0 , NUM_WORK_ITEMS_0_0,0,0,262144
  #Total work items, averaged across hosts
  work_items_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sNUM_WORK_ITEMS_0_\d*,\d*,\d*,(\d*)')
  work_items = re.findall(work_items_regex, log_data)
  print work_items
  num_arr = numpy.array(map(int,work_items))
  total_work_item = numpy.sum(num_arr, axis=0)
  print total_work_item

  timer_graph_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_GRAPH_INIT' + r',\d*,\d*,(\d*)')
  timer_graph_init_all_hosts = re.findall(timer_graph_init_regex, log_data)

  num_arr = numpy.array(map(int,timer_graph_init_all_hosts))
  #avg_graph_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
  max_graph_init_time = numpy.max(num_arr, axis=0)
  #avg_graph_init_time = round((avg_graph_init_time / divisor),3)

  print "max_graph_init time : ", max_graph_init_time


  ## Get Graph_init, HG_init, total
  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306
  #timer_graph_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_GRAPH_INIT,\d*,\d*,(\d*)')
  timer_hg_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_HG_INIT' + r',\d*,\d*,(\d*)')
  timer_hg_init_all_hosts = re.findall(timer_hg_init_regex, log_data)

  num_arr = numpy.array(map(int,timer_hg_init_all_hosts))
  #avg_hg_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
  max_hg_init_time = numpy.max(num_arr, axis=0)
  #avg_hg_init_time = round((avg_hg_init_time / divisor),3)
  hg_init_time = max_hg_init_time

  timer_comm_setup_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sCOMMUNICATION_SETUP_TIME' + r',\d*,\d*,(\d*)')
  timer_comm_setup_all_hosts = re.findall(timer_comm_setup_regex, log_data)

  num_arr = numpy.array(map(int,timer_comm_setup_all_hosts))
  #avg_comm_setup_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
  max_comm_setup_time = numpy.max(num_arr, axis=0)
  #max_comm_setup_time = round((avg_comm_setup_time / divisor),3)

  print "max_comm_setup time : ", max_comm_setup_time

  timer_total_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_TOTAL' + r',\d*,\d*,(\d*)')
  #timer_graph_init = timer_graph_init_regex.search(log_data)
  #timer_hg_init = timer_hg_init_regex.search(log_data)
  timer_total = timer_total_regex.search(log_data)
  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 3)

  return mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time

'''
  if timer_graph_init is not None:
    graph_init_time = float(timer_graph_init.group(1))
    graph_init_time /= divisor
    graph_init_time = round(graph_init_time, 3)

  if timer_hg_init is not None:
    hg_init_time = float(timer_hg_init.group(1))
    hg_init_time /= divisor
    hg_init_time = round(hg_init_time, 3)

  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 3)

  print graph_init_time
  print hg_init_time
  print total_time
'''

def get_basicInfo(fileName, run_identifier):

  print ("IDENTIFIER : ", str(run_identifier))
  hostNum_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sHosts,0,0,(\d*)')
  cmdLine_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sCommandLine,0,0,(.*)')
  threads_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sThreads,0,0,(\d*)')
  runs_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sRuns,0,0,(\d*)')

  log_data = open(fileName).read()

  hostNum    = ''
  cmdLine    = ''
  threads    = ''
  runs       = ''
  benchmark  = ''
  algo_type  = ''
  cut_type   = ''
  input_graph = ''

  hostNum_search = hostNum_regex.search(log_data)
  print hostNum_regex.pattern
  print cmdLine_regex.pattern
  if hostNum_search is not None:
    hostNum = hostNum_search.group(1)

  cmdLine_search = cmdLine_regex.search(log_data)
  if cmdLine_search is not None:
    cmdLine = cmdLine_search.group(1)

  threads_search = threads_regex.search(log_data)
  if threads_search is not None:
    threads = threads_search.group(1)

  runs_search    = runs_regex.search(log_data)
  if runs_search is not None:
    runs = runs_search.group(1)
  if runs == "":
    runs = "3"

  print ("CMDLINE : ", cmdLine)
  split_cmdLine_algo = cmdLine.split()[0].split("/")[-1].split("_")
  print split_cmdLine_algo
  benchmark = split_cmdLine_algo[0]
  algo_type = '-'.join(split_cmdLine_algo[1:])

  split_cmdLine_input = cmdLine.split()[1].split("/")
  input_graph_name = split_cmdLine_input[-1]
  input_graph = input_graph_name.split(".")[0]

  print cmdLine
  split_cmdLine = cmdLine.split()
  print split_cmdLine
  cut_type = "edge-cut"
  for index in range(0, len(split_cmdLine)):
    if split_cmdLine[index] == "-enableVertexCut=1":
      cut_type = "vertex-cut"
      break
    elif split_cmdLine[index] == "-enableVertexCut":
         cut_type = "vertex-cut"
         break
    elif split_cmdLine[index] == "-enableVertexCut=0":
         cut_type = "edge-cut"
         break

  num_nodes = hostNum
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-num_nodes':
      num_nodes = split_cmdLine_devices[-1]
  num_hosts_per_node = int(hostNum) / int(num_nodes)

  devices = str(hostNum) + " CPU"
  deviceKind = "CPU"
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-pset':
      devices_str = split_cmdLine_devices[-1]
      cpus = devices_str.count('c')
      gpus = devices_str.count('g')
      if cpus + gpus == num_hosts_per_node and gpus > 0:
        if cpus == 0:
          devices = str(gpus) + " GPU"
          deviceKind = "GPU"
        else:
          devices = str(cpus) + " CPU + " + str(gpus) + " GPU"
          deviceKind = "CPU+GPU"
          hostNum = str(int(hostNum) - cpus)
      break

  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind

def format_str(col):
  max_len = 0
  for c in col:
    if max_len < len(str(c)):
      max_len = len(str(c))
  return max_len

def main(argv):
  inputFile = ''
  forHost = ''
  outputFile = 'LOG_output.csv'
  time_unit = 'milliseconds'
  try:
    opts, args = getopt.getopt(argv,"hi:n:o:md",["ifile=","node=","ofile=","milliseconds"])
  except getopt.GetoptError:
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputFile = arg
    elif opt in ("-n", "--node"):
      forHost = arg
    elif opt in ("-o", "--ofile"):
      outputFile = arg
    elif opt in ("-m", "--milliseconds"):
      time_unit = 'milliseconds'

  if inputFile == '':
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)

  print 'Input file is : ', inputFile
  print 'Output file is : ', outputFile
  print 'Data for host : ', forHost

  if forHost == '':
    print 'Find the slowest host and calculating everything for that host'

  #Find the unique identifiers for different runs
  log_data = open(inputFile).read()
  run_identifiers_regex = re.compile(r'(.*),\(NULL\),0\s,\sTIMER_0,0,0,\d*')
  run_identifiers = re.findall(run_identifiers_regex, log_data)
  for run_identifier in run_identifiers:
    print run_identifier

    hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile, run_identifier)

    #shorten the graph names:
    if input_graph == "twitter-ICWSM10-component_withRandomWeights" or input_graph == "twitter-ICWSM10-component-transpose" or input_graph == "twitter-ICWSM10-component":
      input_graph = "twitter-50"
    elif input_graph == "twitter-WWW10-component_withRandomWeights" or input_graph == "twitter-WWW10-component-transpose" or input_graph == "twitter-WWW10-component":
      input_graph = "twitter-40"

    print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph
    print 'Devices : ', devices
    data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type, run_identifier)

    print data

    output_str = run_identifier + ',' + benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','
    output_str += deviceKind  + ',' + devices  + ','
    output_str += input_graph  + ',' + algo_type  + ',' + cut_type
    print output_str


    header_csv_str = "run-id,benchmark,platform,host,threads,"
    header_csv_str += "deviceKind,devices,"
    header_csv_str += "input,variant,partition,mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,num_work_items,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time"


    header_csv_list = header_csv_str.split(',')
    try:
      if os.path.isfile(outputFile) is False:
        fd_outputFile = open(outputFile, 'wb')
        wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
        wr.writerow(header_csv_list)
        fd_outputFile.close()
        print "Adding header to the empty file."
      else:
        print "outputFile : ", outputFile, " exists, results will be appended to it."
    except OSError:
      print "Error in outfile opening\n"

    data_list = list(data) #[data] #list(data)
    complete_data = output_str.split(",") + data_list
    fd_outputFile = open(outputFile, 'a')
    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
    wr.writerow(complete_data)
    fd_outputFile.close()

'''
  ## Write ghost and slave nodes to a file.
  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)
  ghostNodes_file = outputFile + "_" + cut_type
  fd_ghostNodes_file = open(ghostNodes_file, 'ab')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.write("\nHosts : " + hostNum + "\nInputFile : "+ inputFile + "\nBenchmark: " + benchmark + "\nPartition: " + cut_type + "\n\n")
  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.close()
'''

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/experimental/abelian_log_parser_multipleRuns2.py
================================================
##########################################
# To parse log files generated by abelian.
# Author: Gurbinder Gill
# Email: gurbinder533@gmail.com
#########################################

import re
import os
import sys, getopt
import csv
import numpy
import subprocess

######## NOTES:
# All time values are in sec by default.


def match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition, run_identifier):

  mean_time = 0.0;
  recvNum_total = 0
  recvBytes_total = 0
  sendNum_total = 0
  sendBytes_total = 0
  sync_pull_avg_time_total = 0.0;
  extract_avg_time_total = 0.0;
  set_avg_time_total = 0.0;
  sync_push_avg_time_total = 0.0;
  graph_init_time = 0
  hg_init_time = 0
  total_time = 0

  if(benchmark == "cc"):
    benchmark = "ConnectedComp"

  if(benchmark == "pagerank"):
    benchmark = "PageRank"

  if (time_unit == 'seconds'):
    divisor = 1000
  else:
    divisor = 1

  log_data = open(fileName).read()


  timer_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sTIMER_0,\d*,0,(\d*)')
  timers = re.findall(timer_regex, log_data)
  #print timers

  time = []
  total_mean_time=0.0

  print timers
  for i in range(int(total_hosts)):
    time.append(0)

  for timer in timers:
    total_mean_time += float(timer)
    #print "TIMER : ", timer

  print "TOTAL MEAN TIME " , total_mean_time
  total_mean_time = total_mean_time/int(total_hosts)
  total_mean_time /= divisor
  mean_time = total_mean_time = round(total_mean_time, 3)
  print "Total Mean time: ", total_mean_time

  rep_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREPLICATION_FACTOR_0_0,(\d*),\d*,(.*)')

  rep_search = rep_regex.search(log_data)
  rep_factor = 0;
  if rep_search is not None:
    rep_factor = rep_search.group(2)
    rep_factor = round(float(rep_factor), 3)
  print ("Replication factor  : ", rep_factor)

  num_iter_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sNUM_ITERATIONS_0' + r',\d*,\d*,(\d*)')
  num_iter_search = num_iter_regex.search(log_data)
  if num_iter_regex is not None:
    if num_iter_search is None:
      num_iter = -1
    else:
      num_iter = num_iter_search.group(1)
    print "NUM_ITER : ", num_iter


  #Finding mean,max,sd compute time over all hosts
  max_do_all = 0
  sum_do_all = 0
  sum_std_do_all = 0;
  for i in range(0,int(num_iter)):
    do_all_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    do_all_all_hosts = re.findall(do_all_regex, log_data)
    num_arr = numpy.array(map(int,do_all_all_hosts))

    if len(num_arr) != 0:
      sum_std_do_all += numpy.std(num_arr, axis=0)
      #print (" COMPUTE NUM_ARR", num_arr)
      max_compute = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_do_all += max_compute
      sum_do_all += numpy.sum(num_arr, axis=0)
  print "max_do_all " , max_do_all
  print "sum_do_all " , sum_do_all
  mean_do_all = float(sum_do_all)/float(total_hosts)
  mean_std_do_all = float(sum_std_do_all)/float(num_iter)
  print "XXXXXXXXXXXXXXXXx STD DO ALL : " , mean_std_do_all


  print "mean_do_all", mean_do_all


  ##################### SYNC ##############################
  ############## SYNC = BROADCAST + REDUCE ################
  #Finding mean,max,sd sync time over all hosts
  max_sync = 0
  sum_sync = 0
  for i in range(0,int(num_iter)):
    sync_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sSYNC_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    sync_all_hosts = re.findall(sync_regex, log_data)
    num_arr = numpy.array(map(int,sync_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_sync_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_sync += max_sync_itr
      sum_sync += numpy.sum(num_arr, axis=0)
  mean_sync_time = float(sum_sync)/float(total_hosts)


  print "NEW SYNC_TIME ", mean_sync_time


  ##################### BROADCAST ##############################
  #### BROADCAST = BROADCAST_SEND + BROADCAST_EXTRACT + BROADCAST_RECV + BROADCAST_SET
  #Finding mean,max,sd BROADCAST time over all hosts
  max_broadcast_time = 0
  sum_broadcast = 0
  for i in range(0,int(num_iter)):
    broadcast_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    broadcast_all_hosts = re.findall(broadcast_regex, log_data)
    num_arr = numpy.array(map(int,broadcast_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_broadcast_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_broadcast_time += max_broadcast_itr
      sum_broadcast += numpy.sum(num_arr, axis=0)
  mean_broadcast_time = float(sum_broadcast)/float(total_hosts)


  print "NEW BROADCAST_TIME ", mean_broadcast_time

  max_broadcast_set = 0
  max_broadcast_recv = 0
  max_broadcast_extract = 0
  max_broadcast_send = 0

  mean_broadcast_set_time = 0
  mean_broadcast_recv_time = 0
  mean_broadcast_extract_time = 0
  mean_broadcast_send_time = 0

  '''
  ##################### BROADCAST SEND ##############################
  #Finding mean,max,sd BROADCAST time over all hosts
  max_broadcast_send = 0
  sum_broadcast_send = 0
  for i in range(0,int(num_iter)):
    broadcast_send_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_SEND_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    broadcast_send_all_hosts = re.findall(broadcast_send_regex, log_data)
    num_arr = numpy.array(map(int,broadcast_send_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_broadcast_send_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_broadcast_send += max_broadcast_send_itr
      sum_broadcast_send += numpy.sum(num_arr, axis=0)
  mean_broadcast_send_time = float(sum_broadcast_send)/float(total_hosts)


  print "NEW broadcast_send_TIME ", mean_broadcast_send_time


  ##################### BROADCAST EXTRACT ##############################
  #Finding mean,max,sd BROADCAST time over all hosts
  max_broadcast_extract = 0
  sum_broadcast_extract = 0
  for i in range(0,int(num_iter)):
    broadcast_extract_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_EXTRACT_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    broadcast_extract_all_hosts = re.findall(broadcast_extract_regex, log_data)
    num_arr = numpy.array(map(int,broadcast_extract_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_broadcast_extract_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_broadcast_extract += max_broadcast_extract_itr
      sum_broadcast_extract += numpy.sum(num_arr, axis=0)
  mean_broadcast_extract_time = float(sum_broadcast_extract)/float(total_hosts)


  print "NEW broadcast_extract_TIME ", mean_broadcast_extract_time


##################### BROADCAST recv ##############################
  #Finding mean,max,sd BROADCAST time over all hosts
  max_broadcast_recv = 0
  sum_broadcast_recv = 0
  for i in range(0,int(num_iter)):
    broadcast_recv_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_RECV_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    broadcast_recv_all_hosts = re.findall(broadcast_recv_regex, log_data)
    num_arr = numpy.array(map(int,broadcast_recv_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_broadcast_recv_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_broadcast_recv += max_broadcast_recv_itr
      sum_broadcast_recv += numpy.sum(num_arr, axis=0)
  mean_broadcast_recv_time = float(sum_broadcast_recv)/float(total_hosts)


  print "NEW broadcast_recv_TIME ", mean_broadcast_recv_time


  ##################### BROADCAST SET ##############################
  #Finding mean,max,sd BROADCAST time over all hosts
  max_broadcast_set = 0
  sum_broadcast_set = 0
  for i in range(0,int(num_iter)):
    broadcast_set_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_SET_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    broadcast_set_all_hosts = re.findall(broadcast_set_regex, log_data)
    num_arr = numpy.array(map(int,broadcast_set_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_broadcast_set_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_broadcast_set += max_broadcast_set_itr
      sum_broadcast_set += numpy.sum(num_arr, axis=0)
  print "max_do_all " , max_broadcast_set
  print "sum_do_all " , sum_broadcast_set
  mean_broadcast_set_time = float(sum_broadcast_set)/float(total_hosts)


  print "NEW broadcast_set_TIME ", mean_broadcast_set_time
  '''


  ##################### REDUCE ##############################
  #Finding mean,max,sd REDUCE time over all hosts
  max_reduce_time = 0
  sum_reduce = 0
  for i in range(0,int(num_iter)):
    reduce_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    reduce_all_hosts = re.findall(reduce_regex, log_data)
    num_arr = numpy.array(map(int,reduce_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_reduce_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_reduce_time += max_reduce_itr
      sum_reduce += numpy.sum(num_arr, axis=0)
  mean_reduce_time = float(sum_reduce)/float(total_hosts)


  print "NEW REDUCE_TIME ", mean_reduce_time

  max_reduce_set = 0
  max_reduce_recv = 0
  max_reduce_extract = 0
  max_reduce_send = 0

  mean_reduce_set_time = 0
  mean_reduce_recv_time = 0
  mean_reduce_extract_time = 0
  mean_reduce_send_time = 0

  '''
  ##################### REDUCE SEND ##############################
  #Finding mean,max,sd reduce time over all hosts
  max_reduce_send = 0
  sum_reduce_send = 0
  for i in range(0,int(num_iter)):
    reduce_send_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_SEND_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    reduce_send_all_hosts = re.findall(reduce_send_regex, log_data)
    num_arr = numpy.array(map(int,reduce_send_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_reduce_send_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_reduce_send += max_reduce_send_itr
      sum_reduce_send += numpy.sum(num_arr, axis=0)
  mean_reduce_send_time = float(sum_reduce_send)/float(total_hosts)


  print "NEW reduce_send_TIME ", mean_reduce_send_time


  ##################### REDUCE EXTRACT ##############################
  #Finding mean,max,sd reduce time over all hosts
  max_reduce_extract = 0
  sum_reduce_extract = 0
  for i in range(0,int(num_iter)):
    reduce_extract_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_EXTRACT_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    reduce_extract_all_hosts = re.findall(reduce_extract_regex, log_data)
    num_arr = numpy.array(map(int,reduce_extract_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_reduce_extract_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_reduce_extract += max_reduce_extract_itr
      sum_reduce_extract += numpy.sum(num_arr, axis=0)
  mean_reduce_extract_time = float(sum_reduce_extract)/float(total_hosts)


  print "NEW reduce_extract_TIME ", mean_reduce_extract_time


##################### REDUCE recv ##############################
  #Finding mean,max,sd reduce time over all hosts
  max_reduce_recv = 0
  sum_reduce_recv = 0
  for i in range(0,int(num_iter)):
    reduce_recv_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_RECV_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    reduce_recv_all_hosts = re.findall(reduce_recv_regex, log_data)
    num_arr = numpy.array(map(int,reduce_recv_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_reduce_recv_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_reduce_recv += max_reduce_recv_itr
      sum_reduce_recv += numpy.sum(num_arr, axis=0)
  mean_reduce_recv_time = float(sum_reduce_recv)/float(total_hosts)


  print "NEW reduce_recv_TIME ", mean_reduce_recv_time


  ##################### REDUCE SET ##############################
  #Finding mean,max,sd reduce time over all hosts
  max_reduce_set = 0
  sum_reduce_set = 0
  for i in range(0,int(num_iter)):
    reduce_set_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_SET_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\d*,(\d*)')
    reduce_set_all_hosts = re.findall(reduce_set_regex, log_data)
    num_arr = numpy.array(map(int,reduce_set_all_hosts))

    if len(num_arr) != 0:
      #print (" SYNC NUM_ARR", num_arr)
      max_reduce_set_itr = numpy.max(num_arr, axis=0)
      #print ("MAX : ", max_compute)
      max_reduce_set += max_reduce_set_itr
      sum_reduce_set += numpy.sum(num_arr, axis=0)
  mean_reduce_set_time = float(sum_reduce_set)/float(total_hosts)


  print "NEW reduce_set_TIME ", mean_reduce_set_time
  '''


  # ######################## BROADCAST SENT BYTES ################################
  #Finding total communication volume in bytes
  #2cc54509-cb49-43f9-b1a5-be8f4a4eaf1f,(NULL),0 , BROADCAST_SEND_BYTES_BFS_0_1,0,0,41851160
  sum_broadcast_bytes = 0
  max_broadcast_bytes = 0
  min_broadcast_bytes = 0
  broadcast_bytes_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sBROADCAST_SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_' +r'\d*' +r',.*' + r',\d*,(\d*)')
  broadcast_bytes_all_hosts = re.findall(broadcast_bytes_regex, log_data)
  num_arr = numpy.array(map(int,broadcast_bytes_all_hosts))
  if(num_arr.size > 0):
    sum_broadcast_bytes += numpy.sum(num_arr, axis=0)
    max_broadcast_bytes += numpy.max(num_arr, axis=0)
    min_broadcast_bytes += numpy.min(num_arr, axis=0)

  print "BROADCAST SEND BYTES : ", sum_broadcast_bytes


  # ######################## REDUCE SENT BYTES ################################
  #Finding total communication volume in bytes
  #2cc54509-cb49-43f9-b1a5-be8f4a4eaf1f,(NULL),0 , BROADCAST_SEND_BYTES_BFS_0_1,0,0,41851160
  sum_reduce_bytes = 0
  max_reduce_bytes = 0
  min_reduce_bytes = 0
  reduce_bytes_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sREDUCE_SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_' +r'\d*' +r',.*' + r',\d*,(\d*)')
  reduce_bytes_all_hosts = re.findall(reduce_bytes_regex, log_data)
  num_arr = numpy.array(map(int,reduce_bytes_all_hosts))
  if(num_arr.size > 0):
    sum_reduce_bytes += numpy.sum(num_arr, axis=0)
    max_reduce_bytes += numpy.max(num_arr, axis=0)
    min_reduce_bytes += numpy.min(num_arr, axis=0)

  print "REDUCE SEND BYTES : ", sum_reduce_bytes


  total_sync_bytes = sum_reduce_bytes + sum_broadcast_bytes


  #75ae6860-be9f-4498-9315-1478c78551f6,(NULL),0 , NUM_WORK_ITEMS_0_0,0,0,262144
  #Total work items, averaged across hosts
  work_items_regex = re.compile((run_identifier) + r',\(NULL\),0\s,\sNUM_WORK_ITEMS_0_\d*,\d*,\d*,(\d*)')
  work_items = re.findall(work_items_regex, log_data)
  print work_items
  num_arr = numpy.array(map(int,work_items))
  total_work_item = numpy.sum(num_arr, axis=0)
  print total_work_item

  timer_graph_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_GRAPH_INIT' + r',\d*,\d*,(\d*)')
  timer_graph_init_all_hosts = re.findall(timer_graph_init_regex, log_data)

  num_arr = numpy.array(map(int,timer_graph_init_all_hosts))
  if(num_arr.size > 0):
    #avg_graph_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
    max_graph_init_time = numpy.max(num_arr, axis=0)
    #avg_graph_init_time = round((avg_graph_init_time / divisor),3)

  print "max_graph_init time : ", max_graph_init_time


  ## Get Graph_init, HG_init, total
  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306
  #timer_graph_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_GRAPH_INIT,\d*,\d*,(\d*)')
  timer_hg_init_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_HG_INIT' + r',\d*,\d*,(\d*)')
  timer_hg_init_all_hosts = re.findall(timer_hg_init_regex, log_data)

  num_arr = numpy.array(map(int,timer_hg_init_all_hosts))
  #avg_hg_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
  if(num_arr.size > 0):
    max_hg_init_time = numpy.max(num_arr, axis=0)
    #avg_hg_init_time = round((avg_hg_init_time / divisor),3)
    hg_init_time = max_hg_init_time

  timer_comm_setup_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sCOMMUNICATION_SETUP_TIME' + r',\d*,\d*,(\d*)')
  timer_comm_setup_all_hosts = re.findall(timer_comm_setup_regex, log_data)

  max_comm_setup_time = 0
  num_arr = numpy.array(map(int,timer_comm_setup_all_hosts))
  if(num_arr.size > 0):
    #avg_comm_setup_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)
    max_comm_setup_time = numpy.max(num_arr, axis=0)
    #max_comm_setup_time = round((avg_comm_setup_time / divisor),3)

  print "max_comm_setup time : ", max_comm_setup_time

  timer_total_regex = re.compile((run_identifier) +r',\(NULL\),0\s,\sTIMER_TOTAL' + r',\d*,\d*,(\d*)')
  #timer_graph_init = timer_graph_init_regex.search(log_data)
  #timer_hg_init = timer_hg_init_regex.search(log_data)
  timer_total = timer_total_regex.search(log_data)
  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 3)

  #return mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,mean_broadcast_time,mean_broadcast_send_time,mean_broadcast_extract_time,mean_broadcast_recv_time,mean_broadcast_set_time,mean_reduce_time,mean_reduce_send_time,mean_reduce_extract_time,mean_reduce_recv_time,mean_reduce_set_time,max_comm_setup_time,max_graph_init_time
  return mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,max_sync,mean_broadcast_time,max_broadcast_time,mean_reduce_time,max_reduce_time,max_comm_setup_time,max_graph_init_time


'''
  if timer_graph_init is not None:
    graph_init_time = float(timer_graph_init.group(1))
    graph_init_time /= divisor
    graph_init_time = round(graph_init_time, 3)

  if timer_hg_init is not None:
    hg_init_time = float(timer_hg_init.group(1))
    hg_init_time /= divisor
    hg_init_time = round(hg_init_time, 3)

  if timer_total is not None:
    total_time = float(timer_total.group(1))
    total_time /= divisor
    total_time = round(total_time, 3)

  print graph_init_time
  print hg_init_time
  print total_time
'''

def get_basicInfo(fileName, run_identifier):

  print ("IDENTIFIER : ", str(run_identifier))
  hostNum_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sHosts,0,0,(\d*)')
  cmdLine_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sCommandLine,0,0,(.*)')
  threads_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sThreads,0,0,(\d*)')
  runs_regex = re.compile(re.escape(run_identifier) + r',\(NULL\),0\s,\sRuns,0,0,(\d*)')

  log_data = open(fileName).read()

  hostNum    = ''
  cmdLine    = ''
  threads    = ''
  runs       = ''
  benchmark  = ''
  algo_type  = ''
  cut_type   = ''
  input_graph = ''

  hostNum_search = hostNum_regex.search(log_data)
  print hostNum_regex.pattern
  print cmdLine_regex.pattern
  if hostNum_search is not None:
    hostNum = hostNum_search.group(1)

  cmdLine_search = cmdLine_regex.search(log_data)
  if cmdLine_search is not None:
    cmdLine = cmdLine_search.group(1)

  threads_search = threads_regex.search(log_data)
  if threads_search is not None:
    threads = threads_search.group(1)

  runs_search    = runs_regex.search(log_data)
  if runs_search is not None:
    runs = runs_search.group(1)
  if runs == "":
    runs = "3"

  print ("CMDLINE : ", cmdLine)
  split_cmdLine_algo = cmdLine.split()[0].split("/")[-1].split("_")
  print split_cmdLine_algo
  benchmark = split_cmdLine_algo[0]
  algo_type = '-'.join(split_cmdLine_algo[1:])

  split_cmdLine_input = cmdLine.split()[1].split("/")
  input_graph_name = split_cmdLine_input[-1]
  input_graph = input_graph_name.split(".")[0]

  print cmdLine
  split_cmdLine = cmdLine.split()
  print split_cmdLine
  cut_type = "edge-cut"
  for index in range(0, len(split_cmdLine)):
    if split_cmdLine[index] == "-enableVertexCut=1":
      cut_type = "vertex-cut"
      break
    elif split_cmdLine[index] == "-enableVertexCut":
         cut_type = "vertex-cut"
         break
    elif split_cmdLine[index] == "-enableVertexCut=0":
         cut_type = "edge-cut"
         break

  num_nodes = hostNum
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-num_nodes':
      num_nodes = split_cmdLine_devices[-1]
  num_hosts_per_node = int(hostNum) / int(num_nodes)

  devices = str(hostNum) + " CPU"
  deviceKind = "CPU"
  for index in range(2, len(cmdLine.split())):
    split_cmdLine_devices = cmdLine.split()[index].split("=")
    if split_cmdLine_devices[0] == '-pset':
      devices_str = split_cmdLine_devices[-1]
      cpus = devices_str.count('c')
      gpus = devices_str.count('g')
      if cpus + gpus == num_hosts_per_node and gpus > 0:
        if cpus == 0:
          devices = str(gpus) + " GPU"
          deviceKind = "GPU"
        else:
          devices = str(cpus) + " CPU + " + str(gpus) + " GPU"
          deviceKind = "CPU+GPU"
          hostNum = str(int(hostNum) - cpus)
      break

  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind

def format_str(col):
  max_len = 0
  for c in col:
    if max_len < len(str(c)):
      max_len = len(str(c))
  return max_len

def main(argv):
  inputFile = ''
  forHost = ''
  outputFile = 'LOG_output.csv'
  time_unit = 'milliseconds'
  try:
    opts, args = getopt.getopt(argv,"hi:n:o:md",["ifile=","node=","ofile=","milliseconds"])
  except getopt.GetoptError:
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputFile = arg
    elif opt in ("-n", "--node"):
      forHost = arg
    elif opt in ("-o", "--ofile"):
      outputFile = arg
    elif opt in ("-m", "--milliseconds"):
      time_unit = 'milliseconds'

  if inputFile == '':
    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'
    sys.exit(2)

  print 'Input file is : ', inputFile
  print 'Output file is : ', outputFile
  print 'Data for host : ', forHost

  if forHost == '':
    print 'Find the slowest host and calculating everything for that host'

  #Find the unique identifiers for different runs
  log_data = open(inputFile).read()
  run_identifiers_regex = re.compile(r'(.*),\(NULL\),0\s,\sTIMER_0,0,0,\d*')
  run_identifiers = re.findall(run_identifiers_regex, log_data)
  for run_identifier in run_identifiers:
    print run_identifier

    hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile, run_identifier)

    #shorten the graph names:
    if input_graph == "twitter-ICWSM10-component_withRandomWeights" or input_graph == "twitter-ICWSM10-component-transpose" or input_graph == "twitter-ICWSM10-component":
      input_graph = "twitter-50"
    elif input_graph == "twitter-WWW10-component_withRandomWeights" or input_graph == "twitter-WWW10-component-transpose" or input_graph == "twitter-WWW10-component":
      input_graph = "twitter-40"

    print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph
    print 'Devices : ', devices
    data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type, run_identifier)

    print data

    output_str = run_identifier + ',' + benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','
    output_str += deviceKind  + ',' + devices  + ','
    output_str += input_graph  + ',' + algo_type  + ',' + cut_type
    print output_str


    header_csv_str = "run-id,benchmark,platform,host,threads,"
    header_csv_str += "deviceKind,devices,"
    #header_csv_str += "input,variant,partition,mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,num_work_items,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time"

    #header_csv_str += "input,variant,partition,mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,mean_broadcast_time,mean_broadcast_send_time,mean_broadcast_extract_time,mean_broadcast_recv_time,mean_broadcast_set_time,mean_reduce_time,mean_reduce_send_time,mean_reduce_extract_time,mean_reduce_recv_time,mean_reduce_set_time,max_comm_setup_time,max_graph_init_time"
    header_csv_str += "input,variant,partition,mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,max_sync,mean_broadcast_time,max_broadcast_time,mean_reduce_time,max_reduce_time,max_comm_setup_time,max_graph_init_time"

    header_csv_list = header_csv_str.split(',')
    try:
      if os.path.isfile(outputFile) is False:
        fd_outputFile = open(outputFile, 'wb')
        wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
        wr.writerow(header_csv_list)
        fd_outputFile.close()
        print "Adding header to the empty file."
      else:
        print "outputFile : ", outputFile, " exists, results will be appended to it."
    except OSError:
      print "Error in outfile opening\n"

    data_list = list(data) #[data] #list(data)
    complete_data = output_str.split(",") + data_list
    fd_outputFile = open(outputFile, 'a')
    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\n')
    wr.writerow(complete_data)
    fd_outputFile.close()

'''
  ## Write ghost and slave nodes to a file.
  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)
  ghostNodes_file = outputFile + "_" + cut_type
  fd_ghostNodes_file = open(ghostNodes_file, 'ab')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.write("\nHosts : " + hostNum + "\nInputFile : "+ inputFile + "\nBenchmark: " + benchmark + "\nPartition: " + cut_type + "\n\n")
  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')
  fd_ghostNodes_file.write("\n--------------------------------------------------------------\n")
  fd_ghostNodes_file.close()
'''

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/experimental/bmk2/__init__.py
================================================
#
# __init__.py
#
# Initialization file for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3


================================================
FILE: scripts/experimental/bmk2/bispec.py
================================================
#!/usr/bin/env python
#
# bispec.py
#
# Reader for Binary/Input specification files (*.bispec) for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import re
import logging

log = logging.getLogger(__name__)

class BinInputSpecV1(object):
    def __init__(self):
        self.rules = []
        self.inputs = {}

    def set_input_db(self, inputs):
        """Given an input database, save the input names with their files to 
        this object.
        """
        for i in inputs:
            nm = i.get_id()
            if nm not in self.inputs:
                self.inputs[nm] = {}

            self.inputs[nm][i.props.file] = i

    def get_inputs(self, binary, sel_inputs = None):
        """Get the inputs that need to be run for a particular binary by
        matching it to the key specified in the binspec file.
        e.g. "bfs" should matche any binary with "bfs" in its id
        """
        inpnames = set()
        binid = binary.get_id()
        
        for (re, inp) in self.rules:
            if re.match(binid): # always anchored?
                inpnames = inpnames.union(inp)

        out = []
        for n in inpnames:
            if sel_inputs and n not in sel_inputs:
                log.debug("Ignoring input '%s' for '%s', not in sel_inputs" % (n, binid))
                continue
            
            if n in self.inputs:
                out += self.inputs[n].values()
            else:
                log.info("Input named '%s' not found in inputdb" % (n,))


        return out

    def read(self, ff):
        """Read the bispec file: save a binary matcher to a set of inputs
        that should be run should that matcher match a binary name.
        """
        out = []
        for l in ff:
            l = l.strip()

            if not l: continue

            if l[0] == "#":
                continue

            ls = l.split(" ", 1)
            binmatch = ls[0]
            inpnames = [x.strip() for x in ls[1].split(",")]
            out.append((re.compile(binmatch), set(inpnames)))
            
        self.rules = out

def read_bin_input_spec(f):
    """Read a bispec file which specifies which inputs to run with particular
    binaries.
    """
    with open(f, "rb") as ff:
        l = ff.readline().strip()
        if l == "#v1":
            x = BinInputSpecV1()
        else:
            print >>sys.stderr, "Unknown file version for input/binary spec", l
    
        x.read(ff)

    return x

if __name__ == "__main__":
    read_bin_input_spec(sys.argv[1])


================================================
FILE: scripts/experimental/bmk2/bmk2.py
================================================
#
# bmk2.py
#
# Loader for bmk2 tests.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
from common import *
from config import *
import glob
import os
import inputdb
import bispec
from core import *
from checkers import *
from perf import *

import logging
log = logging.getLogger(__name__)

def load_binary_specs(f, binary_group = 'BINARIES'):
    """Load a python file which should have a global variable
    which contains a list of test specifications to run.
    """
    g = load_py_module(f)

    if binary_group in g:
        return g[binary_group]
    else:
        log.error("No %s in %s" % (binary_group, f))
        return None
       
class Loader(object):
    def __init__(self, metadir, inpproc):
        self.config = Config(metadir, inpproc)        
        self.binaries = {}
        self.bin_inputs = {}
        self.inp_filtered = False

    def initialize(self, ftf = {}):
        """Load the input database, processor, and properties as well as
        the binary spec which specifies which inputs should be run with
        certain binaries.
        """
        # load the configuration files
        if not self.config.load_config():
            return False
        
        if not self.config.auto_set_files():
            return False

        for ty, f in ftf.iteritems():
            if isinstance(f, list):
                for ff in f:
                    self.config.set_file(ff, ty)
            else:
                self.config.set_file(f, ty)

        # load the input database from specified config files
        self.inputdb = inputdb.InputDB(self.config.get_file(FT_INPUTDB), 
                                       self.config.get_file(FT_INPUTPROC),
                                       self.config.get_file(FT_INPUTPROPS))
        if not self.inputdb.load():
            return False

        # load the binary -> input mapping
        self.bs = bispec.read_bin_input_spec(self.config.get_file(FT_BISPEC))
        self.bs.set_input_db(self.inputdb)

        return True

    def split_binputs(self, binputs):
        bins = set()
        inputs = set()

        if binputs:
            inpnames = self.inputdb.inpnames

            for i in binputs:
                if i in inpnames:
                    inputs.add(i)
                else:
                    bins.add(i)

        self.inp_filtered = len(inputs) > 0

        return inputs, bins            

    def load_multiple_binaries(self, binspecs, sel_binaries = None, bingroup = "BINARIES"):
        for b in binspecs:
            if not self.load_binaries(b, sel_binaries, bingroup):
                return False

        return True

    def load_binaries(self, binspec, sel_binaries = None, bingroup = "BINARIES"):
        """Load the list of binaries that need to be run from a python file
        with a global object containing the names of Binary classes.
        """
        d = os.path.dirname(binspec)
        binaries = load_binary_specs(binspec, bingroup)
        if binaries:
            for b in binaries:
                if b.get_id() in self.binaries:
                    log.error("Duplicate binary id %s in %s" % (b.get_id(), binspec))
                    return False

                if sel_binaries and b.get_id() not in sel_binaries:
                    log.debug("Ignoring binary id %s in %s, not in sel_binaries" % (b.get_id(), binspec))
                    continue

                self.binaries[b.get_id()] = b

                if d == '':
                    log.warning('binspec path from "%s" is empty' % (binspec,))

                b.props._cwd = d

            return True
        
        if not binaries or len(binaries) == 0:
            log.error("%s is empty in %s" % (bingroup, binspec))

        return False

    def apply_config(self):
        """TODO figure out what this does"""
        if len(self.binaries) == 0:
            log.error("No binaries to apply configuration to.")
            return False

        if self.config.bin_config is not None and (self.config.bin_config):
            log.info('Applying configuration "%s"' % (self.config.bin_config,))

            for b in self.binaries.itervalues():
                b.apply_config(self.config.bin_config)
        else:
            log.info('No binary-specific configurations specified')
        
        return True

    def associate_inputs(self, binputs = None):
        """Given loaded binary inputs + binaries, associate inputs with
        binaries.
        """
        if len(self.binaries) == 0:
            log.error("No binaries")
            return False

        for bid, b in self.binaries.iteritems():
            i = self.bs.get_inputs(b, binputs)
            if len(i) == 0:
                if not self.inp_filtered:
                    log.error("No inputs matched for binary " + bid)
                    return False
                else:
                    log.warning("No inputs matched for binary " + bid)
                    continue

            i = b.filter_inputs(i)
            if len(i) == 0:
                if not self.inp_filtered:
                    log.error("Filtering discarded all inputs for binary " + bid)
                    return False
                else:
                    log.warning("Filtering discarded all inputs for binary " + bid)
                    continue
            
            self.bin_inputs[bid] = i

        return True

    # NOTE: I (Loc) added config in so I could pass it in
    def get_run_specs(self, config=None):
        """Returns a list of all of the run specifications for binaries in
        this loader (one run spec for each input it is associated with).
        """
        out = []
        for bid, b in self.binaries.iteritems():
            if bid in self.bin_inputs:
                for inp in self.bin_inputs[bid]:
                    testList = b.get_run_spec(inp, config)
                    for k in testList:
                      out.append(k)
                    #out.append((i) for i in b.get_run_spec(inp))
            else:
                assert self.inp_filtered, bid
                    
        return out

if __name__ == "__main__":
    import sys
    x = load_binary_specs(sys.argv[1])
    for bmk in x:
        print bmk.get_id()
        bmk.props.dump()


================================================
FILE: scripts/experimental/bmk2/checkers.py
================================================
#
# checkers.py
#
# Checkers available for tests in bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

from core import Run, AT_OPAQUE
import re
import logging
import os
log = logging.getLogger(__name__)

class Checker(object):
    """Base checker class"""
    check_ok = False

    def check(self, run):
        pass

    def get_input_files(self):
        return []

class PassChecker(Checker):
    """Check that auto-passes regardless of output."""
    def check(self, run):
        run.check_ok = run.run_ok
        return run.run_ok

class DiffChecker(Checker):
    """Check with diff."""
    def __init__(self, file1, gold):
        self.file1 = file1
        self.gold = gold
        
    def get_input_files(self):
        return [self.gold]

    def check(self, run):        
        if not run.run_ok:
            log.error("Cannot check failed run %s" % (run))
            return False

        args = run.get_tmp_files([self.file1, self.gold])

        if os.name != "nt":
        
            x = Run({}, "diff", [(x, AT_OPAQUE) for x in ["-q"] + args])
            if not x.run():
                log.info("diff -u '%s' '%s'" % tuple(args))
                return False

            run.check_ok = True   
        else:
            x = Run({}, "fc.exe", [(x, AT_OPAQUE) for x in args])
            if not x.run():
                log.info("fc.exe '%s' '%s'" % tuple(args))
                return False

            run.check_ok = True
        return True

class NumDiffChecker(Checker):
    """TODO find out what this is"""
    def __init__(self, file1, gold, options=None):
        self.file1 = file1
        self.gold = gold
        self.options = [] if options is None else options
        
    def get_input_files(self):
        return [self.gold]

    def check(self, run):
        if not run.run_ok:
            log.error("Cannot check failed run %s" % (run))
            return False

        args = run.get_tmp_files([self.file1, self.gold])

        if os.name != "nt":
        
            x = Run({}, "numdiff", [(x, AT_OPAQUE) for x in (["-q"]  + self.options + args)])
            if not x.run():
                log.info("numdiff %s '%s' '%s'" % tuple([" ".join(self.options)] + args))
                return False

            run.check_ok = True   
        else:
            x = Run({}, "numdiff.exe", [(x, AT_OPAQUE) for x in (self.options + self.args)])
            if not x.run():
                log.info("numdiff.exe  %s '%s' '%s'" % tuple([" ".join(self.options)] + args))
                return False

            run.check_ok = True
        return True

class REChecker(Checker):
    """Check with regex; look for a particular pattern in output."""
    def __init__(self, rexp):
        self.re = re.compile(rexp, re.MULTILINE)
        
    def check(self, run):        
        if not run.run_ok:
            log.error("Cannot check failed run %s" % (run))
            return False

        for o in [run.stdout, run.stderr]:
            #Tyler: have to remove the annoying windows \r character
            m = self.re.search(o.replace("\r","")) #TODO: stderr?
            if m:
                run.check_ok = True
                break
        else:
            log.info("REChecker could not match '%s'" % (self.re.pattern))

        return run.check_ok

class ExternalChecker(Checker):
    """Check with an external program specified in a Run object."""
    def __init__(self, brs):
        self.rs = brs

    def get_input_files(self):
        out = []
        if not self.rs.in_path:
            out.append(self.rs.binary)

        return out + self.rs.get_input_files()

    def check(self, run):
        if not run.run_ok:
            log.error("Cannot check failed run %s" % (run))
            return False
        
        x = self.rs.run(run.runid + ".external-checker", inherit_tmpfiles = run.tmpfiles)
        if not x.run_ok:
            return False

        run.check_ok = True
        return run.check_ok


================================================
FILE: scripts/experimental/bmk2/collect.py
================================================
#!/usr/bin/env python
#
# collect.py
#
# Scans log files for "COLLECT" and outputs a list of files to be
# collected. Part of bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import datetime
import logproc
import argparse
import os
import mapfile

def build_collect_list(logfile, skip_failed = True, strip_path = 0, suffix = None):
    out = {}
    last_runid = {}
    failed_runids = {}

    basepath = ""
    for r in logproc.parse_log_file(logfile):
        if r.type == "COLLECT":
            if r.filetype == "basepath":
                basepath = r.file
            else:                
                if r.rsid not in out:
                    out[r.rsid] = {}

                if r.runid not in out[r.rsid]:
                    out[r.rsid][r.runid] = {}

                last_runid[r.rsid] = r.runid

                if r.filetype not in out[r.rsid][r.runid]:
                    out[r.rsid][r.runid][r.filetype] = []

                s = strip_path
                x = -1
                n = r.file
                while s > 0:
                    x = r.file.find('/', x + 1)
                    if x == -1: break
                    s -= 1
                else:
                    n = r.file[x+1:]

                if suffix:
                    n = n + suffix

                out[r.rsid][r.runid][r.filetype].append(n)
        elif r.type == "FAIL":
            if r.runid is not None:
                if r.binid not in failed_runids:
                    failed_runids[r.binid] = set()

                failed_runids[r.binid].add(r.runid)
            else:
                # older log files
                if skip_failed:
                    if "run failed" in r.message or "check failed" in r.message: # not robust!
                        if r.binid in last_runid:
                            del out[r.binid][last_runid[r.binid]]

    if skip_failed:
        for binid in failed_runids:
            if binid in out:
                for runid in failed_runids[binid]:
                    if runid in out[binid]:
                        del out[binid][runid]
                
    return basepath, out

def add_names(fnames, basepath, files, out):
    added_fnames = []
    added_out = []

    for f in files:
        if os.path.basename(f) in fnames:
            print >>sys.stderr, "ERROR: duplicate", os.path.basename(f)
            sys.exit(1)

        bn = os.path.basename(f)
        fp = os.path.join(basepath, f)

        fnames.add(bn)
        out.append(fp)

        added_fnames.append(bn)
        added_out.append(fp)

    return added_fnames, added_out

def mapentries(fnames, revmap):
    for fn in fnames:
        x = revmap[fn]
        yield mapfile.mapfile_entry(binid = x[0],
                                    input = "",
                                    runid = x[1],
                                    filetype = x[2],
                                    filename = fn,
                                    abspath = x[3])
                                    
        #print >>mapfile, "%s %s %s %s %s" % (x[0], x[1], x[2], fn, x[3])
    
def collect_logfile(logfile, skip_failed = True, strip_path = 0, suffix = None, filetypes = []):
    basepath, colfiles = build_collect_list(logfile, skip_failed, strip_path, suffix)
    out = []
    fnames = set()
    revmap = {}

    for rsid in colfiles:
        for runid in colfiles[rsid]:
            for ft in colfiles[rsid][runid]:
                if len(filetypes) and ft in filetypes:
                    af, ao = add_names(fnames, basepath, colfiles[rsid][runid][ft], out)
                elif len(filetypes) == 0:
                    af, ao = add_names(fnames, basepath, colfiles[rsid][runid][ft], out)
                else:
                    af = None
                    ao = None

                if af is not None:
                    for f, ff in zip(af, ao):
                        revmap[f] = (rsid, runid, ft, ff)
                
    assert len(fnames) == len(out)

    return out, fnames, revmap
    

if __name__ == "__main__":
    parser = argparse.ArgumentParser("Collect extra files generated during test2.py in a single directory")
    parser.add_argument('logfile', help='Logfile')
    parser.add_argument('filetype', nargs='?', help='Type of files to collect (default: all)', default=[])
    parser.add_argument('-p', dest="strip_path", type=int, metavar='NUM', help='Strip NUM components from filename before combining with basepath', default=0)
    parser.add_argument('-m', dest="map", metavar='FILE', help='Store map of RSID, file and filetype in FILE', default=None)
    parser.add_argument("-a", dest='append', action='store_true', default=False, help="Append to map file")
    parser.add_argument('-s', dest="suffix", metavar='SUFFIX', help='Add suffix to filename', default=0)
    parser.add_argument('--collect-failed', dest="skip_failed", action="store_false", default=True, help='Collect files from failed runs')

    args = parser.parse_args()

    ft = set() if len(args.filetype) == 0  else set([args.filetype])

    out, fnames, revmap = collect_logfile(args.logfile, args.skip_failed, args.strip_path, args.suffix, ft)
    print "\n".join(out)

    if args.map:
        mapfile.write_mapfile_raw(args.map, mapentries(fnames, revmap), "w" if not args.append else "a")


================================================
FILE: scripts/experimental/bmk2/collect_multi.py
================================================
#!/usr/bin/env python
#
# collect_multi.py
#
# Scans multiple log files for "COLLECT" and outputs a list of files to be
# collected. Part of bmk2.
#
# Copyright (c) 2015, 2016, 2017 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

from collect import *
import argparse
import mapfile

if __name__ == "__main__":
    parser = argparse.ArgumentParser("Collect extra files generated during test2.py in a single directory")
    parser.add_argument('logfiles', nargs='+', help='Logfiles')
    parser.add_argument('-t', dest="filetype", action="append", help='Type of files to collect (default: all)', default=[])
    parser.add_argument('-p', dest="strip_path", type=int, metavar='NUM', help='Strip NUM components from filename before combining with basepath', default=0)
    parser.add_argument('-m', dest="map", metavar='FILE', help='Store map of RSID, file and filetype in FILE', default=None)
    parser.add_argument("-a", dest='append', action='store_true', default=False, help="Append to map file")
    parser.add_argument('-s', dest="suffix", metavar='SUFFIX', help='Add suffix to filename', default=0)
    parser.add_argument('--collect-failed', dest="skip_failed", action="store_false", default=True, help='Collect files from failed runs')

    args = parser.parse_args()

    ft = set(args.filetype)

    for i, l in enumerate(args.logfiles):
        out, fnames, revmap = collect_logfile(l, args.skip_failed, args.strip_path, args.suffix, ft)
        print "\n".join(out)

        if args.map:
            mapfile.write_mapfile_raw(args.map, mapentries(fnames, revmap), "a" if (args.append or i > 0) else "w")


================================================
FILE: scripts/experimental/bmk2/common.py
================================================
#
# common.py
#
# Python utilities for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

# runs a python file and returns the globals
def load_py_module(f):
    """Executes a python file and returns the globals in it at the end
    of execution.
    """
    g = {}
    x = execfile(f, g)
    return g


================================================
FILE: scripts/experimental/bmk2/config.py
================================================
#
# config.py
#
# bmk2.cfg reader for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import glob
import os
import logging
import ConfigParser

log = logging.getLogger(__name__)

# essentially enums for globs
FT_BISPEC = 1
FT_INPUTDB = 2
FT_INPUTPROPS = 3
FT_INPUTPROC = 4

FT_FIRST = FT_BISPEC
FT_LAST = FT_INPUTPROC

# file types you can have multiple of
FT_MULTIPLE_OKAY = set()
# file types that aren't necessary
FT_ZERO_OKAY = set([FT_INPUTPROPS, FT_INPUTPROC])
# file types that are possible to load + their expected file extensions
FT_GLOBS = {FT_BISPEC: '*.bispec', 
            FT_INPUTDB: '*.inputdb',
            FT_INPUTPROPS: '*.inputprops', 
            FT_INPUTPROC: None}

class Config(object):
    """Class that provides an interface to a BMK2 configuration file"""
    def __init__(self, metadir, inpproc = None):
        """Setup data structures and load the config file into the object

        Keyword Arguments:
        metadir -- the directory with the configuration information
        inpproc -- TODO figure out what this is
        """
        self.metadir = metadir
        self.okay = False
        self.files = {}
        self.disable_binaries = set()
        self.site = None

        if inpproc is not None:
            self.files = {FT_INPUTPROC: inpproc}

        self._load_config()

    def set_file(self, f, ty, multiple = False):
        """Save a file name into a particular class of files.

        Keyword Arguments:
        f -- the file name
        ty -- the type of the passed in file (see FT_GLOBS for file types)
        multiple -- specifies if you can have multiple files of some type
        """
        assert not (ty < FT_FIRST or ty > FT_LAST), "Invalid file type: %s" % (ty,)
        
        if multiple:
            assert ty in FT_MULTIPLE_OKAY, "File type %d not in multiple" % (ty,)

        if not (os.path.exists(f) and os.path.isfile(f)):
            log.error("File '%s' (file type: %d) does not exist or is not a file" % 
                      (f, ty))
            return False

        if ty not in self.files:
            if multiple:
                self.files[ty] = [f]
            else:
                self.files[ty] = f
        else:
            if multiple:
                self.files[ty].append(f)
            else:
                log.warning("Overwriting file type %d (currently: %s) with %s" % 
                            (ty, self.files[ty], f))
                self.files[ty] = f

        return True

    def get_file(self, ty):
        """Get the file of a particular type from this object."""
        if ty not in self.files:
            return None

        return self.files[ty]
        
    def _site_specific_cfg(self, x):
        """TODO"""
        sitefiles = glob.glob(os.path.join(self.metadir, "SITE-IS.*"))

        if len(sitefiles) > 1:
            log.error("Only one sitefile should exist. Currently, multiple sitefiles exist: '%s'" % (sitefiles,))
        elif len(sitefiles) == 0:
            log.info("No sitefile found.")
        else:
            p = sitefiles[0].rindex(".")
            self.site = sitefiles[0][p+1:]
            log.info("Site set to '%s'." % (self.site,))
            sscfg = os.path.join(self.metadir, "bmk2.cfg." + self.site)

            if not os.path.exists(sscfg):
                log.warning("No site-specific configuration '%s' found." % (sscfg,))
            else:
                log.info("Loading site-specific configuration from '%s'." % (sscfg,))
                y = self._read_config(sscfg,)
                
                for s in y.sections():
                    for n, v in y.items(s):
                        if not self.cfg.has_section(s):
                            self.cfg.add_section(s)

                        log.info("Setting site-specific [%s]:%s to '%s'" % (s, n, v))
                        self.cfg.set(s, n, v)                

                return True

        return False

    def _read_config(self, f):
        """TODO"""
        x = ConfigParser.SafeConfigParser()

        with open(f, "rb") as fp:
            x.readfp(fp)

            try:
                version = x.get("bmk2", "version")
                if version != "2":
                    log.error("%s: Unknown config version %s" % (self.config_file, version,))
                    return False
            except ConfigParser.NoOptionError:
                log.error("%s: Unable to read version" % (self.config_file,))
                return False

            return x

    def _load_config(self):
        """Open a configuration file: it must have a version specification for this
        function to exit cleanly. Saves the opened configuration file to cfg.
        """
        self.cfg = None

        if not (os.path.exists(self.metadir) and os.path.isdir(self.metadir)):
            log.error("Metadir '%s' does not exist or is not a directory" % (self.metadir,))
            return False

        self.config_file = os.path.join(self.metadir, "bmk2.cfg")
        if not (os.path.exists(self.config_file) and os.path.isfile(self.config_file)):
            log.error("Configuration file '%s' does not exist" % (self.config_file,))
            return False

        x = self._read_config(self.config_file)
        if x == False:
            return x

        self.cfg = x
        if self._site_specific_cfg(x) == False:
            return False
        #x = ConfigParser.SafeConfigParser()
        #with open(self.config_file, "rb") as f:
        #    x.readfp(f)

        #    # only reads version 2 bmk2 config files
        #    try:
        #        version = x.get("bmk2", "version")
        #        if version != "2":
        #            log.error("%s: Unknown config version %s" % 
        #                      (self.config_file, version,))
        #            return False
        #    except ConfigParser.NoOptionError:
        #        log.error("%s: Unable to read version" % (self.config_file,))
        #        return False

        #    self.cfg = x

    def load_config(self):
        """Load the file types and disabled binaries as specified by the 
        loaded configuration file.
        """
        x = self.cfg
        if not x:
            return False

        # load each file type, save names
        for prop, ty in [("inpproc", FT_INPUTPROC),
                         ("inputdb", FT_INPUTDB),
                         ("inputprops", FT_INPUTPROPS),
                         ("bispec", FT_BISPEC)]:
            try:
                # section bmk2 with a particular property
                val = x.get("bmk2", prop)
                val = os.path.join(self.metadir, val)
                if self.set_file(val, ty):
                    log.info("%s: Loaded file type %d ('%s')" % 
                             (self.config_file, ty, val))
                else:
                    return False
            except ConfigParser.NoOptionError:
                log.debug("%s: File type %d (property: '%s') not specified" % 
                          (self.config_file, ty, prop))

        # save disabled binaries into object
        try:
            val = x.get("bmk2", "disable_binaries")
            self.disable_binaries = set([xx.strip() for xx in val.split(",")])
        except ConfigParser.NoOptionError:
            pass

        # TODO find out what this is
        self.bin_config = None
        if x.has_section("default-config"):
            self.bin_config = self.section_to_dict(x, "default-config")

        self.cfg = x
        return True

    def section_to_dict(self, cfgobj, section):        
        """TODO"""
        kv = cfgobj.items(section)

        o = set()
        for kk in kv:
            if kk[0] in o:
                log.warning("Duplicated key '%s' in section '%s'", kk[0], section)

            o.add(kk[0])

        return dict(kv)

    def load_bin_config(self, config_sections):
        """TODO"""
        x = self.cfg
        if not x:
            return False
        
        ok = True
        out = []
        for s in config_sections:
            if not x.has_section(s):
                log.error("Configuration section '%s' not found" % (s,))
                ok = False
            else:
                out.append(self.section_to_dict(x, s))

        if ok:
            nout = {}
            for o in out:
                if 'type' in o and o['type'] == 'bmk2config':
                    if 'disable_binaries' in o:
                        v = set([xx.strip() for xx in o['disable_binaries'].split(",")])
                        self.disable_binaries = self.disable_binaries.union(v)
                    else:
                        # TODO: handle other configuration specific things?
                        pass
                else:
                    nout.update(o)

            if len(nout):
                if self.bin_config is None:
                    self.bin_config = {}

                self.bin_config.update(nout)

            return True

        return ok
        

    def load_var_config(self, varconfigs):
        """TODO"""
        o = {}
        for vv in varconfigs:
            va, vl = vv.split("=")
            o[va] = vl

        if self.bin_config is None:
            self.bin_config = {}

        # TODO: warn of command line config over-riding?
        self.bin_config.update(o)
        return True


    def get_var(self, key, default = None, sec = "bmk2"):
        """Loads a variable from a particular secion of the loaded
        config file

        Keyword Arguments:
        key -- variable to load
        default -- default value to return
        sec -- section in the config file to search
        """
        try:
            return self.cfg.get(sec, key)
        except ConfigParser.NoOptionError:
            return default

    def auto_set_files(self):
        """Load any required remaining files in the metadata directory that have
        not been loaded into the files structure of the object yet.
        """
        for ty in range(FT_FIRST, FT_LAST):
            if ty not in self.files and FT_GLOBS[ty] is not None:
                matches = glob.glob(os.path.join(self.metadir, FT_GLOBS[ty]))

                if len(matches) == 0:
                    if ty not in FT_ZERO_OKAY:
                        log.error("File type %d (%s) required, but not found in %s" % 
                                  (ty, FT_GLOBS[ty], self.metadir))
                        return False
                elif len(matches) == 1:
                    log.info("File type %d auto set to %s" % (ty, matches[0]))
                    if not self.set_file(matches[0], ty, False):
                        return False
                elif len(matches) > 1:
                    if ty not in FT_MULTIPLE_OKAY:
                        log.error("Multiple matches found for file type %d (%s) in %s, must specify only one." % 
                                  (ty, FT_GLOBS[ty], self.metadir))
                        return False
                    else:
                        for f in matches:
                            if not self.set_file(f, ty, True):
                                return False

        return True

__all__  = ['FT_BISPEC', 'FT_INPUTDB', 'FT_INPUTPROC', 'FT_INPUTPROPS',
            'Config']

if __name__ == "__main__":
    """Load configuration files specified by the configuration data.
    The directory to the configuration data should be the second argument,
    and there should be a bmk2.cfg file in the directory.
    """
    import sys
    logging.basicConfig(level=logging.DEBUG)
    inpproc = None

    if len(sys.argv) > 2:
        inpproc = sys.argv[2]

    x = Config(sys.argv[1], inpproc)
    if x.load_config():
        if x.auto_set_files():
            print "LOADED CONFIG"


================================================
FILE: scripts/experimental/bmk2/convert.py
================================================
#!/usr/bin/env python
#
# convert.py
#
# Bulk converter for graph files in bmk2. 
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import ConfigParser
import argparse
from extras import *
import logging
import opdb
import os
import re
import sconvert

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')

p = argparse.ArgumentParser("Generate conversion makefile")
p.add_argument("output", nargs="?", default="/dev/stdout")
p.add_argument("-d", dest="metadir", metavar="PATH", help="Path to load configuration from", default=".")
p.add_argument("--iproc", dest="inpproc", metavar="FILE", help="Input processor")
p.add_argument("--bs", dest="binspec", metavar="FILE", help="Binary specification", default="./bmktest2.py")
p.add_argument("--bispec", dest="bispec", metavar="FILE_OR_MNEMONIC", help="Binary+Input specification")
p.add_argument("--scan", dest="scan", metavar="PATH", help="Recursively search PATH for bmktest2.py")
p.add_argument("-v", dest="verbose", type=int, help="Verbosity", default=0)

args = p.parse_args()

loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, bingroup='CONVERTERS')
if not loaded:
    sys.exit(1)
else:
    basepath, binspecs, l = loaded

convspec = l.config.get_var('convspec', None)
if not convspec:
    log.error("No 'convspec' in config file")
    sys.exit(1)

cs = sconvert.load_convspec(os.path.join(l.config.metadir, convspec))
if not cs:
    sys.exit(1)

all_types, conv = sconvert.init_convgraph(cs)

out = []
rspecs = l.get_run_specs()
for rs in rspecs:
    src, srcty, dst, dstty = rs.args
    src, srcty, dst, dstty = src[0], srcty[0], dst[0], dstty[0]

    exists = {}
    for alt in rs.bmk_input.get_all_alt():
        if alt.props.format not in all_types:
            log.error("Format '%s' not listed in convspec"%  (alt.props.format,))
            sys.exit(1)

        if os.path.exists(alt.props.file):
            # sometimes alt.props.file may only exist in the database
            exists[alt.props.format] = alt.props.file
            
    cmds = sconvert.convert_one(cs, src, srcty, dst, dstty, all_types, conv, exists, args.verbose)
    if cmds is None:
        continue

    out.append(cmds)

if len(out):
    f = open(args.output, "w")
    sconvert.to_makefile(f, out)
    f.close()


================================================
FILE: scripts/experimental/bmk2/convgraph.py
================================================
#!/usr/bin/env python
#
# convgraph.py
#
# Planner-based graph converter library.  
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

from pyhop import *

conversions = {}

def register_conversion(src, dst, fn_xform):
    assert (src, dst) not in conversions, "Duplicate conversion (%s, %s)" % (src, dst)

    conversions[(src, dst)] = fn_xform

def convert_direct(state, a, fmt_a, b, fmt_b):
    # we must have a direct converter
    if (fmt_a, fmt_b) not in conversions:
        #print "no direct conversion"
        return False

    if state.files[fmt_a] != a and state.existing[fmt_a] != a:
        #print "src does not exist"
        return False

    if state.files[fmt_b] == b:
        #print "dst exists"
        return False
    
    state.files[fmt_b] = b
    return state

declare_operators(convert_direct)

def convert_from_existing(state, a, fmt_a, b, fmt_b):
    if (fmt_a, fmt_b) in conversions:        
        if b is None:
            b = conversions[(fmt_a, fmt_b)](a)
            if b == a:
                return False

        return [('convert_direct', a, fmt_a, b, fmt_b)]

    for f, e in state.existing.iteritems():
        if state.files[f] is None and f not in state.tried_existing:
            state.tried_existing.add(f)
            # exists but does not feature as a step
            return [('convert', e, f, b, fmt_b)]

    return False

def convert_via(state, a, fmt_a, b, fmt_b):
    if (fmt_a, fmt_b) in conversions:
        return False
    else:
        for s, d in conversions:
            if s == fmt_a and state.files[d] is None:
                via = conversions[(s, d)](a)

                if via == a: # did not match regex
                    return False

                return [('convert', a, fmt_a, via, d),
                        ('convert', via, d, b, fmt_b)]
        
        return False

declare_methods('convert', convert_from_existing, convert_via)

def get_conversion(start, start_ty, end, end_ty, existing, verbose=0):
    s = State('initial')
    
    # do we need existing?
    s.existing = {}
    s.files = {}
    s.tried_existing = set()

    for f1, f2 in conversions.keys():
        s.files[f1] = None
        s.files[f2] = None

    for k, v in existing.iteritems():
        s.existing[k] = v

    s.files[start_ty] = start

    x = pyhop(s, [('convert', start, start_ty, end, end_ty)], verbose=verbose)
    return x

if __name__ == "__main__":
    start_file = 'a'
    start_file_fmt = 'binary/gr'

    s = State('initial')
    s.existing = {}
    s.files = {}

    for f1, f2 in conversions.keys():
        s.files[f1] = None
        s.files[f2] = None

    s.files[start_file_fmt] = start_file

    s.existing[start_file_fmt] = start_file
    s.existing['other/format1'] = 'c'

    x = pyhop(s, [('convert', 'a', 'binary/gr', 'b', 'other/format')], verbose=2)
    if not x:
        print "conversion is unsupported"

        
================================================
FILE: scripts/experimental/bmk2/core.py
================================================
#
# core.py
#
# Core object classes and functions for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import os
import subprocess
import tempfile
import logging

if os.name != "nt":
    import resource

import re

log = logging.getLogger(__name__)

if not hasattr(subprocess, "check_output"):
    print >>sys.stderr, "%s: Need python 2.7" % (sys.argv[0],)
    sys.exit(1)

# Argument Type Enumeration
AT_OPAQUE = 0
AT_INPUT_FILE = 1
AT_OUTPUT_FILE = 2
AT_TEMPORARY_OUTPUT = 3
AT_INPUT_FILE_IMPLICIT = 4
AT_TEMPORARY_INPUT = 5
AT_LOG = 6

placeholder_re = re.compile(r'(@[A-Za-z0-9_]+)') # may need delimiters?

def escape_for_filename(n):
    """Remove / and . from a path to some file (or any string)."""
    return n.replace("/", "_").replace(".", "").replace("\\","_")
    
def create_log(ftemplate, run):
    v = {'runid': run.runid}
    if 'xtitle' in run.rspec.vars:
        v['xtitle'] = run.rspec.vars['xtitle']
    else:
        v['xtitle'] = ''

    if run.rspec: v['rsid'] = escape_for_filename(run.rspec.get_id())

    complete = os.path.join(os.path.dirname(run.binary), ftemplate.format(**v))
    return complete

def squash_output(buf, max_bytes = 1600):
    if len(buf) <= max_bytes:
        return buf
    
    header = buf[:max_bytes/2]
    tail = buf[-max_bytes/2:]

    pos = header.rfind("\n")
    if pos != -1:
        # can trim by a lot ...
        header = header[:pos]

    pos = tail.find("\n")
    if pos != -1:
        # can trim by a lot ...
        tail = tail[pos+1:]

    return header + "\n *** SQUASHED *** \n " + tail    

def strip_repeated_lines(buf, min_repeat = 2, msg = '<< previous line repeated {count} times >>\n'):
    import cStringIO
    
    x = cStringIO.StringIO(buf)
    y = cStringIO.StringIO()

    prev = None
    repeat_count = 0
    hold_buf = ""

    for l in x:
        if l == prev:
            repeat_count += 1
            if repeat_count <= min_repeat:
                hold_buf = hold_buf + l
        else:
            if hold_buf:
                if repeat_count > min_repeat:
                    y.write(msg.format(count = repeat_count))
                else:
                    y.write(hold_buf)

            repeat_count = 0
            hold_buf = ""
            y.write(l)
            
        prev = l
    
    if hold_buf:
        if repeat_count > min_repeat:
            y.write(msg.format(count = repeat_count))
        else:
            y.write(hold_buf)

        repeat_count = 0
        hold_buf = ""

    return y.getvalue()

def run_command(cmd, stdout = True, stderr = True, env = None, popen_args = {}): 
    """Run on the command line the argument cmd."""
    output = None
    error = None
    
    stdouth = subprocess.PIPE if stdout else None
    stderrh = subprocess.PIPE if stderr else None

    fname_stdout = None
    fname_stderr = None

    if os.name == "nt":
        #stdouth, fname_stdout = tempfile.mkstemp(prefix="tmp-stdout" + self.bin_id, dir=self.tmpdir)
        #stderrh, fname_stderr = tempfile.mkstemp(prefix="tmp-stdout" + self.bin_id, dir=self.tmpdir)
        stdouth, fname_stdout = tempfile.mkstemp(prefix="tmp-stdout")
        stderrh, fname_stderr = tempfile.mkstemp(prefix="tmp-stderr")
    
    try:
        proc = subprocess.Popen(cmd, stdout=stdouth, stderr=stderrh, env = env, **popen_args)
        output, error = proc.communicate()
        
        if fname_stdout != None:
            os.close(stdouth)
            tmp_f = open(fname_stdout)
            output = tmp_f.read()
            tmp_f.close()
            os.remove(fname_stdout)

        if fname_stderr != None:
            os.close(stderrh)
            tmp_f = open(fname_stderr)
            error = tmp_f.read()
            tmp_f.close()
            os.remove(fname_stderr)

        if proc.returncode != 0:
            log.error("Execute failed (%d): " % (proc.returncode,) + " ".join(cmd))
            rv = proc.returncode
        else:
            rv = 0
    except OSError as e:
        #print >>sys.stderr, "Execute failed: (%d: %s) "  % (e.errno, e.strerror) + " ".join(cmd)
        log.error("Execute failed (OSError %d '%s'): "  % (e.errno, e.strerror) + " ".join(cmd))
        output = e.strerror
        rv = e.errno

    return (rv, output, error)

def run_command_old(cmd, stdout = True, stderr = True, env = None, popen_args = {}):
    if stderr:
        stdout = True
        stderrh = subprocess.STDOUT
    else:
        stderrh = None 

    output = None
    error = None

    if stdout:
        try:
            output = subprocess.check_output(cmd, stderr=stderrh, env = env, **popen_args)
            rv = 0
        except subprocess.CalledProcessError as e:
            #print >>sys.stderr, "Execute failed (%d): " % (e.returncode,) + " ".join(cmd)
            log.error("Execute failed (%d): " % (e.returncode,) + " ".join(cmd))
            output = e.output
            rv = e.returncode
        except OSError as e:
            #print >>sys.stderr, "Execute failed: (%d: %s) "  % (e.errno, e.strerror) + " ".join(cmd)
            log.error("Execute failed (OSError %d '%s'): "  % (e.errno, e.strerror) + " ".join(cmd))
            output = e.strerror
            rv = e.errno
    else:
        rv = subprocess.call(cmd, stderr=stderrh)

    return (rv, output, error)


class Properties(object):
    """Classes that inherit from this will have the functionality of dumping
    all of their attributes.
    """
    def dump(self):
        for y in vars(self):
            print y, getattr(self, y)

    def __str__(self):
        return ", ".join(["%s=%s" % (x, getattr(self, x)) for x in vars(self)])

class RLimit(object):
    """Represents resource limits."""
    def __init__(self):
        self.limits = {}

    def setrlimit(self, lim, val):
        self.limits[lim] = val

    def set(self):
        for lim, val in self.limits.iteritems():
            resource.setrlimit(lim, val)
            
class Binary(object):
    """Base Binary class."""
    def get_id(self):
        raise NotImplementedError

    def filter_inputs(self, inputs):
        raise NotImplementedError

    def apply_config(self, config):
        raise NotImplementedError


class Converter(Binary):
    def get_run_spec(self, bmkinput):
        x = BasicRunSpec()
        x.set_binary('', 'convert', in_path = True)
        x.set_arg(bmkinput.props.file, AT_INPUT_FILE)
        x.set_arg(bmkinput.props.format, AT_OPAQUE)

        x.bmk_input = bmkinput

        alt = bmkinput.get_alt_format(self.format)

        if alt:
            # we allow this since converter will remove this later ...
            x.set_arg(alt.props.file, AT_OUTPUT_FILE)
        else:
            x.set_arg("@output", AT_OUTPUT_FILE)

        x.set_arg(self.format, AT_OPAQUE)

        return x

class Input(object):
    """BMK input class created from information from an inputdb file.
    Attributes are saved into the object proper."""
    def __init__(self, props, db = None):
        self.props = Properties()
        self.db = db

        for k, v in props.iteritems():
            setattr(self.props, k, v)

        self.name = self.props.name

    def get_alt_format(self, fmt):
        return self.db.get_alt_format(self.name, fmt)

    def get_all_alt(self):
        return self.db.get_all_alt(self.name)

    def hasprop(self, prop):
        return hasattr(self.props, prop)

    def get_id(self):
        return self.name

    def get_file(self):
        raise NotImplementedError

    def __str__(self):
        return "%s(%s)" % (self.name, str(self.props))

    __repr__ = __str__

class Run(object):
    """Class that specifies one run of some binary with a particular
    set of arguments/environment setting.
    """

    def __init__(self, env, binary, args, rspec = None):
        """Run initialization.

        Keyword Arguments:

        env -- environment variables
        binary -- binary name/path
        args -- arguments to binary
        rpsec -- a run specification object
        """
        self.env = env
        self.binary = binary
        self.args = args
        self.cmd_line_c = "not-run-yet"
        self.bin_id = escape_for_filename(self.binary)
        self.rspec = rspec
        self.runid = None

        self.retval = -1
        self.stdout = ""
        self.stderr = ""

        self.tmpdir = None
        self.tmpfiles = {}
        self.run_ok = False
        self.check_ok = False
        self.overlays = []
        self.popen_args = {}

    def set_popen_args(self, kwd, val):
        """Set a Process open argument."""
        self.popen_args[kwd] = val

    def set_overlays(self, overlays):
        self.overlays = overlays

    def set_tmpdir(self, tmpdir):
        self.tmpdir = tmpdir

    def run(self, inherit_tmpfiles = None):
        """Run the commmand specified by this object."""
        assert self.retval == -1, "Can't use the same Run object twice"

        cmdline = [self.binary]

        # get arguments to pass into command line
        for a, aty in self.args:
            if aty == AT_INPUT_FILE_IMPLICIT:
                continue

            if aty == AT_TEMPORARY_OUTPUT:
                km = placeholder_re.search(a)
                assert km is not None
                k = km.group(1) # this should really be folded into the argtype itself...

                th, self.tmpfiles[k] = tempfile.mkstemp(prefix="test-" + self.bin_id, dir=self.tmpdir)
                os.close(th) # else files will continue to occupy space even after they are deleted
                log.debug("Created temporary file '%s' for '%s'" % (self.tmpfiles[k], k))
                a = a.replace(k, self.tmpfiles[k])
            elif aty == AT_TEMPORARY_INPUT:
                km = placeholder_re.search(a)
                assert km is not None
                k = km.group(1)
                a = a.replace(k, inherit_tmpfiles[k])

            cmdline.append(a)
            
        env = self.env
        for ov in self.overlays:
            env, cmdline = ov.overlay(self, env, cmdline, inherit_tmpfiles)

        self.env = env
        self.cmd_line = cmdline
        self.cmd_line_c = " ".join(self.cmd_line) # command line string

        log.info("Running %s" % (str(self)))

        run_env = os.environ.copy() # do this at init time instead of runtime?
        run_env.update(self.env)

        self.retval, self.stdout, self.stderr = run_command(self.cmd_line,
                                                   env=run_env,
                                                   popen_args = self.popen_args)
        self.run_ok = self.retval == 0

        return self.run_ok

    def get_tmp_files(self, names):
        out = []
        for n in names:
            if n[0] == "@":
                out.append(self.tmpfiles[n])
            else:
                out.append(n)

        return out

    def cleanup(self):
        """Cleanup the temporary files created by the run object."""
        for a, f in self.tmpfiles.iteritems():
            os.unlink(f)

    def __str__(self):
        ev = ["%s=%s" % (k, v) for k, v in self.env.iteritems()]
        return "%s %s" % (" ".join(ev), self.cmd_line_c)


class BasicRunSpec(object):
    """Class containing the specifications for running a binary."""
    def __init__(self):
        self.binary = None
        self.args = []
        self.env = {}
        self.runs = []
        self.in_path = False
        self.overlays = []
        self._runids = set()
        self.rlimit = None
        self.tmpdir = None
        self.vars = {}

        self.errors = set()

    def set_tmpdir(self, tmpdir):
        """Set the temporary directory."""
        self.tmpdir = tmpdir

    def add_overlay(self, overlay):
        """Add an overlay to the list of overlays."""
        self.overlays.append(overlay)

    def get_id(self):
        """Return an id to this run spec."""
        return "%s/%s" % (self.bid, self.input_name)
    
    def set_binary(self, cwd, binary, in_path = False):
        """Set the binary to run with this run spec.

        Keyword Arguments:

        cwd -- current working directory 
        binary -- binary name
        in_path -- specifies if the binary can be found in the current path 
        env variable
        """
        self.cwd = cwd # TODO: does this do anything?
        self.binary = os.path.join(cwd, binary)
        self.in_path = in_path

    def has_env_var(self, var):
        """Check if a particular environment variable is current known
        by this run spec.
        """
        return var in self.env

    def set_env_var(self, var, value, replace = True):
        """Sets an environment variable."""
        if var in self.env and not replace:
            raise IndexError

        self.env[var] = value

    def set_arg(self, arg, arg_type = AT_OPAQUE):
        """Set an argument to use when running the spec."""
        self.args.append((arg, arg_type))

    def get_input_files(self):
        """Search through set arguments in the specification looking
        for input file arguments, and return found input files.
        """
        out = []
        for a, aty in self.args:
            if aty in (AT_INPUT_FILE, AT_INPUT_FILE_IMPLICIT):
                out.append(a)

        return out

    def check(self):
        """Make sure the binary specified by this object as well as its input
        files exist."""
        if not self.binary:
            log.error("No binary specified [bin %s]" % (self.bid,))
            return False

        # make sure binary exists
        if not self.in_path and not os.path.exists(self.binary):
            log.error("Binary %s not found [bin %s]" % (self.binary, self.bid))
            self.errors.add('missing-binary')
            return False
            
        if not self.in_path and not os.path.isfile(self.binary):
            log.error("Binary %s is not a file [bin %s]" % (self.binary, self.bid))
            return False
            
        for a in self.get_input_files():
            if not os.path.exists(a):
                log.error("Input file '%s' does not exist [bin %s]" % (a, self.bid))
                self.errors.add('missing-input')
                return False

            # TODO: add AT_DIR ...
            if not os.path.isfile(a):
                log.error("Input file '%s' is not a file [bin %s]" % (a, self.bid))
                return False

        return True

    def run(self, runid, **kwargs):
        """Run the command specified by this spec."""
        assert runid not in self._runids, "Duplicate runid %s" % (runid,)

        assert len(self.errors) == 0

        x = Run(self.env, self.binary, self.args, self)
        if self.rlimit and os.name != "nt":
            x.set_popen_args('preexec_fn', self.rlimit.set)
        if os.name == "nt":
            log.info("Warning: rlimit not supported on Windows OS")

        x.set_overlays(self.overlays)
        x.set_tmpdir(self.tmpdir)
        x.runid = runid
        self._runids.add(runid)
        x.run(**kwargs)
        self.runs.append(x)
        return x

    def set_rlimit(self, rlimit):
        self.rlimit = rlimit

    def __str__(self):
        ev = ["%s=%s" % (k, v) for k, v in self.env.iteritems()]
        args = ["%s" % (a) for a, b in self.args]
        return "%s %s %s" % (" ".join(ev), self.binary, " ".join(args))
        
class RunSpec(BasicRunSpec):
    """Extended runspec that holds extra things (an id, bmk binarys/inputs, checkers,
    among these additional things.
    """
    def __init__(self, bmk_binary, bmk_input):
        super(RunSpec, self).__init__()

        self.bmk_binary = bmk_binary
        self.bmk_input = bmk_input
        
        self.bid = self.bmk_binary.get_id()
        self.input_name = bmk_input.get_id()
        self.checker = None
        self.perf = None

    def set_checker(self, checker):
        self.checker = checker

    def set_perf(self, perf):
        self.perf = perf

    def check(self):
        if not super(RunSpec, self).check():
            return False

        if not self.checker:
            log.error("No checker specified for input %s [bin %s] " % (self.input_name, self.bid))
            return False

        if not self.perf:
            log.error("No perf specified for input %s [bin %s] " % (self.input_name, self.bid))
            return False

        for a in self.checker.get_input_files():
            if not os.path.exists(a):
                log.error("Checker input file '%s' does not exist [bin %s]" % (a, self.bid))
                return False

            # TODO: add AT_DIR ...
            if not os.path.isfile(a):
                log.error("Checker input file '%s' is not a file [bin %s]" % (a, self.bid))
                return False

        return True

#class DistRunSpec(RunSpec):
#    #TODO


================================================
FILE: scripts/experimental/bmk2/extras.py
================================================
#
# extras.py
#
# Utility functions for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import os
import fnmatch

def read_line_terminated_cfg(configs):
    out = []
    for f in configs:
        fl = [s.strip() for s in open(f, "r")]
        fl = [l for l in fl if (l and l[0] != "#")]
        out += fl

    return out

# Tyler: Not sure if this is the best place for the blacklist
def scan(path, glob, black_list = []):
    out = []
    for root, dirnames, filenames in os.walk(path):
        matches = fnmatch.filter(filenames, glob)
        out += [os.path.join(root, m) for m in matches]

    out = [o for o in out if len([x for x in black_list if x + os.sep in o]) == 0]
    return out

def summarize(log, rspecs):
    bins = set([rs.bmk_binary.get_id() for rs in rspecs])
    inputs = set([rs.bmk_input.get_id() for rs in rspecs])

    runs = 0
    failed_runs = 0
    failed_checks = 0

    for rs in rspecs:
        runs += len(rs.runs)
        failed_runs += len(filter(lambda x: not x.run_ok, rs.runs))
        failed_checks += len(filter(lambda x: not x.check_ok, rs.runs))

    log.info('Summary: Runspecs: %s Binaries: %d Inputs: %d  Total runs: %d Failed: %d Failed Checks: %d' % (len(rspecs), len(bins), len(inputs), runs, failed_runs, failed_checks))

def standard_loader(metadir, inpproc, binspec, scandir, bispec, binputs = "", 
                    ignore_missing_binaries = False, bingroup = "BINARIES", 
                    bin_configs = None, extended_scan = False, black_list = [], 
                    varconfigs = None):
    import bmk2
    import config
    import sys

    if scandir:
        basepath = os.path.abspath(scandir)
        binspecs = scan(scandir, "bmktest2.py", black_list)
        if extended_scan:
            binspecs.extend(scan(scandir, "bmktest2-*.py", black_list))
    else:
        if not os.path.exists(binspec):
            print >>sys.stderr, "Unable to find %s" % (binspec,)
            return False

        basepath = os.path.abspath(".")
        binspecs = [binspec]

    l = bmk2.Loader(metadir, inpproc)

    ftf = {}
    if bispec:
        f = None
        if os.path.exists(bispec) and os.path.isfile(bispec):
            f = bispec
        else:
            f = l.config.get_var("bispec_" + bispec, None)
            f = os.path.join(metadir, f)

        assert f is not None, "Unable to find file or spec in config file for bispec '%s'" % (bispec,)
        ftf[config.FT_BISPEC] = f

    if not l.initialize(ftf): return False
    sel_inputs, sel_binaries = l.split_binputs(binputs)

    print >>sys.stderr, "sel_inputs set to '%s', sel_binaries set to '%s'" % (sel_inputs, sel_binaries)

    if bin_configs is not None and len(bin_configs) > 0:
        if not l.config.load_bin_config(bin_configs):
            print >>sys.stderr, "Unable to load binary configurations '%s'" % (bin_configs,)
            return False

    if varconfigs is not None:
        if not l.config.load_var_config(varconfigs):
            print >>sys.stderr, "Unable to load variable configurations '%s'" % (varconfigs,)
            return False

    sys.path.append(metadir)
    if not l.load_multiple_binaries(binspecs, sel_binaries, bingroup) and not ignore_missing_binaries: return False
    if not l.apply_config(): return False
    if not l.associate_inputs(sel_inputs): return False

    return (basepath, binspecs, l)

if __name__ == '__main__':
    import sys
    print scan(sys.argv[1], "bmktest2.py")


================================================
FILE: scripts/experimental/bmk2/inputdb.py
================================================
#!/usr/bin/env python
#
# inputdb.py
#
# Manages input db files for bmk2 (*.inputdb)
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import os
import ConfigParser
import argparse
import common
import fnmatch
import inputprops
from core import Input
from opdb import ObjectPropsCFG

class InputDBcfg(ObjectPropsCFG):
    """Parser of inputdb files."""
    def __init__(self, filename, inpproc = None):
        super(InputDBcfg, self).__init__(filename, "bmktest2", ["2"])
        self.inpproc = inpproc

        self.unserialize_input = None
        self.serialize_input = None

        # save input processor functions if they exist
        if self.inpproc:
            inpproc = common.load_py_module(self.inpproc)
            if 'unserialize_input' in inpproc:
                self.unserialize_input = inpproc['unserialize_input']

            if 'serialize_input' in inpproc:
                self.serialize_input = inpproc['serialize_input']                

    def init(self, basepath):
        self.meta = dict([('version', "2"), ('basepath', basepath)])

    def post_load(self):
        """Append the basepath to file paths of all "file" variables
        loaded. (basepath specified in meta section)
        """
        basepath = os.path.expanduser(self.meta['basepath'])

        if not (os.path.exists(basepath)):
            print >>sys.stderr, "Basepath '%s' ('%s') does not exist" % (basepath, self.meta['basepath'])
            return False

        for s in self.objects:
            e = self.objects[s]
            if self.unserialize_input:
                e = self.unserialize_input(e, basepath)

            e['file'] = os.path.join(basepath, e['file'])

        return True
        #basepath = self.meta['basepath']

        #for s in self.objects:
        #    e = self.objects[s]
        #    if self.unserialize_input:
        #        e = self.unserialize_input(e, basepath)

        #    e['file'] = os.path.join(basepath, e['file'])            

        #return True

    def unparse_section(self, section):
        if self.serialize_input:
            self.serialize_input(section)
            
        # replaces the "file" variable in a section with a RELATIVE path
        #if 'file' in section:
        #    section['file'] = os.path.relpath(section['file'], self.meta['basepath'])
        if 'file' in section:
            basepath = os.path.expanduser(self.meta['basepath'])
            section['file'] = os.path.relpath(section['file'], basepath)

        return section

class InputDB(object):
    """Database of inputs specified by an inputDB."""
    def __init__(self, cfgfile, inpproc = None, inputprops = None):
        self.inpproc = inpproc
        self.inputprops = inputprops
        self.cfg = InputDBcfg(cfgfile, self.inpproc)

    def get_alt_format(self, name, fmt):
        """Get the Input object associated with some input that has a 
        particular format.
        """
        if name in self.n2i:
            for x in self.n2i[name]:
                if x.props.format == fmt:
                    return x

    def get_all_alt(self, name):
        """Get the Input object(s) associated with some input."""
        if name in self.n2i:
            return self.n2i[name]

    def load(self):
        if not self.cfg.load():
            print >>sys.stderr, "Unable to load InputDB configuration!"
            return False

        if self.inputprops is not None:
            # not .props as Properties!
            self.props = inputprops.InputPropsCfg(self.inputprops, self)
            if not self.props.load():
                print >>sys.stderr, "Unable to load InputProps"
                return False

            # add any new properties specified by inputprops into the
            # correct inputs
            inputprops.apply_props(self.cfg.objects.itervalues(), self.props)

        # get all inputs specified by inputdb and wrap into an Input
        # object
        self.inputdb = [Input(i, self) for i in self.cfg]
        self.inpnames = set([i.get_id() for i in self.inputdb])

        # setup the name -> Input map; note an input can have multiple
        # formats...
        self.n2i = dict([(n, list()) for n in self.inpnames])
        for i in self.inputdb:
            self.n2i[i.get_id()].append(i)
        
        return True
            
    def __iter__(self):
        return iter(self.inputdb)
       
if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Prepare an inputs database")
    p.add_argument("--glob", help="Glob")
    p.add_argument("--update", action="store_true", help="Update dbfile")
    p.add_argument("inpproc", help="Input processor (python module)")
    p.add_argument("dbfile", help="Output database file")
    p.add_argument("basepath", nargs="?", help="Scan this path for inputs", default=".")
    
    args = p.parse_args()
    inpproc = common.load_py_module(args.inpproc)

    if args.update:
        idb = InputDB(args.dbfile, args.inpproc)
        idb.load()
        basepath = os.path.expanduser(idb.cfg.meta['basepath'])
        print >>sys.stderr, "using basepath from file: %s" % (basepath,)
    else:
        idb = InputDB(args.dbfile, args.inpproc)
        basepath = args.basepath
        idb.cfg.init(basepath)

    describe_input = inpproc['describe_input']

    out = []
    for root, dirnames, filenames in os.walk(basepath, followlinks=True):
        rp = os.path.relpath(root, basepath)
        
        if args.glob:
            filenames = fnmatch.filter(filenames, args.glob)       

        for f in filenames:
            if f[0] == ".":
                continue

            x = describe_input(root, f, rp)
            if x:
                x['file'] = os.path.join(rp, f)
                if x['file'] not in idb.cfg.objects:
                    print >>sys.stderr, x['file']
                    idb.cfg.objects[x['file']] = x
                    x['file'] = os.path.join(basepath, x['file'])
                    
    if args.update:
        idb.cfg.save(args.dbfile)
    else:
        idb.cfg.save(args.dbfile)


================================================
FILE: scripts/experimental/bmk2/inputprops.py
================================================
#!/usr/bin/env python
#
# inputprops.py
#
# Manages an input properties file (*.inputprops)
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import inputdb
import argparse
import ConfigParser
import os
from opdb import ObjectPropsCFG

class InputPropsCfg(ObjectPropsCFG):
    """Parser of inputprops files that specify additional properties
    for certain inputs.
    """
    def __init__(self, filename, inputdb):
        super(InputPropsCfg, self).__init__(filename, "bmktest2-props", ["2"])
        self.inputdb = inputdb
        self.path_items = set()

    def init(self):
        self.meta = {}
        self.meta['version'] = "2"

    def post_load(self):
        """Look for "path_items" as a var name in the sections; if it exists,
        then prepend the base path (specified in meta of inputdatabase) to the 
        path.
        """
        path_items = self.meta.get("paths", "")
        self.path_items = set([xx.strip() for xx in path_items.split(",")])

        basepath = os.path.expanduser(self.inputdb.cfg.meta['basepath'])

        for e in self.objects.itervalues():
            for pi in self.path_items:
                if pi in e:
                    e[pi] = os.path.join(basepath, e[pi])

        return True

    def unparse_section(self, section):
        """Replace anything in path_items with relative paths."""
        bp = os.path.expanduser(self.inputdb.cfg.meta['basepath'])

        for pi in self.path_items:
            if pi in section:
                section[pi] = os.path.relpath(section[pi], bp)

        return section

def apply_props(inputdb, props):
    """Save the additional properties specified by an inputprops file
    into the input database.
    """
    for e in inputdb:
        if e['name'] in props.objects:
            e.update(props.objects[e['name']])

    return True

if __name__ == "__main__":
    p = argparse.ArgumentParser("Create/Update an input properties file")
    p.add_argument("inputdb", help="Inputdb file")
    p.add_argument("inputprops", help="Inputprops file")

    args = p.parse_args()

    idb = inputdb.InputDB(args.inputdb)
    ip = InputPropsCfg(args.inputprops, idb)

    
    if not idb.load():
        print >>sys.stderr, "Failed to load inputdb"
        sys.exit(1)


    if os.path.exists(args.inputprops):
        if not ip.load():
            print >>sys.stderr, "Failed to load props"
            sys.exit(1)
    else:
        ip.init()

    for e in idb:
        nm = e.name

        if nm not in ip.objects:
            ip.objects[nm] = {'name':  nm}

    ip.save(args.inputprops)


================================================
FILE: scripts/experimental/bmk2/logproc.py
================================================
#!/usr/bin/env python
#
# logproc.py
#
# Log file reader library for bmk2. Reads log files generated by
# test2.py, for example.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import re
import datetime
from collections import namedtuple

sdate = namedtuple("sdate", ['type', 'start_date', 'raw'])
edate = namedtuple("edate", ['type', 'end_date', 'raw'])

# note: rsid and binid are one and the same
run_begin = namedtuple("run_begin", ['type', 'begin', 'raw'])
run_end = namedtuple("run_end", ['type', 'end', 'raw'])
collect_entry = namedtuple("collect_entry", ['type', 'rsid', 'runid', 'filetype', 'file', 'raw'])
perf_info = namedtuple('perf_info', ['type', 'binid', 'xid', 'run', 'time_ns', 'cmdline', 'raw'])
tc_info = namedtuple('tc_info', ['type', 'rsid', 'task', 'task_args', 'raw'])
missing_info = namedtuple('missing_info', ['type', 'binid', 'raw'])
instr = namedtuple('instr', ['type', 'name', 'args', 'raw'])
fail_info = namedtuple('fail_info', ['type', 'binid', 'runid', 'message', 'raw'])
pass_info = namedtuple('pass_info', ['type', 'binid', 'args', 'raw'])
generic_log = namedtuple('generic_log', ['type', 'loglevel', 'raw'])
unmatched = namedtuple('unmatched', ['type', 'raw'])

st = re.compile("^START")
dt = re.compile("^INFO DATE (START|END)")
collect_bp = re.compile("^COLLECT basepath (.*)$")
collect = re.compile("^COLLECT (.*) (.*) (.*) (.*)$")
pd_begin = re.compile("^INFO PERFDATE BEGIN_RUN")
pd_end = re.compile("^INFO PERFDATE END_RUN")
p = re.compile("^PERF ")
missing = re.compile("^FAIL MISSING PERF")
fail_general = re.compile("^FAIL ([^: ]+) ?([^: ]+)?: ?(.+)$")
tc_re = re.compile("^TASK_COMPLETE ([^ ]+) ([^ ]+)( (.*))?$")
instr_re = re.compile("^INSTR ([^ ]+) (.*)$")
pass_re = re.compile('^PASS ([^ :]+)(.+)$')
gen_log_re = re.compile('^(INFO|DEBUG|ERROR)')

def parse_log_file(logfile, extended = False):
    with open(logfile, "r") as f:
        for l in f:
            m = st.match(l)
            if m:
                assert False, l
                print m.group(0)
                continue

            m = dt.match(l)
            if m:
                # may yield multiple dates if log files are
                # concatenate

                if m.group(1) == "START":
                    yield sdate("START_DATE", l.strip().split(" ", 3)[-1], raw=l)
                else:
                    yield edate("END_DATE", l.strip().split(" ", 3)[-1], raw=l)
                continue

            m = pd_begin.match(l)
            if m:
                yield run_begin("RUN_BEGIN", l.strip().split(" ", 3)[-1], raw=l)
                continue

            m = pd_end.match(l)
            if m:
                yield run_end("RUN_END", l.strip().split(" ", 3)[-1], raw=l)
                continue

            m = collect_bp.match(l)
            if m:
                fi = m.group(1)

                yield collect_entry("COLLECT", rsid="", runid="", filetype="basepath", file=fi, raw=l)
                continue

            m = collect.match(l)
            if m:
                rsid = m.group(1)
                runid = m.group(2)
                ty = m.group(3)
                fi = m.group(4)

                yield collect_entry("COLLECT", rsid=rsid, runid=runid, 
                                    filetype=ty, file=fi, raw=l)
                continue

            m = p.match(l)
            if m:
                ls = l.strip().split(" ", 5)
                out = perf_info("PERF", 
                                binid = ls[1], 
                                xid = ls[2],
                                run = ls[3],
                                time_ns = ls[4],
                                cmdline = ls[5], raw=l)

                yield out
                continue

            m = missing.match(l)
            if m:
                yield missing_info("MISSING", binid = l.strip().split()[-1], raw=l)
                continue

            m = fail_general.match(l)
            if m:                
                yield fail_info("FAIL", binid = m.group(1), runid=m.group(2), message=m.group(3), raw=l)
                continue

            m = tc_re.match(l)
            if m:
                rsid = m.group(1)
                task = m.group(2).strip()
                task_args = m.group(4)

                yield tc_info("TASK_COMPLETE", rsid, task, task_args, raw=l)
                continue
            
            m = instr_re.match(l)
            if m:
                name = m.group(1)
                args = m.group(2)

                yield instr("INSTR", name, args, raw=l)
                continue

            if extended:
                m = pass_re.match(l)
                if m:
                    binid = m.group(1)
                    args = m.group(2)

                    yield pass_info("PASS", binid = binid, args = args, raw=l)
                    continue

                m = gen_log_re.match(l)
                if m:
                    loglevel = m.group(1)

                    yield generic_log("GENERIC_LOG", loglevel, raw=l)
                    continue

                
                yield unmatched("UNMATCHED", raw=l)

if __name__ == "__main__":
    import sys
    for r in parse_log_file(sys.argv[1]):
        print r


================================================
FILE: scripts/experimental/bmk2/mapfile.py
================================================
from collections import namedtuple
import re

run_re = re.compile(r"^([0-9]+\.[0-9]+\.[0-9]+)\.([0-9]+)$")

mapfile_entry = namedtuple('mapfile_entry', ['binid', 'input', 'runid', 'filetype', 'filename', 'abspath'])

def split_bininpid(bininpid):
    p = bininpid.rfind("/")
    return bininpid[:p], bininpid[p+1:]

def split_runid(runid):
    m = run_re.match(runid)

    if m:
        return m.group(1), m.group(2)
    else:
        return runid, None

def get_run(runid):
    m = run_re.match(runid)
    if m:
        return int(m.group(2))
    else:
        return None

def read_mapfile(mapfile):
    with open(mapfile, "r") as f:
        for l in f:
            # binid/input runid filetype filename abspath
            ls = l.strip().split(" ", 4)
            binid, input = split_bininpid(ls[0])

            if len(ls) != 5:
                print "ERROR: malformed mapfile entry", ls

            yield mapfile_entry(binid = binid, input = input, runid=ls[1], filetype=ls[2], filename=ls[3], abspath=ls[4])

def write_mapfile(mapfile, mapentries, mode="w"):
    f = open(mapfile, mode)
    
    for me in mapentries:
        assert me.input != ""
        
        print >>f, "%s/%s %s %s %s %s" % (me.binid, me.input, me.runid, me.filetype, me.filename, me.abspath)

    f.close()
        
def write_mapfile_raw(mapfile, mapentries, mode="w"):
    """For use by non-binary/input aware tools, input must be empty and binid contains the whole ID"""

    f = open(mapfile, mode)
    
    for me in mapentries:
        assert me.input == ""
        
        print >>f, "%s %s %s %s %s" % (me.binid, me.runid, me.filetype, me.filename, me.abspath)

    f.close()
        
def mapfile2dict(mapfile, fltr = None):
    out = {}
    for e in read_mapfile(mapfile):
        if fltr is not None and not fltr(e): continue

        k1 = (e.binid, e.input)
        if k1 not in out:
            out[k1] = {}

        if e.runid not in out[k1]:
            out[k1][e.runid] = {}

        if e.filetype not in out[k1][e.runid]:
            out[k1][e.runid][e.filetype] = []

        out[k1][e.runid][e.filetype].append(e)

    return out


================================================
FILE: scripts/experimental/bmk2/measure_energy.py
================================================
#!/usr/bin/env python
# measure_energy.py
#
# Measure energy on Intel platforms that support RAPL access through
# the powercap interface.
#
# Part of bmk2
#
# Copyright (c) 2017 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>

import sys
import os
import subprocess
import glob
import threading

INTERVAL_S = 15

def get_rapl_files():
    dom = glob.glob("/sys/class/powercap/intel-rapl:*")
    
    out = {}
    for d in dom:
        dd = os.path.basename(d)
        f = os.path.join(d, "energy_uj")
        if os.path.exists(f):
            out[dd] = f

        f = os.path.join(d, "max_energy_range_uj")
        if os.path.exists(f):
            out["max_" + dd] = f


    return out

def read_rapl_power(rapl_files):
    out = {}
    for k, f in rapl_files.items():
        of = open(f, "r")
        out[k] = int(of.read())
        of.close()

    return out

def periodic_power():
    global TIMER
    POWER.append(read_rapl_power(rf))
    TIMER = threading.Timer(INTERVAL_S, periodic_power)
    TIMER.start()

def count_wraparound(nums, key = None):
    prev = None
    wrap = 0

    for n in nums:
        if key:
            n = key(n)

        if prev is not None and prev > n:
            wrap += 1

        prev = n

    return wrap

def calc_power(POWER):
    k = [kk for kk in POWER[0].keys() if kk[:4] != 'max_']

    out = {}
    for kk in k:
        wraps = count_wraparound(POWER, key = lambda x: x[kk])
        
        bef = POWER[0][kk]
        aft = POWER[-1][kk] + wraps * POWER[-1]["max_" + kk]

        out[kk] = aft - bef
        out[kk+":wraps"] = wraps

    return out

if len(sys.argv) == 1:
    print >>sys.stderr, "Usage: %s cmd-line\n" % (sys.argv[0],)
    exit(1)

cmdline = sys.argv[1:]

rf = get_rapl_files()
if len(rf):
    POWER = []
    TIMER = threading.Timer(INTERVAL_S, periodic_power)
    POWER.append(read_rapl_power(rf))
    TIMER.start()

    proc = subprocess.Popen(cmdline)
    proc.wait()

    TIMER.cancel()
    POWER.append(read_rapl_power(rf))

    p = calc_power(POWER)
    for k in p:
        print "INSTR", k, p[k] # micro joules

    sys.exit(proc.returncode)
else:
    print >>sys.stderr, "Did not find RAPL power counters (/sys/class/powercap/intel-rapl*)"
    sys.exit(1)


================================================
FILE: scripts/experimental/bmk2/opdb.py
================================================
#
# opdb.py
#
# Object properties database for bmk2.  Sections in CFG files indicate
# objects, section keys indicate properties.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import ConfigParser
from collections import OrderedDict
import os
import glob
import sys

def cfg_get(fn, section, key, default=None):
    """Wrapper for ConfigParser functions that allows you to return a default
    value on failure.

    Keyword Arguments:
    fn -- config parser function to use
    section -- section of config file to read
    key -- key to attempt to access from section
    default -- default value to return on error
    """
    try:
        v = fn(section, key)
        return v
    except ConfigParser.NoOptionError:
        return default

class ObjectProps(object):
    pass

class ObjectPropsCFG(ObjectProps):
    """Read/Write a .cfg file as a object property file.
    
       Sections names indicate objects, section keys indicate properties."""
    def __init__(self, filename, fmt, acceptable_versions):
        """Initializer.

        Keyword Arguments:
        filename -- config file to read
        fmt -- format of the config file (should be section name in config
        file that has meta information); 
        acceptable_versions -- versions of the config file that are allowed
        """
        self.filename = filename
        self.fmt = fmt
        self.acceptable_versions = acceptable_versions
        self.meta = None
        self.objects = OrderedDict() # contains section -> vars in section
        self.site = None

    def _site_specific_cfg(self, x):
        d = os.path.dirname(self.filename)
        sitefiles = glob.glob(os.path.join(d, "SITE-IS.*"))

        if len(sitefiles) > 1:
            print >>sys.stderr, ("Only one sitefile should exist. Currently, multiple sitefiles exist: '%s'" % (sitefiles,))
        elif len(sitefiles) == 0:
            print >>sys.stderr, ("No sitefile found.")
        else:
            p = sitefiles[0].rindex(".")
            self.site = sitefiles[0][p+1:]
            print >>sys.stderr, ("Site set to '%s'." % (self.site,))
            sscfg = self.filename + "." + self.site

            if not os.path.exists(sscfg):
                print >>sys.stderr, ("Site-specific input db '%s' not found." % (sscfg,))
            else:
                print >>sys.stderr, ("Loading site-specific '%s'." % (sscfg,))

                y = ConfigParser.SafeConfigParser()

                with open(sscfg, "rb") as f:
                    y.readfp(f)

                    v = cfg_get(y.get, self.fmt, "version")

                    self.version = v

                    if not self.check_version(v):
                        av = [str(v) for v in self.acceptable_versions]
                        if v:
                            print >>sys.stderr, "Unknown version: %s (acceptable: %s)" % (v, ", ".join(av))
                        else:
                            print >>sys.stderr, "Unable to determine version (acceptable: %s)" % (", ".join(av))

                    for s in ("bmktest2", ):
                        for n, v in y.items(s):
                            if not x.has_section(s):
                                x.add_section(s)
                                
                            print >>sys.stderr, ("Setting site-specific [%s]:%s to '%s'" % (s, n, v))
                            x.set(s, n, v)                

                return True

        return False

    def check_version(self, version):
        """Check if a version is allowed by this reader."""
        return version in self.acceptable_versions

    def update_props(self, props):
        return props

    def parse_section(self, cfg, section): 
        """Given a dictionary that represents the parse of some section of
        a config file, return them as an ordered dictionary.

        Keyword Arguments:
        cfg -- dictionary that contains parse results
        section -- section to retrieve
        """
        d = OrderedDict(cfg.items(section))
        d = self.update_props(d)
        return d

    def unparse_section(self, section):
        return section
    
    def post_load(self):
        return True

    def load(self):
        """Load the configuration file and parse its sections."""
        x = ConfigParser.SafeConfigParser()

        out = OrderedDict()
        with open(self.filename, "rb") as f:
            x.readfp(f)

            v = cfg_get(x.get, self.fmt, "version")

            self.version = v

            if not self.check_version(v):
                av = [str(v) for v in self.acceptable_versions]
                if v:
                    print >>sys.stderr, "Unknown version: %s (acceptable: %s)" % (v, ", ".join(av))
                else:
                    print >>sys.stderr, "Unable to determine version (acceptable: %s)" % (", ".join(av))
                
            self._site_specific_cfg(x)

            # save vars in a section to dictionary
            for s in x.sections():
                if s == self.fmt: 
                    self.meta = self.parse_section(x, s)
                else:
                    if s in out:
                        print >>sys.stderr, "Warning: Duplicate section '%s', overwriting" % (s,)

                    out[s] = self.parse_section(x, s)

            self.objects = out
            return self.post_load()

        return False

    def save(self, fn = None):
        """Save the parsed configuration file back into another file."""
        def write_items(cfg, section, items):
            for k, v in items.iteritems():
                cfg.set(section, k, v)

        x = ConfigParser.SafeConfigParser()
        
        assert self.filename or fn, "Both filename and fn cannot be empty."
        if not fn: fn = self.filename

        x.add_section(self.fmt)
        write_items(x, self.fmt, self.unparse_section(self.meta))

        for s in self.objects:
            x.add_section(s)
            write_items(x, s, self.unparse_section(self.objects[s]))

        with open(fn, "wb") as f:
            x.write(f)
        
    def __iter__(self):
        return iter(self.objects.itervalues())


================================================
FILE: scripts/experimental/bmk2/overlays.py
================================================
import core
import logging

log = logging.getLogger(__name__)

class Overlay(object):
    """Class that holds params/env vars to "overlay" into the command line options
    of some run."""
    def __init__(self, env = {}, binary = None, args = []):
        self.env = env
        self.binary = binary
        self.args = args
        self.tmpfiles = {}

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None, logfiles = None):
        """Overlay arguments/env vars into a command line (i.e. add to it)."""
        if env is None:
            new_env = None
        else:
            new_env = env.copy()
            new_env.update(self.env)

        new_cmdline = []
        if self.binary:
            new_cmdline.append(self.binary)

        for a, aty in self.args:
            if aty == core.AT_INPUT_FILE_IMPLICIT:
                continue

            if aty == core.AT_TEMPORARY_OUTPUT:
                th, self.tmpfiles[a] = tempfile.mkstemp(prefix="test-ov-")
                os.close(th)
                log.debug("Created temporary file '%s' for overlay parameter '%s'" % 
                          (self.tmpfiles[a], a))
                a = self.tmpfiles[a]
            elif aty == core.AT_TEMPORARY_INPUT:
                a = inherit_tmpfiles[a]
            elif aty == core.AT_LOG:
                a = logfiles[a]

            new_cmdline.append(a)

        new_cmdline += cmdline

        return new_env, new_cmdline

    def cleanup(self):
        """Cleanup temp files created."""
        for a, f in self.tmpfiles.iteritems():
            os.unlink(f)

    def __str__(self):
        ev = ["%s=%s" % (k, v) for k, v in self.env.iteritems()]
        return "%s %s" % (" ".join(ev), self.cmd_line_c)

class CUDAProfilerOverlay(Overlay):
    def __init__(self, profile_cfg = None, profile_log = None):
        env = {'CUDA_PROFILE': '1'}
        if profile_cfg: env['CUDA_PROFILE_CONFIG'] = profile_cfg
        if profile_log: env['CUDA_PROFILE_LOG'] = profile_log

        self.profile_log = profile_log
        self.collect = logging.getLevelName('COLLECT')
        super(CUDAProfilerOverlay, self).__init__(env)

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        if self.profile_log is not None:
            self.env['CUDA_PROFILE_LOG'] = core.create_log(self.profile_log, run)

        if self.profile_log:
            log.log(self.collect, '{rsid} {runid} cuda/profiler {logfile}'.format(rsid=run.rspec.get_id(), runid=run.runid, logfile=self.env['CUDA_PROFILE_LOG']))
        else:
            log.log(self.collect, '{rsid} {runid} cuda/profiler cuda_profile_0.log'.format(rsid=run.rspec.get_id(), runid=run.runid))
        
        return super(CUDAProfilerOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)

class NVProfOverlay(Overlay):
    def __init__(self, profile_cfg = None, profile_log = None, profile_db = False, profile_analysis = False, system_profiling = False):
        args = [(x, core.AT_OPAQUE) for x in profile_cfg.strip().split()]

        if profile_db or profile_analysis:
            args += [('-o', core.AT_OPAQUE), ('@nvprofile', core.AT_LOG)]
            if profile_analysis:
                args += [('--analysis-metrics', core.AT_OPAQUE)]
        else:
            args += [(x, core.AT_OPAQUE) for x in "--csv --print-gpu-trace".split()]
            args += [('--log-file', core.AT_OPAQUE), ('@nvprofile', core.AT_LOG)]

        if system_profiling:
            args += [('--system-profiling', core.AT_OPAQUE), ('on', core.AT_OPAQUE)]

        self.profile_cfg = profile_cfg
        self.profile_log = profile_log
        self.profile_db = profile_db or profile_analysis

        self.collect = logging.getLevelName('COLLECT')
        super(NVProfOverlay, self).__init__(binary="nvprof", args=args)

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        if self.profile_log is not None:
            logfile = core.create_log(self.profile_log, run)
        else:
            if self.profile_db:
                logfile = 'cuda_profile_0.nvprof'
            else:
                logfile = 'cuda_profile_0.log'

        log.log(self.collect, '{rsid} {runid} cuda/nvprof {logfile}'.format(rsid=run.rspec.get_id(), runid=run.runid, logfile=logfile))
        
        return super(NVProfOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles, {'@nvprofile': logfile})

class TmpDirOverlay(Overlay):
    def __init__(self, tmpdir):
        env = {'TMPDIR': tmpdir}
        super(TmpDirOverlay, self).__init__(env)

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        return super(TmpDirOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)

class CLDeviceOverlay(Overlay):
    def __init__(self, cmdline_template, cl_platform, cl_device):
        super(CLDeviceOverlay, self).__init__({})
        self.cmdline_template = cmdline_template
        self.cl_platform = cl_platform
        self.cl_device = cl_device
        self.cmdline = cmdline_template.format(platform = cl_platform, device = cl_device).split(" ")

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        return super(CLDeviceOverlay, self).overlay(run, env, cmdline + self.cmdline, inherit_tmpfiles, {})

class Bmk2RTEnvOverlay(Overlay):
    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        self.env['BMK2'] = "1"
        if isinstance(run.rspec, core.RunSpec):
            self.env['BMK2_BINID'] = run.rspec.bid
            self.env['BMK2_INPUTID'] = run.rspec.input_name

        if run.runid is not None:
            self.env['BMK2_RUNID'] = run.runid

        return super(Bmk2RTEnvOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)


_instr_overlay_file = {}

class GGCInstrOverlay(Overlay):
    @staticmethod
    def read_map_file(mapfile):
        out = {}
        f = open(mapfile, "r")
        for l in f:
            ls = l.strip().split(' ', 4)
            bmkinput, uniqid, ty, fn, p = ls

            if ty == "ggc/kstate" and bmkinput not in out:
                out[bmkinput] = (uniqid, os.path.dirname(p))

        f.close()

        return out

    def __init__(self, mapfile):        
        if mapfile not in _instr_overlay_file:
            _instr_overlay_file[mapfile] = GGCInstrOverlay.read_map_file(mapfile)
            
        self.mapfile = _instr_overlay_file[mapfile]
        super(GGCInstrOverlay, self).__init__()
    
    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        if run.rspec.get_id() in self.mapfile:
            uid, dirname = self.mapfile[run.rspec.get_id()]

            self.env['INSTR_UNIQID'] = uid
            self.env['INSTR_TRACE_DIR'] = dirname + "/"

        return super(GGCInstrOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)


class MeasureEnergyOverlay(Overlay):
    def __init__(self):
        super(MeasureEnergyOverlay, self).__init__(binary=os.path.join(os.path.dirname(__file__), "measure_energy.py"))

    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):
        return super(MeasureEnergyOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)


def add_overlay(rspecs, overlay, *args, **kwargs):
    """Add an overlay to a series of rspecifications."""
    for r in rspecs:
        r.add_overlay(overlay(*args, **kwargs))


================================================
FILE: scripts/experimental/bmk2/perf.py
================================================
#
# perf.py
#
# Performance number extractor for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import re
import logging
log = logging.getLogger(__name__)

MULTIPLIERS = {'s': int(1E9), 'ms': int(1E6), 'us': int(1E3), 'ns': 1}

def split_decimal_str(n):
    p = n.find(".")
    if p != -1:
        whole = int(n[:p])
        frac = int(n[p+1:])
    else:
        whole = int(n)

    return (whole, frac)

class Perf(object):
    def get_perf(self, run):
        raise NotImplementedError

class ZeroPerf(object):
    def get_perf(self, run):
        return 0

class PerfFn(object):
    def __init__(self, fn):
        self.fn = fn

    def get_perf(self, run):
        if not (run.run_ok and run.check_ok):
            return None

        return self.fn(run.stdout, run.stderr)

class PerfRE(object):
    def __init__(self, rexp, re_unit = None):
        self.re = re.compile(rexp, re.MULTILINE)
        self.units = re_unit

        if re_unit:
            assert self.units in MULTIPLIERS, "Invalid unit %s" % (re_unit)

    def get_perf(self, run):
        if not (run.run_ok and run.check_ok):
            return None

        if run.stdout != None:
            run.stdout = run.stdout.replace("\r", "");
        if run.stderr != None:
            run.stderr = run.stderr.replace("\r", "");

        m = self.re.search(run.stdout)
        if not m and run.stderr:
            m = self.re.search(run.stderr)

        if not m:
            log.debug("No match for perf re in stdout or stderr")
            return None

        gd = m.groupdict()

        time_ns = 0
        if "time_ns" in gd:
            # use time_ns only if present
            time_ns = int(gd['time_ns'])
        elif "time_ms" in gd:
            time_ns = int(gd['time_ms']) * MULTIPLIERS['ms']
        elif "time_us" in gd:
            time_ns = int(gd['time_us']) * MULTIPLIERS['us']
        elif "time_s" in gd:
            time_ns = int(gd['time_s']) * MULTIPLIERS['s']
        elif "frac" in gd:
            w, f = int(gd['whole']), int(gd['frac'])

            assert self.units is not None

            m = MULTIPLIERS[self.units]

            l = len(str(m)) - len(gd['frac'])
            #print l
            assert l > 0, l

            time_ns = w * m + f * (10**(l-1))
        elif "float" in gd:
            assert self.units is not None

            m = MULTIPLIERS[self.units]
            
            time_ns = int(float(gd['float']) * m)
        else:
            assert False, "Unable to located named groups in perf regex (%s)" % (gd,)

        return {'time_ns': time_ns}


__all__ = ['Perf', 'ZeroPerf', 'PerfFn', 'PerfRE']


================================================
FILE: scripts/experimental/bmk2/rsinfo.py
================================================
#!/usr/bin/env python
#
# bmk2info.py
#
# Dump information from bmktest2.py files (such as benchmark input
# files, checker input files, etc.).
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import ConfigParser
import argparse
from extras import *
import logging
import opdb
import os
import re
import sconvert

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')

p = argparse.ArgumentParser(description="Dump information from bmktest2.py files")
#p.add_argument("output", nargs="?", default="/dev/stdout")
p.add_argument('binputs', nargs='*', help="List of binaries to restrict")
p.add_argument("-d", dest="metadir", metavar="PATH", help="Path to load configuration from", default=".")
p.add_argument("--iproc", dest="inpproc", metavar="FILE", help="Input processor")
p.add_argument("--bs", dest="binspec", metavar="FILE", help="Binary specification", default="./bmktest2.py")
p.add_argument("--bispec", dest="bispec", metavar="FILE_OR_MNEMONIC", help="Binary+Input specification")
p.add_argument("--scan", dest="scan", metavar="PATH", help="Recursively search PATH for bmktest2.py")
p.add_argument("--xs", dest="xtended_scan", action="store_true", help="Also recognize bmktest2-*.py in scans")
p.add_argument("--ignore-missing-binaries", action="store_true", default = False)
p.add_argument("--cfg", dest="configs", action="append", help="Configurations to apply. default is always applied if present", default=[])
p.add_argument("--varcfg", dest="varconfigs", action="append", help="Variable configs, specified as var=value", default=[])
p.add_argument("-o", dest="output", help="Output file")
p.add_argument("-v", dest="verbose", type=int, help="Verbosity", default=0)
p.add_argument("-i", dest="include", action="append", default=[], choices=set(['inputs', 'checker-inputs', 'all']))

args = p.parse_args()

if len(args.include) == 0 or "all" in args.include:
    args.include = ['inputs', 'checker-inputs']

args.include = set(args.include)

loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, args.binputs, ignore_missing_binaries = args.ignore_missing_binaries, bin_configs=args.configs, extended_scan = args.xtended_scan, varconfigs = args.varconfigs)
if not loaded:
    sys.exit(1)
else:
    basepath, binspecs, l = loaded

out = []
rspecs = l.get_run_specs()
for rs in rspecs:
    if "inputs" in args.include:
        out += [(rs.input_name, f) for f in rs.get_input_files()]

    if "checker-inputs" in args.include:
        out += [(rs.input_name, f) for f in rs.checker.get_input_files()]

out = list(set(out))
if args.output:
    of = open(args.output, "w")
else:
    of = sys.stdout

for e in out:
    of.write(("%s %s" % e) + "\n")


================================================
FILE: scripts/experimental/bmk2/sconvert.py
================================================
#!/usr/bin/env python
#
# sconvert.py
#
# Simple converter for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import convgraph
import re
import logging
import opdb
import argparse
import os

# simple converter

log = logging.getLogger(__name__)

class ConvSpec(opdb.ObjectPropsCFG):
    pass

def gen_xform_fn(srcname, dstname):
    src_re = re.compile(srcname)

    def f(s):
        return src_re.sub(dstname, s)

    return f

def load_convspec(convspec):
    cs = ConvSpec(convspec, "bmk2-convspec", ["2"])
    if not cs.load():
        log.error("Unable to read config file")
        return None

    return cs

def init_convgraph(cs):
    all_types = set()
    conv = {}

    for n, s in cs.objects.iteritems():    
        convgraph.register_conversion(s['src'], s['dst'], 
                                      gen_xform_fn(s['srcname'],
                                                   s['dstname']))

        all_types.add(s['src'])
        all_types.add(s['dst'])

        conv[(s['src'], s['dst'])] = n

    return all_types, conv


def convert_one(cs, src, srcty, dst, dstty, all_types, conv, exists = None, verbose = 0):
    if exists is None:
        exists = {}

    # might be useful for a copy?
    if dstty in exists:
        del exists[dstty]

    #print exists

    if srcty not in all_types:
        log.error("Conversion from %s not supported" % (srcty,))
        return None

    if dstty not in all_types:
        log.error("Conversion to %s not supported" % (dstty,))
        return None

    if dst == "@output":
        dst = None

    if not os.path.exists(src):
        log.error("Input file '%s' does not exist" % (src,))
        return None

    # silently skip destinations that already exist in database and on disk
    if dst and os.path.exists(dst):
        # we're also abandoning any intermediate files ...
        # TODO: the planner should do this...
        log.info("Destination `%s' already exists and is in database, not converting" % (dst,))
        return None
    
    c = convgraph.get_conversion(src, srcty, dst, dstty, exists, verbose)
    if not c:
        log.error("Unable to plan conversion from %s to %s" % (srcty, dstty))
        return None

    if False:
        print >>sys.stderr, c

    if dst is None:
        # we had to figure out the output name
        dst = c[-1][3]

    # skip destinations that only exist on disk but not in database
    if os.path.exists(dst):
        log.info("Destination `%s' already exists, not converting. But it is not in database, you need to update inputdb." % (dst,))
        return None

    out = []
    for cmd, fs, fsty, ds, dsty in c:
        assert cmd == "convert_direct", "Unsupported: %s" % (cmd,)
        assert (fsty, dsty) in conv, "Planner got it wrong: %s -> %s unsupported" % (fst, dsty)

        if os.path.exists(ds):
            continue

        cmd = cs.objects[conv[(fsty, dsty)]]['cmd']
        cmd = cmd.format(src = fs, dst=ds, verbose=1)

        out.append((ds, fs, cmd))

    return (dst, out)

def to_makefile(f, dst_rule_array):
    targets = []
    nout = []

    for dst, rules in dst_rule_array:
        targets.append(dst)

        for rule in rules:
            nout.append("""
{dst}: {src}
\t{cmd}""".format(src=rule[1], dst=rule[0], cmd=rule[2]))


    if len(targets):
        print >>f, "all: %s" % (" ".join(targets))
        print >>f, "\n".join(nout)

if __name__ == '__main__':
    import bmk2
    import config
    import os
    import sys

    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')

    p = argparse.ArgumentParser(description="Convert commands to convert input file to destination")
    p.add_argument("input", help="Input file")
    p.add_argument("input_type", help="Input file type")
    p.add_argument("dst_type", help="Destination file type (name will be autodetermined)")
    p.add_argument("dst", nargs="?", help="Destination file name (optional)")

    p.add_argument("-o", dest="output", metavar="FILE", help="Output makefile", default="/dev/stdout")
    p.add_argument("-d", dest="metadir", metavar="PATH", help="Path to load configuration from", default=".")
    p.add_argument("-v", dest="verbose", type=int, help="Verbosity", default=0)

    args = p.parse_args()
    
    cfg = config.Config(args.metadir)
    if not cfg:
        sys.exit(1)

    convspec = cfg.get_var('convspec', None)
    if not convspec:
        log.error("No 'convspec' in config file")
        sys.exit(1)

    cs = load_convspec(os.path.join(cfg.metadir, convspec))
    if not cs:
        sys.exit(1)
        
    all_types, conv = init_convgraph(cs)
    cmds = convert_one(cs, args.input, args.input_type, args.dst, args.dst_type, all_types, conv)
    if cmds is None:
        sys.exit(1)

    f = open(args.output, "w")
    to_makefile(f, [cmds])
    f.close()


================================================
FILE: scripts/experimental/bmk2/summlog.py
================================================
#!/usr/bin/env python
#
# bmk2info.py
#
# Dump information from bmktest2.py files (such as benchmark input
# files, checker input files, etc.).
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import ConfigParser
import argparse
from extras import *
import logging
import opdb
import os
import re
import sconvert

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')

p = argparse.ArgumentParser(description="Dump information from bmktest2.py files")
#p.add_argument("output", nargs="?", default="/dev/stdout")
p.add_argument('binputs', nargs='*', help="List of binaries to restrict")
p.add_argument("-d", dest="metadir", metavar="PATH", help="Path to load configuration from", default=".")
p.add_argument("--iproc", dest="inpproc", metavar="FILE", help="Input processor")
p.add_argument("--bs", dest="binspec", metavar="FILE", help="Binary specification", default="./bmktest2.py")
p.add_argument("--bispec", dest="bispec", metavar="FILE_OR_MNEMONIC", help="Binary+Input specification")
p.add_argument("--scan", dest="scan", metavar="PATH", help="Recursively search PATH for bmktest2.py")
p.add_argument("--xs", dest="xtended_scan", action="store_true", help="Also recognize bmktest2-*.py in scans")
p.add_argument("--ignore-missing-binaries", action="store_true", default = False)
p.add_argument("--cfg", dest="configs", action="append", help="Configurations to apply. default is always applied if present", default=[])
p.add_argument("--varcfg", dest="varconfigs", action="append", help="Variable configs, specified as var=value", default=[])
p.add_argument("-o", dest="output", help="Output file")
p.add_argument("-v", dest="verbose", type=int, help="Verbosity", default=0)
p.add_argument("-i", dest="include", action="append", default=[], choices=set(['inputs', 'checker-inputs', 'all']))

args = p.parse_args()

if len(args.include) == 0 or "all" in args.include:
    args.include = ['inputs', 'checker-inputs']

args.include = set(args.include)

loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, args.binputs, ignore_missing_binaries = args.ignore_missing_binaries, bin_configs=args.configs, extended_scan = args.xtended_scan, varconfigs = args.varconfigs)
if not loaded:
    sys.exit(1)
else:
    basepath, binspecs, l = loaded

out = []
rspecs = l.get_run_specs()
for rs in rspecs:
    if "inputs" in args.include:
        out += [(rs.input_name, f) for f in rs.get_input_files()]

    if "checker-inputs" in args.include:
        out += [(rs.input_name, f) for f in rs.checker.get_input_files()]

out = list(set(out))
if args.output:
    of = open(args.output, "w")
else:
    of = sys.stdout

for e in out:
    of.write(("%s %s" % e) + "\n")


================================================
FILE: scripts/experimental/bmk2/test2.py
================================================
#!/usr/bin/env python
#
# test2.py
#
# Main test runner for bmk2.
#
# Copyright (c) 2015, 2016 The University of Texas at Austin
#
# Author: Sreepathi Pai <sreepai@ices.utexas.edu>
#
# Intended to be licensed under GPL3

import sys
import argparse
import os
import bmk2
import logging
import datetime
import time
from extras import *
import logproc
import overlays
import config
import core
import signal

if os.name != "nt":
    import resource
    
import platform

TIME_FMT = "%Y-%m-%d %H:%M:%S"

if hasattr(time, 'monotonic'):
    time_fn = time.monotonic
else:
    time_fn = time.time

def log_env():
    interesting = ['CUDA_VISIBLE_DEVICES']

    for v in interesting:
        if v in os.environ:
            log.info('Environment: %s=%s' % (v, os.environ[v]))

def load_rlimits(lo, mt = 1):
    x = core.RLimit()
    rlimit_cpu = lo.config.get_var("rlimit.cpu", None)
    if rlimit_cpu is not None:        
        log.info('Setting RLIMIT_CPU to %s' % (int(rlimit_cpu)*mt,))
        x.setrlimit(resource.RLIMIT_CPU, (int(rlimit_cpu)*mt, int(rlimit_cpu)*mt))

    return x

def squash_output(output, buf_size = 1800):
    if buf_size <= 0:
        return output
    else:
        return core.squash_output(core.strip_repeated_lines(output), buf_size)
    
def read_log(logfiles):
    if not isinstance(logfiles, list):
        logfiles = [logfiles]

    binids = set()
    for l in logfiles:
        for r in logproc.parse_log_file(l):
            if r.type == "TASK_COMPLETE":
                binids.add(r.rsid)

    return binids

def std_run(args, rs, runid):
    rsid = rs.get_id()
    x = rs.run(runid)

    if x.run_ok:
        if args.verbose:
            if x.stdout: log.info("%s STDOUT\n" %(rsid) + squash_output(x.stdout, args.max_output))
            if x.stderr: log.info("%s STDERR\n" %(rsid) + squash_output(x.stderr, args.max_output))

        if rs.checker.check(x):
            log.log(PASS_LEVEL, "%s: %s" % (rsid, x))
            x.cleanup()
            return True, x
        else:
            log.log(FAIL_LEVEL, "%s %s: check failed: %s" % (rsid, runid, x))
            if args.always_cleanup:
                x.cleanup()
            return False, x
    else:
        log.log(FAIL_LEVEL, "%s %s: run failed" % (rsid, runid))
        if x.stdout: log.info("%s STDOUT\n" %(rsid) + squash_output(x.stdout, args.max_output))
        if x.stderr: log.info("%s STDERR\n" %(rsid) + squash_output(x.stderr, args.max_output) + "%s END\n" % (rsid))
        x.cleanup()
        return False, x
    
def do_run(args, rspecs):
    log.info("TASK run")

    xid_base = str(time.time()) # this should really be a nonce
    runid = 0
    for rs in rspecs:
        rsid = rs.get_id()
        xid_c = xid_base + "." + str(runid)
        runid += 1


        # TODO: use time.monotic()
        startat = time_fn()
        run_ok, x = std_run(args, rs, xid_c) # in this case because we do not repeat, xid_c == runid
        endat = time_fn()

        total_time = endat - startat 

        if not run_ok and args.fail_fast:
            sys.exit(1)
        
        if run_ok:
            log.log(TASK_COMPLETE_LEVEL, "%s RUN %f" % (rsid, total_time))
            
def do_perf(args, rspecs):
    log.info("TASK perf")
    xid_base = str(time.time()) # this should really be a nonce
    runid = 0

    for rs in rspecs:
        rsid = rs.get_id()
        run = 0
        repeat = 0
        runid += 1

        while run < args.repeat:
            xid_c = xid_base + "." + str(runid)
            runid2 = xid_c + "." + str(run + repeat)

            ts = datetime.datetime.now()
            log.info("PERFDATE BEGIN_RUN %s" % (ts.strftime(TIME_FMT)))
            run_ok, x = std_run(args, rs, runid2)
            log.info("PERFDATE END_RUN %s" % (datetime.datetime.now().strftime(TIME_FMT)))

            if run_ok:
                p = rs.perf.get_perf(x)
                if p is None:
                    log.log(FAIL_LEVEL, "%s %s: perf extraction failed: %s" % (rsid, runid2, x))
                    if args.fail_fast:
                        sys.exit(1)
                    else:
                        break

                # TODO: delay this until we have all repeats?
                log.log(PERF_LEVEL, "%s %s %s %s %s" % (rsid, xid_c, run, p['time_ns'], x))
                run += 1
            else:
                if x.retval == -signal.SIGKILL or (x.retval == 256-signal.SIGKILL):
                    # 255 - signal.SIGKILL will be when measure_energy is on for example
                    if run == 0:
                        log.log(FAIL_LEVEL, "%s %s: killed" % (rsid, runid2))
                        # first run failed, don't continue when killed out of time.
                        log.log(FAIL_LEVEL, "MISSING PERF %s" % (rsid,))

                        if args.fail_fast:
                            sys.exit(1)
                            
                        break
                    
                if repeat < 3:
                    log.log(FAIL_LEVEL, "%s %s: failed, re-running: %s" % (rsid, runid2, x))
                    repeat += 1
                else:
                    if run == 0:
                        # we never managed to run this ...
                        log.log(FAIL_LEVEL, "MISSING PERF %s" % (rsid,))

                    if args.fail_fast:
                        sys.exit(1)

                    break

        if run > 0:
            log.log(TASK_COMPLETE_LEVEL, "%s PERF %d/%d/%d" % (rsid, run, repeat, args.repeat))
        
            
def check_rspecs(rspecs):
    checks = []
    out = []
    all_ok = True

    for rs in rspecs:
        x = rs.check()
        if not x:
            if args.ignore_missing_binaries and len(rs.errors) == 1 and 'missing-binary' in rs.errors:
                # do not add rs to out [and do not pass go.]
                all_ok = False
                continue

        checks.append(x)
        out.append(rs)

    return all_ok, checks, out

def populate_black_list(black_list_file):
    f = open(black_list_file)
    lines = f.read().replace("\r","").split("\n")
    f.close()
    filtered_lines = [l for l in lines if l != ""]
    return filtered_lines

log = logging.getLogger(__name__)

FAIL_LEVEL = logging.getLevelName("ERROR") + 1
PASS_LEVEL = logging.getLevelName("ERROR") + 2
PERF_LEVEL = logging.getLevelName("ERROR") + 3
COLLECT_LEVEL = logging.getLevelName("ERROR") + 4
TASK_COMPLETE_LEVEL = logging.getLevelName("ERROR") + 5
BLACK_LIST = []

logging.addLevelName(FAIL_LEVEL, "FAIL")
logging.addLevelName(PASS_LEVEL, "PASS")
logging.addLevelName(PERF_LEVEL, "PERF")
logging.addLevelName(COLLECT_LEVEL, "COLLECT")
logging.addLevelName(TASK_COMPLETE_LEVEL, "TASK_COMPLETE")

p = argparse.ArgumentParser("Run tests")
p.add_argument("-d", dest="metadir", metavar="PATH", help="Path to load configuration from", default=".")
p.add_argument("--iproc", dest="inpproc", metavar="FILE", help="Input processor")
p.add_argument("--bs", dest="binspec", metavar="FILE", help="Binary specification", default="./bmktest2.py")
p.add_argument("--bispec", dest="bispec", metavar="FILE_OR_MNEMONIC", help="Binary+Input specification")
p.add_argument("--scan", dest="scan", metavar="PATH", help="Recursively search PATH for bmktest2.py")
p.add_argument("--xs", dest="xtended_scan", action="store_true", help="Also recognize bmktest2-*.py in scans")

p.add_argument("--log", dest="log", metavar="FILE", help="Store logs in FILE")
p.add_argument("--blacklist", dest="blacklist_file", metavar="FILE", help="a list of applications to skip in FILE")
p.add_argument("--ignore-missing-binaries", action="store_true", default = False)
p.add_argument("--cuda-profile", dest="cuda_profile", action="store_true", help="Enable CUDA profiling")
p.add_argument("--cp-cfg", dest="cuda_profile_config", metavar="FILE", help="CUDA Profiler configuration")
p.add_argument("--cp-log", dest="cuda_profile_log", action="store_true", help="CUDA Profiler logfile", default="{xtitle}cp_{rsid}_{runid}.log")
p.add_argument("--only", dest="only", help="Only run binids in FILE")
p.add_argument("--invert-only", dest="invert_only", action="store_true", help="Invert --only, do NOT run binids in FILE")
p.add_argument("--always-cleanup", dest="always_cleanup", action="store_true", help="Always cleanup files even if checks fail")
p.add_argument("--nvprof", dest="nvprof", action="store_true", help="Enable CUDA profiling via NVPROF")
p.add_argument("--nvp-metrics", dest="nvp_metrics", help="Comma-separated list of NVPROF metrics")
p.add_argument("--nvp-events", dest="nvp_events", help="Comma-separated list of NVPROF events")
p.add_argument("--nvp-metfiles", dest="nvp_metric_files", help="Comma-separated list of NVPROF metric files")
p.add_argument("--npdb", dest="npdb", action="store_true", help="Generate a profile database instead of a CSV")
p.add_argument("--npanalysis", dest="npanalysis", action="store_true", help="Supply --analysis-metrics to nvprof")
p.add_argument("--npsystem", dest="npsystem", action="store_true", help="Supply --system-profiling to nvprof")
p.add_argument("--max-output-bytes", dest="max_output", type=int, metavar="BYTES", help="Truncate output and error logs from runs if they exceed BYTES, zero to never truncate", default=1600)
p.add_argument("--xtitle", dest="xtitle", help="Title of experiment")
p.add_argument("--cfg", dest="configs", action="append", help="Configurations to apply. default is always applied if present", default=[])
p.add_argument("--varcfg", dest="varconfigs", action="append", help="Variable configs, specified as var=value", default=[])
p.add_argument("--measure-energy", dest="measure_energy", action="store_true", help="Measure energy of run")
p.add_argument("--read", dest="readlog", metavar="FILE", help="Read previous log")
p.add_argument('-v', "--verbose", dest="verbose", action="store_true", help="Show stdout and stderr of executing programs", default=False)
p.add_argument('--missing', dest="missing", action="store_true", help="Select new/missing runspecs")

p.add_argument("--retrace", dest="retrace", metavar="FILE", help="Read map file FILE and rerun traces")
p.add_argument("--cl-device", dest="cl_device", metavar="PLATFORM,DEVICE", help="Run binary on PLATFORM,DEVICE")
p.add_argument("--cl-cmdline", dest="cl_cmdline", metavar="TEMPLATE", help="Command template for OpenCL device selection")
p.add_argument("--mtcpulimit", dest="mtcpulimit", help="Multiply CPU limit by this number (usually max. number of threads)", default=1,type=int)

sp = p.add_subparsers(help="sub-command help", dest="command")
plist = sp.add_parser('list', help="List runspecs")
plist.add_argument('binputs', nargs='*', help="Limit to binaries and/or inputs")
plist.add_argument('--show-files', action="store_true", help="Limit to binaries and/or inputs", default=False)

prun = sp.add_parser('run', help="Run binaries")
prun.add_argument('binputs', nargs='*', help="List of binaries and/or inputs to execute")
prun.add_argument('--ff', dest="fail_fast", action="store_true", help="Fail fast", default=False)

pperf = sp.add_parser('perf', help="Run performance tests")
pperf.add_argument('binputs', nargs='*', help="List of binaries and/or inputs to execute")
pperf.add_argument('--ff', dest="fail_fast", action="store_true", help="Fail fast", default=False)
pperf.add_argument('-r', dest="repeat", metavar="N", type=int, help="Number of repetitions", default=3)

cmd_line = " ".join(sys.argv)

args = p.parse_args()

PREV_BINIDS = set()
if args.readlog:
    assert args.readlog != args.log
    PREV_BINIDS = read_log(args.readlog)

if args.log:
    logging.basicConfig(level=logging.DEBUG, format='%(levelname)s %(message)s', filename=args.log, filemode='wb') # note the 'wb', instead of 'a'
    console = logging.StreamHandler()
    fmt = logging.Formatter('%(levelname)-8s %(name)-10s %(message)s')
    console.setLevel(logging.INFO)
    console.setFormatter(fmt)
    logging.getLogger('').addHandler(console)
else:
    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')

if args.readlog:
    log.info('%d completed task rsids read from log' % (len(PREV_BINIDS)))

if args.blacklist_file:
    BLACK_LIST = populate_black_list(args.blacklist_file)
    log.info('black created for applications: %s' % " ".join(BLACK_LIST))


loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, 
                         args.bispec, args.binputs, 
                         args.ignore_missing_binaries, bin_configs=args.configs, 
                         extended_scan = args.xtended_scan, 
                         black_list = BLACK_LIST, varconfigs = args.varconfigs)
if not loaded:
    sys.exit(1)
else:
    basepath, binspecs, l = loaded

rspecs = l.get_run_specs(l.config)
rspecs.sort(key=lambda x: x.bid)

all_ok, checks, rspecs = check_rspecs(rspecs)

if not all(checks):
    log.info("Some checks failed. See previous error messages for information.")
    sys.exit(1)

if all_ok:
    log.info("Configuration loaded successfully.")
else:
    log.info("Configuration loaded with some errors ignored. See previous error messages for information.")

start = datetime.datetime.now()
log.info("SYSTEM: %s" % (",".join(platform.uname())))
log.info("DATE START %s" % (start.strftime(TIME_FMT)))
log.log(COLLECT_LEVEL, "basepath %s" % (basepath,))
log.info("CMD_LINE: %s" % (cmd_line))
log_env()

if args.missing:
    rspecs = filter(lambda rs: rs.get_id() not in PREV_BINIDS, rspecs)

if args.only:
    onlybinids = set([s.strip() for s in open(args.only, "r").readlines() if s != '\n'])
    all_rsids = set([rs.get_id() for rs in rspecs])

    if onlybinids.intersection(all_rsids) != onlybinids:
        log.error('Subset IDs did not match (possibly misspelt?): %s' % (onlybinids.difference(all_rsids)))
        if args.binputs is None or len(args.binputs) == 0: 
            sys.exit(1) 

    onlybinids = onlybinids.intersection(all_rsids)

    if not args.invert_only:
        log.info("SUBSET: %s" % (onlybinids,))
        rspecs = filter(lambda rs: rs.get_id() in onlybinids, rspecs)
    else:
        log.info("EXCLUDING: %s" % (onlybinids,))
        log.info("SUBSET: %s" % (all_rsids - onlybinids,))
        rspecs = filter(lambda rs: rs.get_id() not in onlybinids, rspecs)


if args.xtitle:
    for rs in rspecs:
        rs.vars['xtitle'] = args.xtitle

if args.cl_device:
    cl_platform, cl_device = args.cl_device.split(",")
    cl_cmdline = args.cl_cmdline or l.config.get_var("cl_cmdline", None) or "-p {platform} -d {device}"
        
    overlays.add_overlay(rspecs, overlays.CLDeviceOverlay, cmdline_template=cl_cmdline, 
                         cl_platform = cl_platform, cl_device = cl_device)

if args.cuda_profile:
    cp_cfg_file = args.cuda_profile_config or l.config.get_var("cp_cfg", None)
    cp_log_file = args.cuda_profile_log or l.config.get_var("cp_log", None)

    if cp_cfg_file:
        assert os.path.exists(cp_cfg_file) and os.path.isfile(cp_cfg_file), "CUDA Profiler Config '%s' does not exist or is not a file" % (cp_cfg_file,)

    overlays.add_overlay(rspecs, overlays.CUDAProfilerOverlay, profile_cfg=cp_cfg_file, profile_log=cp_log_file)
elif args.nvprof:
    cp_log_file = args.cuda_profile_log or l.config.get_var("cp_log", None)
    cfg = []
    metrics = []
    events = []
    if args.nvp_metrics:        
        metrics.extend(args.nvp_metrics.split(","))

    if args.nvp_events:
        events.extend(args.nvp_events.split(","))

    if args.nvp_metric_files:
        nvpdir = l.config.get_var("nvprof_dir", args.metadir)
        files = [os.path.join(nvpdir, a) for a in args.nvp_metric_files.split(",")]
        metrics.extend(read_line_terminated_cfg(files))
                
    if len(metrics):
        cfg.append("--metrics %s" % (",".join(metrics),))

    if len(events):
        cfg.append("--events %s" % (",".join(events),))

    cfg = " ".join(cfg)
    if args.npdb or args.npanalysis:
        cp_log_file = cp_log_file.replace(".log", ".nvprof")
        
    overlays.add_overlay(rspecs, overlays.NVProfOverlay, profile_cfg=cfg, profile_log=cp_log_file, profile_db = args.npdb, profile_analysis=args.npanalysis, system_profiling=args.npsystem)

tmpdir = l.config.get_var("tmpdir", None)
if tmpdir: 
    assert (os.path.exists(tmpdir) and os.path.isdir(tmpdir)), "Temporary directory '%s' does not exist or is not a directory" % (tmpdir,)
    overlays.add_overlay(rspecs, overlays.TmpDirOverlay, tmpdir)
    for r in rspecs:
        r.set_tmpdir(tmpdir)

overlays.add_overlay(rspecs, overlays.Bmk2RTEnvOverlay)

if args.retrace:
    overlays.add_overlay(rspecs, overlays.GGCInstrOverlay, args.retrace)

if args.measure_energy:
    overlays.add_overlay(rspecs, overlays.MeasureEnergyOverlay)

rl = load_rlimits(l, args.mtcpulimit)

for r in rspecs:
    r.set_rlimit(rl)

if args.command == "list":
    prev_bid = None
    for rs in rspecs:
        if rs.bid != prev_bid:
            print rs.bid,
            prev_bid = rs.bid
            if rs.bid in l.config.disable_binaries:
                print "\t** DISABLED **",
            print

        print "\t", rs.input_name
        if args.show_files:
            files = rs.get_input_files() +rs.checker.get_input_files()
            print "\t\t", " ".join(files)
elif args.command == "run":
    for b in l.config.disable_binaries:
        log.info("DISABLED BINARY %s" % (b,))

    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]
    do_run(args, rspecs)
elif args.command == "perf":
    for b in l.config.disable_binaries:
        log.info("DISABLED BINARY %s" % (b,))

    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]
    do_perf(args, rspecs)

summarize(log, rspecs)    
end = datetime.datetime.now()
log.info("DATE END %s" % (end.strftime(TIME_FMT)))
log.info("APPROXIMATE DURATION %s" % (end - start)) # modulo clock adjusting, etc.
logging.shutdown()

#def load_rlimits(lo):
#    x = core.RLimit()
#    rlimit_cpu = lo.config.get_var("rlimit.cpu", None)
#    if rlimit_cpu is not None:        
#        log.info('Setting RLIMIT_CPU to %s' % (rlimit_cpu,))
#        x.setrlimit(resource.RLIMIT_CPU, (int(rlimit_cpu), int(rlimit_cpu)))
#
#    return x
#
#def read_log(logfiles):
#    if not isinstance(logfiles, list):
#        logfiles = [logfiles]
#
#    binids = set()
#    for l in logfiles:
#        for r in logproc.parse_log_file(l):
#            if r.type == "TASK_COMPLETE":
#                binids.add(r.rsid)
#
#    return binids
#
#def std_run(args, rs, runid):
#    rsid = rs.get_id()
#    x = rs.run(runid)
#
#    if x.run_ok:
#        if args.verbose:
#            if x.stdout: log.info(x.stdout)
#            if x.stderr: log.info(x.stderr)
#        return True, x
#
#        if rs.checker.check(x):
#            log.log(PASS_LEVEL, "%s: %s" % (rsid, x))
#            x.cleanup()
#            return True, x
#        else:
#            log.log(FAIL_LEVEL, "%s: check failed: %s" % (rsid, x))
#            return False, x
#    else:
#        log.log(FAIL_LEVEL, "%s: run failed" % (rsid))
#        if x.stdout: log.info("%s STDOUT\n" %(rsid) + x.stdout)
#        if x.stderr: log.info("%s STDERR\n" %(rsid) + x.stderr + "%s END\n" % (rsid))
#        x.cleanup()
#        return False, x
#    
#def do_run(args, rspecs):
#    log.info("TASK run")
#
#    xid_base = str(time.time()) # this should really be a nonce
#    runid = 0
#    for rs in rspecs:
#        rsid = rs.get_id()
#        xid_c = xid_base + "." + str(runid)
#        runid += 1
#
#        run_ok, x = std_run(args, rs, xid_c) # in this case because we do not repeat, xid_c == runid
#        if not run_ok and args.fail_fast:
#            sys.exit(1)
#        
#        if run_ok:
#            log.log(TASK_COMPLETE_LEVEL, "%s RUN" % (rsid,))
#            
#def do_perf(args, rspecs):
#    log.info("TASK perf")
#    xid_base = str(time.time()) # this should really be a nonce
#    runid = 0
#
#    for rs in rspecs:
#        rsid = rs.get_id()
#        run = 0
#        repeat = 0
#        runid += 1
#
#        while run < args.repeat:
#            xid_c = xid_base + "." + str(runid)
#
#            ts = datetime.datetime.now()
#            log.info("PERFDATE BEGIN_RUN %s" % (ts.strftime(TIME_FMT)))
#            run_ok, x = std_run(args, rs, xid_c + "." + str(run + repeat))
#            log.info("PERFDATE END_RUN %s" % (datetime.datetime.now().strftime(TIME_FMT)))
#
#            if run_ok:
#                p = rs.perf.get_perf(x)
#                if p is None:
#                    log.log(FAIL_LEVEL, "%s: perf extraction failed: %s" % (rsid, x))
#                    if args.fail_fast:
#                        sys.exit(1)
#                    else:
#                        break
#
#                # TODO: delay this until we have all repeats?
#                log.log(PERF_LEVEL, "%s %s %s %s %s" % (rsid, xid_c, run, p['time_ns'], x))
#                run += 1
#            else:
#                if repeat < 3:
#                    log.log(FAIL_LEVEL, "%s %s: failed, re-running: %s" % (rsid, xid_c, x))
#                    repeat += 1
#                else:
#                    if run == 0:
#                        # we never managed to run this ...
#                        log.log(FAIL_LEVEL, "MISSING PERF %s" % (rsid,))
#                    else:
#                        log.log(TASK_COMPLETE_LEVEL, "%s PERF %d/%d/%d" % (rsid, run, repeat, args.repeat))
#
#                    if args.fail_fast:
#                        sys.exit(1)
#
#                    break
#
#def check_rspecs(rspecs):
#    checks = []
#    out = []
#    all_ok = True
#
#    for rs in rspecs:
#        x = rs.check()
#        if not x:
#            if args.ignore_missing_binaries and len(rs.errors) == 1 and 'missing-binary' in rs.errors:
#                # do not add rs to out [and do not pass go.]
#                all_ok = False
#                continue
#
#        checks.append(x)
#        out.append(rs)
#
#    return all_ok, checks, out
#
#log = logging.getLogger(__name__)
#
#FAIL_LEVEL = logging.getLevelName("ERROR") + 1
#PASS_LEVEL = logging.getLevelName("ERROR") + 2
#PERF_LEVEL = logging.getLevelName("ERROR") + 3
#COLLECT_LEVEL = logging.getLevelName("ERROR") + 4
#TASK_COMPLETE_LEVEL = logging.getLevelName("ERROR") + 5
#
#logging.addLevelName(FAIL_LEVEL, "FAIL")
#logging.addLevelName(PASS_LEVEL, "PASS")
#logging.addLevelName(PERF_LEVEL, "PERF")
#logging.addLevelName(COLLECT_LEVEL, "COLLECT")
#logging.addLevelName(TASK_COMPLETE_LEVEL, "TASK_COMPLETE")
#
#p = argparse.ArgumentParser("Run tests")
#p.add_argument("-d", dest="metadir", metavar="PATH", 
#               help="Path to load configuration from", default=".")
#p.add_argument("--iproc", dest="inpproc", metavar="FILE", 
#               help="Input processor")
#p.add_argument("--bs", dest="binspec", metavar="FILE", 
#               help="Binary specification", default="./bmktest2.py")
#p.add_argument("--bispec", dest="bispec", metavar="FILE_OR_MNEMONIC", 
#               help="Binary+Input specification")
#p.add_argument("--scan", dest="scan", metavar="PATH", 
#               help="Recursively search PATH for bmktest2.py")
#p.add_argument("--log", dest="log", metavar="FILE", help="Store logs in FILE")
#p.add_argument("--ignore-missing-binaries", action="store_true", 
#               default = False)
#p.add_argument("--cuda-profile", dest="cuda_profile", action="store_true", 
#               help="Enable CUDA profiling")
#p.add_argument("--cp-cfg", dest="cuda_profile_config", metavar="FILE", 
#               help="CUDA Profiler configuration")
#p.add_argument("--cp-log", dest="cuda_profile_log", action="store_true", 
#               help="CUDA Profiler logfile", default="cp_{rsid}_{runid}.log")
#
#p.add_argument("--nvprof", dest="nvprof", action="store_true", 
#               help="Enable CUDA profiling via NVPROF")
#p.add_argument("--nvp-metrics", dest="nvp_metrics", 
#               help="Comma-separated list of NVPROF metrics")
#
#p.add_argument("--read", dest="readlog", metavar="FILE", help="Read previous log")
#p.add_argument('-v', "--verbose", dest="verbose", action="store_true", 
#               help="Show stdout and stderr of executing programs", default=False)
#p.add_argument('--missing', dest="missing", action="store_true", 
#               help="Select new/missing runspecs")
#
#sp = p.add_subparsers(help="sub-command help", dest="command")
#plist = sp.add_parser('list', help="List runspecs")
#plist.add_argument('binputs', nargs='*', help="Limit to binaries and/or inputs")
#plist.add_argument('--show-files', action="store_true", 
#                   help="Limit to binaries and/or inputs", default=False)
#
#prun = sp.add_parser('run', help="Run binaries")
#prun.add_argument('binputs', nargs='*', 
#                  help="List of binaries and/or inputs to execute")
#prun.add_argument('--ff', dest="fail_fast", action="store_true", 
#                  help="Fail fast", default=False)
#
#pperf = sp.add_parser('perf', help="Run performance tests")
#pperf.add_argument('binputs', nargs='*', 
#                   help="List of binaries and/or inputs to execute")
#pperf.add_argument('--ff', dest="fail_fast", action="store_true", 
#                   help="Fail fast", default=False)
#pperf.add_argument('-r', dest="repeat", metavar="N", type=int, 
#                   help="Number of repetitions", default=3)
#
#args = p.parse_args()
#
#PREV_BINIDS = set()
#
#if args.readlog:
#    assert args.readlog != args.log
#    PREV_BINIDS = read_log(args.readlog)
#
#if args.log:
#    logging.basicConfig(level=logging.DEBUG, format='%(levelname)s %(message)s', 
#                        filename=args.log, filemode='wb') # note the 'wb', instead of 'a'
#    console = logging.StreamHandler()
#    fmt = logging.Formatter('%(levelname)-8s %(name)-10s %(message)s')
#    console.setLevel(logging.INFO)
#    console.setFormatter(fmt)
#    logging.getLogger('').addHandler(console)
#else:
#    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')
#
#if args.readlog:
#    log.info('%d completed task rsids read from log' % (len(PREV_BINIDS)))
#
#
#loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, 
#                         args.bispec, args.binputs, args.ignore_missing_binaries)
#if not loaded:
#    sys.exit(1)
#else:
#    basepath, binspecs, l = loaded
#
#rspecs = l.get_run_specs(l.config)
#rspecs.sort(key=lambda x: x.bid)
#
#all_ok, checks, rspecs = check_rspecs(rspecs)
#
#if not all(checks):
#    log.info("Some checks failed. See previous error messages for information.")
#    sys.exit(1)
#
#if all_ok:
#    log.info("Configuration loaded successfully.")
#else:
#    log.info("Configuration loaded with some errors ignored. See previous error messages for information.")
#
#start = datetime.datetime.now()
#log.info("SYSTEM: %s" % (",".join(os.uname())))
#log.info("DATE START %s" % (start.strftime(TIME_FMT)))
#log.log(COLLECT_LEVEL, "basepath %s" % (basepath,))
#
#if args.missing:
#    rspecs = filter(lambda rs: rs.get_id() not in PREV_BINIDS, rspecs)
#
#if args.cuda_profile:
#    cp_cfg_file = args.cuda_profile_config or l.config.get_var("cp_cfg", None)
#    cp_log_file = args.cuda_profile_log or l.config.get_var("cp_log", None)
#
#    if cp_cfg_file:
#        assert os.path.exists(cp_cfg_file) and os.path.isfile(cp_cfg_file), "CUDA Profiler Config '%s' does not exist or is not a file" % (cp_cfg_file,)
#
#    overlays.add_overlay(rspecs, overlays.CUDAProfilerOverlay, profile_cfg=cp_cfg_file, profile_log=cp_log_file)
#elif args.nvprof:
#    cp_log_file = args.cuda_profile_log or l.config.get_var("cp_log", None)
#    overlays.add_overlay(rspecs, overlays.NVProfOverlay, profile_cfg="--metrics %s" % (args.nvp_metrics), profile_log=cp_log_file)
#
#tmpdir = l.config.get_var("tmpdir", None)
#if tmpdir: 
#    assert (os.path.exists(tmpdir) and os.path.isdir(tmpdir)), "Temporary directory '%s' does not exist or is not a directory" % (tmpdir,)
#    overlays.add_overlay(rspecs, overlays.TmpDirOverlay, tmpdir)
#    for r in rspecs:
#        r.set_tmpdir(tmpdir)
#
#rl = load_rlimits(l)
#
#for r in rspecs:
#    r.set_rlimit(rl)
#
#if args.command == "list":
#    prev_bid = None
#    for rs in rspecs:
#        if rs.bid != prev_bid:
#            print rs.bid,
#            prev_bid = rs.bid
#            if rs.bid in l.config.disable_binaries:
#                print "\t** DISABLED **",
#            print
#
#        print "\t", rs.input_name
#        if args.show_files:
#            files = rs.get_input_files() +rs.checker.get_input_files()
#            print "\t\t", " ".join(files)
#elif args.command == "run":
#    for b in l.config.disable_binaries:
#        log.info("DISABLED BINARY %s" % (b,))
#
#    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]
#    do_run(args, rspecs)
#elif args.command == "perf":
#    for b in l.config.disable_binaries:
#        log.info("DISABLED BINARY %s" % (b,))
#
#    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]
#    do_perf(args, rspecs)
#
#summarize(log, rspecs)    
#end = datetime.datetime.now()
#log.info("DATE END %s" % (end.strftime(TIME_FMT)))
#log.info("APPROXIMATE DURATION %s" % (end - start)) # modulo clock adjusting, etc.
#logging.shutdown()


================================================
FILE: scripts/experimental/buildFunc.sh
================================================
#!/bin/bash

SRC_ROOT="$HOME/projects/GaloisCpp"

BUILD_ROOT="/workspace/$USER/build"

mkdir -p "${BUILD_ROOT}"


cc=${cc:="gcc"}
cxx=${cxx:="g++"}
build=${build:="Debug"}
cmakeOpts=${cmakeOpts:="-DUSE_PAPI=1 -DUSE_VTUNE=1 -DGALOIS_ENABLE_DIST=1"}
cleanup=${cleanup:="0"}

galoisCheckStatus() {
  local cmd="$1"
  if eval "$cmd" ; then
    echo "OK: success running ($cmd)"
  else
    echo "ERROR: ($cmd) failed"
    exit -1
  fi
}

galoisSetCompilers() {
  if [[ "xx$cc" == "xxgcc" ]] ; then
    cxx="g++";
  elif [[ "xx$cc" == "xxicc" ]] ; then
    cxx="icpc";
  elif [[ "xx$cc" == "xxclang" ]] ; then
    cxx="clang++";
  else
    cxx="not found";
  fi

  galoisCheckStatus "which $cc"
  galoisCheckStatus "which $cxx"
}


galoisRunBuild() {
  galoisSetCompilers
  local buildDir=$(mktemp -d -p ${BUILD_ROOT} "$cc-$build.XXXXXX")
  galoisCheckStatus "cd $buildDir"
  galoisCheckStatus "CC=$cc CXX=$cxx cmake -DCMAKE_BUILD_TYPE=$build $cmakeOpts ${SRC_ROOT}"
  galoisCheckStatus "make -j"
  if [[ "xx$cleanup" == "xx1" ]] && [[ "xx$buildDir" != "xx" ]] ;  then
    galoisCheckStatus "rm -rf $buildDir"
  fi
}

galoisBuildMultiCompiler() {
  for c in "gcc" "clang" "icc"; do
    build="Debug"
    cc="$c"
    galoisRunBuild;

    # build="Release"
    # galoisRunBuild;
  done
}

galoisBuildMultiVer() {
  for i in 5 6 7 ; do 
    galoisCheckStatus "module load atc/1.$i"
    galoisBuildMultiCompiler
  done
}

galoisBuildGccDebug() {
  cc="gcc"
  build="Debug"
  galoisRunBuild
}

galoisBuildGccRelease() {
  cc="gcc"
  build="Release"
  galoisRunBuild
}

galoisBuildIccDebug() {
  cc="icc"
  build="Debug"
  galoisRunBuild
}

galoisBuildIccRelease() {
  cc="icc"
  build="Release"
  galoisRunBuild
}

galoisBuildClangDebug() {
  cc="clang"
  build="Debug"
  galoisRunBuild
}

galoisBuildClangRelease() {
  cc="clang"
  build="Release"
  galoisRunBuild
}


================================================
FILE: scripts/experimental/buildMultiCompiler.sh
================================================
#!/bin/bash

scriptsDir=$(dirname $0)

source $scriptsDir/buildFunc.sh

galoisBuildMultiCompiler


================================================
FILE: scripts/experimental/buildMultiVersion.sh
================================================
#!/bin/bash

scriptsDir=$(dirname $0)

source $scriptsDir/buildFunc.sh

galoisBuildMultiVer


================================================
FILE: scripts/experimental/buildOnce.sh
================================================
#!/bin/bash

scriptsDir=$(dirname $0)

source $scriptsDir/buildFunc.sh

galoisRunBuild


================================================
FILE: scripts/experimental/distbmk2/README
================================================
How to run:

Set the following environment variables:

BMK_DIST_PATH to the dist_apps
BMK_DISTLOGS to wherever you want the logs to go
BMK_MPIRUN_PATH to the mpirun binary

Then run the following in THIS directory (same as README):

python <path to bmk2/test2.py> run

TODO explain the bmktest2.py file


================================================
FILE: scripts/experimental/distbmk2/bmk2.cfg
================================================
[bmk2]
version=2
inputdb=dist.inputdb
inputprops=dist.inputprops
bispec=dist.bispec
pathToApps=${BMK_DIST_PATH}
logOutputDirectory=${BMK_DISTLOGS}
pathToMPIRun=${BMK_MPIRUN_PATH}


================================================
FILE: scripts/experimental/distbmk2/bmkprops.py
================================================
import bmk2
import datetime

TIME_FMT = "%Y-%m-%d %H:%M:%S"

class GraphBMKDistApp(bmk2.Binary):
  """Base class for dist apps to inherit from. Subclasses specify benchmark
  name + number of threads + number of hosts."""
  def __init__(self):
    """Initialize dist app properties."""
    self.props = GraphBMKDistAppProps(self.benchmark)
        
  def get_id(self):
    """Return the id of this benchmark."""
    return "%s" % (self.benchmark)

  def getUniqueStatFile(self, numThreads, numHosts, currentCut, graphName):
    """Get a statfile name given num threads + graph name being used."""
    timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(" ", "_")

    return ("%s_t=%d_n=%d_%s_%s_%s.log" % (self.benchmark, numThreads, numHosts,
                                           graphName, currentCut, timeNow))

class GraphBMKDistAppProps(bmk2.Properties):
  """Properties pertaining to a dist app."""
  def __init__(self, benchmark):
    self.benchmark = benchmark


================================================
FILE: scripts/experimental/distbmk2/bmktest2.py
================================================
import bmk2
from bmkprops import GraphBMKDistApp
import os

################################################################################
# DistApp base class
################################################################################

class DistApp(GraphBMKDistApp):
  """Base class that has default run spec construction behavior for most
  dist apps."""
  # thread to start from
  startThread = 40 
  # thread to end at (inclusive)
  endThread = 40
  # step to use for looping through threads
  step = 10

  # list of hosts to loop through
  testHosts = [1]
  #testHosts = [1, 2, 3]

  # list of cuts to test
  # TODO use hybrid cuts?
  cutsToTest = ["oec", "iec", "cvc"]

  def filter_inputs(self, inputs):
    """Ignore inputs that aren't currently supported; dist apps only 
    support the Galois binary graph format."""
    def finput(x):
      if x.props.format == 'bin/galois': return True
      return False

    return filter(finput, inputs)

  def get_default_run_specs(self, bmkinput, config):
    """Creates default run specifications with common arguments for all
    dist apps and returns them. They can be modified
    later according to the benchmark that you want to run.
    """
    assert config != None # config should be passed through test2.py

    listOfRunSpecs = []

    # TODO add cuts in as well....
    for numThreads in range(self.startThread, self.endThread + 1, self.step):
      if numThreads == 0 and self.step != 1:
        numThreads = 1
      elif numThreads == 0:
        continue

      for numHosts in self.testHosts:
        # TODO no cut if 1 host
        for currentCut in self.cutsToTest:
          # TODO figure out how to get mpirun hooked up to this

          x = bmk2.RunSpec(self, bmkinput)

          # mpirun setup
          x.set_binary("", os.path.expandvars(
                             os.path.join(config.get_var("pathToMPIRun"))))
          x.set_arg("-n=%d" % numHosts)
          # TODO set this in config instead?
          x.set_arg("-hosts=peltier,gilbert,oersted")

          # app setup
          x.set_arg(os.path.expandvars(
                         os.path.join(config.get_var("pathToApps"),
                                      self.relativeAppPath)))
          x.set_arg("-t=%d" % numThreads)

          # set transpose or symm graph flag
          if not (bmkinput.props.file).endswith(".sgr"):
            x.set_arg("-graphTranspose=%s" % bmkinput.props.transpose)
          else:
            x.set_arg("-symmetricGraph")

          nameToAppend = bmkinput.name

          # partition setup
          if numHosts != 1:
            x.set_arg("-partition=%s" % currentCut)
          else:
            currentCut = "single"

          x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)
          x.set_arg("-statFile=" +
                    os.path.expandvars(
                      os.path.join(config.get_var("logOutputDirectory"),
                                   self.getUniqueStatFile(numThreads, numHosts,
                                                          currentCut,
                                                          nameToAppend))))

          listOfRunSpecs.append(x)

          # null checkers/perf checkers
          x.set_checker(bmk2.PassChecker())
          x.set_perf(bmk2.ZeroPerf())

          # escape partition loop if only in a single host
          if (currentCut == "single"):
            break

    return listOfRunSpecs

  def get_run_spec(self, bmkinput, config):
    return self.get_default_run_specs(bmkinput, config)


################################################################################
# List of apps to test
################################################################################

class BFSPush(DistApp):
  relativeAppPath = "bfs_push"
  benchmark = "bfs_push"

  def get_run_spec(self, bmkinput, config):
    """Adds source of bfs"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-startNode=%s" % bmkinput.props.source)
      
    return specs

class BFSPull(DistApp):
  relativeAppPath = "bfs_pull"
  benchmark = "bfs_pull"

  def get_run_spec(self, bmkinput, config):
    """Adds source of bfs"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-startNode=%s" % bmkinput.props.source)
      
    return specs

class CCPush(DistApp):
  relativeAppPath = "cc_push"
  benchmark = "cc_push"
  
class CCPull(DistApp):
  relativeAppPath = "cc_pull"
  benchmark = "cc_pull"

class KCorePush(DistApp):
  relativeAppPath = "kcore_push"
  benchmark = "kcore_push"

  def get_run_spec(self, bmkinput, config):
    """Adds kcore num"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-kcore=100")
      
    return specs

class KCorePull(DistApp):
  relativeAppPath = "kcore_pull"
  benchmark = "kcore_pull"

  def get_run_spec(self, bmkinput, config):
    """Adds kcore num"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-kcore=100")
      
    return specs

class PageRankPush(DistApp):
  relativeAppPath = "pagerank_push"
  benchmark = "pagerank_push"
  # TODO max iterations?

class PageRankPull(DistApp):
  relativeAppPath = "pagerank_pull"
  benchmark = "pagerank_pull"
  # TODO max iterations?

class SSSPPush(DistApp):
  relativeAppPath = "sssp_push"
  benchmark = "sssp_push"

  def get_run_spec(self, bmkinput, config):
    """Adds source of sssp"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-startNode=%s" % bmkinput.props.source)
      
    return specs

class SSSPPull(DistApp):
  relativeAppPath = "sssp_pull"
  benchmark = "sssp_pull"

  def get_run_spec(self, bmkinput, config):
    """Adds source of sssp"""
    specs = self.get_default_run_specs(bmkinput, config)

    for s in specs:
      s.set_arg("-startNode=%s" % bmkinput.props.source)
      
    return specs


################################################################################
# Specification of binaries to run
################################################################################

#BINARIES = [BFSPush(), BFSPull()]
BINARIES = [BFSPush(), BFSPull(), CCPush(), CCPull(), KCorePush(), KCorePull(),
            PageRankPush(), PageRankPull(), SSSPPush(), SSSPPull()]


================================================
FILE: scripts/experimental/distbmk2/dist.bispec
================================================
#v1
bfs_push rmat24
bfs_pull rmat24
cc_push rmat24s
cc_pull rmat24s
kcore_push rmat24s
kcore_pull rmat24s
pagerank_push rmat24
pagerank_pull rmat24
sssp_push rmat24
sssp_pull rmat24


================================================
FILE: scripts/experimental/distbmk2/dist.inputdb
================================================
[bmktest2]
version = 2
basepath = /net/ohm/export/iss/dist-inputs

[rmat20.gr]
flags = 
name = rmat20
file = rmat20.gr
transpose = /net/ohm/export/iss/dist-inputs/transpose/rmat20.tgr
format = bin/galois

[rmat22.gr]
flags = 
name = rmat22
file = rmat22.gr
transpose = /net/ohm/export/iss/dist-inputs/transpose/rmat22.tgr
format = bin/galois

[rmat24.gr]
flags = 
name = rmat24
file = rmat24.gr
transpose = /net/ohm/export/iss/dist-inputs/transpose/rmat24.tgr
format = bin/galois

[rmat25.gr]
flags = 
name = rmat25
file = rmat25.gr
transpose = /net/ohm/export/iss/dist-inputs/transpose/rmat25.tgr
format = bin/galois

[rmat20.sgr]
flags = 
name = rmat20s
file = symmetric/rmat20.sgr
transpose = 
format = bin/galois

[rmat22.sgr]
flags = 
name = rmat22s
file = symmetric/rmat22.sgr
transpose = 
format = bin/galois

[rmat24.sgr]
flags = 
name = rmat24s
file = symmetric/rmat24.sgr
transpose = 
format = bin/galois

[rmat25.sgr]
flags = 
name = rmat25s
file = symmetric/rmat25.sgr
transpose = 
format = bin/galois


================================================
FILE: scripts/experimental/distbmk2/dist.inputprops
================================================
[bmktest2-props]
version = 2
paths = 

[rmat20]
source = 0

[rmat22]
source = 0

[rmat24]
source = 7601598

[rmat25]
source = 7601598


================================================
FILE: scripts/experimental/galois_license_fixer.py
================================================
#!/usr/bin/python
import re
import sys
import fileinput
import getopt
import textwrap

new_license_text = """/*
 * This file belongs to the Galois project, a C++ library for exploiting parallelism.
 * The code is being released under the terms of the 3-Clause BSD License (a
 * copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */\n\n"""

""" remove license from the files.
    returns: text with license removed
"""
def commentRemover(text, filename):
  def replacer(match):
    s = match.group(0)
    if s.startswith('/'):
      return new_license_text # note: a space and not an empty string
    else:
      return s

  pattern = re.compile(
    #r'/\*.*?\*/',
    #r'/\*.*License.*?\*/\s+',
    #r'/\*.*License.*?.*The University of Texas at Austin.*?\*/\s+',
    r'/\*.*This file belongs to the Galois project.*?.*or loss or inaccuracy of data of any kind\..*?\*/\s+',
    re.DOTALL | re.MULTILINE
  )

  return re.sub(pattern, replacer, text, 1)

def licenseFind(text):
  pattern = re.compile(
    r'/\*.*This file belongs to the Galois project.*?.*or loss or inaccuracy of data of any kind\..*?\*/\s+',
    re.DOTALL | re.MULTILINE
  )

  return re.search(pattern, text)

def main(argv):
  inputfile = ''
  outputfile = ''
  try:
    opts, args = getopt.getopt(argv,"hi:",["ifile="])
  except getopt.GetoptError:
    print 'remove_galois_license.py -i <inputfile>'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'remove_galois_license.py -i <inputfile>'
      sys.exit()
    elif opt in ("-i", "--ifile"):
      inputfile = arg
    print 'Input file is "', inputfile

    filename = inputfile

  with open(filename, 'r+') as f:
     originalText = f.read()

     found = licenseFind(originalText)
     if found == None:
       f.seek(0)
       f.write(new_license_text)
       f.write(originalText)
     else:
       uncmtFile = commentRemover(originalText, filename)
       f.seek(0)
       f.write(uncmtFile)

     f.truncate()
     f.close()

if __name__ == "__main__":
  main(sys.argv[1:])


================================================
FILE: scripts/experimental/githubbmk2_setup/README
================================================
-----
bmk2.cfg
-----

Setup the path to the built apps (Lonestar build directory) and where to output
logs for each benchmark here under pathToApps and logOutputDirectory.

-----
bmktest2.py
-----

This file defines all the benchmarks to be run. Each benchmark is defined as a
class that inherits from SharedMemApp.

The runtime looks for relativeAppPath, which points to the executable, and
benchmark, which is the name given to that benchmark when outputting things
and, more importantly, the name used to refer to said benchmark run by bmk2 in
other files.

If any arguments to that benchmark need to be specified, you define the
get_run_spec function to add args. See examples there in the current bmktest2.py
file to get a feel of how to do it.

The SharedMemApp parent class is where default parameters such as threads
are specified. startThread, endThread, and step are the variables to edit
for this purpose.

The benchmarks that will be run are specified in the BINARIES array. For
example, the below definition will run BarnesHut and BFS:

BINARIES = [BarnesHut(), BFS()]

-----
lonestar.inputdb
-----

Inputs to benchmarks are specified here.

To begin, change the "basepath" variable in the header to point to the root
directory where are inputs are stored.

The format for specifying an input is as follows:

[<input name>]
flags =
name = <name to refer to input as in bmk2>
file = <relative path to file from root input directory "basepath">
format = file format

There are a couple of file formats to be aware of:

bin/galois = binary galois format
mesh/nodes = mesh format
triangles = format for triangle counting
text = text format (for things like points-to-analysis)
nothing = no file is passed in (for things like self generated input)

-----
lonestar.inputprops
-----

Additional properties for certain inputs can be specified in the inputprops
file.

[<same input name used in header for input in inputdb>]
<additional property>=<what additional property is>

For example, I can specify the transpose graph as following:

[soc-livejournal]
ptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr

It can then be referred to in the bmktest2.py python script as the example
below shows:

s.set_arg("-graphTranspose=%s" % bmkinput.props.ptranspose)

-----
lonestar.bispec
-----

This file is where one specifies which input to run with a particular bmk2
benchmark. Use the name given in the benchmark variable in bmktest2 to
refer to a benchmark and the name given under the name variable in
lonestar.inputdb to refer to an input. For example, the below lines says to
run bfs with the twitter40 input.

bfs twitter40

-----
How to run:
-----

defaultrunscript.sh has a default command to start the bmk2 runtime and use the
config specified in the directory you run it from.


================================================
FILE: scripts/experimental/githubbmk2_setup/bmk2.cfg
================================================
[bmk2]
version=2
inputdb=lonestar.inputdb
inputprops=lonestar.inputprops
bispec=lonestar.bispec
pathToApps=${BMK_LONESTAR_PATH}
logOutputDirectory=${BMK_LOGS}


================================================
FILE: scripts/experimental/githubbmk2_setup/bmkprops.py
================================================
import bmk2
import datetime

TIME_FMT = "%Y-%m-%d %H:%M:%S"

class GraphBMKSharedMem(bmk2.Binary):
    """Base class for shared memory benchmarks to inherit from. Subclasses
    need to specify benchmark name + number of threads.
    """
    def __init__(self):
        """Initialize shared mem properties."""
        self.props = GraphBMKSharedMemProps(self.benchmark)
        
    def get_id(self):
        """Return the id of this benchmark."""
        return "%s" % (self.benchmark)

    def getUniqueStatFile(self, numThreads, graphName):
        """Get a statfile name given num threads + graph name being used."""
        timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(" ", "_")

        return ("%s_%d_%s_%s.log" % (self.benchmark, numThreads, graphName,
                               timeNow))

class GraphBMKSharedMemProps(bmk2.Properties):
    """Properties pertaining to shared memory."""
    def __init__(self, benchmark):
        self.benchmark = benchmark


================================================
FILE: scripts/experimental/githubbmk2_setup/bmktest2.py
================================================
import bmk2
from bmkprops import GraphBMKSharedMem
import os

class SharedMemApp(GraphBMKSharedMem):
    """Base class that has default run spec construction behavior for
    most if not all shared memory apps.
    """
    # thread to start from
    startThread = 40
    # thread to end at (inclusive)
    endThread = 40
    # step to use for looping through threads
    step = 1

    def filter_inputs(self, inputs):
        """Ignore inputs that aren't currently supported."""
        def finput(x):
            if x.props.format == 'bin/galois': return True
            if x.props.format == 'mesh': return True
            if x.props.format == 'mesh/nodes': return True
            if x.props.format == 'triangles': return True
            if x.props.format == 'text': return True
            if x.props.format == 'nothing': return True

            return False

        return filter(finput, inputs)

    def get_default_run_specs(self, bmkinput, config):
        """Creates default run specifications with common arguments for all
        shared memory benchmarks and returns them. They can be modified
        later according to the benchmark that you want to run.
        """
        assert config != None # config should be passed through test2.py
        listOfRunSpecs = []

        for numThreads in range(self.startThread, self.endThread + 1, self.step):
            if numThreads == 0 and self.step != 1:
              numThreads = 1
            elif numThreads == 0:
              continue

            x = bmk2.RunSpec(self, bmkinput)

            x.set_binary("", os.path.expandvars(
                               os.path.join(config.get_var("pathToApps"),
                                          self.relativeAppPath)))
            x.set_arg("-t=%d" % numThreads)

            nameToAppend = bmkinput.name

            if bmkinput.props.format == "nothing":
                nameToAppend = "gen"
                pass
            elif bmkinput.props.format != "mesh":
                x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)
            else: # mesh
                # don't specify with input file flag as it doesn't exist (mesh
                # loads multiple files, so the file specified in the inputdb
                # isn't an actual file
                x.set_arg(bmkinput.props.file)

            x.set_arg("-statFile=" +
                      os.path.expandvars(
                        os.path.join(config.get_var("logOutputDirectory"),
                                     self.getUniqueStatFile(numThreads,
                                     nameToAppend))
                      ))

            listOfRunSpecs.append(x)

            x.set_checker(bmk2.PassChecker())
            x.set_perf(bmk2.ZeroPerf())

        return listOfRunSpecs

    def get_run_spec(self, bmkinput, config):
        return self.get_default_run_specs(bmkinput, config)

################################################################################

class BarnesHut(SharedMemApp):
    relativeAppPath = "barneshut/barneshut"
    benchmark = "barneshut"

    def get_run_spec(self, bmkinput, config):
        """Adds barnes hut specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-n=100000")
            s.set_arg("-steps=1")
            s.set_arg("-seed=0")

        return specs

class BCAsync(SharedMemApp):
    relativeAppPath = "betweennesscentrality/bc-async"
    benchmark = "bc-async"

    def get_run_spec(self, bmkinput, config):
        """BC async command line setup"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            # do 5 nodes with edges
            s.set_arg("-numOfOutSources=5")

        return specs

class BCOuter(SharedMemApp):
    relativeAppPath = "betweennesscentrality/betweennesscentrality-outer"
    benchmark = "bc-outer"

class BFS(SharedMemApp):
    relativeAppPath = "bfs/bfs"
    benchmark = "bfs"

class Boruvka(SharedMemApp):
    relativeAppPath = "boruvka/boruvka"
    benchmark = "boruvka"

class BoruvkaMerge(SharedMemApp):
    relativeAppPath = "boruvka/boruvka-merge"
    benchmark = "boruvka-merge"

class Clustering(SharedMemApp):
    relativeAppPath = "clustering/clustering"
    benchmark = "clustering"

    def get_run_spec(self, bmkinput, config):
        """Clustering command line setup"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-numPoints=100") # num points to cluster

        return specs

class ConnectedComponents(SharedMemApp):
    relativeAppPath = "connectedcomponents/connectedcomponents"
    benchmark = "connectedcomponents"

class DelaunayTriangulation(SharedMemApp):
    relativeAppPath = "delaunaytriangulation/delaunaytriangulation"
    benchmark = "delaunaytriangulation"

class DelaunayTriangulationDet(SharedMemApp):
    relativeAppPath = "delaunaytriangulation/delaunaytriangulation-det"
    benchmark = "delaunaytriangulation-det"

class DMR(SharedMemApp):
    relativeAppPath = "delaunayrefinement/delaunayrefinement"
    benchmark = "dmr"

class GMetis(SharedMemApp):
    relativeAppPath = "gmetis/gmetis"
    benchmark = "gmetis"

    def get_run_spec(self, bmkinput, config):
        """Adds gmetis specific arguments (num partitions)"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("256") # num of partitions

        return specs

class IndependentSet(SharedMemApp):
    relativeAppPath = "independentset/independentset"
    benchmark = "independentset"

# triggers caps for matrix completion
MCCAP = False

class MatrixCompletionSync(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-sync"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=syncALS") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MatrixCompletionSimple(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-simple"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=simpleALS") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MatrixCompletionEdge(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-edge"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdBlockEdge") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MatrixCompletionJump(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-jump"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdBlockJump") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MatrixCompletionByItems(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-byitems"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdByItems") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MatrixCompletionByEdges(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-byedges"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdByEdges") # algo type
            s.set_arg("-lambda=0.001")
            s.set_arg("-learningRate=0.01")
            s.set_arg("-learningRateFunction=intel")
            s.set_arg("-tolerance=0.01")
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")

        return specs

class MCM(SharedMemApp):
    relativeAppPath = "matching/bipartite-mcm"
    benchmark = "mcm"

    def get_run_spec(self, bmkinput, config):
        """Adds bipartite matching specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-abmpAlgo")
            s.set_arg("-inputType=generated")
            s.set_arg("-n=100") # nodes in each bipartite set
            s.set_arg("-numEdges=10000")
            s.set_arg("-numGroups=100")
            s.set_arg("-seed=0") # seed for rng; keep it consistent

        return specs


class PageRankPull(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-pull"
    benchmark = "pagerank-pull"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.01") # pagerank tolerance

        return specs

class PageRankPullTopo(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-pull"
    benchmark = "pagerank-pull-topo"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument and algorithm setting"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.01") # pagerank tolerance
            s.set_arg("-algo=Topo") # pagerank tolerance

        return specs

class PageRankPush(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-push"
    benchmark = "pagerank-push"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.01") # pagerank tolerance

        return specs

class PageRankPushSync(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-push"
    benchmark = "pagerank-push-sync"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument and algo setting"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.01") # pagerank tolerance
            s.set_arg("-algo=Sync") # pagerank tolerance

        return specs

# for galois 2.2 version of pagerank
class PageRank2Point2(SharedMemApp):
    relativeAppPath = "pagerank/pagerank"
    benchmark = "pagerank22"

    def get_run_spec(self, bmkinput, config):
        """Adds transpose graph."""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-graphTranspose=%s" % bmkinput.props.ptranspose)
            s.set_arg("-maxIterations=1000")

        return specs

class PreflowPush(SharedMemApp):
    relativeAppPath = "preflowpush/preflowpush"
    benchmark = "preflowpush"

    def get_run_spec(self, bmkinput, config):
        """Adds preflow push specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("0") # source id
            s.set_arg("100") # sink id

        return specs

class PointsToAnalysis(SharedMemApp):
    relativeAppPath = "pointstoanalysis/pta"
    benchmark = "pta"

class SpanningTree(SharedMemApp):
    relativeAppPath = "spanningtree/spanningtree"
    benchmark = "spanningtree"

class SSSP(SharedMemApp):
    relativeAppPath = "sssp/sssp"
    benchmark = "sssp"

    def get_run_spec(self, bmkinput, config):
        """Adds delta argument to runs."""
        specs = self.get_default_run_specs(bmkinput, config)

        # 0 is best for twitter50
        # 8 seems best for r4-2e26
        for s in specs:
            #s.set_arg("-delta=0")
            s.set_arg("-delta=8")

        return specs

class SurveyPropagation(SharedMemApp):
    relativeAppPath = "surveypropagation/surveypropagation"
    benchmark = "surveypropagation"

    def get_run_spec(self, bmkinput, config):
        """Adds survey prop arguments to runs."""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("9") # random generator seed
            s.set_arg("100") # number of vars
            s.set_arg("100") # number of clauses
            s.set_arg("3") # vars per clause

            # below are args used by runs on galois website
            #s.set_arg("1000000") # number of vars
            #s.set_arg("3000000") # number of clauses
            #s.set_arg("3") # vars per clause

        return specs

class TrianglesNode(SharedMemApp):
    relativeAppPath = "triangles/triangles"
    benchmark = "triangles-node"

    def get_run_spec(self, bmkinput, config):
        """Specifies node version"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=nodeiterator")

        return specs

class TrianglesEdge(SharedMemApp):
    relativeAppPath = "triangles/triangles"
    benchmark = "triangles-edge"

    def get_run_spec(self, bmkinput, config):
        """Specifies edge version"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=edgeiterator")

        return specs

# specification of binaries to run
# apps present in Galois 2.2
#BINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(), BoruvkaMerge(),
#            Clustering(), ConnectedComponents(), DelaunayTriangulation(), DMR(),
#            GMetis(), IndependentSet(), MCM(), PageRankPull(), PageRankPush(),
#            PreflowPush(), SpanningTree(), SSSP(), SurveyPropagation()]

# single benchmark run
#BINARIES = [SurveyPropagation(), TrianglesNode(), TrianglesEdge()]

BINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(),
            ConnectedComponents(), DelaunayTriangulation(), DMR(),
            GMetis(), IndependentSet(), MatrixCompletionSync(),
            MatrixCompletionSimple(), MatrixCompletionEdge(),
            MatrixCompletionJump(), MatrixCompletionByItems(),
            MatrixCompletionByEdges(), MCM(), PageRankPull(), PageRankPush(),
            PreflowPush(), PointsToAnalysis(), SSSP(), SurveyPropagation(),
            TrianglesNode(), TrianglesEdge()]


================================================
FILE: scripts/experimental/githubbmk2_setup/defaultrunscript.sh
================================================
#!/bin/bash

python ../bmk2/test2.py --max-output-bytes 0 --log ${BMK_LOGS}/bmkrunlog.log --verbose run


================================================
FILE: scripts/experimental/githubbmk2_setup/lonestar.bispec
================================================
#v1
barneshut nothing
bfs rmat10
bc-outer rmat10
boruvka rmat10
boruvka-merge rmat10
clustering nothing
connectedcomponents rmat10-sym
delaunaytriangulation 250kn
delaunaytriangulation-det 250kn
dmr 250k
gmetis rmat10
independentset rmat10-sym
matrixcompletion-sync bgg
matrixcompletion-simple bgg
matrixcompletion-edge bgg
matrixcompletion-jump bgg
matrixcompletion-byitems bgg
matrixcompletion-byedges bgg
mcm nothing
pagerank-pull rmat10-transpose
pagerank-pull-topo rmat10-transpose
pagerank-push rmat10
pagerank-push-sync rmat10
preflowpush rmat10
pta gdb_constraints
spanningtree rmat10
sssp rmat10
surveypropagation nothing
triangles-node dblp-triangles
triangles-edge dblp-triangles


================================================
FILE: scripts/experimental/githubbmk2_setup/lonestar.inputdb
================================================
[bmktest2]
version = 2
basepath = ../../../build/small_inputs

[rmat10]
flags =
name = rmat10
file = scalefree/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.gr
format = bin/galois

[rmat10-transpose]
flags =
name = rmat10-transpose
file = scalefree/transpose/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.tgr
format = bin/galois

[rmat10-symmetric]
flags =
name = rmat10-sym
file = scalefree/symmetric/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.sgr
format = bin/galois

[dblp-triangles]
flags =
name = dblp-triangles
file = stanford/communities/DBLP/com-dblp.wgt32.sym.gr.triangles
format = triangles

[weighted/bipartite/bgg]
flags =
name = bgg
file = weighted/bipartite/floatEdgeWts/bgg.gr
format = bin/galois

[meshes/250k.2]
flags =
name = 250k
file = meshes/250k.2
format = mesh

[meshes/250k.2n]
flags =
name = 250kn
file = meshes/250k.2.node
format = mesh

[scalefree/rmat8-2e14.gr]
flags =
name = rmat8-2e14
file = scalefree/deprecated/rmat8-2e14.gr
format = bin/galois

[random/r4-2e26.gr]
flags =
name = r4-2e26
file = random/r4-2e26.gr
format = bin/galois

[random/r4-2e26.sgr]
flags =
name = r4-2e26-symmetric
file = random/symmetric/r4-2e26.gr
format = bin/galois

[random/rmat25.rsgr]
flags =
name = rmat25-rsymmetric
file = scalefree/randomized/symmetric/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.srgr
format = bin/galois

[road/USA-road-d.USA.gr]
flags =
name = USA-road-d.USA
file = road/USA-road-d.USA.gr
format = bin/galois

[unweighted/uk-2007-05.sgr]
flags =
name = uk-2007-05-symmetric
file = unweighted/uk-2007-05.sgr
format = bin/galois

[scalefree/symmetric/rmat26.sgr]
flags =
name = rmat26-symmetric
file = scalefree/symmetric/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.sgr
format = bin/galois

[scalefree/rmat28.gr]
flags =
name = rmat28
file = scalefree/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=.05.gr
format = bin/galois

[scalefree/random/rmat27.sgr]
flags =
name = rmat27-rsymmetric
file = scalefree/randomized/symmetric/rmat16-2e27-a=0.57-b=0.19-c=0.19-d=.05.srgr
format = bin/galois

[scalefree/random/rmat28.sgr]
flags =
name = rmat28-symmetric
file = scalefree/randomized/symmetric/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.srgr
format = bin/galois

[meshes/r5M]
flags =
name = r5M
file = meshes/r5M
format = mesh

[meshes/r5Mn]
flags =
name = r5Mn
file = meshes/r5M.node
format = mesh/nodes

[twitter40]
flags =
name = twitter40
file = unweighted/twitter-WWW10-component.gr
format = bin/galois

[twitter40-transpose]
flags =
name = twitter40-transpose
file = unweighted/twitter-WWW10-component-transpose.gr
format = bin/galois

[twitter40-symmetric]
flags =
name = twitter40-symmetric
file = unweighted/twitter-WWW10-component-symmetric.gr
format = bin/galois

[twitter50]
flags =
name = twitter50
file = unweighted/twitter-ICWSM10-component.gr
format = bin/galois

[twitter50w]
flags =
name = twitter50w
file = unweighted/withRandomWeights/twitter-ICWSM10-component_withRandomWeights.gr
format = bin/galois

[twitter50-transpose]
flags =
name = twitter50-transpose
file = unweighted/twitter-ICWSM10-component-transpose.gr
format = bin/galois

[soc-livejournal]
flags =
name = soc-livejournal
file = unweighted/soc-LiveJournal1.gr
format = bin/galois

[com-lj]
flags =
name = com-lj
file = stanford/communities/LiveJournal/com-lj.wgt32.sym.gr.triangles
format = triangles

[weighted/bipartite/yahoo]
flags =
name = yahoo-music
file = weighted/bipartite/yahoo.gr
format = bin/galois

[weighted/bipartite/netflix]
flags =
name = netflix
file = weighted/bipartite/floatEdgeWts/netflix.gr
format = bin/galois

[java/pta/tshark_constraints]
flags =
name = tshark_constraints
file = java/pta/tshark_constraints.txt
format = text

[java/pta/gdb_constraints]
flags =
name = gdb_constraints
file = java/pta/gdb_constraints.txt
format = text

[nothing]
flags =
name = nothing
file = 
format = nothing


================================================
FILE: scripts/experimental/githubbmk2_setup/lonestar.inputprops
================================================
[bmktest2-props]
version = 2
paths = 

[soc-livejournal]
ptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr

[twitter40]
ptranspose=/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.ptgr


================================================
FILE: scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull.sh
================================================
#!/bin/bash
benchmark=$2
variant=$3
partition=$4
comm_mode="0"

algo=$benchmark"_pull-"$variant
echo $algo

./$algo "-help"

if [ $partition != "0" ]; then
  echo "Parition should be 0 since it's edgeCut : Exiting"
  exit
fi

ENV_options="MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1"

GRAPH_rmat="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr"
GRAPH_usa="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr"

GRAPH_twitter="/scratch/03279/roshand/dist-inputs/transpose/twitter-ICWSM10-component.tgr"   #"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/withRandomWeights/transpose/twitter-ICWSM10-component_withRandomWeights.transpose.gr"
GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/transpose/rmat28.tgr" #Randomized rmat28
GRAPH_rmat15="/scratch/03279/roshand/dist-inputs/transpose/rmat15.tgr" #Randomized rmat28

cmd_options_reset="-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}"
cmd_options="-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}"

if [ $benchmark = "pagerank" ]; then
	cmd_options=$cmd_options"  -tolerance=0.0000001"
fi

if [ $benchmark = "cc" ]; then
  GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr"
  GRAPH_twitter="/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr"
fi


#RMAT28
if [ $5 = "rmat28" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options" -startNode=155526494"
  fi
ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t "01:42:30" -q "normal" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_RUNS/LOG_${algo}_TH$1\_ECUT\_${partition}\_rmat28.tgr_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_rmat28 $cmd_options " -s $6  -e $7 -k 2
fi

#  twitter ICWSM ##sssp
if [ $8 = "twitter" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options_reset
    cmd_options=$cmd_options" -startNode=33643219"
  fi
ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t "01:48:00" -q "normal" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_RUNS/LOG_${algo}_TH$1\_ECUT\_${partition}\_Twitter-ICWSM10.tgr_  -A "Galois"  -c "$ENV_options ibrun ./$algo $GRAPH_twitter $cmd_options" -s $9  -e ${10}  -k 2
fi

#RMAT15
if [[ ${11} = "rmat15" ]]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options" -startNode=0"
  fi
ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t "00:12:30" -q "development" -n 4 -N 4 -i dist_run_script_generated -o ./DEV_RUNS/DEV_LOG_${algo}_TH$1\_ECUT\_${partition}\_rmat15.tgr_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_rmat15 $cmd_options " -s ${12}  -e ${13} -k 2
fi


================================================
FILE: scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull_Vcut.sh
================================================
#!/bin/bash

benchmark=${2}
variant=${3}
partition=${4}
comm_mode="0"
queue=$9

algo=$benchmark"_pull-"$variant
#algo=$benchmark"_pull-"$variant"_"$partition
echo $algo

./$algo "-help"

ENV_options="MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1"

#GRAPH_rmat="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr"
#GRAPH_usa="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr"


GRAPH_twitter="/scratch/03279/roshand/dist-inputs/transpose/twitter-ICWSM10-component.tgr"

GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/transpose/rmat28.tgr" #Randomized rmat28

cmd_options_reset="-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}  "
cmd_options="-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition} "

if [ $benchmark = "pagerank" ]; then
	cmd_options=$cmd_options"  -tolerance=0.0000001"
fi

if [ $benchmark = "cc" ]; then
  GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr"
  GRAPH_twitter="/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr"
fi


if [ $5 = "rmat28" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options" -startNode=155526494"
  fi

  #partFileBase="/scratch/01131/rashid/inputs/partitioned"
  partFileBase="/scratch/02982/ggill0/dist_inputs/partitioned"
  for i in $6
  do
    if [ $benchmark != "cc" ]; then
      partFileType="rmat28-trans"
      if [ $i == 2 ]; then
        partFileExt="rmat28.tgr"
      elif [ $i == 4 ]; then
        partFileExt="rmat28.tgr"
      elif [ $i == 8 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr"
      elif [ $i == 16 ]; then
        partFileExt="rmat28.tgr"
      elif [ $i == 32 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr"
      elif [ $i == 64 ]; then
        partFileExt="rmat28.tgr"
      elif [ $i == 128 ]; then
        partFileExt="rmat28.tgr"
        #partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr"
      elif [ $i == 256 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr"
      fi
    else
      partFileExt="rmat28.sgr"
      partFileType="rmat28-sym"
    fi

    ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t "01:45:00" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_RUNS/LOG_${algo}_TH$1\_VCUT\_${partition}\_rmat28.tgr_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_rmat28  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options" -s $i  -e $i  -k 2
  done
fi


if [ $7 = "twitter" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options_reset
    cmd_options=$cmd_options" -startNode=33643219"
  fi

  partFileBase="/scratch/01131/rashid/inputs/partitioned"
  for i in $8
  do
    if [ $benchmark != "cc" ]; then
       partFileExt="twitter-ICWSM10-component_withRandomWeights.transpose.gr"
       partFileType="twitter-trans"
    else
      partFileExt="twitter-ICWSM10-component.sgr"
      partFileType="twitter-sym"
    fi

    ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb -t "01:25:00" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_RUNS/LOG_${algo}_TH$1\_VCUT\_${partition}\_Twitter-ICWSM10_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_twitter  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options" -s $i  -e $i  -k 2
  done
fi


================================================
FILE: scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Push.sh
================================================
#!/bin/bash
benchmark=$2
variant=$3
partition=$4
comm_mode=${11}

algo=$benchmark"_push-"$variant"_edge-cut"
echo $algo

./$algo "-help"

ENV_options="MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1"


GRAPH_twitter="/scratch/03279/roshand/dist-inputs/twitter-ICWSM10-component.gr"
GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/rmat28.gr" #Randomized rmat28

cmd_options_reset="-maxIterations=10000 -verify=0 -t=$1 -comm_mode=${comm_mode}"
cmd_options="-maxIterations=10000 -verify=0 -t=$1 -comm_mode=${comm_mode}"

if [ $benchmark = "pagerank" ]; then
	cmd_options=$cmd_options"  -tolerance=0.0000001"
fi

if [ $benchmark = "cc" ]; then
  GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr"
  GRAPH_twitter="/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr"
fi


#RMAT25
if [ $5 = "rmat28" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options" -startNode=155526494"
  fi
ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t "01:42:30" -q "normal" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_jul_31/LOG_${algo}_TH$1\_CM${comm_mode}\_rmat28.rgr_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_rmat28 $cmd_options " -s $6  -e $7 -k 2
fi

#  twitter ICWSM ##sssp
if [ $8 = "twitter" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options_reset
    cmd_options=$cmd_options" -startNode=33643219"
  fi
ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t "01:48:00" -q "normal" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_jul_31/LOG_${algo}_TH$1\_CM${comm_mode}\_Twitter-ICWSM10_  -A "Galois"  -c "$ENV_options ibrun ./$algo $GRAPH_twitter $cmd_options" -s $9  -e ${10}  -k 2
fi


================================================
FILE: scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Push_Vcut.sh
================================================
#!/bin/bash

benchmark=${2}
variant=${3}
partition=${4}
comm_mode="0"
queue=$9

algo=$benchmark"_push-"$variant"_vertex-cut"
#algo=$benchmark"_pull-"$variant"_"$partition
echo $algo

./$algo "-help"

ENV_options="MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1"

#GRAPH_rmat="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr"
#GRAPH_usa="/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr"


GRAPH_twitter="/scratch/03279/roshand/dist-inputs/twitter-ICWSM10-component.gr"
GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/rmat28.gr" #Randomized rmat28


cmd_options_reset="-maxIterations=10000 -verify=0 -t=$1"
cmd_options="-maxIterations=10000 -verify=0 -t=$1"

if [ $benchmark = "pagerank" ]; then
	cmd_options=$cmd_options"  -tolerance=0.0000001"
fi

if [ $benchmark = "cc" ]; then
  GRAPH_rmat28="/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr"
  GRAPH_twitter="/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr"
fi


if [ $5 = "rmat28" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options" -startNode=155526494"
  fi

  partFileBase="/scratch/01131/rashid/inputs/partitioned"
  for i in $6
  do
    if [ $benchmark != "cc" ]; then
      partFileType="rmat28"
      if [ $i == 2 ]; then
        partFileExt="rmat28.gr"
      elif [ $i == 4 ]; then
        partFileExt="rmat28.gr"
      elif [ $i == 8 ]; then
        partFileExt="rmat28.rgr"
      elif [ $i == 16 ]; then
        partFileExt="rmat28.gr"
      elif [ $i == 32 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr"
      elif [ $i == 64 ]; then
        partFileExt="rmat28.gr"
      elif [ $i == 128 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr"
      elif [ $i == 256 ]; then
        partFileExt="rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr"
      fi
    else
      partFileExt="rmat28.sgr"
      partFileType="rmat28-sym"
    fi

    ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t "01:25:00" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_aug_9/LOG_${algo}_TH$1\_CM${comm_mode}\_rmat28.rgr_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_rmat28  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options" -s $i  -e $i  -k 2
  done
fi


if [ $7 = "twitter" ]; then
  if [ $benchmark = "bfs" ] || [ $benchmark = "sssp" ]; then
    cmd_options=$cmd_options_reset
    cmd_options=$cmd_options" -startNode=33643219"
  fi

  partFileBase="/scratch/01131/rashid/inputs/partitioned"
  for i in $8
  do
    if [ $benchmark != "cc" ]; then
       partFileExt="twitter-ICWSM10-component_withRandomWeights.gr"
       partFileType="twitter"
    else
      partFileExt="twitter-ICWSM10-component.sgr"
      partFileType="twitter-sym"
    fi

    ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t "01:25:00" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_aug_9/LOG_${algo}_TH$1\_CM${comm_mode}\_Twitter-ICWSM10_  -A "Galois" -c "$ENV_options ibrun ./$algo $GRAPH_twitter  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options" -s $i  -e $i  -k 2
  done
fi


================================================
FILE: scripts/experimental/heterogeneousGalois/README_compiler
================================================
  USAGE: ./compiler.sh <input-source-file>

If you do not want to generate heterogeneous CUDA code (which requires GGC):
  USAGE: ABELIAN_NON_HETEROGENEOUS=1 ./compiler.sh <input-source-file>
Note: Please DO NOT commit this generated code (could lead to a mismatch between CPU and GPU code).

There are 3 other environment variables that the compiler uses (which you can change):
* ABELIAN_LLVM_BUILD: build directory of LLVM containing ABELIAN plugins
* ABELIAN_GALOIS_ROOT: source directory of Galois containing ABELIAN headers and runtime
* ABELIAN_GGC_ROOT: source directory of GGC which compiles IrGL to CUDA


================================================
FILE: scripts/experimental/heterogeneousGalois/batch_bridges_all.sh
================================================
#!/bin/sh

# all benchmarks
EXECS=( "bfs_push" "bfs_pull" "cc_push" "cc_pull" "kcore_push" "kcore_pull" "pagerank_push" "pagerank_pull" "sssp_push" "sssp_pull" )
# fastest variants
EXECS=( "bfs_push" "cc_push" "pagerank_pull" "sssp_push" )

SET="1,02:00:00 2,01:30:00 4,01:00:00" #rmat28 gpu
SET="4,03:30:00 8,03:30:00 16,03:00:00 32,02:45:00 64,02:30:00 128,02:00:00" #clueweb12
SET="1,2:00:00 2,01:30:00 4,01:00:00 8,01:00:00 16,01:00:00 32,00:45:00 64,00:30:00 128,00:30:00" #rmat28
SETt="1,02:00:00" #twitter40 gpu
SETc="8,02:00:00 16,02:00:00" #clueweb12 gpu
SETk="8,02:00:00 16,02:00:00" #kron30 gpu
SETr="1,02:00:00 2,01:30:00 4,01:00:00 8,01:00:00 16,01:00:00" #rmat28 gpu

INPUTS=("twitter40;\"${SETt}\"")
INPUTS=("kron30;\"${SETk}\"")
INPUTS=("clueweb12;\"${SETc}\"")
INPUTS=("rmat28;\"${SETr}\"")

QUEUE=GPU-shared
QUEUE=RM
QUEUE=GPU
#HET=1

PARTS=( "cvc" "hovc" "2dvc" "iec" ) #rmat28/kron30
PARTS=( "cvc" "hivc" "2dvc" "oec" ) #clueweb12/twitter40
PARTS=( "cvc" )
PARTS=( "oec" ) #clueweb12/twitter40
PARTS=( "iec" ) #kron30/rmat28

for j in "${INPUTS[@]}"
do
  IFS=";";
  set $j;
  for i in "${EXECS[@]}"
  do
    for p in "${PARTS[@]}"
    do
      echo "./run_bridges_all.sh ${i} ${1} ${2} $QUEUE $p $HET"
      ./run_bridges_all.sh ${i} ${1} ${2} $QUEUE $p $HET |& tee -a jobs
    done
  done
done


================================================
FILE: scripts/experimental/heterogeneousGalois/batch_single-host_multi-device_all.sh
================================================
#!/bin/sh

# all benchmarks
EXECS=( "bfs_push" "bfs_pull" "cc_push" "cc_pull" "kcore_push" "kcore_pull" "pagerank_push" "pagerank_pull" "sssp_push" "sssp_pull" )
# fastest variants
EXECS=( "bfs_push" "cc_push" "pagerank_pull" "sssp_push" )

INPUTS=( "twitter40" "rmat26" "twitter50" "rmat28" "uk2007" )
INPUTS=( "twitter40" "rmat26" "rmat28" "uk2007" )
INPUTS=( "rmat28" "twitter40" )

for j in "${INPUTS[@]}"
do
  for i in "${EXECS[@]}"
  do
    echo "./run_single-host_multi-device_all.sh ${i} ${j}"
    ./run_single-host_multi-device_all.sh ${i} ${j}
    #echo "ABELIAN_VERIFY=1 ./run_single-host_multi-device_all.sh ${i} ${j}"
    #ABELIAN_VERIFY=1 ./run_single-host_multi-device_all.sh ${i} ${j}
  done
done


================================================
FILE: scripts/experimental/heterogeneousGalois/batch_stampede_all.sh
================================================
#!/bin/sh

# all benchmarks
EXECS=( "bfs_push" "bfs_pull" "cc_push" "cc_pull" "kcore_push" "kcore_pull" "pagerank_push" "pagerank_pull" "sssp_push" "sssp_pull" )
# fastest variants
EXECS=( "bfs_push" "cc_push" "pagerank_pull" "sssp_push" )

SET="1,2:00:00 2,01:30:00 4,01:00:00 8,00:45:00 16,00:30:00 32,00:20:00"
SET="128,00:30:00 64,00:45:00 32,01:00:00"
SET="64,01:00:00 32,01:30:00 16,02:00:00" 
SETc="256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00"
SETk="256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00 8,02:00:00 4,02:30:00"
SETr="256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00 8,02:00:00 4,02:30:00 2,02:30:00 1,02:30:00"

INPUTS=("twitter40;\"${SET}\"")
INPUTS=("rmat28;\"${SET}\"")
INPUTS=("kron30;\"${SET}\"")
INPUTS=("clueweb12;\"${SET}\"")
INPUTS=("wdc12;\"${SET}\"")
INPUTS=("rmat28;\"${SETr}\"" "kron30;\"${SETk}\"" "clueweb12;\"${SETc}\"")

QUEUE=development
QUEUE=normal

PARTS=( "cvc" "hivc" "2dvc" "oec" ) #clueweb12
PARTS=( "cvc" "hovc" "2dvc" "iec" ) #rmat28/kron30
PARTS=( "cvc" )

for j in "${INPUTS[@]}"
do
  IFS=";";
  set $j;
  for i in "${EXECS[@]}"
  do
    for p in "${PARTS[@]}"
    do
      echo "./run_stampede_all.sh ${i} ${1} ${2} $QUEUE $p"
      ./run_stampede_all.sh ${i} ${1} ${2} $QUEUE $p |& tee -a jobs
    done
  done
done


================================================
FILE: scripts/experimental/heterogeneousGalois/batch_verify.sh
================================================
#!/bin/sh

LOG=.verify_log

# all benchmarks
EXECS=( "bfs_push" "bfs_pull" "kcore_push" "kcore_pull" "cc_push" "cc_pull" "sssp_push" "sssp_pull" "pagerank_push" "pagerank_pull" )
EXECS=( "bfs_push" "kcore_push" "cc_push" "sssp_push" "pagerank_pull" )

INPUTS=( "rmat25" "twitter-WWW10-component" )
INPUTS=( "rmat15" "rmat20" "rmat24" "road-USA" )
#INPUTS=( "rmat20" "road-USA")
INPUTS=( "rmat15" "rmat20" )
INPUTS=( "rmat20" )
INPUTS=( "rmat15" )

rm -f $LOG

current_dir=$(dirname "$0")
for input in "${INPUTS[@]}"
do
  for EXEC in "${EXECS[@]}"
  do
    $current_dir/verify.sh ${EXEC} ${input} "--exec=Sync"
    $current_dir/verify.sh ${EXEC} ${input} "--exec=Async"
    rm -f $LOG
  done
done


================================================
FILE: scripts/experimental/heterogeneousGalois/compile.sh
================================================
#!/bin/sh
# Usage: ./compile.sh <SOURCE_INPUT_FILE> <GENERATED_OUTPUTD_DIR>
# environment variables: ABELIAN_NON_HETEROGENEOUS ABELIAN_LLVM_BUILD ABELIAN_GALOIS_ROOT ABELIAN_GALOIS_BUILD ABELIAN_GGC_ROOT

INPUT=$(cd $(dirname "$1") && pwd -P)/$(basename "$1")
if [ -n "$2" ]; then
  if ! [ -d "$2" ]; then
    mkdir $2
  fi
  OUTPUT_DIR="$( cd $2 && pwd )"
else
  OUTPUT_DIR="$( cd "$(dirname "$0" )" && pwd )"
fi

if [ -z "$ABELIAN_LLVM_BUILD" ]; then
  ABELIAN_LLVM_BUILD=/net/velocity/workspace/SourceCode/llvm/build
fi
if [ -z "$ABELIAN_GALOIS_ROOT" ]; then
  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois
fi
#if [ -z "$ABELIAN_GALOIS_BUILD" ]; then
#  ABELIAN_GALOIS_BUILD=/net/velocity/workspace/SourceCode/Galois/build/verify
#fi
if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  if [ -z "$ABELIAN_GGC_ROOT" ]; then
    ABELIAN_GGC_ROOT=/net/velocity/workspace/SourceCode/ggc
  fi
fi
MPI_INCLUDE=/opt/apps/ossw/libraries/mpich2/mpich2-3.1.4/c7/gcc-4.9/include

echo "Using LLVM build:" $ABELIAN_LLVM_BUILD
echo "Using Galois:" $ABELIAN_GALOIS_ROOT
if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  echo "Using GGC:" $ABELIAN_GGC_ROOT
fi

CXX_DEFINES="-DBOOST_NO_AUTO_PTR -DGALOIS_COPYRIGHT_YEAR=2015 -DGALOIS_VERSION=2.3.0 -DGALOIS_VERSION_MAJOR=2 -DGALOIS_VERSION_MINOR=3 -DGALOIS_VERSION_PATCH=0 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS"
CXX_FLAGS="-g -Wall -gcc-toolchain $GCC_BIN/.. -fopenmp -fcolor-diagnostics -O3 -DNDEBUG -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/dist_apps/include -I$MPI_INCLUDE -I$BOOST_INC -I$ABELIAN_GALOIS_ROOT/lonestar/include -I$ABELIAN_GALOIS_ROOT/libgalois/include -I$ABELIAN_GALOIS_ROOT/libruntime/include -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/libllvm/include -std=gnu++14"

if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  GGC_FLAGS="--cuda-worklist basic --cuda-graph basic --opt parcomb --opt np --npf 8 "
  if [ -f "$OUTPUT_DIR/GGCFLAGS" ]; then
    GGC_FLAGS+=$(head -n 1 "$OUTPUT_DIR/GGCFLAGS")
  fi
  echo "Using GGC FLAGS:" $GGC_FLAGS
fi

CXX=$ABELIAN_LLVM_BUILD/bin/clang++
GPREPROCESS_CXX="$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctionsPreProcess.so -Xclang -plugin -Xclang galois-preProcess"
GANALYSIS_CXX="$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis"
GFUNCS_CXX="$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang galois-fns"
if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  IRGL_CXX="$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang irgl"
  GGC="$ABELIAN_GGC_ROOT/src/ggc"
fi

log=.log

cd $OUTPUT_DIR

echo "Cleaning generated files"
rm -f $log gen.cpp gen_cuda.py gen_cuda.cu gen_cuda.cuh gen_cuda.h
cp $INPUT gen.cpp

echo "Preprocessing global variables"
$GPREPROCESS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp &>$log

echo "Generating analysis information"
$GANALYSIS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1
echo "Generating communication code"
$GFUNCS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1

if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  echo "Generating IrGL code"
  $IRGL_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1
  echo "Generating CUDA code from IrGL"
  $GGC $GGC_FLAGS -o gen_cuda.cu gen_cuda.py >>$log 2>&1
fi

if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  echo "Generated files in $OUTPUT_DIR: gen.cpp gen_cuda.py gen_cuda.h gen_cuda.cuh gen_cuda.cu" 
else
  echo "Generated files in $OUTPUT_DIR: gen.cpp" 
fi

rm -f *Entry-*.dot


================================================
FILE: scripts/experimental/heterogeneousGalois/compile_all.sh
================================================
#!/bin/sh

if ! [ -z "$ABELIAN_GALOIS_ROOT" ]; then
  BASE_DIR=${ABELIAN_GALOIS_ROOT}/dist_apps/experimental
else
  BASE_DIR=..
fi
INPUT_DIR=${BASE_DIR}/compiler_inputs
OUTPUT_DIR=${BASE_DIR}/compiler_outputs

if [ -n "$1" ]; then
  threads=$1
else
  threads=1
fi
count=0
for input in $INPUT_DIR/*.cpp; do
  name=$(basename "$input" ".cpp")
  ./compile.sh $input ${OUTPUT_DIR}/$name &
  count=$((count+1))
  if [[ $count == $threads ]]; then
    wait
    count=0
  fi
done
if [[ $count != 0 ]]; then
  wait
fi


================================================
FILE: scripts/experimental/heterogeneousGalois/cuda_compile.sh
================================================
#!/bin/sh

if [ -z "$ABELIAN_LLVM_BUILD" ]; then
  ABELIAN_LLVM_BUILD=/net/velocity/workspace/SourceCode/llvm/build
fi
if [ -z "$ABELIAN_GALOIS_ROOT" ]; then
  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois
fi
#if [ -z "$ABELIAN_GALOIS_BUILD" ]; then
#  ABELIAN_GALOIS_BUILD=/net/velocity/workspace/SourceCode/Galois/build/verify
#fi
  if [ -z "$ABELIAN_GGC_ROOT" ]; then
    ABELIAN_GGC_ROOT=/net/velocity/workspace/SourceCode/ggc
  fi
MPI_INCLUDE=/opt/apps/ossw/libraries/mpich2/mpich2-3.1.4/c7/gcc-4.9/include

echo "Using LLVM build:" $ABELIAN_LLVM_BUILD
echo "Using Galois:" $ABELIAN_GALOIS_ROOT
echo "Using GGC:" $ABELIAN_GGC_ROOT

CXX_DEFINES="-DBOOST_NO_AUTO_PTR -DGALOIS_COPYRIGHT_YEAR=2015 -DGALOIS_VERSION=2.3.0 -DGALOIS_VERSION_MAJOR=2 -DGALOIS_VERSION_MINOR=3 -DGALOIS_VERSION_PATCH=0 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS"
CXX_FLAGS="-g -Wall -gcc-toolchain $GCC_BIN/.. -fopenmp -fcolor-diagnostics -O3 -DNDEBUG -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/lonestardist/include -I$MPI_INCLUDE -I$BOOST_INC -I$ABELIAN_GALOIS_ROOT/lonestar/include -I$ABELIAN_GALOIS_ROOT/libgalois/include -I$ABELIAN_GALOIS_ROOT/libcusp/include -I$ABELIAN_GALOIS_ROOT/libgluon/include -I$ABELIAN_GALOIS_ROOT/libruntime/include -I$ABELIAN_GALOIS_ROOT/libnet/include -I$ABELIAN_GALOIS_ROOT/libllvm/include -std=gnu++14"

GGC_FLAGS=" --cuda-worklist basic --cuda-graph basic --opt parcomb --dis-opt oiter --dis-opt oitergb"
if [[ $1 == *"bfs"* || $1 == *"sssp"* || $1 == *"cc"* || $1 == *"pagerank_pull"* || $1 == *"kcore"*  ]]; then
	GGC_FLAGS+=" --opt np --npf 8 --opt dyn_lb --cuda_graph basic"
else
	GGC_FLAGS+=" --opt np --npf 8 --cuda_graph basic"
fi
#GGC_FLAGS+=" --loglevel DEBUG "
if [ -f "GGCFLAGS" ]; then
  GGC_FLAGS+=$(head -n 1 "GGCFLAGS")
fi
echo "Using GGC FLAGS:" $GGC_FLAGS

CXX=$ABELIAN_LLVM_BUILD/bin/clang++
  IRGL_CXX="$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang irgl"
  GGC="$ABELIAN_GGC_ROOT/src/ggc"

log=.log

gen=$1

echo "Cleaning generated files"
if [ -n "$2" ]; then
  rm -f $log ${gen}_cuda.cu
else
  rm -f $log ${gen}_cuda.py ${gen}_cuda.cu ${gen}_cuda.cuh ${gen}_cuda.h
fi

if ! [ -n "$2" ]; then
  echo "Generating IrGL code"
  $IRGL_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c ${gen}.cpp >>$log 2>&1
fi
  echo "Generating CUDA code from IrGL"
  $GGC $GGC_FLAGS -o ${gen}_cuda.cu ${gen}_cuda.py >>$log 2>&1

if [ -n "$2" ]; then
  echo "Generated files: ${gen}_cuda.cu" 
else
  echo "Generated files: ${gen}_cuda.py ${gen}_cuda.h ${gen}_cuda.cuh ${gen}_cuda.cu" 
fi

rm -f Entry-*.dot cdep_Entry-*.dot


================================================
FILE: scripts/experimental/heterogeneousGalois/run_bridges.template.sbatch
================================================
#!/bin/bash
#SBATCH --mail-user=roshan@cs.utexas.edu
#SBATCH --mail-type=fail
#SBATCH --mail-type=end
PART=$3
NUM_TASKS=$4
PSET=$5
THREADS=$6

execname=$1
#execdir=/pylon5/ci560jp/roshand/Galois-build/dist_apps
execdir=`pwd`
EXEC=${execdir}/${execname}

inputname=$2
inputdirname=/pylon5/ci560jp/roshand/dist-inputs
extension=gr

statname=${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.stats

FLAGS=" -statFile=${execdir}/${statname}"
# kcore flag
if [[ $execname == *"kcore"* ]]; then
  # TODO: update this for non-100 kcore numbers
  FLAGS+=" -kcore=100"
fi
if [[ ($execname == *"bc"*) || ($execname == *"bfs"*) || ($execname == *"sssp"*) ]]; then
  if [[ -f "${inputdirname}/${inputname}.source" ]]; then
    FLAGS+=" -startNode=`cat ${inputdirname}/${inputname}.source`"
  fi
fi
if [[ ($execname == *"bc"*) ]]; then
  FLAGS+=" -singleSource"
fi

source_file=${inputdirname}/source
if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
  inputdirname=${inputdirname}/symmetric
  extension=sgr
  FLAGS+=" -symmetricGraph"
else 
  # for verify purposes, always pass in graph transpose just in case it is 
  # needed for non-symmetric graphs
  FLAGS+=" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr"
fi
grep "${inputname}.${extension}" ${source_file}
INPUT=${inputdirname}/${inputname}.${extension}

if [[ ($execname == *"pagerank"*) ]]; then
  FLAGS+=" -maxIterations=100"
fi
FLAGS+=" -partition=${PART}"
#if [[ ($PART == "cvc") ]]; then
#  FLAGS+=" -balanceMasters=both"
#fi
if [[ ($PART == "2dvc") ]]; then
  FLAGS+=" -balanceMasters=nodes"
fi

#if [[ ($PSET == *"cg"*) || ($PSET == *"gc"*) ]]; then
#  FLAGS+=" -scalegpu=2"
#fi

RUN=mpirun

#source $HOME/Galois/load_modules.sh

# move to working directory
WORK_DIR=/pylon5/ci560jp/roshand/Galois
cd $WORK_DIR

set -x #echo on
PSM2_MULTI_EP=1 LD_LIBRARY_PATH=$PSM2_LATEST_BUILD:$LD_LIBRARY_PATH MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS
#MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS
#I_MPI_FABRICS=shm:ofa GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS
set +x #echo off

echo "Algorithm: " $execname
echo "Input: " $INPUT
echo "Number of nodes: " $SLURM_NNODES
echo "Number of tasks: " $NUM_TASKS
echo "Number of tasks per node: " $SLURM_TASKS_PER_NODE
echo "Devices: " $PSET


================================================
FILE: scripts/experimental/heterogeneousGalois/run_bridges_all.sh
================================================
#!/bin/sh

EXEC=$1
INPUT=$2
SET=$3
QUEUE=$4
PART=$5
HET=$6 # not supported for now

current_dir=$(dirname "$0")

SET="${SET%\"}"
SET="${SET#\"}"

for task in $SET; do
  IFS=",";
  set $task;
  cp $current_dir/run_bridges.template.sbatch $current_dir/run_bridges.sbatch 
  if [ $QUEUE == "GPU" ]; then
    if [[ $HET == 1 ]]; then
      ntasks=5
      ntasks=$((ntasks*$1))
      sed -i "2i#SBATCH -t $2" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --gres=gpu:k80:4" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --ntasks $ntasks" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -N $1" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -p $QUEUE" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_cgggg_%j.out" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_cgggg" $current_dir/run_bridges.sbatch
      threads=20
      echo -n "multi-CPU+GPU" $EXEC $INPUT $PART $1 $ntasks "cgggg" $threads $2 " "
      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $ntasks cgggg $threads
    else
      ntasks=4
      ntasks=$((ntasks*$1))
      sed -i "2i#SBATCH -t $2" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --gres=gpu:k80:4" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --ntasks $ntasks" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -N $1" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -p $QUEUE" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_gggg_%j.out" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_gggg" $current_dir/run_bridges.sbatch
      threads=7
      echo -n "multi-GPU-only " $EXEC $INPUT $PART $1 $ntasks "gggg" $threads $2 " "
      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $ntasks gggg $threads
    fi
  elif [ $QUEUE == "GPU-shared" ]; then # should be fixed
    #if [[ $HET == 1 ]]; then
      threads=28
      ngpus=$1
      ngpus=$((ngpus-1))
      sed -i "2i#SBATCH -t $2" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --gres=gpu:$ngpus" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH --ntasks-per-node $threads" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -N 1" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -p $QUEUE" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${1}_cgggg_%j.out" $current_dir/run_bridges.sbatch
      sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${1}_cgggg" $current_dir/run_bridges.sbatch
      threads=$((threads-$ngpus))
      threads=$((threads-$ngpus))
      echo -n "CPU+GPU" $EXEC $INPUT $1 "cgggg" $threads $2 " "
      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $1 cgggg $threads
    #else
    #  threads=7
    #  threads=$((threads*$1))
    #  sed -i "2i#SBATCH -t $2" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH --gres=gpu:$1" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH --ntasks-per-node $threads" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH -N 1" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH -p $QUEUE" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${1}_g_%j.out" $current_dir/run_bridges.sbatch
    #  sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${1}_g" $current_dir/run_bridges.sbatch
    #  echo -n "GPU-only" $EXEC $INPUT $1 "gggg" $threads $2 " "
    #  sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $1 gggg $threads
    #fi
  elif [ $QUEUE == "RM" ]; then
    sed -i "2i#SBATCH -t $2" $current_dir/run_bridges.sbatch
    sed -i "2i#SBATCH --ntasks-per-node 1" $current_dir/run_bridges.sbatch
    sed -i "2i#SBATCH -N $1" $current_dir/run_bridges.sbatch
    sed -i "2i#SBATCH -p $QUEUE" $current_dir/run_bridges.sbatch
    sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_c_%j.out" $current_dir/run_bridges.sbatch
    sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_c" $current_dir/run_bridges.sbatch
    threads=28
    echo -n "CPU-only" $EXEC $INPUT $PART $1 "c" $threads $2 " "
    sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $1 c $threads
  fi
  rm $current_dir/run_bridges.sbatch
done


================================================
FILE: scripts/experimental/heterogeneousGalois/run_single-host_multi-device_all.sh
================================================
#!/bin/sh
# Usage: ./run_single-host_multi-device_all.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>
# environment variables: ABELIAN_VERIFY ABELIAN_GALOIS_ROOT ABELIAN_VERTEX_CUT ABELIAN_VTUNE
# assumes 4 GPU devices available

execdir="."
execname=$1
EXEC=${execdir}/${execname}

#inputdirname=/workspace/dist-inputs
inputdirname=/net/ohm/export/iss/dist-inputs
inputname=$2
extension=gr

MPI=mpiexec

FLAGS=
# kcore flag
if [[ $execname == *"kcore"* ]]; then
  # TODO: update this for non-100 kcore numbers
  FLAGS+=" -kcore=100"
fi
if [[ ($execname == *"bc"*) || ($execname == *"bfs"*) || ($execname == *"sssp"*) ]]; then
  if [[ -f "${inputdirname}/${inputname}.source" ]]; then
    FLAGS+=" -startNode=`cat ${inputdirname}/${inputname}.source`"
  fi
fi
if [[ ($execname == *"bc"*) ]]; then
  FLAGS+=" -singleSource"
fi
if [[ ($execname == *"pagerank"*) ]]; then
  FLAGS+=" -maxIterations=100"
fi

source_file=${inputdirname}/source
if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
  inputdirname=${inputdirname}/symmetric
  extension=sgr
  FLAGS+=" -symmetricGraph"
else 
  # for verify purposes, always pass in graph transpose just in case it is 
  # needed for non-symmetric graphs
  FLAGS+=" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr"
fi
INPUT=${inputdirname}/${inputname}.${extension}

if [ -n "$ABELIAN_VERIFY" ]; then
  #outputdirname=/workspace/dist-outputs
  outputdirname=/net/ohm/export/iss/dist-outputs
  IFS='_' read -ra EXECP <<< "$execname"
  problem=${EXECP[0]}
  OUTPUT=${outputdirname}/${inputname}.${problem}

  if [ -z "$ABELIAN_GALOIS_ROOT" ]; then
    ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois
  fi
  checker=${ABELIAN_GALOIS_ROOT}/scripts/result_checker.py

  hostname=`hostname`
fi

# assumes 2 GPU devices available
SET="g,1,2 gg,2,2 c,1,16 gc,2,14 cg,2,14 ggc,3,12 cgg,3,12 gcg,3,12"
# assumes 6 GPU devices available - tuxedo
SET="c,1,48 g,1,2 gg,2,2 ggg,3,2 gggg,4,2 ggggg,5,2 gggggg,6,2"
# assumes 4 GPU devices available
SET="c,1,28 g,1,28 gg,2,14 ggg,3,7 gggg,4,7"

for task in $SET; do
  IFS=",";
  set $task;
  PFLAGS=$FLAGS
  statname=${execname}_${inputname}_${1}.stats
  PFLAGS+=" -statFile=${execdir}/${statname}"
  if [ -n "$ABELIAN_VERTEX_CUT" ]; then
    PFLAGS+=" -partition=cvc"
  elif [[ ($1 == *"gc"*) || ($1 == *"cg"*) ]]; then
    PFLAGS+=" -scalegpu=3"
  fi
  if [ -n "$ABELIAN_VTUNE" ]; then
    PFLAGS+=" -runs=1"
    CUSTOM_VTUNE="amplxe-cl -collect general-exploration -search-dir /lib/modules/3.10.0-327.22.2.el7.x86_64/weak-updates/nvidia/ -call-stack-mode all -trace-mpi -analyze-system -start-paused -r ${execname}_${inputname}_${1}_exploration"
  fi
  if [ -n "$ABELIAN_VERIFY" ]; then
    PFLAGS+=" -verify"
    rm -f output_*.log
  fi
  rm -f ${execname}_${inputname}_${1}.out
  grep "${inputname}.${extension}" ${source_file} |& tee ${execname}_${inputname}_${1}.out
  echo "GALOIS_DO_NOT_BIND_THREADS=1 $CUSTOM_VTUNE $MPI -n=$2 ${EXEC} ${INPUT} -pset=$1 -t=$3 ${PFLAGS} -num_nodes=1" |& tee ${execname}_${inputname}_${1}.out
  eval "GALOIS_DO_NOT_BIND_THREADS=1 $CUSTOM_VTUNE $MPI -n=$2 ${EXEC} ${INPUT} -pset=$1 -t=$3 ${PFLAGS} -num_nodes=1 |& tee -a ${execname}_${inputname}_${1}.out"
  if [ -n "$ABELIAN_VERIFY" ]; then
    outputs="output_${hostname}_0.log"
    i=1
    while [ $i -lt $2 ]; do
      outputs+=" output_${hostname}_${i}.log"
      let i=i+1
    done
    echo "python $checker -t=1 $OUTPUT ${outputs}" |& tee -a ${execname}_${inputname}_${1}.out
    eval "python $checker -t=1 $OUTPUT ${outputs} |& tee -a ${execname}_${inputname}_${1}.out"
  fi
done

rm -f output_*.log


================================================
FILE: scripts/experimental/heterogeneousGalois/run_stampede.template.sbatch
================================================
#!/bin/bash
#SBATCH --mail-user=CHANGE EMAIL HERE OR ERROR
#SBATCH --mail-type=begin
#SBATCH --mail-type=fail
#SBATCH --mail-type=end
#SBATCH -A Galois
PART=$3
PSET=$4
THREADS=$5

execname=$1
execdir=`pwd`
EXEC=${execdir}/${execname}

inputname=$2
# change this if necessary
inputdirname=/scratch/03279/roshand/dist-inputs
extension=gr

statname=${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.stats

FLAGS=" -statFile=${execdir}/${statname}"
# kcore flag
if [[ $execname == *"kcore"* ]]; then
  FLAGS+=" -kcore=100"
fi

if [[ ($execname == *"bc"*) ||  ($execname == *"bfs"*) || ($execname == *"sssp"*) ]]; then
  if [[ -f "${inputdirname}/${inputname}.source" ]]; then
    FLAGS+=" -startNode=`cat ${inputdirname}/${inputname}.source`"
  fi
fi
if [[ ($execname == *"bc"*) ]]; then
  FLAGS+=" -singleSource"
fi

source_file=${inputdirname}/source
if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
  inputdirname=${inputdirname}/symmetric
  extension=sgr
  FLAGS+=" -symmetricGraph"
else 
  # for verify purposes, always pass in graph transpose just in case it is 
  # needed for non-symmetric graphs
  FLAGS+=" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr"
fi

grep "${inputname}.${extension}" ${source_file}
INPUT=${inputdirname}/${inputname}.${extension}

if [[ ($execname == *"pagerank"*) ]]; then
  FLAGS+=" -maxIterations=100"
fi

FLAGS+=" -partition=${PART}"

################################################################################

# XTRAPULP

if [[ ($PART == "cec") ]]; then
  # idea is to balance out sends across hosts since you're sending everything
  FLAGS+=" -balanceMasters=nodes"
  if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
    FLAGS+=" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}s_${SLURM_NNODES}.xtra"
  elif [[ $execname == *"pull"* ]]; then
    FLAGS+=" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}t_${SLURM_NNODES}.xtra"
  else # everything else, bfs, bc, sssp, push versions
    FLAGS+=" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}_${SLURM_NNODES}.xtra"
  fi
fi

################################################################################

# LOCAL GRAPH SAVES

PARTGRAPHROOT="/scratch/02681/gsg466/partitioned_graphs"

if [[ ($execname == *"bfs"*) ]]; then
  localGraphFileName="${PARTGRAPHROOT}/${PART}/${inputname}/local_graph_${inputname}_${SLURM_NNODES}"
  if [ ! -d "$localGraphFileName" ]; then
     # Control will enter here if $DIRECTORY doesn't exist.
     mkdir -p $localGraphFileName;
  fi
  FLAGS+="  -localGraphFileName=$localGraphFileName "
fi

if [[ ($execname == *"sssp"*) ]]; then
  localGraphFileName="${PARTGRAPHROOT}/${PART}/${inputname}/weighted/local_graph_${inputname}_${SLURM_NNODES}"
  if [ ! -d "$localGraphFileName" ]; then
     # Control will enter here if $DIRECTORY doesn't exist.
     mkdir -p $localGraphFileName;
  fi
  FLAGS+="  -localGraphFileName=$localGraphFileName "
  #FLAGS+=" -saveLocalGraph "
fi

if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
  localGraphFileName="${PARTGRAPHROOT}/${PART}/${inputname}/symmetric/local_graph_${inputname}_${SLURM_NNODES}"
  if [ ! -d "$localGraphFileName" ]; then
     # Control will enter here if $DIRECTORY doesn't exist.
     mkdir -p $localGraphFileName;
  fi
  FLAGS+="  -localGraphFileName=$localGraphFileName "
  #FLAGS+=" -saveLocalGraph "
fi

if [[ ($execname == *"pagerank"*) ]]; then
  localGraphFileName="${PARTGRAPHROOT}/${PART}/${inputname}/transpose/local_graph_${inputname}_${SLURM_NNODES}"
  if [ ! -d "$localGraphFileName" ]; then
     # Control will enter here if $DIRECTORY doesn't exist.
     mkdir -p $localGraphFileName;
  fi
  FLAGS+="  -localGraphFileName=$localGraphFileName "
  #FLAGS+=" -saveLocalGraph "
fi

if [[ ($execname == *"partition"*) ]]; then
  localGraphFileName="${PARTGRAPHROOT}/${PART}/${inputname}/local_graph_${inputname}_${SLURM_NNODES}"
  if [ ! -d "$localGraphFileName" ]; then
     # Control will enter here if $DIRECTORY doesn't exist.
     mkdir -p $localGraphFileName;
  fi
  FLAGS+="  -localGraphFileName=$localGraphFileName "
  #FLAGS+=" -saveLocalGraph "
fi

if [[ -n "$SAVE_GRAPH" ]]; then
  FLAGS+=" -saveLocalGraph "
fi

if [[ -n "$LOAD_GRAPH" ]]; then
  FLAGS+=" -readFromFile "
fi

RUN=ibrun

set -x #echo on
PRINT_PER_HOST_STATS=1 GALOIS_DO_NOT_BIND_THREADS=1 $RUN $EXEC ${INPUT} -t=$THREADS $FLAGS -edgeBufferSize=8388608 -runs=3
set +x #echo off

# give permissions to output files
chmod 660 ${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.out
chmod 660 $statname

echo "Algorithm: " $execname
echo "Input: " $INPUT
echo "Number of nodes: " $SLURM_NNODES
echo "Number of tasks: " $SLURM_NTASKS
echo "Number of tasks per node: " $SLURM_TASKS_PER_NODE
echo "Devices: " $PSET


================================================
FILE: scripts/experimental/heterogeneousGalois/run_stampede_all.sh
================================================
#!/bin/sh

EXEC=$1
INPUT=$2
SET=$3
QUEUE=$4
PART=$5
HET=$6 # not supported for now

SET="${SET%\"}"
SET="${SET#\"}"

for task in $SET; do
  IFS=",";
  set $task;
  cp run_stampede.template.sbatch run_stampede.sbatch 
  if [ $QUEUE == "gpu" ]; then # should add HET option
    sed -i "2i#SBATCH -t $2" run_stampede.sbatch
    sed -i "2i#SBATCH -p $QUEUE" run_stampede.sbatch
    sed -i "2i#SBATCH -N $1 -n $1" run_stampede.sbatch
    sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${1}_g_%j.out" run_stampede.sbatch
    sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${1}_g" run_stampede.sbatch
    threads=16
    echo "multi-GPU-only " $EXEC $INPUT $1 "g" $threads $2
    sbatch run_stampede.sbatch $EXEC $INPUT g $threads
  else
    sed -i "2i#SBATCH -t $2" run_stampede.sbatch
    sed -i "2i#SBATCH -p $QUEUE" run_stampede.sbatch
    sed -i "2i#SBATCH -N $1 -n $1" run_stampede.sbatch
    sed -i "2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_c_%j.out" run_stampede.sbatch
    sed -i "2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_c" run_stampede.sbatch
    threads=272
    echo "CPU-only " $EXEC $INPUT $PART $1 "c" $threads $2
    sbatch run_stampede.sbatch $EXEC $INPUT $PART c $threads 
  fi
  rm run_stampede.sbatch
done


================================================
FILE: scripts/experimental/heterogeneousGalois/verify.sh
================================================
#!/bin/sh
# Usage: ./verify.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>
# environment variables: ABELIAN_NON_HETEROGENEOUS ABELIAN_GALOIS_ROOT ABELIAN_EDGE_CUT_ONLY
# executes only on single machine
# assumes 2 GPU devices available (if heterogeneous)

execdirname="."
execname=$1
EXEC=${execdirname}/${execname}

inputdirname=/net/ohm/export/iss/dist-inputs
#inputdirname=/workspace/dist-inputs
inputname=$2
extension=gr

option=$3

outputdirname=/net/ohm/export/iss/dist-outputs
#outputdirname=/workspace/dist-outputs

IFS='_' read -ra EXECP <<< "$execname"
problem=${EXECP[0]}
OUTPUT=${outputdirname}/${inputname}.${problem}

# kcore output files have a number at the end specifying kcore number
if [[ $execname == *"kcore"* ]]; then
  # TODO: update this for non-100 kcore numbers
  OUTPUT=${outputdirname}/${inputname}.${problem}100
fi

# for bc, do single source outputs
if [[ ($execname == *"bc"*) ]]; then
  OUTPUT=${outputdirname}/${inputname}.ssbc
fi

# for bc, if using rmat15, then use all sources output (without ss)
if [[ ($execname == *"bc"*) && ($inputname == "rmat15") ]]; then
  OUTPUT=${outputdirname}/rmat15.bcbfsall
fi

MPI=mpiexec
LOG=.verify_log

FLAGS=
# kcore flag
if [[ $execname == *"kcore"* ]]; then
  FLAGS+=" -kcore=100"
fi
if [[ ($execname == *"bfs"*) || ($execname == *"sssp"*) ]]; then
  if [[ -f "${inputdirname}/${inputname}.source" ]]; then
    FLAGS+=" -startNode=`cat ${inputdirname}/${inputname}.source`"
  fi
fi

# bc: if rmat15 is not used, specify single source flags else do
# all sources for rmat15
if [[ ($execname == *"bc"*) && ! ($inputname == "rmat15") ]]; then
  FLAGS+=" -singleSource"
  FLAGS+=" -startNode=`cat ${inputdirname}/${inputname}.source`"
fi

# batch multiple sources if using mrbc
if [[ ($execname == *"bc_mr"*) ]]; then
  FLAGS+=" -numRoundSources=4096"
fi

source_file=${inputdirname}/source
if [[ $execname == *"cc"* || $execname == *"kcore"* ]]; then
  inputdirname=${inputdirname}/symmetric
  extension=sgr
  FLAGS+=" -symmetricGraph"
else 
  # for verify purposes, always pass in graph transpose just in case it is 
  # needed for non-symmetric graphs
  FLAGS+=" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr"
fi

FLAGS+=" -maxIterations=10000000"

grep "${inputname}.${extension}" ${source_file} >>$LOG
INPUT=${inputdirname}/${inputname}.${extension}

if [ -z "$ABELIAN_GALOIS_ROOT" ]; then
  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois
fi
checker=${ABELIAN_GALOIS_ROOT}/scripts/result_checker.py
#checker=./result_checker.py

hostname=`hostname`

if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
  # assumes only 2 GPUs device available
  #SET="g,1,48 gg,2,24 gggg,4,12 gggggg,6,8 c,1,48 cc,2,24 cccc,4,12 cccccccc,8,6 cccccccccccccccc,16,3"
  SET="g,1,16 gg,2,8 gc,2,8 cg,2,8, ggc,3,4 cgg,3,4 c,1,16 cc,2,8 ccc,3,4 cccc,4,4 ccccc,5,2 cccccc,6,2 ccccccc,7,2 cccccccc,8,2 ccccccccc,9,1 cccccccccc,10,1 ccccccccccc,11,1 cccccccccccc,12,1 ccccccccccccc,13,1 cccccccccccccc,14,1 cccccccccccccc,15,1 ccccccccccccccc,16,1"
else
  #SET="c,1,48 cc,2,24 cccc,4,12 cccccccc,8,6 cccccccccccccccc,16,3"
  #SET="c,1,80 cc,2,40 cccc,4,20 cccccccc,8,10 ccccccccccccccc,16,5"
  SET="c,1,16 cc,2,8 ccc,3,4 cccc,4,4 ccccc,5,2 cccccc,6,2 ccccccc,7,2 cccccccc,8,2 ccccccccc,9,1 cccccccccc,10,1 ccccccccccc,11,1 cccccccccccc,12,1 ccccccccccccc,13,1 cccccccccccccc,14,1 cccccccccccccc,15,1 ccccccccccccccc,16,1"
fi

pass=0
fail=0
failed_cases=""
#for partition in 1 2 3 4 5 6 7 8 9 10 11 12; do
for partition in 1 2 3 4 5 6; do
#for partition in 1; do
  CUTTYPE=

  if [ $partition -eq 1 ]; then
    CUTTYPE+=" -partition=oec"
  elif [ $partition -eq 2 ]; then
    CUTTYPE+=" -partition=iec"
  elif [ $partition -eq 3 ]; then
    CUTTYPE+=" -partition=cvc"
  elif [ $partition -eq 4 ]; then
    CUTTYPE+=" -partition=cvc-iec"
  elif [ $partition -eq 5 ]; then
    CUTTYPE+=" -partition=hovc"
  elif [ $partition -eq 6 ]; then
    CUTTYPE+=" -partition=hivc"
  elif [ $partition -eq 7 ]; then
    CUTTYPE+=" -partition=fennel-o -stateRounds=100"
  elif [ $partition -eq 8 ]; then
    CUTTYPE+=" -partition=fennel-i -stateRounds=100"
  elif [ $partition -eq 9 ]; then
    CUTTYPE+=" -partition=ginger-o -stateRounds=100"
  elif [ $partition -eq 10 ]; then
    CUTTYPE+=" -partition=ginger-i -stateRounds=100"
  elif [ $partition -eq 11 ]; then
    CUTTYPE+=" -partition=sugar-o -stateRounds=100"
  elif [ $partition -eq 12 ]; then
    CUTTYPE+=" -partition=sugar-i -stateRounds=100"
  fi

  for task in $SET; do
    old_ifs=$IFS
    IFS=",";
    set $task;
    if [ -z "$ABELIAN_NON_HETEROGENEOUS" ]; then
      PFLAGS=" -pset=$1 -num_nodes=1"
    else
      PFLAGS=""
    fi
    PFLAGS+=$FLAGS
    if [[ ($1 == *"gc"*) || ($1 == *"cg"*) ]]; then
      PFLAGS+=" -scalegpu=3"
    fi
    rm -f output_*.log

    echo "GALOIS_DO_NOT_BIND_THREADS=1 $MPI -n=$2 ${EXEC} ${INPUT} -t=$3 ${option} ${PFLAGS} ${CUTTYPE} -verify" >>$LOG
    eval "GALOIS_DO_NOT_BIND_THREADS=1 $MPI -n=$2 ${EXEC} ${INPUT} -t=$3 ${option} ${PFLAGS} ${CUTTYPE} -verify" >>$LOG 2>&1

    eval "sort -nu output_${hostname}_*.log -o output_${hostname}_0.log"
    eval "python $checker -t=0.01 $OUTPUT output_${hostname}_0.log &> .output_diff"

    cat .output_diff >> $LOG
    if ! grep -q "SUCCESS" .output_diff ; then
      let fail=fail+1
      if [ $partition -eq 1 ]; then
        failed_cases+="outgoing edge-cut $1 devices with $3 threads; "
      elif [ $partition -eq 2 ]; then
        failed_cases+="incoming edge-cut $1 devices with $3 threads; "
      elif [ $partition -eq 3 ]; then
        failed_cases+="cartesian outgoing vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 4 ]; then
        failed_cases+="cartesian incoming vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 5 ]; then
        failed_cases+="hybrid outgoing vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 6 ]; then
        failed_cases+="hybrid incoming vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 7 ]; then
        failed_cases+="fennel outgoing edge-cut $1 devices with $3 threads; "
      elif [ $partition -eq 8 ]; then
        failed_cases+="fennel incoming edge-cut $1 devices with $3 threads; "
      elif [ $partition -eq 9 ]; then
        failed_cases+="ginger outgoing vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 10 ]; then
        failed_cases+="ginger incoming vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 11 ]; then
        failed_cases+="sugar outgoing vertex-cut $1 devices with $3 threads; "
      elif [ $partition -eq 12 ]; then
        failed_cases+="sugar outgoing vertex-cut $1 devices with $3 threads; "
      fi
    else
      let pass=pass+1
    fi
    rm .output_diff
    IFS=$old_ifs
  done
done

rm -f output_*.log

echo "---------------------------------------------------------------------------------------"
echo "Algorithm: " $execname
echo "Input: " $inputname
echo "Runtime option: " $option
echo $pass "passed test cases"
if [[ $fail == 0 ]] ; then
  echo "Status: SUCCESS"
else
  echo $fail "failed test cases:" $failed_cases
  echo "Status: FAILED"
fi
echo "---------------------------------------------------------------------------------------"


================================================
FILE: scripts/experimental/lonestarbmk2/README
================================================
-----
bmk2.cfg
-----

Setup the path to the built apps (Lonestar build directory) and where to output
logs for each benchmark here under pathToApps and logOutputDirectory.

-----
bmktest2.py
-----

This file defines all the benchmarks to be run. Each benchmark is defined as a
class that inherits from SharedMemApp.

The runtime looks for relativeAppPath, which points to the executable, and
benchmark, which is the name given to that benchmark when outputting things
and, more importantly, the name used to refer to said benchmark run by bmk2 in
other files.

If any arguments to that benchmark need to be specified, you define the
get_run_spec function to add args. See examples there in the current bmktest2.py
file to get a feel of how to do it.

The SharedMemApp parent class is where default parameters such as threads
are specified. startThread, endThread, and step are the variables to edit
for this purpose.

The benchmarks that will be run are specified in the BINARIES array. For
example, the below definition will run BarnesHut and BFS:

BINARIES = [BarnesHut(), BFS()]

-----
lonestar.inputdb
-----

Inputs to benchmarks are specified here.

To begin, change the "basepath" variable in the header to point to the root
directory where are inputs are stored.

The format for specifying an input is as follows:

[<input name>]
flags =
name = <name to refer to input as in bmk2>
file = <relative path to file from root input directory "basepath">
format = file format

There are a couple of file formats to be aware of:

bin/galois = binary galois format
mesh/nodes = mesh format
triangles = format for triangle counting
text = text format (for things like points-to-analysis)
nothing = no file is passed in (for things like self generated input)

-----
lonestar.inputprops
-----

Additional properties for certain inputs can be specified in the inputprops
file.

[<same input name used in header for input in inputdb>]
<additional property>=<what additional property is>

For example, I can specify the transpose graph as following:

[soc-livejournal]
ptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr

It can then be referred to in the bmktest2.py python script as the example
below shows:

s.set_arg("-graphTranspose=%s" % bmkinput.props.ptranspose)

-----
lonestar.bispec
-----

This file is where one specifies which input to run with a particular bmk2
benchmark. Use the name given in the benchmark variable in bmktest2 to
refer to a benchmark and the name given under the name variable in
lonestar.inputdb to refer to an input. For example, the below lines says to
run bfs with the twitter40 input.

bfs twitter40

-----
How to run out of the box
-----

Set the following environment variables:

BMK_LONESTAR_PATH to the lonestar directory
BMK_LOGS to wherever you want the logs to go

Then run the following in THIS directory (same as README):

python <path to bmk2/test2.py> run

TODO explain the bmktest2.py file


================================================
FILE: scripts/experimental/lonestarbmk2/bmk2.cfg
================================================
[bmk2]
version=2
inputdb=lonestar.inputdb
inputprops=lonestar.inputprops
bispec=lonestar.bispec
pathToApps=${BMK_LONESTAR_PATH}
logOutputDirectory=${BMK_LOGS}


================================================
FILE: scripts/experimental/lonestarbmk2/bmkprops.py
================================================
import bmk2
import datetime

TIME_FMT = "%Y-%m-%d %H:%M:%S"

class GraphBMKSharedMem(bmk2.Binary):
    """Base class for shared memory benchmarks to inherit from. Subclasses
    need to specify benchmark name + number of threads.
    """
    def __init__(self):
        """Initialize shared mem properties."""
        self.props = GraphBMKSharedMemProps(self.benchmark)
        
    def get_id(self):
        """Return the id of this benchmark."""
        return "%s" % (self.benchmark)

    def getUniqueStatFile(self, numThreads, graphName):
        """Get a statfile name given num threads + graph name being used."""
        timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(" ", "_")

        return ("%s_%d_%s_%s.log" % (self.benchmark, numThreads, graphName,
                               timeNow))

class GraphBMKSharedMemProps(bmk2.Properties):
    """Properties pertaining to shared memory."""
    def __init__(self, benchmark):
        self.benchmark = benchmark


================================================
FILE: scripts/experimental/lonestarbmk2/bmktest2.py
================================================
import bmk2
from bmkprops import GraphBMKSharedMem
import os

class SharedMemApp(GraphBMKSharedMem):
    """Base class that has default run spec construction behavior for
    most if not all shared memory apps.
    """
    # thread to start from
    startThread = 0
    # thread to end at (inclusive)
    endThread = 56
    # step to use for looping through threads
    step = 7

    def filter_inputs(self, inputs):
        """Ignore inputs that aren't currently supported."""
        def finput(x):
            if x.props.format == 'bin/galois': return True
            if x.props.format == 'mesh': return True
            if x.props.format == 'mesh/nodes': return True
            if x.props.format == 'triangles': return True
            if x.props.format == 'text': return True
            if x.props.format == 'nothing': return True

            return False

        return filter(finput, inputs)

    def get_default_run_specs(self, bmkinput, config):
        """Creates default run specifications with common arguments for all
        shared memory benchmarks and returns them. They can be modified
        later according to the benchmark that you want to run.
        """
        assert config != None # config should be passed through test2.py
        listOfRunSpecs = []

        for numThreads in range(self.startThread, self.endThread + 1, self.step):
            if numThreads == 0 and self.step != 1:
              numThreads = 1
            elif numThreads == 0:
              continue

            x = bmk2.RunSpec(self, bmkinput)

            x.set_binary("", os.path.expandvars(
                               os.path.join(config.get_var("pathToApps"),
                                          self.relativeAppPath)))
            x.set_arg("-t=%d" % numThreads)

            nameToAppend = bmkinput.name

            if bmkinput.props.format == "nothing":
                nameToAppend = "gen"
                pass
            elif bmkinput.props.format != "mesh":
                x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)
            else: # mesh
                # don't specify with input file flag as it doesn't exist (mesh
                # loads multiple files, so the file specified in the inputdb
                # isn't an actual file
                x.set_arg(bmkinput.props.file)

            x.set_arg("-statFile=" +
                      os.path.expandvars(
                        os.path.join(config.get_var("logOutputDirectory"),
                                     self.getUniqueStatFile(numThreads, 
                                     nameToAppend))
                      ))

            listOfRunSpecs.append(x)

            x.set_checker(bmk2.PassChecker())
            x.set_perf(bmk2.ZeroPerf())

        return listOfRunSpecs

    def get_run_spec(self, bmkinput, config):
        return self.get_default_run_specs(bmkinput, config)

################################################################################

class BarnesHut(SharedMemApp):
    relativeAppPath = "barneshut/barneshut"
    benchmark = "barneshut"

    def get_run_spec(self, bmkinput, config):
        """Adds barnes hut specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-n=500000")
            s.set_arg("-steps=1")
            s.set_arg("-seed=0")
        
        return specs

class BCAsync(SharedMemApp):
    relativeAppPath = "betweennesscentrality/bc-async"
    benchmark = "bc-async"

    def get_run_spec(self, bmkinput, config):
        """BC async command line setup"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            # do 5 nodes with edges
            s.set_arg("-numOfOutSources=5") 
        
        return specs

class BCOuter(SharedMemApp):
    relativeAppPath = "betweennesscentrality/betweennesscentrality-outer"
    benchmark = "bc-outer"

class BFS(SharedMemApp):
    relativeAppPath = "bfs/bfs"
    benchmark = "bfs"

class Boruvka(SharedMemApp):
    relativeAppPath = "boruvka/boruvka"
    benchmark = "boruvka"

class BoruvkaMerge(SharedMemApp):
    relativeAppPath = "boruvka/boruvka-merge"
    benchmark = "boruvka-merge"

class Clustering(SharedMemApp):
    relativeAppPath = "clustering/clustering"
    benchmark = "clustering"

    def get_run_spec(self, bmkinput, config):
        """Clustering command line setup"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-numPoints=10000") # num points to cluster
        
        return specs

class ConnectedComponents(SharedMemApp):
    relativeAppPath = "connectedcomponents/connectedcomponents"
    benchmark = "connectedcomponents"

class DelaunayTriangulation(SharedMemApp):
    relativeAppPath = "delaunaytriangulation/delaunaytriangulation"
    benchmark = "delaunaytriangulation"

class DelaunayTriangulationDet(SharedMemApp):
    relativeAppPath = "delaunaytriangulation/delaunaytriangulation-det"
    benchmark = "delaunaytriangulation-det"

class DMR(SharedMemApp):
    relativeAppPath = "delaunayrefinement/delaunayrefinement"
    benchmark = "dmr"

class GMetis(SharedMemApp):
    relativeAppPath = "gmetis/gmetis"
    benchmark = "gmetis"

    def get_run_spec(self, bmkinput, config):
        """Adds gmetis specific arguments (num partitions)"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("256") # num of partitions
        
        return specs

class IndependentSet(SharedMemApp):
    relativeAppPath = "independentset/independentset"
    benchmark = "independentset"

# triggers caps for matrix completion
MCCAP = False

class MatrixCompletionSync(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-sync"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=syncALS") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MatrixCompletionSimple(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-simple"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=simpleALS") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MatrixCompletionEdge(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-edge"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdBlockEdge") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MatrixCompletionJump(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-jump"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdBlockJump") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MatrixCompletionByItems(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-byitems"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdByItems") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MatrixCompletionByEdges(SharedMemApp):
    relativeAppPath = "matrixcompletion/matrixCompletion"
    benchmark = "matrixcompletion-byedges"

    def get_run_spec(self, bmkinput, config):
        """Adds matrix completion type"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=sgdByEdges") # algo type
            s.set_arg("-lambda=0.001") 
            s.set_arg("-learningRate=0.01") 
            s.set_arg("-learningRateFunction=intel") 
            s.set_arg("-tolerance=0.0001") 
            s.set_arg("-noverify")
            s.set_arg("-useSameLatentVector")
            s.set_arg("-useDetInit")
            if MCCAP:
              s.set_arg("-fixedRounds=8")
              s.set_arg("-maxUpdates=8")
        
        return specs

class MCM(SharedMemApp):
    relativeAppPath = "matching/bipartite-mcm"
    benchmark = "mcm"

    def get_run_spec(self, bmkinput, config):
        """Adds bipartite matching specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-abmpAlgo")
            s.set_arg("-inputType=generated")
            s.set_arg("-n=1000000") # nodes in each bipartite set
            s.set_arg("-numEdges=100000000") 
            s.set_arg("-numGroups=10000") 
            s.set_arg("-seed=0") # seed for rng; keep it consistent
        
        return specs


class PageRankPull(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-pull"
    benchmark = "pagerank-pull"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.001") # pagerank tolerance
        
        return specs

class PageRankPullTopo(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-pull"
    benchmark = "pagerank-pull-topo"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument and algorithm setting"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.001") # pagerank tolerance
            s.set_arg("-algo=Topo") # pagerank tolerance
        
        return specs

class PageRankPush(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-push"
    benchmark = "pagerank-push"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.001") # pagerank tolerance
        
        return specs

class PageRankPushSync(SharedMemApp):
    relativeAppPath = "pagerank/pagerank-push"
    benchmark = "pagerank-push-sync"

    def get_run_spec(self, bmkinput, config):
        """Adds tolerance argument and algo setting"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-tolerance=0.001") # pagerank tolerance
            s.set_arg("-algo=Sync") # pagerank tolerance
        
        return specs

# for galois 2.2 version of pagerank
class PageRank2Point2(SharedMemApp):
    relativeAppPath = "pagerank/pagerank"
    benchmark = "pagerank22"

    def get_run_spec(self, bmkinput, config):
        """Adds transpose graph."""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-graphTranspose=%s" % bmkinput.props.ptranspose) 
            s.set_arg("-maxIterations=1000")
        
        return specs

class PreflowPush(SharedMemApp):
    relativeAppPath = "preflowpush/preflowpush"
    benchmark = "preflowpush"

    def get_run_spec(self, bmkinput, config):
        """Adds preflow push specific arguments"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("0") # source id
            s.set_arg("100") # sink id
        
        return specs

class PointsToAnalysis(SharedMemApp):
    relativeAppPath = "pointstoanalysis/pta"
    benchmark = "pta"

class SpanningTree(SharedMemApp):
    relativeAppPath = "spanningtree/spanningtree"  
    benchmark = "spanningtree"

class SSSP(SharedMemApp):
    relativeAppPath = "sssp/sssp"
    benchmark = "sssp"

    def get_run_spec(self, bmkinput, config):
        """Adds delta argument to runs."""
        specs = self.get_default_run_specs(bmkinput, config)

        # 0 is best for twitter50
        # 8 seems best for r4-2e26
        for s in specs:
            #s.set_arg("-delta=0")
            s.set_arg("-delta=8")
        
        return specs

class SurveyPropagation(SharedMemApp):
    relativeAppPath = "surveypropagation/surveypropagation"
    benchmark = "surveypropagation"

    def get_run_spec(self, bmkinput, config):
        """Adds survey prop arguments to runs."""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("9") # random generator seed
            s.set_arg("2000000") # number of vars
            s.set_arg("6000000") # number of clauses
            s.set_arg("4") # vars per clause

            # below are args used by runs on galois website
            #s.set_arg("1000000") # number of vars
            #s.set_arg("3000000") # number of clauses
            #s.set_arg("3") # vars per clause
       
        return specs

class TrianglesNode(SharedMemApp):
    relativeAppPath = "triangles/triangles"
    benchmark = "triangles-node"

    def get_run_spec(self, bmkinput, config):
        """Specifies node version"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=nodeiterator")
        
        return specs

class TrianglesEdge(SharedMemApp):
    relativeAppPath = "triangles/triangles"
    benchmark = "triangles-edge"

    def get_run_spec(self, bmkinput, config):
        """Specifies edge version"""
        specs = self.get_default_run_specs(bmkinput, config)

        for s in specs:
            s.set_arg("-algo=edgeiterator")
        
        return specs

# specification of binaries to run
# apps present in Galois 2.2
#BINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(), BoruvkaMerge(), 
#            Clustering(), ConnectedComponents(), DelaunayTriangulation(), DMR(), 
#            GMetis(), IndependentSet(), MCM(), PageRankPull(), PageRankPush(), 
#            PreflowPush(), SpanningTree(), SSSP(), SurveyPropagation()]

# single benchmark run
#BINARIES = [MatrixCompletionSimple()]

BINARIES = [BarnesHut(), BFS(), BCAsync(), BCOuter(), Boruvka(), 
            ConnectedComponents(), DelaunayTriangulation(), DMR(), 
            GMetis(), IndependentSet(), MatrixCompletionSync(), 
            MatrixCompletionSimple(), MatrixCompletionEdge(), 
            MatrixCompletionJump(), MatrixCompletionByItems(), 
            MatrixCompletionByEdges(), MCM(), PageRankPull(), PageRankPush(), 
            PreflowPush(), PointsToAnalysis(), SSSP(), SurveyPropagation(), 
            TrianglesNode(), TrianglesEdge()] 


================================================
FILE: scripts/experimental/lonestarbmk2/defaultrunscript.sh
================================================
#!/bin/bash

for i in {1..3}; do python ../bmk2/test2.py --max-output-bytes 0 --log ${BMK_LOGS}/bmkrunlog${i}.log --verbose run; done 


================================================
FILE: scripts/experimental/lonestarbmk2/lonestar.bispec
================================================
#v1
barneshut nothing
bfs twitter40
bc-async USA-road-d.USA
bc-outer rmat8-2e14
boruvka USA-road-d.USA
boruvka-merge USA-road-d.USA
clustering nothing
connectedcomponents rmat25-rsymmetric
delaunaytriangulation r5Mn
delaunaytriangulation-det r5Mn
dmr r5M
gmetis USA-road-d.USA
independentset rmat27-rsymmetric
matrixcompletion-sync netflix
matrixcompletion-simple netflix
matrixcompletion-edge netflix
matrixcompletion-jump netflix
matrixcompletion-byitems netflix
matrixcompletion-byedges netflix
mcm nothing
pagerank22 twitter40
pagerank-pull twitter40-transpose
pagerank-pull-topo twitter40-transpose
pagerank-push twitter40
pagerank-push-sync twitter40
preflowpush r4-2e26
pta gdb_constraints
spanningtree r4-2e26
sssp r4-2e26
surveypropagation nothing
triangles-node com-lj
triangles-edge com-lj


================================================
FILE: scripts/experimental/lonestarbmk2/lonestar.inputdb
================================================
[bmktest2]
version = 2
basepath = /net/ohm/export/iss/inputs

[scalefree/rmat8-2e14.gr]
flags =
name = rmat8-2e14
file = scalefree/deprecated/rmat8-2e14.gr
format = bin/galois

[random/r4-2e26.gr]
flags =
name = r4-2e26
file = random/r4-2e26.gr
format = bin/galois

[random/r4-2e26.sgr]
flags =
name = r4-2e26-symmetric
file = random/symmetric/r4-2e26.gr
format = bin/galois

[random/rmat25.rsgr]
flags =
name = rmat25-rsymmetric
file = scalefree/randomized/symmetric/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.srgr
format = bin/galois

[road/USA-road-d.USA.gr]
flags =
name = USA-road-d.USA
file = road/USA-road-d.USA.gr
format = bin/galois

[unweighted/uk-2007-05.sgr]
flags =
name = uk-2007-05-symmetric
file = unweighted/uk-2007-05.sgr
format = bin/galois

[scalefree/symmetric/rmat26.sgr]
flags =
name = rmat26-symmetric
file = scalefree/symmetric/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.sgr
format = bin/galois

[scalefree/rmat28.gr]
flags =
name = rmat28
file = scalefree/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=.05.gr
format = bin/galois

[scalefree/random/rmat27.sgr]
flags =
name = rmat27-rsymmetric
file = scalefree/randomized/symmetric/rmat16-2e27-a=0.57-b=0.19-c=0.19-d=.05.srgr
format = bin/galois

[scalefree/random/rmat28.sgr]
flags =
name = rmat28-symmetric
file = scalefree/randomized/symmetric/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.srgr
format = bin/galois

[meshes/r5M]
flags =
name = r5M
file = meshes/r5M
format = mesh

[meshes/r5Mn]
flags =
name = r5Mn
file = meshes/r5M.node
format = mesh/nodes

[twitter40]
flags =
name = twitter40
file = unweighted/twitter-WWW10-component.gr
format = bin/galois

[twitter40-transpose]
flags =
name = twitter40-transpose
file = unweighted/twitter-WWW10-component-transpose.gr
format = bin/galois

[twitter40-symmetric]
flags =
name = twitter40-symmetric
file = unweighted/twitter-WWW10-component-symmetric.gr
format = bin/galois

[twitter50]
flags =
name = twitter50
file = unweighted/twitter-ICWSM10-component.gr
format = bin/galois

[twitter50w]
flags =
name = twitter50w
file = unweighted/withRandomWeights/twitter-ICWSM10-component_withRandomWeights.gr
format = bin/galois

[twitter50-transpose]
flags =
name = twitter50-transpose
file = unweighted/twitter-ICWSM10-component-transpose.gr
format = bin/galois

[soc-livejournal]
flags =
name = soc-livejournal
file = unweighted/soc-LiveJournal1.gr
format = bin/galois

[com-lj]
flags =
name = com-lj
file = stanford/communities/LiveJournal/com-lj.wgt32.sym.gr.triangles
format = triangles

[weighted/bipartite/yahoo]
flags =
name = yahoo-music
file = weighted/bipartite/yahoo.gr
format = bin/galois

[weighted/bipartite/netflix]
flags =
name = netflix
file = weighted/bipartite/floatEdgeWts/netflix.gr
format = bin/galois

[java/pta/tshark_constraints]
flags =
name = tshark_constraints
file = java/pta/tshark_constraints.txt
format = text

[java/pta/gdb_constraints]
flags =
name = gdb_constraints
file = java/pta/gdb_constraints.txt
format = text

[nothing]
flags =
name = nothing
file = 
format = nothing


================================================
FILE: scripts/experimental/lonestarbmk2/lonestar.inputprops
================================================
[bmktest2-props]
version = 2
paths = 

[soc-livejournal]
ptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr

[twitter40]
ptranspose=/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.ptgr


================================================
FILE: scripts/experimental/older/backend.pl
================================================
#!/usr/bin/perl

use strict;
use Getopt::Std;

sub run_prog {
    my $appstr = shift;

    my %options=();
    getopts("hr:t:s:", \%options);
    
    my $threadcount = 24;
    my $threadstart = 1;
    my $numruns = 9;

    if (defined $options{h}) {
	print "-h      :help\n";
	print "-r num  :run num times\n";
	print "-t tmax :end at tmax threads\n";
	print "-s tmin :start at tmin threads\n";
	exit;
    }
    
    if (defined $options{r}) {
	$numruns = $options{r};
    }
    
    if (defined $options{t}) {
	$threadcount = $options{t};
	print "setting threads ending point to $threadcount\n";
    }
    if (defined $options{s}) {
	$threadstart = $options{s};
	print "setting threads starting point to $threadstart\n";
    }
    
    for(my $i = $threadstart; $i <= $threadcount; $i++) {
	print "THREADS: $i\n";
	my %stats;
	for (my $j = 0; $j < $numruns; $j++) {
	    print "*** Executing: " . "$appstr -t $i" . "\n";
	    system("$appstr -t $i");
	    
	    if ($? == -1) {
		print "failed to execute: $!\n";
	    } elsif ($? & 127) {
		printf "child died with signal %d, %s coredump\n",
		($? & 127),  ($? & 128) ? 'with' : 'without';
	    } else {
		printf "child exited with value %d\n", $? >> 8;
	    } 
	}
    }
}

sub vtune_prog {
    my $appstr = shift;

    my %options=();
    getopts("hr:t:s:", \%options);
    
    my $threadcount = 24;
    my $threadstart = 1;

    if (defined $options{h}) {
	print "-h      :help\n";
	print "-t tmax :end at tmax threads\n";
	print "-s tmin :start at tmin threads\n";
	exit;
    }
    
    if (defined $options{t}) {
	$threadcount = $options{t};
	print "setting threads ending point to $threadcount\n";
    }
    if (defined $options{s}) {
	$threadstart = $options{s};
	print "setting threads starting point to $threadstart\n";
    }
    
    for(my $i = $threadstart; $i <= $threadcount; $i++) {
	print "THREADS: $i\n";
	print "*** Executing: " . "$appstr -t $i" . "\n";
	system("rm -r r$i");
	system("mkdir r$i");
	system("/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -collect nehalem_general-exploration -result-dir=r$i -start-paused -- $appstr -t $i");
	system("/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -R hw-events -r r$i -group-by source-line -csv-delimiter tab |perl prune_headers_line.pl > results.line.$i.csv");
	system("/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -R hw-events -r r$i -group-by function -csv-delimiter tab |perl prune_headers_function.pl > results.function.$i.csv");
    }
}

return 1;
exit;


================================================
FILE: scripts/experimental/older/prune_headers_function.pl
================================================
while (<>) {
    @line = split '\t';
    chomp @line;
    $function = shift @line;
    $function =~ s/,/_/g;
    $module = shift @line;
    $proc = shift @line;
    $pid = shift @line;
    print "\"$module:$function\"," . join(',', @line) . "\n";
}


================================================
FILE: scripts/experimental/older/prune_headers_line.pl
================================================
while (<>) {
    @line = split '\t';
    chomp @line;
    $file = shift @line;
    $path = shift @line;
    $line = shift @line;
    $module = shift @line;
    $proc = shift @line;
    $pid = shift @line;
    print "\"$file:$line\"," . join(',', @line) . "\n";
}


================================================
FILE: scripts/experimental/older/report.pl
================================================
use List::Util qw(sum reduce);

#Gather
my $curthread = 0;

while(<>) {
    if (/RUN: Variable Threads = (\d+)/) {
	$curthread = $1;
    }
    if (/STAT SINGLE (\w+)\s+\(null\)\s+(\d+)/) {
	push(@{$stats{$curthread}{"$1"} }, $2);
	$k{"$1"} = 1;
    }
}

#output
foreach my $th (sort { $a <=> $b } keys %stats) {
    print ",$th";
} 
print "\n";
foreach my $st (sort keys %k) {
    print "$st";
    foreach my $th (sort { $a <=> $b } keys %stats) {
	@values = @{$stats{$th}{$st}};
	if (@values) {
	    my $avg = sum(@values)/@values;
	    print ",$avg";
	} else {
	    print ",0";
	}
    }
    print "\n";
    print "$st Stdev";
    foreach my $th (sort { $a <=> $b } keys %stats) {
	@values = @{$stats{$th}{$st}};
	if (@values) {
	    my $avg = sum(@values)/@values;
	    my $stdev = reduce {$a + ($b - $avg) * ($b - $avg)} 0, @values;
	    $stdev = $stdev / @values;
	    $stdev = sqrt($stdev);
	    print ",$stdev";
	} else {
	    print ",0";
	}
    }
    print "\n";
}


================================================
FILE: scripts/experimental/older/report_vtune.pl
================================================
while (@ARGV) {
    my $arg = shift @ARGV;
    #work out thread id
    $arg =~ /\w+\.(\d+)\.\w+/;
    $thread = $1;
    $thread_keys{$thread} = 1;
    #print "$arg: $thread\n";
    
    #open file
    open FILE, "<$arg";
    $h = <FILE>;
    @H = split ',', $h;
    foreach $hh (@H) {
	$name_keys{$hh} = 1;
    }
    while ($l = <FILE>) {
	@L = split ',', $l;
	chomp @L;
	$line_keys{$L[0]} = 1;
	for($i = 1; $i < @L; $i++) {
	    $stats{$H[$i]}{$L[0]}{$thread} = $L[$i];
	    #print "$H[$i] $L[0] $L[$i]\n";
	}
    }
}

foreach $nk (sort keys %name_keys) {
    print "$nk";
    foreach $tk (sort { $a <=> $b } keys %thread_keys) {
	print ",$tk";
    }
    print "\n";
    foreach $lk (sort keys %line_keys) {
	print "$lk";
	foreach $tk (sort { $a <=> $b } keys %thread_keys) {
	    print "," . $stats{$nk}{$lk}{$tk};
	}
	print "\n";
    }
    print "\n\n\n";
}


================================================
FILE: scripts/experimental/older/run_boruvka.pl
================================================
#!/usr/bin/perl

require "backend.pl";
# ../inputs/sssp/USA-road-d.W.structure
run_prog("../debug/apps/boruvka/boruvka /net/faraday/workspace/inputs/weighted/random4-26.structure 1 2");


================================================
FILE: scripts/experimental/older/run_clustering.pl
================================================
#!/usr/bin/perl

require "backend.pl";
run_prog("../debug/apps/clustering/boruvka 10000");


================================================
FILE: scripts/experimental/older/run_delaunayrefinement.pl
================================================
#!/usr/bin/perl

require "backend.pl";

run_prog("../apps/delaunayrefinement/delaunayrefinement ../inputs/delaunayrefinement/test.1");


================================================
FILE: scripts/experimental/older/run_sssp.pl
================================================
#!/usr/bin/perl

require "backend.pl";
# ../inputs/sssp/USA-road-d.W.structure
run_prog("../apps/sssp/sssp /net/faraday/workspace/inputs/weighted/random4-26.structure 1 2");


================================================
FILE: scripts/experimental/older/vtune_sssp.pl
================================================
#!/usr/bin/perl

require "backend.pl";

vtune_prog("../apps/sssp/sssp /net/faraday/workspace/inputs/weighted/random4-24.structure 1 2");


================================================
FILE: scripts/experimental/pangolin/batch_verify.sh
================================================
#!/bin/sh

# all benchmarks
EXECS=( "tc" "kcl" "motif" "fsm" )
#EXECS=( "tc" "kcl" "fsm" )

#INPUTS=( "mico" "patent" "youtube" )
INPUTS=( "citeseer" )
#INPUTS=( "livej" "orkut" )

current_dir=$(dirname "$0")
for input in "${INPUTS[@]}"; do
  for EXEC in "${EXECS[@]}"; do
    $current_dir/verify.sh ${EXEC} ${input}
  done
done


================================================
FILE: scripts/experimental/pangolin/fsm.citeseer.2.300
================================================
7


================================================
FILE: scripts/experimental/pangolin/fsm.citeseer.2.500
================================================
3


================================================
FILE: scripts/experimental/pangolin/fsm.patent.2.1000
================================================
20275


================================================
FILE: scripts/experimental/pangolin/fsm.patent.2.300
================================================
21756


================================================
FILE: scripts/experimental/pangolin/fsm.patent.2.500
================================================
21480


================================================
FILE: scripts/experimental/pangolin/fsm.patent.2.5000
================================================
5933


================================================
FILE: scripts/experimental/pangolin/kcl.citeseer.4
================================================
255


================================================
FILE: scripts/experimental/pangolin/kcl.citeseer.5
================================================
46


================================================
FILE: scripts/experimental/pangolin/kcl.mico.4
================================================
514864225


================================================
FILE: scripts/experimental/pangolin/kcl.mico.5
================================================
19246558419


================================================
FILE: scripts/experimental/pangolin/kcl.patent.3
================================================
6913764


================================================
FILE: scripts/experimental/pangolin/kcl.patent.4
================================================
3310556


================================================
FILE: scripts/experimental/pangolin/kcl.patent.5
================================================
2976152


================================================
FILE: scripts/experimental/pangolin/motif.citeseer.3
================================================
1166
23380


================================================
FILE: scripts/experimental/pangolin/motif.citeseer.4
================================================
111153
222630
3094
22900
2200
255


================================================
FILE: scripts/experimental/pangolin/motif.mico.3
================================================
12534960
53546459


================================================
FILE: scripts/experimental/pangolin/motif.mico.4
================================================
4070868075
2307847995
33929353
3591944265
437985111
514864225


================================================
FILE: scripts/experimental/pangolin/motif.patent.3
================================================
6913764
267600153


================================================
FILE: scripts/experimental/pangolin/motif.patent.4
================================================
5764763466
5148841859
227197040
497680804
55988120
3310556


================================================
FILE: scripts/experimental/pangolin/result_checker.py
================================================
#!/usr/bin/python3

import sys
import glob

if __name__ == "__main__":

	application = sys.argv[1]
	input_graph = sys.argv[2]
	size = sys.argv[3]
	minsup = sys.argv[4]
	log_filename = sys.argv[5]
	out_filename = "/net/ohm/export/iss/pangolin-outputs/" + application + "." + input_graph 
	if application != "tc":
		out_filename = out_filename + "." + size
	if application == "fsm":
		out_filename = out_filename + "." + minsup
	
	log = open(log_filename, 'r')
	if application == "motif":
		res = open("result.txt", "w")
		for line in log:
			if line.find("triangles")!=-1 or line.find("wedges")!=-1 or line.find("4-paths")!=-1 or line.find("3-stars")!=-1 or line.find("4-cycles")!=-1 or line.find("tailed-triangles")!=-1 or line.find("diamonds")!=-1 or line.find("4-cliques")!=-1:
				#print(line.split(' ')[-1], file=res)
				res.write(line.split(' ')[-1])
		res.close()
		same = True
		with open(out_filename) as out, open("result.txt") as res:
			for l1, l2 in zip(out, res):
				if l1 != l2:
					same = False
					print("truth is " + l1 + ", but your answer is " + l2)
					break
		if same:
			print("SUCCESS\n")
	else:
		num = 0
		for line in log:
			if line.find("total_num")!=-1:
				num = int(line.split(' ')[-1])
				#print(num)
		log.close()

		out = open(out_filename, "r")
		for line in out:
			truth = int(line)
			if num == truth:
				print("SUCCESS\n")
			else:
				print("truth is " + str(truth) + ", but your answer is " + str(num))
	out.close()


================================================
FILE: scripts/experimental/pangolin/verify.sh
================================================
#!/bin/bash
# Usage: ./verify.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>

execname=$1
inputname=$2
option=$3
bin=${execname}
if [[ $execname == *"tc"* ]]; then
  bin="tc_mine"
fi
execdirname="./${execname}"
NTHREADS="56"
MINSUP="500"
EXEC=${execdirname}/${bin}
inputdirname=/net/ohm/export/iss/inputs/Mining
outputdirname=/net/ohm/export/iss/pangolin-outputs

filetype=gr
extension=csgr
if [[ $execname == *"fsm"* ]]; then
  filetype=adj
  extension=sadj
fi

IFS='_' read -ra EXECP <<< "$execname"
problem=${EXECP[0]}

SIZES="3"
if [[ $execname == *"fsm"* ]]; then
  SIZES="2"
fi

if [[ $execname == *"kcl"* ]]; then
  SIZES="4 5"
fi

if [[ $execname == *"motif"* ]]; then
  SIZES="3 4"
fi

FLAGS=
if [[ $execname == *"fsm"* ]]; then
  FLAGS="-ms=$MINSUP"
fi

#FLAGS+=" -t=56"
OUTPUT=${outputdirname}/${inputname}.${problem}.$K
INPUT=${inputdirname}/${inputname}.${extension}
checker=${outputdirname}/result_checker.py
pass=0
fail=0
failed_cases=""
check_output="my-output.log"


for K in $SIZES; do
	for NT in $NTHREADS; do
		LOG=${execname}-${inputname}-$K-$NT.log
		echo "${EXEC} $filetype ${INPUT} -t=$NT -k=$K $FLAGS -v > $LOG"
		eval "${EXEC} $filetype ${INPUT} -t=$NT -k=$K $FLAGS -v" > $LOG 2>> error.log
		echo "python $checker ${execname} ${inputname} $K $MINSUP $LOG &> ${check_output}"
		eval "python $checker ${execname} ${inputname} $K $MINSUP $LOG &> ${check_output}"
		#cat ${check_output}
		if ! grep -q "SUCCESS" ${check_output} ; then
			let fail=fail+1
			failed_cases+="${execname} ${inputname} k=$K t=$NT"
		else
			let pass=pass+1
		fi
		rm -f ${check_output}
	done
done


echo "---------------------------------------------------------------------------------------"
echo "Algorithm: " $execname
echo "Input: " $inputname
echo "Runtime option: " $option
echo $pass "passed test cases"
if [[ $fail == 0 ]] ; then
  echo "Status: SUCCESS"
else
  echo $fail "failed test cases:" $failed_cases
  echo "Status: FAILED"
fi
echo "---------------------------------------------------------------------------------------"


================================================
FILE: scripts/experimental/runBFS.sh
================================================
#!/bin/bash

declare -A inputsMap

inputsMap["r4"]="/net/ohm/export/iss/inputs/random/r4-2e26.gr"
inputsMap["rmat"]="/net/ohm/export/iss/inputs/scalefree/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.gr"
inputsMap["twitter"]="/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.gr"


serialAlgos="SerialSync Serial"
serialRep="`seq 1 3`"
tag=${tag="tag"}

for algo in $serialAlgos; do 
  for input in "${!inputsMap[@]}"; do 
    for i in 1 2 3; do 
      ./lonestar/bfs/bfs -algo=${algo}  "${inputsMap[$input]}" -noverify  ; 
    done 2>&1 | tee bfs-${tag}-${algo}-${input}.log 
  done
done

parallelAlgos="Async Sync Sync2p"
threads="1 `seq 5 5 40`"

for algo in $parallelAlgos; do
  for input in "${!inputsMap[@]}"; do 
    for t in $threads; do
      ./lonestar/bfs/bfs -algo=${algo}  "${inputsMap[$input]}" -noverify  -t $t; 
    done 2>&1 | tee bfs-${tag}-${algo}-${input}.log 
  done
done


================================================
FILE: scripts/experimental/runSSSP.sh
================================================
#!/bin/bash

declare -A inputsMap

inputsMap["r4"]="/net/ohm/export/iss/inputs/random/r4-2e26.gr"
inputsMap["rmat"]="/net/ohm/export/iss/inputs/scalefree/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.gr"
inputsMap["road"]="/net/ohm/export/iss/inputs/road/osm-eur-karlsruhe.gr"


app="./lonestar/sssp/sssp"
# serialAlgos="dijkstra serDelta serDeltaTiled"
serialAlgos="serDelta serDeltaTiled"
serialRep="`seq 1 3`"
tag=${tag="tag"}

for algo in $serialAlgos; do 
  for input in "${!inputsMap[@]}"; do 
    for i in 1 2 3; do 
      ${app} -algo=${algo}  "${inputsMap[$input]}" -noverify  ; 
    done 2>&1 | tee $(basename ${app})-${tag}-${algo}-${input}.log 
  done
done

parallelAlgos="deltaStep deltaTiled"
threads="1 `seq 5 5 40`"
# threads="40"

for algo in $parallelAlgos; do
  for input in "${!inputsMap[@]}"; do 
    for t in $threads; do
      ${app} -algo=${algo}  "${inputsMap[$input]}" -noverify  -t $t; 
    done 2>&1 | tee $(basename ${app})-${tag}-${algo}-${input}.log 
  done
done


================================================
FILE: scripts/find_ifdefs.sh
================================================
find $* -name '*.h' -o -name '*.cpp'  \
  | xargs grep --no-filename '#if' \
  | awk '{print $2;}' | sort | uniq


================================================
FILE: scripts/galois_log_parser.R
================================================
#!/usr/bin/env Rscript

#######################################################
# Author: Gurbinder Gill
# Email:  gill@cs.utexas.edu
# Date:   Oct 8, 2017
######################################################
library("optparse")
library('data.table')

####START: @function to parse commadline##################
# Parses the command line to get the arguments used
parseCmdLine <- function (logData, isSharedMemGaloisLog, graphPassedAsInput) {
  #cmdLineRow <- subset(logData, CATEGORY == "CommandLine"& TOTAL_TYPE != "HostValues")
  cmdLineRow <- subset(logData, CATEGORY == "CommandLine" & STAT_TYPE == "PARAM")

  ## Distributed has extra column: HostID
  if(isTRUE(isSharedMemGaloisLog)){
    cmdLine <- substring(cmdLineRow[,5], 0)
  }
  else
    cmdLine <- substring(cmdLineRow[,6], 0)

  cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]

  deviceKind = "CPU"
  if(!isTRUE(isSharedMemGaloisLog)){
    ## To check the device kind
    pos = regexpr('-pset', cmdLineSplit)
    deviceKind = ""
    if(sum(pos>0) > 0){
      deviceKind = "GPU"
    } else {
      deviceKind = "CPU"
    }
  }

  ## First postitional argument is always name of the executable
  ### WORKING: split the exePath name found at the position 1 of the argument list and split on "/".
  exePathSplit <- strsplit(cmdLineSplit[1], "/")[[1]]
  benchmark <- exePathSplit[length(exePathSplit)]

  ## subset the threads row from the table
  numThreads <- (subset(logData, CATEGORY == "Threads" & TOTAL_TYPE != "HostValues"))$TOTAL

  input = "noInput"
  if(isTRUE(graphPassedAsInput)){
    ## subset the input row from the table
    inputPath <- (subset(logData, CATEGORY == "Input" & STAT_TYPE == "PARAM"))$TOTAL
    print(inputPath)
    if(!identical(inputPath, character(0))){
      #inputPath = cmdLineSplit[3]
      #print(cmdLineSplit[3])
      inputPathSplit <- strsplit(inputPath, "/")[[1]]
      input <- inputPathSplit[length(inputPathSplit)]
    }
    else {
      inputPathSplit <- strsplit(inputPath[[2]], "/")[[1]]
      input <- inputPathSplit[length(inputPathSplit)]
    }
    ### This is to remore the extension for example .gr or .sgr
    inputsplit <- strsplit(input, "[.]")[[1]]
    if(length(inputsplit) > 1) {
      input <- inputsplit[1]
    }
  }

  if(isTRUE(isSharedMemGaloisLog)){
    returnList <- list("benchmark" = benchmark, "input" = input, "numThreads" = numThreads, "deviceKind" = deviceKind)
    return(returnList)
  }

 ## Need more params for distributed galois logs
 numHosts <- (subset(logData, CATEGORY == "Hosts"& TOTAL_TYPE != "HostValues"))$TOTAL

 partitionScheme <- (subset(logData, CATEGORY == "PartitionScheme"& TOTAL_TYPE != "HostValues"))$TOTAL

 runID <- (subset(logData, CATEGORY == "Run_UUID"& TOTAL_TYPE != "HostValues"))$TOTAL

 numIterations <- (subset(logData, CATEGORY == "NumIterations_0"& TOTAL_TYPE != "HostValues"))$TOTAL
 #If numIterations is not printed in the log files
 if(identical(numIterations, character(0))){
   numIterations <- 0
 }

 ## returnList for distributed galois log
 returnList <- list("runID" = runID, "benchmark" = benchmark, "input" = input, "partitionScheme" = partitionScheme, "hosts" = numHosts , "numThreads" = numThreads, "iterations" = numIterations, "deviceKind" = deviceKind)
 return(returnList)
}
#### END: @function to parse commadline ##################

#### START: @function to values of timers for shared memory galois log ##################
# Parses to get the timer values
getTimersShared <- function (logData, benchmark) {
  totalTimeRow <- subset(logData, CATEGORY == "Time" & REGION == "(NULL)")
  totalTime <- totalTimeRow$TOTAL
  print(paste("totalTime:", totalTime))
 returnList <- list("totalTime" = totalTime)
 return(returnList)
}
#### END: @function to values of timers for shared memory galois log ##################

#### START: @function to values of timers for distributed memory galois log ##################
# Parses to get the timer values
getTimersDistributed <- function (logData) {

 ## Total time including the graph construction and initialization
 totalTime <- (subset(logData, CATEGORY == "TimerTotal" & TOTAL_TYPE != "HostValues")$TOTAL)
 print(paste("totalTime:", totalTime))

 ## Taking mean of all the runs
 totalTimeExecMean <- round(mean(as.numeric(subset(logData, grepl("Timer_[0-9]+", CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)), digits = 2)
 print(paste("totalTimeExecMean:", totalTimeExecMean))

 ## To get the name of benchmark to be used with other queries to get right timers.
 ### It assumes that there will always with Timer_0 with REGION name as benchmark
 ### name used with other queries.
 benchmarkRegionName <- subset(logData, CATEGORY == "Timer_0" & TOTAL_TYPE != "HostValues")$REGION
 print(paste("benchmark:", benchmarkRegionName))

 ## Number of runs
 numRuns <- as.numeric((subset(logData, CATEGORY == "Runs" & TOTAL_TYPE != "HostValues"))$TOTAL)
 print(paste("numRuns:", numRuns))

 ## Total compute time (galois::do_alls)
 computeTimeMean <- 0
 if(benchmarkRegionName == "BC"){
   regions <- c("SSSP", "InitializeIteration", "PredAndSucc", "NumShortestPathsChanges", "NumShortestPaths", "PropagationFlagUpdate", "DependencyPropChanges", "DependencyPropagation", "BC")
   for( region in regions){
     print(region)
     computeTimeRows <- subset(logData, grepl(paste("^", region, "$", sep=""), REGION) & CATEGORY == "Time" & TOTAL_TYPE == "HMAX")$TOTAL
     #computeTimeRows <- subset(logData, grepl(paste("CUDA_DO_ALL_IMPL_", region, "$", sep=""), CATEGORY) & TOTAL_TYPE == "HMAX")$TOTAL
     if(!identical(computeTimeRows, character(0))){
       print(paste(region, " : time :  ", as.numeric(computeTimeRows)))
       computeTimeMean = computeTimeMean + round(as.numeric(computeTimeRows)/numRuns, digits = 2)
     }
   }
 }
 else {
   computeTimePerIter <- numeric(numRuns)
   for(i in 1:(numRuns)) {
     j = i - 1 #Vectors are 1 indexed in r
     computeTimeRows <- subset(logData, grepl(paste("^", benchmarkRegionName, "_", j, "_[0-9]+", sep=""), REGION) & TOTAL_TYPE != "HostValues")$TOTAL
     #computeTimeRows <- subset(logData, grepl(paste("CUDA_DO_ALL_IMPL_", benchmarkRegionName, "_", j, "_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL
     if(!identical(computeTimeRows, character(0))){
       computeTimePerIter[i] <- sum(as.numeric(computeTimeRows))
     }
   }
   computeTimeMean <- (mean(computeTimePerIter))

   if(computeTimeMean == 0){
     computeTimeMean <- round(mean(as.numeric(subset(logData, grepl(paste("^", benchmarkRegionName, "_[0-9]+", sep=""), REGION) & TOTAL_TYPE != "HostValues")$TOTAL)), digits = 2)
   }
 }
 print(paste("computeTimeMean:", computeTimeMean))

 ##Total sync time.
 syncTimePerIter <- numeric(numRuns)
 syncTimeMean <- 0
 if(benchmarkRegionName == "BC"){
   regions <- c("SSSP", "InitializeIteration", "PredAndSucc", "NumShortestPathsChanges", "NumShortestPaths", "PropagationFlagUpdate", "DependencyPropChanges", "DependencyPropagation", "BC")
   for(i in 1:(numRuns)) {
     for(region in regions){
       syncTimeRows <- subset(logData, grepl(paste("Sync_", region, "_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL
       if(!identical(syncTimeRows, character(0))){
         #print(region)
         syncTimeMean <- syncTimeMean + round(mean(as.numeric(syncTimeRows)), digits = 2)
       }
     }
   }
 }
 else{
   for(i in 1:(numRuns)) {
     j = i - 1 #Vectors are 1 indexed in r
     syncTimeRows <- subset(logData, grepl(paste("Sync_", benchmarkRegionName, "_", j, "_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL
     if(!identical(syncTimeRows, character(0))){
       syncTimePerIter[i] <- sum(as.numeric(syncTimeRows))
     }
   }
   syncTimeMean <- (mean(syncTimePerIter))
   if(syncTimeMean == 0) {
     syncTimeMean <- round(mean(as.numeric(subset(logData, grepl(paste("Sync_", benchmarkRegionName, "_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)), digits = 2)
   }
 }
 print(paste("syncTimeMean", syncTimeMean))


 ## Mean time spent in the implicit barrier: DGReducible
 barrierTimePerIter <- numeric(numRuns)
 for(i in 1:(numRuns)) {
  j = i - 1 #Vectors are 1 indexed in r
  barrierTimeRows <- subset(logData, REGION =="DGReducible" & grepl(paste( "ReduceDGAccum_", j, "_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL
  if(!identical(barrierTimeRows, character(0))){
    barrierTimePerIter[i] <- sum(as.numeric(barrierTimeRows))
  }
 }
 barrierTimeMean <- (mean(barrierTimePerIter))
 if(barrierTimeMean == 0) {
  barrierTimeMean <- round(mean(as.numeric(subset(logData, REGION =="DGReducible" & grepl(paste( "ReduceDGAccum_[0-9]*", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)), digits = 2)
 }
 print(paste("barrierTimeMean:", barrierTimeMean))

 ## Total bytes sent in reduce and broadcast phase in run 0.
 ### Same number of bytes are being sent in all the runs.
 syncBytes <- 0
 if(benchmarkRegionName == "BC"){
   regions <- c("SSSP", "InitializeIteration", "PredAndSucc", "NumShortestPathsChanges", "NumShortestPaths", "PropagationFlagUpdate", "DependencyPropChanges", "DependencyPropagation", "BC")
   for(region in regions){
     sendBytesRegion <- sum(as.numeric(subset(logData, grepl(paste("[Reduce|Broadcast]SendBytes_", region, "_0", sep=""), CATEGORY) & TOTAL_TYPE == "HSUM")$TOTAL))
     print(paste(region, " : ", sendBytesRegion))
     syncBytes <- syncBytes + sendBytesRegion 
     print(syncBytes)
   }
 }
 else {
   syncBytes <- sum(as.numeric(subset(logData, grepl(paste("[Reduce|Broadcast]SendBytes_", benchmarkRegionName, "_0", sep=""), CATEGORY)& TOTAL_TYPE == "HSUM")$TOTAL))
 }
 print(paste("syncBytes:", syncBytes))

 ##Graph construction time
 graphConstructTime <- subset(logData, CATEGORY == "GraphConstructTime" & TOTAL_TYPE != "HostValues")$TOTAL
 print(paste("graphConstructTime:", graphConstructTime))

 ## Replication factor
 replicationFactor <- subset(logData, CATEGORY == "ReplicationFactor" & TOTAL_TYPE != "HostValues")$TOTAL
 print(paste("replicationFactor:", replicationFactor))
 #if(is.null(replicationFactor)){
if(identical(replicationFactor, character(0))){
   replicationFactor <- 0
 }

 ## Communication memory usage: Max and Min.
 communicationMemUsageMax = as.numeric(subset(logData, CATEGORY == "CommunicationMemUsageMax" & TOTAL_TYPE == "HMAX")$TOTAL)
 communicationMemUsageMin = as.numeric(subset(logData, CATEGORY == "CommunicationMemUsageMin" & TOTAL_TYPE == "HMIN")$TOTAL)

 if(identical(communicationMemUsageMax, numeric(0)) || identical(communicationMemUsageMin, numeric(0))){
   communicationMemUsageMax = 0
   communicationMemUsageMin = 0
   print("Printing Memory usage counter not present.")
 }

 returnList <- list("replicationFac" = replicationFactor, "totalTime" = totalTime, "totalTimeExec" = totalTimeExecMean, "computeTime" = computeTimeMean, "syncTime" = syncTimeMean, "barrierTime" = barrierTimeMean, "syncBytes" = syncBytes, "graphConstructTime"= graphConstructTime, "communicationMemUsageMax" = communicationMemUsageMax, "communicationMemUsageMin" = communicationMemUsageMin)
 print(length(returnList))
 return(returnList)
}
#### END: @function to values of timers for distributed memory galois log ##################

#### START: @function to compute per iteration communication volume. ##################
# Parses to get the timer values
computePerIterVolume <- function (logData, paramList, output) {
  numIter = as.numeric(paramList["iterations"])
  print(numIter)

  benchmarkRegionName <- subset(logData, CATEGORY == "Timer_0" & TOTAL_TYPE != "HostValues")$REGION
  print(paste("benchmark:", benchmarkRegionName))

  ## Number of runs
  numRuns <- as.numeric((subset(logData, CATEGORY == "Runs" & TOTAL_TYPE != "HostValues"))$TOTAL)
  print(paste("numRuns:", numRuns))

  output_perIterVol_file <- paste(output, "_perIterVolume", sep="")
  output_perIterVolRangePercentage_file <- paste(output, "_perIterVolumeRangePercentage", sep="")

  ## Doing 1st iteration separately to see if new file is to be created or if file already exists.
  #STAT, 0, dGraph, REDUCE_SEND_BYTES_BFS_0_0, HSUM, 23587108
  ## To collect the data points in separate ranges of data volume
  low = 0
  medium = 0
  high = 0

  for(r in 0:(numRuns - 1)){
    commVolumeRow <- subset(logData, grepl(paste("SEND_BYTES_", benchmarkRegionName, "_", r, "_", 0, "$" , sep=""), CATEGORY) & TOTAL_TYPE == "HSUM")$TOTAL
    #print(commVolumeRow)
    if(!identical(commVolumeRow, character(0))){
      print(commVolumeRow)
      totalCommVolSentPerIter <- sum(as.numeric(commVolumeRow))
      vol = totalCommVolSentPerIter/(1024*1024)
      if(vol <= 100 )
        low = low + 1
      else if(vol > 100 && vol <= 1000)
        medium = medium + 1
      else if(vol > 1000)
        high = high + 1

      commVolList <- list("run" = r, "iter" = 0, "sendBytesPerIter" = totalCommVolSentPerIter)
      outDataList <- append(paramList, commVolList)
      if(!file.exists(output_perIterVol_file)){
        print(paste(output_perIterVol_file, "Does not exist. Creating new file to record per iteration volume"))
        write.csv(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, quote=F)
      } else {
        print(paste("Appending data to the existing file", output_perIterVol_file))
        write.table(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
      }
      print(totalCommVolSentPerIter)
    }
  }

  for(i in 1:(numIter - 1)) {
    for(r in 0:(numRuns - 1)){
      commVolumeRow <- subset(logData, grepl(paste("SEND_BYTES_", benchmarkRegionName, "_", r, "_", i, "$" ,sep=""), CATEGORY) & TOTAL_TYPE == "HSUM")$TOTAL
      if(!identical(commVolumeRow, character(0))){
        #print(commVolumeRow)
        totalCommVolSentPerIter <- sum(as.numeric(commVolumeRow))
        vol = totalCommVolSentPerIter/(1024*1024)
        if(vol <= 100 )
          low = low + 1
        else if(vol > 100 && vol <= 1000)
          medium = medium + 1
        else if(vol > 1000)
          high = high + 1

        commVolList <- list("run" = r, "iter" = i, "sendBytesPerIter" = totalCommVolSentPerIter)
        outDataList <- append(paramList, commVolList)
        write.table(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
        #print(totalCommVolSentPerIter)
      }
    }
  }


  totalNumber <- low + medium + high
  if(!file.exists(output_perIterVolRangePercentage_file)){
    print(paste(output_perIterVolRangePercentage_file, "Does not exist. Creating new file to record per iteration volume in ranges"))
    rangeList_low <- list("rangeLabel" = "low", "value" = low, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_low)
    write.csv(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, quote=F)

    rangeList_medium <- list("rangeLabel" = "medium", "value" = medium, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_medium)
    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=",")

    rangeList_high <- list("rangeLabel" = "high", "value" = high, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_high)
    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
  } else {

    print(paste("Appending data to the existing file", output_perIterVolRangePercentage_file))

    rangeList_low <- list("rangeLabel" = "low", "value" = low, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_low)
    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=",")

    rangeList_medium <- list("rangeLabel" = "medium", "value" = medium, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_medium)
    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=",")

    rangeList_high <- list("rangeLabel" = "high", "value" = high, "total" = totalNumber)
    outDataList <- append(paramList, rangeList_high)
    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
  }

}


#### START: @function to compute per iteration RSD of compute time. ##################
# Parses to get the timer values
computeRSD <- function (logData, paramList, output) {
  numIter = as.numeric(paramList["iterations"])

  benchmarkRegionName <- subset(logData, CATEGORY == "TIMER_0" & TOTAL_TYPE != "HostValues")$REGION
  print(paste("benchmark:", benchmarkRegionName))

  ## Number of runs
  numRuns <- as.numeric((subset(logData, CATEGORY == "Runs" & TOTAL_TYPE != "HostValues"))$TOTAL)
  print(paste("numRuns:", numRuns))

  output_rsd_file <- paste(output, "_computeRSD", sep="")


  ## Doing 1st iteration separately to see if new file is to be created or if file already exists.
  for(r in 0:(numRuns - 1)){
    computeTimeRows <- subset(logData, grepl(paste("^", benchmarkRegionName, "_", r, "_", 0, sep=""), REGION) & TOTAL_TYPE == "HostValues")$TOTAL
    if(!identical(computeTimeRows, character(0))){
      print(computeTimeRows)
      computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, ";")[[1]]))
      sd <- sd(computeTimePerHostArr)
      mean <- mean(computeTimePerHostArr)
      rsd <- round((sd/mean)*100, digits = 2)
      rsdList <- list("run" = r, "iter" = 0, "sd" = sd, "mean" = mean , "rsd" = rsd)
      outDataList <- append(paramList, rsdList)
      if(!file.exists(output_rsd_file)){
        print(paste(output_rsd_file, "Does not exist. Creating new file"))
        write.csv(as.data.frame(outDataList), file=output_rsd_file, row.names=F, quote=F)
      } else {
        print(paste("Appending data to the existing file", output_rsd_file))
        write.table(as.data.frame(outDataList), file=output_rsd_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
      }
      print(rsd)
    }
  }

  for(i in 1:(numIter - 1)) {
    for(r in 0:(numRuns - 1)){
      print(i)
      computeTimeRows <- subset(logData, grepl(paste("^", benchmarkRegionName, "_", r, "_", i, sep=""), REGION) & TOTAL_TYPE == "HostValues")$TOTAL
      if(!identical(computeTimeRows, character(0))){
        computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, ";")[[1]]))
        sd <- sd(computeTimePerHostArr)
        mean <- mean(computeTimePerHostArr)
        rsd <- round((sd/mean)*100, digits = 2)
        rsdList <- list("run" = r, "iter" = i, "sd" = sd, "mean" = mean , "rsd" = rsd)
        outDataList <- append(paramList, rsdList)
        write.table(as.data.frame(outDataList), file=output_rsd_file, row.names=F, col.names=F, quote=F, append=T, sep=",")
        print(rsd)
      }
    }
  }
}

#### START: @function to compute max by mean of compute time. ##################
# Parses to get the timer values
computeMaxByMean <- function (logData, paramList, output) {
  numIter = as.numeric(paramList["iterations"])

  benchmarkRegionName <- subset(logData, CATEGORY == "TIMER_0" & TOTAL_TYPE != "HostValues")$REGION
  print(paste("benchmark:", benchmarkRegionName))

  ## Number of runs
  numRuns <- as.numeric((subset(logData, CATEGORY == "Runs" & TOTAL_TYPE != "HostValues"))$TOTAL)
  print(paste("numRuns:", numRuns))

  maxsum <- numeric()
  meansum <- numeric()
  maxbymean <- numeric()

  if(benchmarkRegionName == "BC"){
    maxsum <- 0
    meansum <- 0
    regions <- c("SSSP", "InitializeIteration", "PredAndSucc", "NumShortestPathsChanges", "NumShortestPaths", "PropagationFlagUpdate", "DependencyPropChanges", "DependencyPropagation", "BC")
    for( region in regions){
     print(region)
     computeTimeRows <- subset(logData, grepl(paste("^", region, "$", sep=""), REGION) & CATEGORY == "Time" & TOTAL_TYPE == "HostValues")$TOTAL
     #computeTimeRows <- subset(logData, grepl(paste("CUDA_DO_ALL_IMPL_", region, "$", sep=""), CATEGORY) & TOTAL_TYPE == "HostValues")$TOTAL
     if(!is.null(computeTimeRows)){
       print(computeTimeRows)
       computeTimePerHost <- (as.numeric(strsplit(computeTimeRows, ";")[[1]]))
       maxsum[1] <- maxsum[1] +  round(max(as.numeric(computeTimePerHost))/numRuns, digits = 2)
       meansum[1] <- meansum[1] + round(mean(as.numeric(computeTimePerHost))/numRuns, digits = 2)
     }
   }
   maxbymean[1] <- round(maxsum[1]/meansum[1], digits = 2)
   print(paste(region, " : maxsum :  ", maxsum))
   print(paste(region, " : meansum :  ", meansum))
   print(paste(region, " : maxbymean :  ", maxbymean))

  }
  else {
    for(r in 0:(numRuns - 1)){
      max <- numeric()
      mean <- numeric()
      for(i in 0:(numIter - 1)) {
        computeTimeRows <- subset(logData, grepl(paste("^", benchmarkRegionName, "_", r, "_", i, sep=""), REGION) & TOTAL_TYPE == "HostValues")$TOTAL
        if(!identical(computeTimeRows, character(0))){
          computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, ";")[[1]]))
          mean[i+1] <- mean(computeTimePerHostArr)
          max[i+1] <- max(computeTimePerHostArr)
        }
        else {
          mean[i+1] <- 0
          max[i+1] <- 0
        }
      }
      maxsum[r+1] <- sum(max)
      meansum[r+1] <- sum(mean)
      maxbymean[r+1] <- round((maxsum[r+1]/meansum[r+1]), digits = 2)
    }
  }
  maxsum_avg <- mean(maxsum)
  meansum_avg <- mean(meansum)
  maxbymean_avg <- mean(maxbymean)
  maxbymeanList <- list("maxComputeTime" = maxsum_avg, "meanComputeTime" = meansum_avg, "maxByMeanComputeTime" = maxbymean_avg)
  outDataList <- append(paramList, maxbymeanList)
  print(paste("MaxByMeanComputeTime:", maxbymean_avg))

  if(!file.exists(output)){
    print(paste(output, "Does not exist. Creating new file"))
    write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)
  } else {
    print(paste("Appending data to the existing file", output))
    write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",")
  }
}

getTimersFT <- function(logData) {

  enableFT <- 0
  crashIteration <- 0
  crashNumHosts <- 0
  checkPointInterval <- 0
  recoveryScheme <- "NA"
  recoveryTimeTotal <- 0
  recoveryTimeTotalCrashed <- 0
  recoveryTimeTotalHealthy <- 0
  recoveryTimeGraphConstruct <- 0
  recoveryTime <- 0
  recoveryTimeSync <- 0
  checkpointSaveTime <- 0


  enableFT <- as.numeric(subset(logData, CATEGORY == "ENABLE_FT"& TOTAL_TYPE != "HostValues")$TOTAL)
  if(identical(enableFT, numeric(0))){
    cmdLineRow <- subset(logData, CATEGORY == "CommandLine"& TOTAL_TYPE != "HostValues")
    cmdLine <- substring(cmdLineRow[,6], 0)
    cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]
    for (c in cmdLineSplit) {
      if(any(grep("-enableFT", c))){
        splitStr = strsplit(c, "=")[[1]]
        enableFT <- splitStr[2]
      }
    }
  }

  print(enableFT)
  if(enableFT == 1){
  print("here", enableFT)

    cmdLineRow <- subset(logData, CATEGORY == "CommandLine"& TOTAL_TYPE != "HostValues")
    cmdLine <- substring(cmdLineRow[,6], 0)
    cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]
    for (c in cmdLineSplit) {
      if(any(grep("-crashIteration", c))){
        splitStr = strsplit(c, "=")[[1]]
        crashIteration <- splitStr[2]
      } else if(any(grep("-crashNumHosts", c))){
        splitStr = strsplit(c, "=")[[1]]
        crashNumHosts <- splitStr[2]
      } else if(any(grep("-recoveryScheme", c))){
        splitStr = strsplit(c, "=")[[1]]
        recoveryScheme <- splitStr[2]
      } else if(any(grep("-checkpointInterval", c))){
        splitStr = strsplit(c, "=")[[1]]
        checkPointInterval <- splitStr[2]
      }
    }

    print(paste("enableFT:", enableFT, " crashIteration:", crashIteration, " crashNumHosts:", crashNumHosts, " recoveryScheme:", recoveryScheme, " checkPointInterval:", checkPointInterval))

    #### Recovery counters
    #recoveryTimeTotal <- (subset(logData, grepl(paste("TIMER_RECOVERY_TOTAL_[0-9]+_", crashIteration, sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTimeTotal <- (subset(logData, grepl(paste("TIMER_RECOVERY_TOTAL_[0-9]+_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTimeTotal, character(0))){
      recoveryTimeTotal <- 0
    }

    #recoveryTimeTotalCrashed <- (subset(logData, grepl(paste("TIMER_RECOVERY_CRASHED_[0-9]+_", crashIteration, sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTimeTotalCrashed <- (subset(logData, grepl(paste("TIMER_RECOVERY_CRASHED_[0-9]+_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTimeTotalCrashed, character(0))){
      recoveryTimeTotalCrashed <- 0
    }

    #recoveryTimeGraphConstruct <- (subset(logData, grepl(paste("TIMER_RECOVERY_GRAPH_CONSTRUCT_[0-9]+_", crashIteration, sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTimeGraphConstruct <- (subset(logData, grepl(paste("TIMER_RECOVERY_GRAPH_CONSTRUCT_[0-9]+_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTimeGraphConstruct, character(0))){
      recoveryTimeGraphConstruct <- 0
    }

    ### Timers for the healthy host
    #recoveryTimeTotalHealthy <- (subset(logData, grepl(paste("TIMER_RECOVERY_HEALTHY_[0-9]+_", crashIteration, sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTimeTotalHealthy <- (subset(logData, grepl(paste("TIMER_RECOVERY_HEALTHY_[0-9]+_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTimeTotalHealthy, character(0))){
      recoveryTimeTotalHealthy <- 0
    }

    ## Time spent on recovery
    #recoveryTime <- (subset(logData, grepl(paste("^RECOVERY_[0-9]+_", crashIteration, sep=""), REGION) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTime <- (subset(logData, grepl(paste("^RECOVERY_[0-9]+_[0-9]+", sep=""), REGION) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTime, character(0))){
      recoveryTime <- 0
    }
    #recoveryTimeSync <- (subset(logData, grepl(paste("^SYNC_RECOVERY_[0-9]+_", crashIteration, sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    recoveryTimeSync <- (subset(logData, grepl(paste("^SYNC_RECOVERY_[0-9]+_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)
    if(identical(recoveryTimeSync, character(0))){
      recoveryTimeSync <- 0
    }

    #### Total time to save checkpoint
    checkpointSaveTime <- 0
    if(recoveryScheme == "cp" || recoveryScheme == "hr"){
    #checkpointSaveTime <- sum(as.numeric(subset(logData, grepl(paste("^TIMER_SAVE_CHECKPOINT_[0-9]+_[0-9]+",sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL))
    checkpointSaveTime <- (as.numeric(subset(logData, grepl(paste("^TOTAL_TIMER_SAVE_CHECKPOINT",sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL))
    print (checkpointSaveTime)
    }

    print(recoveryTimeTotal)
    print(recoveryTimeTotalCrashed)
    print(recoveryTimeTotalHealthy)
    print(recoveryTime)
    print(recoveryTimeSync)

  }
  ### Calculate the number of work items:
  workItems <- sum(as.numeric(subset(logData, grepl(paste("NUM_WORK_ITEMS_0", sep=""), CATEGORY) & TOTAL_TYPE == "HSUM")$TOTAL))
  print(paste("workItems : ", workItems))

  ### calculate the sync bytes in recovery phase
  recoverySyncBytes <- sum(as.numeric(subset(logData, grepl(paste("[REDUCE|BROADCAST]_SEND_BYTES_RECOVERY.*_0_[0-9]+", sep=""), CATEGORY)& TOTAL_TYPE == "HSUM")$TOTAL))
  print(paste("recoverySyncBytes", recoverySyncBytes))

  ### To calculate any bytes spent in initialization
  #syncBytesInitGraph <- sum(as.numeric(subset(logData, grepl(paste("[REDUCE|BROADCAST]_SEND_BYTES_InitializeGraph_[crashed|healthy]_0_[0-9]+", sep=""), CATEGORY)& TOTAL_TYPE == "HSUM")$TOTAL))
  syncBytesInitGraph <- sum(as.numeric(subset(logData, grepl(paste("[REDUCE|BROADCAST]_SEND_BYTES_InitializeGraph_(?:crashed|healthy)_0_[0-9]+", sep=""), CATEGORY)& TOTAL_TYPE == "HSUM")$TOTAL))
  print(paste("syncBytesInitGraph", syncBytesInitGraph))


  returnList <- list("FT" = enableFT, "crashIter" = crashIteration, "crashNumHosts" = crashNumHosts, "RScheme" = recoveryScheme, "CPInterval" = checkPointInterval, "RTimeTotal" = recoveryTimeTotal, "RTimeTotalCrashed" =  recoveryTimeTotalCrashed, "RTimeTotalHealthy" = recoveryTimeTotalHealthy, "RTimeGraphConstruct" = recoveryTimeGraphConstruct, "RTimeExec" = recoveryTime, "RTimeSync" = recoveryTimeSync, "CPSaveTime" = checkpointSaveTime, "workItems" = workItems, "RSyncBytes" = recoverySyncBytes, "RSyncBytesGraphInit" = syncBytesInitGraph)
  return(returnList)
}

#### START: @function entry point for galois log parser ##################
galoisLogParser <- function(input, output, isSharedMemGaloisLog, isComputeRSD, isComputeMaxByMean, isComputePerIterVol, isFautlTolerant, graphPassedAsInput) {
  logData <- read.csv(input, stringsAsFactors=F,strip.white=T)

  printNormalStats = TRUE;
  if(isTRUE(isSharedMemGaloisLog)){
    print("Parsing commadline")
    paramList <- parseCmdLine(logData, T, graphPassedAsInput)
    print("Parsing timers for shared memory galois log")
    benchmark = paramList[1]
    timersList <- getTimersShared(logData, benchmark)
  }
  else{
    print("Parsing commadline")
    paramList <- parseCmdLine(logData, F, graphPassedAsInput)
    print("Parsing timers for distributed memory galois log")
    if(isTRUE(isComputeMaxByMean)){
      computeMaxByMean(logData, paramList, output)
      printNormalStats = FALSE
    }
    else if(isTRUE(isComputeRSD)){
      computeRSD(logData, paramList, output)
      printNormalStats = FALSE
    }
    else if(isTRUE(isComputePerIterVol)){
      computePerIterVolume(logData, paramList, output)
      printNormalStats = FALSE
    }
    else{
      timersList <- getTimersDistributed(logData)
    }

    if(isTRUE(isFautlTolerant)){
      timersList_ft <- getTimersFT(logData)
      timersList <- append(timersList, timersList_ft)
    }
  }

  ## if computing RSD then normal stats are not printed
  #if(isTRUE(!isComputeRSD && !isComputeMaxByMean && !isComputePerIterVol)){
  if(isTRUE(printNormalStats)){
    outDataList <- append(paramList, timersList)
    if(!file.exists(output)){
      print(paste(output, "Does not exist. Creating new file"))
      write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)
    } else {
      print(paste("Appending data to the existing file", output))
      write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",")
    }
  }
}
#### END: @function entry point for shared memory galois log ##################

#############################################
##  Commandline options.
#######################################
option_list = list(
                   make_option(c("-i", "--input"), action="store", default=NA, type='character',
                               help="name of the input file to parse"),
                   make_option(c("-o", "--output"), action="store", default=NA, type='character',
                               help="name of the output file to store output"),
                   make_option(c("-s", "--sharedMemGaloisLog"), action="store_true", default=FALSE,
                               help="Is it a shared memory Galois log? If -s is not used, it will be treated as a distributed Galois log [default %default]"),
                   make_option(c("-r", "--relativeStandardDeviation"), action="store_true", default=FALSE,
                               help="To compute the RSD of per iteration compute time[default %default]"),
                   make_option(c("-m", "--maxByMean"), action="store_true", default=FALSE,
                               help="To compute the max by mean compute time[default %default]"),
                   make_option(c("-p", "--perItrVolume"), action="store_true", default=FALSE,
                               help="To get the per iteration communication volume [default %default]"),
                   make_option(c("-f", "--faultTolerance"), action="store_true", default=FALSE,
                               help="Logs are fault tolerant [default %default]"),
                   make_option(c("-g", "--graphPassedAsInput"), action="store_false", default=TRUE,
                               help="Benchmark explicitly takes input graph as the positional argument [default %default]")

                   )

opt_parser <- OptionParser(usage = "%prog [options] -i input.log -o output.csv", option_list=option_list)
opt <- parse_args(opt_parser)

if (is.na(opt$i)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file)", call.=FALSE)
} else {
  if (is.na(opt$o)){
    print("Output file name is not specified. Using name ouput.csv as default")
    opt$o <- "output.csv"
  }
  print(opt$g)
  galoisLogParser(opt$i, opt$o, opt$s, opt$r, opt$m, opt$p, opt$f, opt$g)
}

##################### END #####################


================================================
FILE: scripts/galois_log_parser_minimal.R
================================================
#!/usr/bin/env Rscript

library("optparse")
library('data.table')

####START: @function to parse commadline##################
# Parses the command line to get the arguments used
parseCmdLine <- function (logData, isSharedMemGaloisLog) {
  cmdLineRow <- subset(logData, CATEGORY == "CommandLine" & STAT_TYPE == "PARAM")

  ## Distributed has extra column: HostID
  if(isTRUE(isSharedMemGaloisLog)){
    cmdLine <- substring(cmdLineRow[,5], 0)
  }
  else
    cmdLine <- substring(cmdLineRow[,6], 0)

  cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]

  deviceKind = "CPU"
  if(!isTRUE(isSharedMemGaloisLog)){
    ## To check the device kind
    pos = regexpr('-pset', cmdLineSplit)
    deviceKind = ""
    if(sum(pos>0) > 0){
      deviceKind = "GPU"
    } else {
      deviceKind = "CPU"
    }
  }

  ## First postitional argument is always name of the executable
  ### WORKING: split the exePath name found at the position 1 of the argument list and split on "/".
  exePathSplit <- strsplit(cmdLineSplit[1], "/")[[1]]
  benchmark <- exePathSplit[length(exePathSplit)]

  ## subset the threads row from the table
  numThreads <- (subset(logData, CATEGORY == "Threads" & TOTAL_TYPE != "HostValues"))$TOTAL

  input = "noInput"
  ## subset the input row from the table
  inputPath <- (subset(logData, CATEGORY == "Input" & STAT_TYPE == "PARAM"))$TOTAL
  print(inputPath)
  if(!identical(inputPath, character(0))){
    inputPathSplit <- strsplit(inputPath, "/")[[1]]
    input <- inputPathSplit[length(inputPathSplit)]
  }
  else {
    inputPathSplit <- strsplit(inputPath[[2]], "/")[[1]]
    input <- inputPathSplit[length(inputPathSplit)]
  }
  ### This is to remove the extension for example .gr or .sgr
  inputsplit <- strsplit(input, "[.]")[[1]]
  if(length(inputsplit) > 1) {
    input <- inputsplit[1]
  }
  
  if(isTRUE(isSharedMemGaloisLog)){
    returnList <- list("benchmark" = benchmark, "input" = input, "numThreads" = numThreads, "deviceKind" = deviceKind)
    return(returnList)
  }

 ## Need more params for distributed galois logs
 numHosts <- (subset(logData, CATEGORY == "Hosts"& TOTAL_TYPE != "HostValues"))$TOTAL

 partitionScheme <- (subset(logData, CATEGORY == "PartitionScheme"& TOTAL_TYPE != "HostValues"))$TOTAL

 runID <- (subset(logData, CATEGORY == "Run_UUID"& TOTAL_TYPE != "HostValues"))$TOTAL

 numIterations <- (subset(logData, CATEGORY == "NumIterations_0"& TOTAL_TYPE != "HostValues"))$TOTAL
 #If numIterations is not printed in the log files
 if(identical(numIterations, character(0))){
   numIterations <- 0
 }

 ## returnList for distributed galois log
 returnList <- list("benchmark" = benchmark, "input" = input, "partitionScheme" = partitionScheme, "hosts" = numHosts , "numThreads" = numThreads, "deviceKind" = deviceKind, "iterations" = numIterations)
 return(returnList)
}
#### END: @function to parse commadline ##################

#### START: @function to values of timers for shared memory galois log ##################
# Parses to get the timer values
getTimersShared <- function (logData, benchmark) {
  totalTimeRow <- subset(logData, CATEGORY == "Time" & REGION == "(NULL)")
  totalTime <- totalTimeRow$TOTAL
  print(paste("totalTime:", totalTime))
 returnList <- list("totalTime" = totalTime)
 return(returnList)
}
#### END: @function to values of timers for shared memory galois log ##################

#### START: @function to values of timers for distributed memory galois log ##################
# Parses to get the timer values
getTimersDistributed <- function (logData) {

 ## Total time including the graph construction and initialization
 totalTime <- (subset(logData, CATEGORY == "TimerTotal" & TOTAL_TYPE != "HostValues")$TOTAL)
 print(paste("totalTime:", totalTime))

 ## Taking mean of all the runs
 totalTimeExecMean <- round(mean(as.numeric(subset(logData, grepl("Timer_[0-9]+", CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL)), digits = 2)
 print(paste("totalTimeExecMean:", totalTimeExecMean))

 ## To get the name of benchmark to be used with other queries to get right timers.
 ### It assumes that there will always with Timer_0 with REGION name as benchmark
 ### name used with other queries.
 benchmarkRegionName <- subset(logData, CATEGORY == "Timer_0" & TOTAL_TYPE != "HostValues")$REGION
 print(paste("benchmark:", benchmarkRegionName))

 ## Number of runs
 numRuns <- as.numeric((subset(logData, CATEGORY == "Runs" & TOTAL_TYPE != "HostValues"))$TOTAL)
 print(paste("numRuns:", numRuns))

 ## Total compute time (galois::do_alls)
 computeTimeMean <- 0
 computeTimeRows <- subset(logData, grepl(paste("^", benchmarkRegionName, "[_]*[[:alpha:]]*_[0-9]+", sep=""), REGION) & CATEGORY == "Time" & TOTAL_TYPE == "HMAX")$TOTAL
 computeTimeMean <- round(sum(as.numeric(computeTimeRows))/numRuns, digits = 2)

 print(paste("computeTimeMean:", computeTimeMean))

 ## Synchronization Time
 syncTimeMean <- 0
 syncTimeRows = subset(logData, grepl(paste("Sync_", benchmarkRegionName, "[_]*[[:alpha:]]*_[0-9]+", sep=""), CATEGORY) & TOTAL_TYPE != "HostValues")$TOTAL
 if(!identical(syncTimeRows, character(0))){
   syncTimeMean <- round(sum(as.numeric(syncTimeRows))/numRuns, digits = 2)
 } 
 print(paste("syncTimeMean", syncTimeMean))


 ## Mean time spent in the implicit barrier: Total - (compute +  sync)
 barrierTimeMean = totalTimeExecMean - (computeTimeMean + syncTimeMean)
 if(barrierTimeMean < 0){
   barrierTimeMean <- 0
 }
 print(paste("barrierTimeMean:", barrierTimeMean))

 ## Total bytes sent in reduce and broadcast phase in run 0.
 ### Same number of bytes are being sent in all the runs.
 syncBytes <- 0
 syncBytes <- sum(as.numeric(subset(logData, grepl(paste("[Reduce|Broadcast]SendBytes_", benchmarkRegionName, "[_]*[[:alpha:]]*_0", sep=""), CATEGORY)& TOTAL_TYPE == "HSUM")$TOTAL))
 print(paste("syncBytes:", syncBytes))

###NOTE: Timer are per source for BC
if(benchmarkRegionName == "BC" | benchmarkRegionName == "MRBC") {
   ## Total number of sources for BC
   numSources <- as.numeric((subset(logData, CATEGORY == "NumSources" & TOTAL_TYPE != "HostValues"))$TOTAL)
   
   totalTimeExecMean <- round(totalTimeExecMean/numSources, digits = 2)
   computeTimeMean <- round(computeTimeMean/numSources, digits = 2)
   syncTimeMean <- round(syncTimeMean/numSources, digits = 2) 
   barrierTimeMean <- round(barrierTimeMean/numSources, digits = 2)
   syncBytes <- round(syncBytes/numSources, digits = 2)
 }

 ##Graph construction time
 graphConstructTime <- subset(logData, CATEGORY == "GraphConstructTime" & TOTAL_TYPE != "HostValues")$TOTAL
 print(paste("graphConstructTime:", graphConstructTime))

 ## Replication factor
 replicationFactor <- subset(logData, CATEGORY == "ReplicationFactor" & TOTAL_TYPE != "HostValues")$TOTAL
 print(paste("replicationFactor:", replicationFactor))
 if(identical(replicationFactor, character(0))){
   replicationFactor <- 0
 }
 returnList <- list("replicationFac" = replicationFactor, "totalTime" = totalTime, "totalTimeExec" = totalTimeExecMean, "computeTime" = computeTimeMean, "syncTime" = syncTimeMean, "barrierTime" = barrierTimeMean, "syncBytes" = syncBytes, "graphConstructTime"= graphConstructTime)
 return(returnList)
}
#### END: @function to values of timers for distributed memory galois log ##################

#### START: @function entry point for galois log parser ##################
galoisLogParser <- function(input, output, isSharedMemGaloisLog) {
  logData <- read.csv(input, stringsAsFactors=F,strip.white=T)

  printNormalStats = TRUE;
  if(isTRUE(isSharedMemGaloisLog)){
    print("Parsing commadline")
    paramList <- parseCmdLine(logData, T)
    print("Parsing timers for shared memory galois log")
    benchmark = paramList[1]
    timersList <- getTimersShared(logData, benchmark)
  }
  else{
    print("Parsing commadline")
    paramList <- parseCmdLine(logData, F)
    print("Parsing timers for distributed memory galois log")
    timersList <- getTimersDistributed(logData)
    
  }

  if(isTRUE(printNormalStats)){
    outDataList <- append(paramList, timersList)
    if(!file.exists(output)){
      print(paste(output, "Does not exist. Creating new file"))
      write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)
    } else {
      print(paste("Appending data to the existing file", output))
      write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",")
    }
  }
}
#### END: @function entry point for shared memory galois log ##################

#### START: @function entry point for de-duplication of entries ##################
deDupByMean <- function(output) {
  logData <- read.csv(output, stringsAsFactors=F,strip.white=T)
  ## Aggregate results from multiple runs
  logData_agg <- aggregate(. ~ benchmark + input + partitionScheme + 
          hosts + numThreads + deviceKind,
          data = logData, mean)
  write.csv(logData_agg, output, row.names=FALSE, quote=FALSE)
}
#### END: @function entry point for de-duplication of entries ##################


#############################################
##  Commandline options.
#######################################
option_list = list(
                   make_option(c("-i", "--input"), action="store", default=NA, type='character',
                               help="Name of the input file to parse"),
                   make_option(c("-o", "--output"), action="store", default=NA, type='character',
                               help="Name of the output file to store output"),
                   make_option(c("-d", "--duplicate"), action="store_true", default=FALSE,
                               help="Allow duplicate entries. By default takes mean of duplicate entries [default %default]"),
                   make_option(c("-s", "--sharedMemGaloisLog"), action="store_true", default=FALSE,
                               help="Is it a shared memory Galois log? If -s is not used, it will be treated as a distributed Galois log [default %default]")
                   )

opt_parser <- OptionParser(usage = "%prog [options] -i input.log -o output.csv", option_list=option_list)
opt <- parse_args(opt_parser)

if (is.na(opt$i)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file)", call.=FALSE)
} else {
  if (is.na(opt$o)){
    print("Output file name is not specified. Using name ouput.csv as default")
    opt$o <- "output.csv"
  }
  print(opt$g)
  galoisLogParser(opt$i, opt$o, opt$s)
  ## Take mean of the duplicate entries ##
  if(!opt$d){
    deDupByMean(opt$o)
  }
}
##################### END #####################


================================================
FILE: scripts/gitFindBigCommits.sh
================================================
#!/bin/bash

# This script was obtained from: https://stackoverflow.com/a/42544963

git rev-list --objects --all \
  | git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \
  | sed -n 's/^blob //p' \
  | sort --numeric-sort --key=2 \
  | cut -c 1-12,41- \
  | numfmt --field=2 --to=iec-i --suffix=B --padding=7 --round=nearest


================================================
FILE: scripts/hcompiler.sh
================================================
#!/bin/bash
#Common variables:: 
LLVM_BUILD_DIR=/net/dyne/workspace/rashid/llvm37/build/
GALOIS_DIST_SRC_DIR=/h2/rashid/workspace/GaloisDist/gdist/
GALOIS_DIST_BUILD_DIR=/net/faraday/workspace/rashid/GaloisDist/release/
CUDA_HOME=/org/centers/cdgc/cuda/cuda-7.0/
IN_FILE_NAME=$2
USE_CASE=$1


GALOIS_INCLUDE_DIRS="-I${BOOST_INC} -I${GALOIS_DIST_SRC_DIR}/exp/include -I${MPI_DIR}/include -I${GALOIS_DIST_BUILD_DIR}/include -I${GALOIS_DIST_SRC_DIR}/include -I${GALOIS_DIST_SRC_DIR}/libruntime/include/ -I${GALOIS_DIST_SRC_DIR}/libsubstrate/include/ -I${GALOIS_DIST_SRC_DIR}/lonestar/include/ -I${GALOIS_DIST_SRC_DIR}/libllvm/include -I${GALOIS_DIST_SRC_DIR}/libgraphs/include -I${GALOIS_DIST_SRC_DIR}/libdist/include -I${CUDA_HOME}/include"

#For Analysis: 

#${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG -I${BOOST_INC} -I${GALOIS_DIST_SRC_DIR}/exp/include -I${MPI_DIR}/include -I${GALOIS_DIST_BUILD_DIR}/include -I${GALOIS_DIST_SRC_DIR}/include -I${GALOIS_DIST_SRC_DIR}/libruntime/include/ -I${GALOIS_DIST_SRC_DIR}/libsubstrate/include/ -I${GALOIS_DIST_SRC_DIR}/lonestar/include/ -I${GALOIS_DIST_SRC_DIR}/libllvm/include -I${GALOIS_DIST_SRC_DIR}/libgraphs/include -I${GALOIS_DIST_SRC_DIR}/libdist/include -I${CUDA_HOME}/include -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}


#analysis:

if [ "${USE_CASE}" == "analysis" ] 
then
${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}

elif [ "${USE_CASE}" == "codegen" ] 
then
#codegen:
${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctions.so -Xclang -plugin -Xclang galois-fns  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}

elif [ "${USE_CASE}" ==  "opencl" ] 
then
${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/OpenCLCodeGenHost.so -Xclang -plugin -Xclang opencl-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}

elif [ "${USE_CASE}" ==  "clcodegen" ] 
then
${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/OpenCLCodeGenDevice.so -Xclang -plugin -Xclang opencl-device-codegen  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}

elif [ "${USE_CASE}" ==  "aos2soa" ] 
then
${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/AosToSoaPlugin.so -Xclang -plugin -Xclang aos2soa  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}

elif [ "${USE_CASE}" == "astdump" ]
then

${LLVM_BUILD_DIR}/bin/clang -cc1 -ast-dump -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o ${IN_FILE_NAME}
fi


================================================
FILE: scripts/intel_study_scripts/README.md
================================================
Instructions to build Galois and reproduce IntelStudy experiments
========

Clone the repository
```Shell
git clone https://github.com/IntelligentSoftwareSystems/Galois
```

Let us assume that the SRC_DIR is the top-level Galois source dir where the Galois repository is cloned.

Building Galois
------------
```Shell
BUILD_DIR=<path-to-your-build-dir>
mkdir -p $BUILD_DIR
cmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Release
```

Galois applications are in lonestar directory. In order to build a particular application:
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/<app-dir-name> -j
```

For IntelStudy build the following apps:
------------
BFS
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/bfs -j
```
BC
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/betweennesscentrality -j
```
CC
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/connectedcomponents -j
```
PR
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/pagerank -j
```
SSSP
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/sssp -j
```
TC
```Shell
make -C $BUILD_DIR/lonestar/analytics/cpu/triangles -j
```


Download the inputs
------------
```Shell
mkdir -p $INPUT_DIR
bash $BUILD_DIR/scripts/intel_study_scripts/download_inputs.sh $INPUT_DIR
```


Running benchmarks using scripts:
------------

Set env variables to be used by scripts
```Shell
export GALOIS_BUILD=$BUILD_DIR
export INPUT_DIR=$INPUT_DIR
```

Run
```Shell
cd $BUILD_DIR/scripts/intel_study_scripts/
./run_bc.sh
./run_bfs.sh
./run_cc.sh
./run_pr.sh
./run_sssp.sh
./run_tc.sh
```

logs will be produced by the above mentioned scripts in the repespective folders of the benchmark, here is the example for bfs:
```Shell
cd $BUILD_DIR/lonestar/analytics/cpu/bfs/logs
```


================================================
FILE: scripts/intel_study_scripts/download_inputs.sh
================================================
#!/bin/bash

echo -e "USAGE: ./download_inputs.sh INPUT_DIR_PATH\n"
INPUT_DIR=$1
if [ -z ${INPUT_DIR} ];
then
  echo "INPUT_DIR not set; Please point it to the directory where graphs will be downloaded"
  exit
else
  echo "Using directory ${INPUT_DIR} for inputs"
fi

cd ${INPUT_DIR}
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/sources.tar.gz 
tar -xzvf sources.tar.gz

wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-road.sgr

wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-urand.sgr 

wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-kron.sgr 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-kron.sgr.triangles 

wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.csgr.triangles 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.gr 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.sgr 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.tgr 

wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.csgr.triangles 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.gr 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.sgr 
wget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.tgr


================================================
FILE: scripts/intel_study_scripts/run_bc.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_bc.sh <numRuns>\n"
appname=betweennesscentrality

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}"
echo ${execDir}

exec=betweennesscentrality-cpu

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${input} == "web" ] || [ ${input} == "twitter" ]; then 
        ##NOTE: Using gr for directed graphs
        extension=gr
      else # kron road urand
        ##NOTE: Using sgr for undirected graphs
        extension=sgr
      fi

      if [ ${configType} == 1 ]; then 
        algo="AutoAlgo"
      elif [ ${input} == "road" ]; then # ${configType} == 2
        algo="Async"
      else # ${configType} == 2
        algo="Level"
      fi

      echo "Running on ${input}"
      echo "Logs will be available in ${execDir}/logs/${input}"
      if [ ! -d "${execDir}/logs/${input}" ]; then
        mkdir -p ${execDir}/logs/${input}
      fi

      for count in {0..15}
      do
        filename="${appname}_${input}_file_${count}_${configType}_Run${run}"
        statfile="${filename}.stats"
        args=" -numOfSources=4 -numOfOutSources=4 -sourcesToUse="$inputDir/sources/GAP-${input}-bc/GAP-${input}_sources_${count}.txt" "
        ${execDir}/${exec} $inputDir/GAP-${input}.${extension} -t ${Threads} ${args}  -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out
      done
    done
  done
done

================================================
FILE: scripts/intel_study_scripts/run_bfs.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_bfs.sh <numRuns>\n"
appname=bfs

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/bfs"
echo ${execDir}

exec=bfs-directionopt-cpu

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${input} == "web" ] || [ ${input} == "twitter" ]; then 
        ##NOTE: Using gr for directed graphs
        extension=gr
      else # kron road urand
        ##NOTE: Using sgr for undirected graphs
        extension=sgr
      fi

      if [ ${configType} == 1 ]; then 
        algo="AutoAlgo"
      elif [ ${input} == "road" ]; then # ${configType} == 2
        algo="Async"
      else # ${configType} == 2
        algo="SyncDO"
      fi

      echo "Running on ${input}"
      echo "Logs will be available in ${execDir}/logs/${input}"
      if [ ! -d "${execDir}/logs/${input}" ]; then
        mkdir -p ${execDir}/logs/${input}
      fi

      while read p; do
        source_node=$((${p} - 1))
        filename="${appname}_${input}_source_${source_node}_algo_${algo}_${configType}_Run${run}"
        statfile="${filename}.stats"
        ${execDir}/${exec} -algo=${algo} $inputDir/GAP-${input}.${extension} -t ${Threads} -preAlloc=1200  -startNode=${source_node} -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out
      done < $inputDir/sources/GAP-${input}_sources.mtx
    done
  done
done


================================================
FILE: scripts/intel_study_scripts/run_cc.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_cc.sh <numRuns>\n"
appname="connected-components"

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}"
echo ${execDir}
if [ ! -d "${execDir}/logs/" ]; then
  mkdir -p ${execDir}/logs/
fi
echo "Logs will be available in ${execDir}/logs/"

exec="connected-components-cpu"
extension=sgr

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${configType} == 2 ] && [ ${input} == "web" ]; then 
        algo="EdgetiledAfforest"
      else
        algo="Afforest"
      fi
      echo "Running on ${input}"
      filename="${appname}_${input}_algo_${algo}_${configType}_Run${run}"
      statfile="${filename}.stats"
      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -symmetricGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out
    done
  done
done


================================================
FILE: scripts/intel_study_scripts/run_pr.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_pr.sh <numRuns>\n"
appname="pagerank"

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}"
echo ${execDir}
if [ ! -d "${execDir}/logs/" ]; then
  mkdir -p ${execDir}/logs/
fi
echo "Logs will be available in ${execDir}/logs/"

exec="pagerank-pull-cpu"

algo="Topo"
tol=1e-4
maxIter=1000

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${input} == "web" ] || [ ${input} == "twitter" ]; then 
        ##NOTE: Using tgr for directed graphs
        extension=tgr
      else # kron road urand
        ##NOTE: Using sgr for undirected graphs
        extension=sgr
      fi
      echo "Running on ${input}"
      filename="${appname}_${input}_algo_${algo}_${configType}_Run${run}"
      statfile="${filename}.stats"
      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -tolerance=${tol} -maxIterations=${maxIter} -transposedGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out
    done
  done
done


================================================
FILE: scripts/intel_study_scripts/run_sssp.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_sssp.sh <numRuns>\n"
appname=sssp

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}"
echo ${execDir}

exec=sssp-cpu

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${input} == "web" ] || [ ${input} == "twitter" ]; then 
        ##NOTE: Using gr for directed graphs
        extension=gr
      else # kron road urand
        ##NOTE: Using sgr for undirected graphs
        extension=sgr
      fi

      if [ ${configType} == 1 ]; then 
        algo="AutoAlgo"
      elif [ ${input} == "road" ]; then # ${configType} == 2
        algo="deltaStep"
      else # ${configType} == 2
        algo="deltaStepBarrier"
      fi

      if [ ${input} == "road" ]; then
        delta=15
      else
        delta=1
      fi

      echo "Running on ${input}"
      echo "Logs will be available in ${execDir}/logs/${input}"
      if [ ! -d "${execDir}/logs/${input}" ]; then
        mkdir -p ${execDir}/logs/${input}
      fi

      while read p; do
        source_node=$((${p} - 1))
        filename="${appname}_${input}_source_${source_node}_algo_${algo}_${configType}_Run${run}"
        statfile="${filename}.stats"
        ${execDir}/${exec} -t=${Threads} -delta=${delta} -algo=$algo $inputDir/GAP-${input}.${extension} -startNode=${source_node} -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out
      done < $inputDir/sources/GAP-${input}_sources.mtx
    done
  done
done


================================================
FILE: scripts/intel_study_scripts/run_tc.sh
================================================
#!/bin/bash

echo -e "USAGE: ./run_tc.sh <numRuns>\n"
appname="triangle-counting"

numRuns=$1
if [ -z $numRuns ]; then
  numRuns=1
fi

if [ -z ${GALOIS_BUILD} ]; then
  echo "GALOIS_BUILD not set; Please point it to the top level directory where Galois is built"
  exit
else
  echo "Using ${GALOIS_BUILD} for Galois build to run ${appname}"
fi

if [ -z ${INPUT_DIR} ]; then
  echo "INPUT_DIR not set; Please point it to the directory with .gr graphs"
  exit
else
  echo "Using ${INPUT_DIR} for inputs for ${appname}"
fi

inputDir="${INPUT_DIR}"
execDir="${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}"
echo ${execDir}
if [ ! -d "${execDir}/logs/" ]; then
  mkdir -p ${execDir}/logs/
fi
echo "Logs will be available in ${execDir}/logs/"

exec="triangle-counting-cpu"
algo="orderedCount"

for configType in $(seq 1 2)
do
  if [ ${configType} == 1 ]; then
    echo "Running ${appname} with config1"
    export GOMP_CPU_AFFINITY="0-31"
    export KMP_AFFINITY="verbose,explicit,proclist=[0-31]"
    Threads=32
  else
    echo "Running ${appname} with config2"
    Threads=64
  fi

  for run in $(seq 1 ${numRuns})
  do
    for input in "kron" "road" "urand" "web" "twitter"
    do
      if [ ${input} == "web" ] || [ ${input} == "twitter" ]; then 
        ##NOTE: Using csgr for directed graphs
        extension=csgr
      else # kron road urand
        ##NOTE: Using sgr for undirected graphs
        extension=sgr
      fi

      echo "Running on ${input}"
      filename="${appname}_${input}_algo_${algo}_${configType}_Run${run}"
      statfile="${filename}.stats"
      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -symmetricGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out
    done
  done
done


================================================
FILE: scripts/iss_load_modules.sh
================================================
#!/bin/bash

if [ "$(basename -- $0)" == "iss_load_modules.sh" ]; then
  echo "Source this file instead of running directly" >&2
  exit 1
fi

# first up remove everything
module purge

module use /opt/apps/ossw/modulefiles/

module load c7
module load serf

module use /net/faraday/workspace/local/modules/modulefiles
module use /org/centers/cdgc/modules

module load atc
module load cmake
module load mpich2
module load boost
module load gdb
module load isspython # needed for vim
module load git
module load fmt

if [ "$1" != "min" ]; then
  module load tbb
  module load eigen
  module load neon
  module load lapack
  module load vtune
  module load mkl
  module load texlive
  module load subversion
  # module load screen #disabling for now because screen was compiled without proper color support
  if [ "$SYSTEMTYPE" != "c7" ] ; then
    module load doxygen
    module load gnuplot
    module load ghostscript
    module load valgrind
  fi
fi


================================================
FILE: scripts/make_dist.sh.in
================================================
#!/bin/bash
#
# Make distribution tarball

NAME="Galois-@GALOIS_VERSION_MAJOR@.@GALOIS_VERSION_MINOR@.@GALOIS_VERSION_PATCH@"

if [[ ! -e COPYRIGHT ]]; then
  echo "Run this from the root source directory" 1>&2
  exit 1
fi

touch "$NAME.tar.gz" # Prevent . from changing during tar
#(svn status | grep '^\?' | sed -e 's/^\? *//'; \
( \
  echo ".git"; \
  echo "*.swp"; \
  echo "*~"; \
  echo "exp"; \
  echo "$NAME.tar.gz") | \
  tar --exclude-from=- --exclude-vcs --transform "s,^\./,$NAME/," -cz -f "$NAME.tar.gz" .


================================================
FILE: scripts/merge_vtune.pl
================================================
#!/usr/bin/perl
#
# Take the output of individual run_vtune commands and merge them into
# a single file

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;

my $Help = 0;

GetOptions('help'=>\$Help) or pod2usage(2);
pod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $Help;
die("need at least one file") unless (@ARGV >= 2);

while (@ARGV) {
  my $threads = shift @ARGV;
  my $filename = shift @ARGV;
  open(my $fh, '<', $filename) or die($!);
  die("empty file") unless (<$fh>);
  print "THREADS\t$threads\n";
  while (my $line = <$fh>) {
    print $line;
  }
}

__END__

=head1 NAME

merge_vtune - Merge output from multiple run_vtune commands

=head1 SYNOPSIS

merge_vtune (<num threads> <file>)+ > merged

=head1 DESCRIPTION

Merge output from multiple run_vtune commands

=cut


================================================
FILE: scripts/plot_lonestar_apps.R
================================================
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(plyr)
library(reshape2)
#library(dplyr)

showPlot <- function(name, p, ...) {
  if (interactive()) {
    return(invisible(NULL))
  }
  outfile <- paste("figs", "/", name, ".pdf", sep="")
  ggsave(p + theme(plot.margin=unit(c(0, 0.25, 0, 0), "lines")), file=outfile, ...)
}

showImage <- function(name, p, ...) {
  if (interactive()) {
    return(invisible(NULL))
  }
  outfile <- paste("figs", "/", name, ".png", sep="")
  ggsave(p + theme(plot.margin=unit(c(0, 0.25, 0, 0), "lines")), file=outfile, width=8, height=4, ...)
}

showPlotGrid <- function(name, p, ...) {
    if (interactive()) {
          return(invisible(NULL))
  }
  outfile <- paste("figs", "/", name, ".pdf", sep="")
    ggsave(p , file=outfile, ...)
}

geomean <- function(label, x) {
  gm <- exp(sum(log(x[x > 0]), na.rm=TRUE) / length(x))
  print(c(label, gm))
}

round_df <- function(df, digits) {
  nums <- vapply(df, is.numeric, FUN.VALUE = logical(1))

  df[,nums] <- round(df[,nums], digits = digits)

  (df)
}

grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right"), name = "comm-breakdown") {

  plots <- list(...)
  position <- match.arg(position)
  name <- match.arg(name)
  g <- ggplotGrob(plots[[1]] +labs(fill="") + guides(fill = guide_legend(nrow = 2)) + theme(legend.position = position))$grobs
  legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
  lheight <- sum(legend$height)
  lwidth <- sum(legend$width)
  gl <- lapply(plots, function(x) x + theme(legend.position="none"))
  gl <- c(gl, ncol = ncol, nrow = nrow)

  combined <- switch(position,
                     "bottom" = arrangeGrob(do.call(arrangeGrob, gl),
                                            legend,
                                            ncol = 1,
                                            heights = unit.c(unit(1, "npc") - lheight, lheight)),
                     "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                           legend,
                                           ncol = 2,
                                           widths = unit.c(unit(1, "npc") - lwidth, lwidth)))


  showPlotGrid(name, combined, width=3.25, height=6, unit = "in")
  # return gtable invisibly
  invisible(combined)

}


preProcess <- function(directoryName, benchmarkName, hasInput=TRUE){
  #################### No Crash #########################
  readFile <- paste(directoryName, "/", benchmarkName,".csv", sep="")
  res <- read.csv(readFile, stringsAsFactors=F)

  tmpMean <- aggregate(. ~ benchmark + input + numThreads  + deviceKind, data = res, mean)
  print((tmpMean))

  tmpMean$totalTime <- tmpMean$totalTime/1000
  #if(hasInput)
    #p <- ggplot(tmpMean, aes(x=numThreads, y=totalTime)) + geom_line(color="steelblue") + geom_point(color="steelblue") + facet_grid(~input, scales="free_y") + scale_y_continuous("Time Total (s)")+ theme(axis.text.x = element_text(angle = 0)) + scale_x_continuous("Number of Threads", breaks = unique(tmpMean$numThreads)) + scale_color_manual(values=c("#CC6666")) #+ scale_x_continuous("Hosts", trans="log2", breaks=c(1, 4, 16, 64))
  #else
    p <- ggplot(tmpMean, aes(x=numThreads, y=totalTime)) + geom_line(color="steelblue") + geom_point(color="steelblue4") + scale_y_continuous("Time Total (s)")+ theme(axis.text.x = element_text(angle = 0)) + scale_x_continuous("Number of Threads", breaks = unique(tmpMean$numThreads)) + scale_color_manual(values=c("#CC6666")) #+ scale_x_continuous("Hosts", trans="log2", breaks=c(1, 4, 16, 64))

  outFileName <- paste(benchmarkName, "_totalTime", sep="")
  showPlot(outFileName, p, width=3.6, height=5.25, unit = "in")
}

preProcess("./", "barneshut")
preProcess("./", "bc-async")
preProcess("./", "bc-outer")
preProcess("./", "bfs")
preProcess("./", "boruvka")
preProcess("./", "connectedcomponents")
preProcess("./", "delaunaytriangulation")
preProcess("./", "dmr")
preProcess("./", "gmetis")
preProcess("./", "independentset")
preProcess("./", "matrixcompletion")
preProcess("./", "mcm")
preProcess("./", "pagerank-pull")
preProcess("./", "pagerank-push")
preProcess("./", "preflowpush")
preProcess("./", "pta")
preProcess("./", "sssp")
preProcess("./", "surveypropagation")
preProcess("./", "triangles-edge")
preProcess("./", "triangles-node")


================================================
FILE: scripts/quick_plot.pl
================================================
while (<STDIN>) {
    if (/STAT.*/) {
	my @values = split ',';
	if ($values[2] eq $ARGV[0]) {
	    $v{$values[3]} += $values[4];
	    $n{$values[3]} += 1;
	}
    }
}

#foreach $key (sort {$a <=> $b} keys %v) {
#    print "$key $v{$key} $n{$key}\n";
#}

open GP, "|gnuplot -persist" or die "Can't execute gnuplot";

if (exists $n{1}) {
    $doscale = 1;
} else {
    $doscale = 0;
}

if (scalar @ARGV > 1) {
    print "outputfile (eps) is $ARGV[1]\n";
    open GP, "|gnuplot" or die "Can't execute gnuplot";
    print GP "set terminal postscript enhanced color\n";
    print GP "set output '| ps2pdf - $ARGV[1]'\n";
} else {
    open GP, "|gnuplot -persist" or die "Can't execute gnuplot";
}

print GP "set xlabel \"threads\"\n";
print GP "set ylabel \"$ARGV[0]\"\n";
print GP "set y2label \"Scaling\"\n" if $doscale;
print GP "set y2tics nomirror\n" if $doscale;
print GP "set ytics nomirror\n";
print GP "plot '-' title \"$ARGV[0]\" with lines axis x1y1";
print GP ", '-' title \"scaling\" with lines axis x1y2" if $doscale;
print GP "\n";

foreach $key (sort {$a <=> $b} keys %v) {
    print GP $key . " " . ($v{$key} / $n{$key}) . "\n";
}
print GP "e\n";

if ($doscale) {
    foreach $key (sort {$a <=> $b} keys %v) {
	print GP $key . " " . ($v{1} / $n{1}) / ($v{$key} / $n{$key})  . "\n";
    }
    print GP "e\n";
}


================================================
FILE: scripts/rcat.py
================================================
#!/usr/bin/env python
"""
Like cat but optionally add key-values after 'RUN: Start'. Useful with report.py.

@section License

Copyright (C) 2012, The University of Texas at Austin. All rights reserved.
UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS SOFTWARE
AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR ANY
PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY
WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF TRADE.
NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO THE USE OF THE
SOFTWARE OR DOCUMENTATION. Under no circumstances shall University be liable
for incidental, special, indirect, direct or consequential damages or loss of
profits, interruption of business, or related expenses which may arise from use
of Software or Documentation, including but not limited to those resulting from
defects in Software and/or Documentation, or loss or inaccuracy of data of any
kind.

@author Donald Nguyen <ddn@cs.utexas.edu> 
"""

from __future__ import print_function
import sys
import os
import re
import optparse 
import collections

def process(fh, options):
  regex = re.compile(r'^RUN: Start')
  pairs = [kv.split('=') for kv in options.add]
  text = '\n'.join(['RUN: Variable %s = %s' % (k, v) for (k,v) in pairs])

  for line in fh:
    print(line, end='')
    if regex.match(line):
      print(text)


if __name__ == '__main__':
  parser = optparse.OptionParser(usage='usage: %prog [options]')
  parser.add_option('-a', '--add-column',
      dest="add", default=[], action='append',
      help='column to include in output. Multiple columns can be specified '
           + 'with multiple options or a comma separated list of columns. '
           + 'Example: --add-column Version=Good')

  (options, args) = parser.parse_args()

  if args:
    for f in args:
      with open(f) as fh:
        process(fh, options)
  else:
    process(sys.stdin, options)


================================================
FILE: scripts/report.py
================================================
#!/usr/bin/env python

import sys
import collections

def main():
  class Row:
    def __init__(self):
      self.reset()
    def reset(self):
      self.r = collections.defaultdict(str)
      self.header = None
    def get(self, token, key):
      return token[self.header.index(key)]

  row = Row()
  rows = []
  cols = set()

  for line in sys.stdin:
    try:
      param_token = [i.strip() for i in line.split()]
      stat_token = [i.strip() for i in line.split(",")]

      # empty line
      if param_token == []:
        continue

      # parameter setting by run.py
      if param_token[0] == "RUN:":
        if param_token[1] == "Start":
          if row.r:
            rows.append(row.r)
            row.reset()
        elif param_token[1] == "Variable":
          key = param_token[2]
          cols.add(key)
          row.r[key] = param_token[4] # param_token[3] is "="
        elif param_token[1] == "CommandLine":
          cmd_token = [i.strip() for i in line.split(None, 2)]
          key = cmd_token[1]
          cols.add(key)
          row.r[key] = cmd_token[2]

      # stat header returned by Galois
      elif stat_token[0] == "LOOP":
        row.header = stat_token

      # stat lines. ignore HOST for shared-memory version
      elif row.header != None:
        loop_name = row.get(stat_token, "LOOP")
        instance = row.get(stat_token, "INSTANCE")
        th = row.get(stat_token, "THREAD")
        key = row.get(stat_token, "CATEGORY") + "-t" + th
        if loop_name != "(NULL)":
          key = loop_name + "-i" + instance + "-" + key
        cols.add(key)
        row.r[key] = row.get(stat_token, "VAL")

    except:
      sys.stderr.write("Error parsing line: %s" % line)
      raise

  if row.r:
    rows.append(row.r)
  cols = sorted(cols)

  print(','.join(cols))
  for r in rows:
    print(','.join([str(r[c]) for c in cols]))


if __name__ == "__main__":
  main()


================================================
FILE: scripts/report_vtune.pl
================================================
#!/usr/bin/perl
#
# Take output of run_vtune.pl and produce tab-deliminated file

use strict; 
use warnings;
use Getopt::Long;
use Pod::Usage;

# Command line options
my $InType = "line";
my %validInTypes = map { $_ => 1 } ("line", "function");
my $ShowType = "raw";
my %validShowTypes = map { $_ => 1 } ("raw", "ratio", "scalebythread");
my $Help = 0;

my %Stats = ();
my %Thread_keys = ();
my $TOTAL = "TOTAL";

my $DEBUG=0;
sub debug {
  print STDERR ">>>DEBUG: @_\n" if $DEBUG;
}

sub show {
  my ($tmap, $lk, $tk) = @_;

  if (!exists $tmap->{$lk}{$tk}) {
    print STDERR "ERROR: missing key: tk=$tk, lk=$lk\n";
  }

  if ($ShowType eq "scalebythread") {
    return $tmap->{$lk}{$tk} / $tk;
  } elsif ($ShowType eq "ratio") {
    return $tmap->{$lk}{$tk} / $tmap->{$TOTAL}{$tk};
  } elsif ($ShowType eq "raw") {
    return $tmap->{$lk}{$tk};
  } else {
    die;
  }
}

GetOptions('show=s'=>\$ShowType, 'in=s'=>\$InType, 'help'=> \$Help) or pod2usage(2);
pod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $Help;
die("unknown show type") unless ($validShowTypes{$ShowType});
die("unknown in type") unless ($validInTypes{$InType});

my $newSet = 0;
my $curThread = 0;
my @heads = ();

while (<>) {
  chomp;
  my @line = split /\t/;

  debug "line:@line\n";
  # print "line:$line[0],$line[1]\n";

  if ($line[0] =~ /^THREADS$/) {
    debug "Threads line: @line\n";
    $newSet = 1;
    $curThread = $line[1];
    $Thread_keys{$curThread} = 1;
  } elsif ($newSet) {
    $newSet = 0;
    @heads = @line;
    debug "headers:@heads\n";
  } else {
    my $ind;
    my $offset = 0;

    debug "line=@line, length=$#line\n";

    if ($InType eq "line") {
      # first 2 columns are source file and line
      my $file = shift @line;
      my $ln = shift @line;

      $offset = 2;
      if ($heads[$offset] =~ /file path/i) {
        my $path = shift @line;
        $offset += 1;
      } 

      $ind = "$file:$ln";

    } elsif ($InType eq "function") {
      # first column is function name 
      # last 4 colums are module, function full name, source file, start address
      my $function = shift @line;

      my $address = pop @line;
      my $file = pop @line;
      my $fullname = pop @line;
      my $module = pop @line;

      $offset = 1;
      $ind = "$file:$fullname:$module:$address";
    }

    debug "line=@line, length=$#line\n";

    for (my $i = 0; $i <= $#line; $i++) {
      my $nk = $heads[$i + $offset];
      debug "nk=$nk\n";
      $Stats{$nk}{$curThread}{$ind} += $line[$i];
      $Stats{$nk}{$curThread}{$TOTAL} += $line[$i];
    }
    # print "###\n";
  }
}

# for the combinations of (line_keys, Thread_keys) for a given stat_name that don't
# have corresponding Stats, we put a 0. e.g. a particular function/line shows
# up in the profile at threads=1 but not at threads=16.
foreach my $nk (keys %Stats) {
  my %line_keys = ();
  foreach my $tk (keys %{$Stats{$nk}}) {
    foreach my $lk (keys %{$Stats{$nk}{$tk}}) {
      $line_keys{$lk} = 1;
    }
  }

  foreach my $tk (keys %Thread_keys) {
    foreach my $lk (keys %line_keys) {
      if (!exists $Stats{$nk}{$tk}{$lk}) {
        $Stats{$nk}{$tk}{$lk} = 0;
      }    
    }
  }
}

my $maxThread = (sort { $a <=> $b } keys %Thread_keys)[-1];

foreach my $nk (sort keys %Stats) {
  print "$nk";
  foreach my $tk (sort { $a <=> $b } keys %Thread_keys) {
    print "\t$tk";
  }
  print "\n";

  my %transpose = ();
  foreach my $tk (keys %Thread_keys) {
    foreach my $lk (keys %{$Stats{$nk}{$tk}}) {
      $transpose{$lk}{$tk} = $Stats{$nk}{$tk}{$lk};
    }
  }

  # delete lines with all 0s from transpose
  foreach my $lk (keys %transpose) {
    my $all_zeros = 1;
    foreach my $tk (keys %Thread_keys) {
      if ($transpose{$lk}{$tk} != 0) {
        $all_zeros = 0;
        last;
      }
    }

    if ($all_zeros) {
      delete $transpose{$lk};
    }
  }

  #sort by final thread performance
  foreach my $lk (sort { show(\%transpose, $b, $maxThread) <=> show(\%transpose, $a, $maxThread) }
    keys %transpose) {

    print "$lk";
    foreach my $tk (sort { $a <=> $b } keys %Thread_keys) {
      print "\t" . show(\%transpose, $lk, $tk);
    }
    print "\n";
  }

  print "\n\n\n";
}

__END__

=head1 NAME

report_vtune - Emit tab-separated file from output of run_vtune

=head1 SYNOPSIS

cat output | report_vtune [options] > output.tsv

 Options:
   -help             brief help message
   -in=INTYPE        format of run_vtune output
   -show=SHOWTYPE    output format

=head1 OPTIONS

=over 8

=item B<-help>

Print a brief help message and exits.

=item B<-in>=INTYPE

run_vtune output is by INTYPE instead function: line, function

=item B<-show>=SHOWTYPE

Output SHOWTYPE instead of raw counts: raw, ratio, scalebythread

=back

=head1 DESCRIPTION

Emit tab-separated file from output of run_vtune

=cut


================================================
FILE: scripts/result_checker.py
================================================
#### Script to check the output of algorithms:
### Author: Gurbinder Gill (gurbinder533@gmail.com)
### Author: Roshan Dathathri (roshan@cs.utexas.edu)
### Modified to calculate error + take tolerance as an error by Loc Hoang 

### python script.py masterFile allfile* [-t, -tolerance]=<tolerance>

### expects files in the follwing format:
###### nodeID nodeFieldVal
######## These are generated by galois::runtime::printOutput function.
### Requires python version > 2.7
# Can also take 2 field files TODO make more general

import sys
import argparse
import os

def check_results(masterFile, otherFiles, tolerance, 
  offset, errors, mrows, global_error_squared, num_nodes):

  with open(masterFile) as mfile, open(otherFiles) as ofile:
    mfile.seek(offset)

    for line2 in ofile:
      line1 = mfile.readline()
      offset = offset + len(line1)

      split_line1 = line1.split(' ')
      split_line2 = line2.split(' ')

      if (split_line1[0] == ''):
        print("ERROR: output longer than input")
        return (0, errors, mrows)

      while (long(split_line1[0]) < long(split_line2[0])):
        print "MISSING ROW: ", split_line1[0]
        mrows = mrows + 1
        line1 = mfile.readline()
        offset = offset + len(line1)
        split_line1 = line1.split(' ')


      # forces failure if missings rows exist
      #if mrows > 0:
      #  return (-1, errors, mrows)

      if (long(split_line1[0]) == long(split_line2[0])):
        # absolute value of difference in fields
        field_difference = abs(float(split_line1[1]) - float(split_line2[1]))

        global_error_squared += (field_difference ** 2)
        num_nodes += 1

        if (field_difference > tolerance):
          print "NOT MATCHED \n";
          print line1;
          print line2;
          errors = errors + 1;
        # TODO (Loc) make more general: deals with 2 fields in output (should
        # optimally deal with arbitrary # of fields
        elif (len(split_line1) == 3):
          field_difference2 = abs(float(split_line1[2]) - float(split_line2[2]))
          if (field_difference2 > tolerance):
            print "NOT MATCHED \n";
            print line1;
            print line2;
            errors = errors + 1;
      else:
        print "OFFSET MISMATCH: ", split_line1[0], split_line2[0]
        return (-1, errors, mrows, global_error_squared, num_nodes);

  return (offset, errors, mrows, global_error_squared, num_nodes);

def main(masterFile, allFiles_arr, tolerance, mean_tolerance):
  offset = 0
  errors = 0
  mrows = 0
  global_error_squared = 0
  num_nodes = 0

  for i in range(0 , len(allFiles_arr)):
    print allFiles_arr[i]
    print offset
    offset, errors, mrows, global_error_squared, num_nodes = check_results(masterFile, allFiles_arr[i], tolerance, offset, errors, mrows, global_error_squared, num_nodes)
    if (offset == -1):
      break

  rmse = (global_error_squared / num_nodes) ** 0.5
  if (rmse > mean_tolerance):
    print "\nRoot mean square error (for first field): ", rmse

  if (offset != -1):
    mfile = open(masterFile)
    mfile.seek(offset)
    old_mrows=mrows
    for line in mfile:
      mrows = mrows + 1
    if mrows > old_mrows:
      mrows = mrows - old_mrows
      print "\nNo of offsets/rows missing: ", mrows

  if (offset == -1):
    print "\nOffset not correct"

  if (errors > 0):
    print "\nNo. of mismatches: ", errors

  if (errors > 0) or (offset == -1) or (mrows > 0) or (rmse > mean_tolerance):
    print "\nFAILED\n"
    return 1
  else:
    print "\nSUCCESS\n"
    return 0

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Check graph output results")

  # parse files and an optional tolerance
  parser.add_argument('files', type=str, nargs='+', help='input + output files')
  parser.add_argument('-tolerance', '-t', type=float, nargs=1, default=0.0001,
                      help='tolerance for difference in fields (error)')
  parser.add_argument('-sort', '-s', type=bool, nargs=1, default=False,
                      help='sort the generated output files')
  parser.add_argument('-delete', '-d', type=bool, nargs=1, default=False,
                      help='delete the generated output files')
  parser.add_argument('-mean_tolerance', '-m', type=float, nargs=1, default=0.0001,
                      help='tolerance for root mean square error')

  arg = sys.argv
  parsed_arguments = parser.parse_args()

  masterFile = parsed_arguments.files[0]
  allFiles_arr = parsed_arguments.files[1:]

  print masterFile  
  print allFiles_arr  

  if parsed_arguments.sort:
    sortstr = "sort -nu"  
    for f in allFiles_arr:
      sortstr += " " + f
    sortstr += " -o .output_log"
    os.system(sortstr)

  if parsed_arguments.delete:
    rmstr = "rm -f"
    for f in allFiles_arr:
      rmstr += " " + f
    os.system(rmstr)

  if parsed_arguments.sort:
    allFiles_arr = ['.output_log']

  tolerance = parsed_arguments.tolerance
  mean_tolerance = parsed_arguments.mean_tolerance

  print("Starting comparison...")
  ret = main(masterFile, allFiles_arr, tolerance, mean_tolerance)

  if parsed_arguments.sort:
    os.system("rm -f .output_log")

  if ret:
    sys.exit(1)


================================================
FILE: scripts/run.py
================================================
#!/usr/bin/env python
#
# Run an application multiple times, varying parameters like
# number of threads, etc

from __future__ import print_function
import sys
import os
import subprocess
import optparse
import shlex
import signal


def die(s):
  sys.stderr.write(s)
  sys.exit(1)


def print_bright(s):
  red = '\033[1;31m'
  endc = '\033[0m'
  print(red + s + endc)


def parse_range(s):
  """
  Parses thread range s
  Grammar:
   R := R,R
      | S
      | N
      | N:N
      | N:N:N
   N := an integer
   S := a string
  """
  # Parsing strategy: greedily parse integers with one character
  # lookahead to figure out exact category
  s = s + ' ' # append special end marker
  retval = []
  cur = -1
  curseq = []
  for i in range(len(s)):
    if s[i] == ',' or s[i] == ' ':
      if cur < 0:
        break
      if len(curseq) == 0:
        retval.append(s[cur:i])
      elif len(curseq) == 1:
        retval.extend(range(curseq[0], int(s[cur:i]) + 1))
      elif len(curseq) == 2:
        retval.extend(range(curseq[0], curseq[1] + 1, int(s[cur:i])))
      else:
        break
      cur = -1
      curseq = []
    elif s[i] == ':' and cur >= 0:
      curseq.append(int(s[cur:i]))
      cur = -1
    elif cur < 0:
      cur = i
    else:
      pass
  else:
    return sorted(set(retval))
  die('error parsing range: %s\n' % s)


def product(args):
  """
  Like itertools.product but for one iterable of iterables
  rather than an argument list of iterables
  """
  pools = map(tuple, args)
  result = [[]]
  for pool in pools:
    result = [x+[y] for x in result for y in pool]
  for prod in result:
    yield tuple(prod)


def run(cmd, values, envs, options):
  import subprocess, datetime, os, time, signal, socket

  new_env = dict(os.environ)
  new_env.update(envs)
  is_tty = sys.stdout.isatty()

  for R in range(options.runs):
    if is_tty:
      print_bright('RUN: Start')
    else:
      print('RUN: Start')
    print("RUN: CommandLine %s" % ' '.join(cmd))
    print("RUN: Variable Hostname = %s" % socket.gethostname())
    print("RUN: Variable Timestamp = %f" % time.time())

    for (name, value) in values:
      print('RUN: Variable %s = %s' % (name, value))

    if options.timeout:
      start = datetime.datetime.now()
      process = subprocess.Popen(cmd, env=new_env)
      while process.poll() is None:
        time.sleep(5)
        now = datetime.datetime.now()
        diff = (now-start).seconds
        if diff > options.timeout:
          process.kill()
          #os.waitpid(-1, os.WNOHANG)
          os.waitpid(-1, 0)
          print("RUN: Variable Timeout = %d" % (diff*1000))
          break
      retcode = process.returncode
    else:
      retcode = subprocess.call(cmd, env=new_env)
    if retcode != 0:
      # print command line just in case child process should be died before doing it
      print("RUN: Error %s" % retcode)
      if not options.ignore_errors:
        sys.exit(1)

def parse_extra(extra):
  """
  Parse extra command line option.
  
  Three cases:
   (1) <name>::<arg>::<range>
   (2) ::<arg>::<range>
   (3) <name>::<range>
  """
  import re
  if extra.count('::') == 2:
    (name, arg, r) = extra.split('::')
    if not name:
      name = re.sub(r'^-*', '', arg)
  elif extra.count('::') == 1:
    (name, r) = extra.split('::')
    arg = None
  else:
    die('error parsing extra argument: %s\n' % extra)
  return (name, arg, r)


def main(args, options):
  variables = []
  ranges = []
  extras = [(e, False) for e in options.extra]
  extras += [(e, True) for e in options.extra_env]
  for (extra, env) in extras:
    (name, arg, r) = parse_extra(extra)
    variables.append((name, arg, env))
    ranges.append(parse_range(r))

  for prod in product(ranges):
    params = []
    values = []
    envs = {}
    for ((name, arg, env), value) in zip(variables, prod):
      if env:
        if arg:
          envs[arg] = str(value)
        else:
          envs[str(value)] = ''
      else:
        if arg:
          params.extend([arg, str(value)])
        else:
          params.extend([str(value)])
      values.append((name, str(value)))
    if options.append_arguments:
      cmd = args + params
    else:
      cmd = [args[0]] + params + args[1:]
    run(cmd, values, envs, options)


if __name__ == '__main__':
  signal.signal(signal.SIGQUIT, signal.SIG_IGN)
  sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
  parser = optparse.OptionParser(usage='usage: %prog [options] <command line> ...')
  parser.add_option('--ignore-errors', dest='ignore_errors', default=False, action='store_true',
      help='ignore errors in subprocesses')
  parser.add_option('-t', '--threads', dest="threads", default="1",
      help='range of threads to use. A range is R := R,R | S | N | N:N | N:N:N where N is an integer and S is a string.')
  parser.add_option('-r', '--runs', default=1, type='int',
      help='set number of runs')
  parser.add_option('-x', '--extra', dest="extra", default=[], action='append',
      help='add commandline parameter to range over (format: <name>::<arg>::<range> or ::<arg>::<range> or <name>::<range>). E.g., delta::-delta::1,5 or ::-delta::1,5 or schedule::-useFIFO,-useLIFO')
  parser.add_option('-e', '--extra-env', dest="extra_env", default=[], action='append',
      help='add environment variable to range over (format: <name>::<arg>::<range> or ::<arg>::<range> or <name>::<range>). E.g., delta::-delta::1,5 or ::-delta::1,5 or schedule::-useFIFO,-useLIFO')
  parser.add_option('-o', '--timeout', dest="timeout", default=0, type='int',
      help="timeout a run after SEC seconds", metavar='SEC')
  parser.add_option('--no-default-thread', dest='no_default_thread', default=False, action='store_true',
      help='supress run command default thread argument')
  parser.add_option('--append-arguments', dest='append_arguments', default=False, action='store_true',
      help='append instead of prepend additional command line arguments')
  (options, args) = parser.parse_args()
  if not args:
    parser.error('need command to run')
  if not options.no_default_thread:
    options.extra.insert(0, '%s::%s::%s' % ('Threads', '-t', options.threads))
  main(args, options)


================================================
FILE: scripts/run_vtune.pl
================================================
#!/usr/bin/perl
#
# Run vtune and collect report to file

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Cwd;

## Search for vtune in common places
sub find_vtune() {
  my $vtune = `which amplxe-cl 2> /dev/null`;
  chomp $vtune;
  if (not -e $vtune) {
    my @vtune_vers = ('', '_2018');
    foreach my $ver (@vtune_vers) {
      my $base = "/opt/intel/vtune_amplifier$ver/bin64/amplxe-cl";
      if (-e $base) {
        return $base;
      }
    }
  }
  return $vtune;
}

sub extra_symbols_option() {
  my $uname = `uname -r`;
  chomp $uname;
  my @sdirs = ();
  my @candidates = ("/usr/lib/debug/boot/$uname");
  foreach my $c (@candidates) {
    if (-e $c) {
      push(@sdirs, "-search-dir all=$c");
    }
  }
  return join('', @sdirs);
}

sub report_dir_option($) {
  my ($threads) = @_;
  my $user = `whoami`;
  chomp $user;
  my $cwd = getcwd();
  my @candidates = ("/workspace/$user/tmp/vtune--r$threads", "/tmp/$user/vtune--r$threads", "$cwd/vtune--r$threads");
  foreach my $c (@candidates) {
    if (system("mkdir -p $c") == 0) {
      return ($c, "-result-dir=$c");
    }
  }
  return ('', '');
}

sub counter_option(@) {
  return ['-collect-with runsa -knob event-config=' . join(',', @_), '-report hw-events'];
}

## returns analysis and report type option together
sub analysis_option(@) {
  my ($vtune, $a) = @_;
  my @candidates = (
    ["$a", "hw-events"],
    [qw/memory-access hw-events/],
    [qw/memory-consumption hw-events/],
    [qw/general-exploration hw-events/],
    [qw/hotspots hotspots/]
  );
  foreach my $pair (@candidates) {
    my $try = `$vtune -collect @$pair[0] 2>&1`;
    unless ($try =~ /Cannot find analysis type/) {
      return ("-collect @$pair[0]", "-report @$pair[1]");
    }
  }
  return ('','');
}

sub report_line($$$$) {
  my ($cmd, $threads, $outfile, $maxsec) = @_;

  system("echo \"THREADS\t$threads\" >>$outfile.line.log");

  open(my $syspipe, "ulimit -t $maxsec ; $cmd -format csv -csv-delimiter tab -group-by source-line |") or die($!);
  open(my $output, ">> $outfile.line.log") or die($!);
  my @header = ();
  my @sums = ();
  my $first_data_column = 0;
  while (<$syspipe>) {
    print $output $_;
    chomp;
    my @tokens = split /\t/;
    if (not @header) {
      @header = @tokens;
      for my $tok (@tokens) {
        $first_data_column++;
        last if ($tok =~ /Module$/);
      }
    } else {
      for my $idx ($first_data_column .. @tokens - 1) {
        $sums[$idx] += $tokens[$idx];
      }
    }
  }
  for my $idx ($first_data_column .. @header - 1) {
    my $label = (split /:/, $header[$idx])[1];
    print "RUN: Variable $label = $sums[$idx]\n";
  }
  close $syspipe;
  close $output;
}

sub report_function($$$$) {
  my ($cmd, $threads, $outfile, $maxsec) = @_;

  system("echo \"THREADS\t$threads\" >>$outfile.function.log");
  system("ulimit -t $maxsec; $cmd -format csv -csv-delimiter tab -group-by function >> $outfile.function.log ");
}

my @counters = ();
my $analyzeSystem = 1;
my $startPaused = 1;
my $help = 0;
my $threads = 0;
my $analysisType = 'memory-access'; # causes vtune 2016 to hang
# my $analysisType = 'general-exploration';
my $reportType = 'hw-events';
my $reportTimeout = 100000;
GetOptions(
  't|threads=s'=>\$threads,
  'analysisType=s'=>\$analysisType,
  'reportType=s'=>\$reportType,
  'counter=s'=>\@counters,
  'startPaused!'=>\$startPaused,
  'analyzeSystem!'=>\$analyzeSystem,
  'reportTimeout=s'=>\$reportTimeout,
  'help'=>\$help) or pod2usage(2);
pod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $help;
my $outfile = shift @ARGV;
my $cmdline = join(" ", @ARGV);

if ($threads) {
  $cmdline = "$cmdline -t $threads";
}

my $vtune = find_vtune;
my $symbol = extra_symbols_option();
my ($rdir, $rdiropt) = report_dir_option($threads);
my ($copt, $ropt);
if (@counters) {
  ($copt, $ropt) = counter_option(@counters);
} else {
  ($copt, $ropt) = analysis_option($vtune, $analysisType);
}

die("cannot find way to run vtune") unless($rdir and $copt and $ropt);
die("no command given") unless($cmdline);

print "RUN: CommandLine $cmdline\n";

my @collect = ();
push @collect, $vtune, $symbol, $rdiropt, $copt;
push(@collect, '-analyze-system') if ($analyzeSystem);
push(@collect, '-start-paused') if ($startPaused);
push @collect, '--', $cmdline;

my @report = ();
push @report, $vtune, $rdiropt, $ropt;

system("rm -rf $rdir") == 0 or die($!);
system("mkdir -p $rdir") == 0 or die($!);

my $vtune_collect_cmd = join(' ', @collect);
print "Running: '$vtune_collect_cmd'\n";
system("$vtune_collect_cmd") == 0 or die("vtune collection failed");
report_function join(' ', @report), $threads, $outfile, $reportTimeout;
report_line join(' ', @report), $threads, $outfile, $reportTimeout;

__END__

=head1 NAME

run_vtune - run vtune and parse results to file

=head1 SYNOPSIS

report_vtune [options] <outputbasename> <commandline>

  Options:
    -help                  brief help message
    -analysisType=T        specify vtune analysis type manually
    -reportType=T          specify vtune report type manually
    -counter=C             specify hardware performance counters manually
    -startPaused           start vtune paused (default)
    -reportTimeout=SEC     timeout for generating report
    -nostartPaused         start vtune running
    -analyzeSystem         analyze entire system (default)
    -noanalzeSystem        analyze just command and child processes

=head1 OPTIONS

=over 8

=item B<-analysisType>=T

Run "amplxe-cl -help collect" to see which collection methods are available.

=item B<-reportType>=T

Run "amplxe-cl -help report" to see which reports are available.

=item B<-counter>=C

Use multiple options for multiple counters. Examples of counter names are:
LONGEST_LAT_CACHE.MISS or OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_DRAM

=back

=head1 DESCRIPTION

Run vtune and parse results to file

=cut


================================================
FILE: scripts/sparse-matrices/diff_edgelists.py
================================================
#!/usr/bin/env python

"""diff_edgelists - Display differences between two edgelist files."""

import sys
from math import isnan, isinf
from itertools import izip_longest

# If within this tolerance, difference is not an error. (Surely five
# nines of accuracy is good enough for *relative* error.)
CLOSE_ENOUGH = 1e-5

class DiffEdgelists(object):
    """Compare edgelist files and display differences."""

    def __init__(self, symmetric=True, quick=False, verbose=True):
        self.symmetric = symmetric
        self.quick = quick
        self.verbose = verbose
        self.reset()

    def reset(self):
        """Reset all counters to zero."""
        # Counters
        self.nfiles = 0    # Number of files read
        self.nedges = 0    # Total number of edges
        self.ndups = 0     # Edges duplicated within one input file
        self.ninvalid = 0  # Edges that are invalid (NaN or infinite)
        self.ndiffs = 0    # Edges with differences > CLOSE_ENOUGH
        self.nexact = 0    # Edges with no measurable difference
        self.nxzeros = 0   # Explicit zeros not in all input files
        self.nmissing = 0  # Nonzero edges missing in some input files

        # Maximums
        self.maxabsdiff = 0     # Maximum absolute difference
        self.maxreldiff = 0     # Maximum relative difference
        self.maxqueue = 0       # Maximum size of queue

        # Verify reset was correct
        assert self.is_success()

    def _diff_edge(self, index, edge, incremental=True):
        """Compare two edges and return True if they are the same, False if a
        difference was found, or None if the edge was not found in
        every file and an incremental diff was requested.

        """
        if incremental and None in edge:
            # Edge is not complete
            return None
        eset = set(edge)
        esetlen = len(eset)
        if esetlen == 1:
            # Every version of edge is the same, no error
            self.nexact += 1
            return True
        elif not incremental and None in eset:
            if esetlen == 2:
                if float(0) in eset:
                    # An explicit zero that is not in every input file
                    self.nxzeros += 1
                    return True
                else:
                    self.nmissing += 1
                    if self.verbose >= 2:
                        print "%d %d" % index, edge
                    return False
            eset.remove(None)

        # Compute differences
        emin = min(eset)
        emax = max(eset)
        absdiff = emax-emin
        reldiff = absdiff/max(abs(emax), absdiff)

        # Update maximum
        if absdiff > self.maxabsdiff:
            self.maxabsdiff = absdiff
        if reldiff > self.maxreldiff:
            self.maxreldiff = reldiff
        if reldiff > CLOSE_ENOUGH:
            self.ndiffs += 1
            if self.verbose >= 2:
                print "%d %d" % index, edge, reldiff
            return False
        return True

    def diff(self, fhs):
        """Compare the given files."""
        if self.quick and not self.is_success():
            return

        nfiles = len(fhs)
        self.nfiles += nfiles

        # Stack of edges not yet diffed
        finished = set()
        queue = {}
        queuesize = 0

        # Iterate over the lines in each file (like paste(1))
        for lines in izip_longest(*fhs):
            self.nedges += 1
            # Parse line from each file
            for fno, line in enumerate(lines):
                if line is None:
                    continue
                # Parse line
                line = line.split()
                assert len(line) == 3
                index = (int(line[0], 10), int(line[1], 10))
                if self.symmetric and index[1] < index[0]:
                    index = (index[1], index[0])
                val = float(line[2])
                if isnan(val) or isinf(val):
                    self.ninvalid += 1
                    if self.quick:
                        return
                # Check queue
                if index in finished:
                    # We already found this edge in every file -- Collision!
                    self.ndups += 1
                    if self.quick:
                        return
                    else:
                        continue
                elif index not in queue:
                    # First time seeing this edge, create a new queue entry
                    queue[index] = [(val if i == fno else None)
                                    for i in xrange(nfiles)]
                    # Track queue size
                    queuesize += 1
                    if queuesize > self.maxqueue:
                        self.maxqueue = queuesize
                elif queue[index][fno] is not None:
                    # We already found this edge in this file -- Collision!
                    self.ndups += 1
                    if self.quick:
                        return
                    else:
                        continue
                else:
                    # Store value in existing queue entry
                    queue[index][fno] = val
                    # Check queue entry for completeness/correctness
                    result = self._diff_edge(index, queue[index])
                    if result is not None:
                        # Queue entry was complete, remove from queue
                        del queue[index]
                        queuesize -= 1
                        if self.quick and not result:
                            # We found an error while in quick mode
                            return
        # Check for differences in all remaining queue entries
        for index, entry in queue.iteritems():
            if entry is None:
                continue
            result = self._diff_edge(index, entry, incremental=False)
            if self.quick and not result:
                # We found an error while in quick mode
                return
        return

    def is_success(self):
        """Return True iff no error conditions have been detected."""
        return self.ninvalid == self.ndiffs == self.nmissing == self.ndups == 0

    def report(self):
        """Provide a report of the comparison, if verbosity allows. Returns
        is_success().

        """
        result = self.is_success()
        if result and self.verbose < 1:
            return result
        nedges = self.nedges if self.nedges else 1
        def pct(val):
            """Convert a counter to a percentage of edges."""
            return val*100./nedges

        print ("%d edges: %d differences (%.1f%%), %d missing (%.1f%%), " +
               "%d duplicates") % \
            (self.nedges, self.ndiffs, pct(self.ndiffs), self.nmissing,
             pct(self.nmissing), self.ndups)
        print "  %d implicit zeros (%.1f%%)" % \
            (self.nxzeros, pct(self.nxzeros))
        print "  %d files read" % (self.nfiles)
        print "  Maximum absolute difference = %g" % (self.maxabsdiff,)
        print "  Maximum relative difference = %g" % (self.maxreldiff,)
        print "  Maximum queue size = %d (%3.1f%%)" % \
            (self.maxqueue, pct(self.maxqueue))
        print "OK" if result else "Failed"

        return result

def main(argv):
    """Main entry point when run as a program."""
    import argparse
    parser = argparse.ArgumentParser(
        description="Display differences between edge list files"
    )
    parser.add_argument('--asymmetric', dest='symmetric', action='store_false',
                        help="Do not assume a symmetric matrix")
    parser.add_argument('--quick', action='store_true',
                        help="Stop after first error")
    parser.add_argument('--quiet', '-q', action='count', default=0,
                        help="Show less output")
    parser.add_argument('--verbose', '-v', action='count', default=1,
                        help="Show more output")
    parser.add_argument('files', type=argparse.FileType('r'), nargs='*',
                        help="Files to compare", metavar='file')
    parser.set_defaults(
        files=[sys.stdin],
    )
    args = parser.parse_args(argv)
    args.verbose -= args.quiet
    differ = DiffEdgelists(symmetric=args.symmetric, verbose=args.verbose,
                           quick=args.quick)
    #import statprof
    #statprof.start()
    differ.diff(args.files)
    for filehandle in args.files:
        filehandle.close()
    #statprof.stop()
    #statprof.display()
    return 0 if differ.report() else 1

if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))


================================================
FILE: scripts/sparse-matrices/iperm2order.pl
================================================
#!/usr/bin/perl
#
# iperm2order.pl - Convert an inverted permutation (as used by METIS)
#                  to an ordering (as used by Cholesky).
#
use warnings;
use strict;

my @order = ();
my $i = 0;
while (<>) {
    s/[\r\n]//g;
    $order[$_] = $i;
    $i++;
}
foreach ( @order ) {
    print "$_\n";
}


================================================
FILE: scripts/sparse-matrices/mtx2edgelist.pl
================================================
#!/usr/bin/perl
#
# mtx2edgelist.pl - Convert a Matrix Market matrix to a 0-based edgelist
#
use warnings;
use strict;

while (<>) {
    last unless m/^%/;
}
while (<>) {
    next if m/^%/;
    my ($i,$j,$x) = split /[ \t]/;
    print $i-1, ' ', $j-1, ' ', $x;
}


================================================
FILE: scripts/sparse-matrices/reorder.pl
================================================
#!/usr/bin/perl
#
# reorder.pl - Reorder a matrix given an ordering.
#              Supports both edgelist and Matrix Market input files.
#

use warnings;
use strict;

die "Usage: $0 [-1] <matrix> <ordering>\n" unless @ARGV == 2;

my $onebased = 0;
if ( $ARGV[0] eq '-1' ) {
    $onebased = 1;
    shift @ARGV;
    warn "Using one-based input"
}

# Load ordering
open F, '<', $ARGV[1] or die;
# number of nonzeros in tim davis code
my @ordering = ();
my $i = 0;
while (<F>) {
    my $j = int($_);
    die if $j < 0;
    $ordering[$j] = $i;
    $i++;
}
close F;

open F, '<', $ARGV[0] or die;
my @rows = ();
my $matrixmarket = 0;
while ( <F> ) {
    if ( m/^[%#]/ ) {
        if ( m/^%%MatrixMarket/ ) {
            warn "Unknown MatrixMarket format" unless m/coordinate real/;
            warn "Assuming one-based" if !$onebased;
            $onebased = 1;
            $matrixmarket = 1;
        }
        print;
        next;
    }
    elsif ( $matrixmarket == 1 ) {
        warn "Skipping MatrixMarket size info\n";
        $matrixmarket++;
        print;
        next;
    }
    s/[\r\n]//g;
    my @row = split(/\s/, $_);
    $row[0] = $onebased ? $ordering[$row[0]-1] : $ordering[$row[0]];
    $row[1] = $onebased ? $ordering[$row[1]-1] : $ordering[$row[1]];
    (defined($row[0]) && defined($row[1])) or die;
    if ( $onebased ) { $row[0]++; $row[1]++ }
    print join(' ', @row), "\n";
}
close F;


================================================
FILE: scripts/tcp_starter.py
================================================
import socket
import threading
import SocketServer
import time
import sys

reqs = []
num = int(sys.argv[1])
finished = 0

class ThreadedTCPRequestHandler(SocketServer.StreamRequestHandler):

    def handle(self):
        self.data = self.rfile.readline().strip()
        cur_thread = threading.current_thread()
        cport = int(self.data)
        response = "{0}: {1} | {2} | {3}".format(cur_thread.name, self.data,num,self.client_address[0]);
        mynum = len(reqs) / 2
        reqs.append(self.client_address[0])
        reqs.append(cport)
        print "Recv: {0} of {1}\n".format(len(reqs) / 2, num);
        while len(reqs) != num*2:
            time.sleep(0.0001)
        rep = ",".join(str(x) for x in reqs)
        print rep
        self.request.sendall(str(num) + "," + str(mynum) + "," + rep + "\n")
        global finished
        finished = finished + 1
        if finished == num:
            self.server.shutdown()

class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
    allow_reuse_address=True

if __name__ == "__main__":
    # Port 0 means to select an arbitrary unused port
    HOST, PORT = "", 9999

    
    server = ThreadedTCPServer((HOST, PORT), ThreadedTCPRequestHandler)
    ip, port = server.server_address

    # Start a thread with the server -- that thread will then start one
    # more thread for each request
    server_thread = threading.Thread(target=server.serve_forever)
    # Exit the server thread when the main thread terminates
    server_thread.daemon = True
    server_thread.start()
    print "Server loop running in thread: {0} host: {1} port: {2}".format(server_thread.name, ip, port)

    server.serve_forever(0.05)


================================================
FILE: scripts/visual/plot2Dmesh.m
================================================
poly = dlmread ('mesh-poly.csv', ',' , 1, 0);

coord = dlmread ('mesh-coord.csv', ',', 1, 0);

xcoord = coord(:, 1);

ycoord = coord(:, 2);

zcoord = coord(:, 3);

t = poly (:, 1:4);

t = t + 1;

% triplot (t, x, y);

timestamps = poly(:, 4);

norm_ts = timestamps ./ max(timestamps); % scale relative to 1;

norm_freq = min (timestamps) ./ timestamps;


sz = size(t)

% colormap Gray;
colormap ('default');

cbar = colorbar ('location', 'EastOutside');
set( get(cbar, 'Ylabel'), 'String', 'Number of updates per element normalized to 1');


hold on;
for i = 1 : sz(1)
    x = xcoord(t(i,:));
    y = ycoord(t(i,:));
    z = zcoord(t(i,:));
    
    fill3 (x, y, z, norm_freq(i), 'LineWidth', 2);
    
end

set (get(gca, 'Xlabel'), 'String', 'Xpos of elements');
set (get(gca, 'Ylabel'), 'String', 'Ypos of elements');
title ('Mesh elements colored by number of updates');


================================================
FILE: scripts/visual/plotGraph.R
================================================
#!/usr/bin/Rscript

library (rgl)

args = commandArgs (trailingOnly=T);

nodes = read.csv (args[1], header=T);
edges = read.csv (args[2], header=T);

nodes = nodes[rev (rownames (nodes)), ]; # order by nodeId
rownames (nodes) = 1:nrow(nodes);

edges = edges[rev (rownames (edges)), ]; # reverse like wise
rownames (edges) = 1:nrow(edges);

localMin = nodes[nodes$inDeg == 0, ]
localMax = nodes[nodes$outDeg == 0, ]

srcVec = edges$srcId + 1
dstVec = edges$dstId + 1

arrowCoord = cbind (nodes$centerX[srcVec], nodes$centerY[srcVec], nodes$centerX[dstVec], nodes$centerY[dstVec]);

cex=2.5;
pch=20

outfile = "adjgraph.pdf"
pdf (outfile)
plot (nodes$centerX, nodes$centerY, type="p", pch=pch, col="transparent", cex=cex, xlab="", ylab="");

arrows (arrowCoord[,1], arrowCoord[,2], arrowCoord[,3], arrowCoord[,4], length=0.1, angle=7, lwd=1);

points (nodes$centerX, nodes$centerY, pch=pch, col="blue", cex=cex);

points (localMin$centerX, localMin$centerY, pch=pch, col="red", cex=cex);

points (localMax$centerX, localMax$centerY, pch=pch, col="green", cex=cex);


dev.off ();


================================================
FILE: scripts/visual/plotGraph3d.R
================================================
#!/usr/bin/Rscript

library (rgl)
library (compositions)

args = commandArgs (trailingOnly=T);

nodes = read.csv (args[1], header=T);
edges = read.csv (args[2], header=T);

nodes = nodes[rev (rownames (nodes)), ]; # order by nodeId
rownames (nodes) = 1:nrow(nodes);

edges = edges[rev (rownames (edges)), ]; # reverse like wise
rownames (edges) = 1:nrow(edges);

# scale time stamps
nodes$timeStamp = 500 * nodes$timeStamp;

localMin = nodes[nodes$inDeg == 0, ]
localMax = nodes[nodes$outDeg == 0, ]

srcVec = edges$srcId + 1
dstVec = edges$dstId + 1


open3d ();

# segX = nodes$centerX[rbind (srcVec, dstVec)]
# segY = nodes$centerY[rbind (srcVec, dstVec)]
# segZ = nodes$timeStamp[rbind (srcVec, dstVec)]
# segments3d (segX, segY, segZ);

arrowStart = cbind (nodes$centerX[srcVec], nodes$centerY[srcVec], nodes$timeStamp[srcVec]);
arrowEnd = cbind (nodes$centerX[dstVec], nodes$centerY[dstVec], nodes$timeStamp[dstVec]);
arrows3D (arrowStart, arrowEnd, angle=10, length=0.2, size=10, lwd=10);


radius=0.15

spheres3d (nodes$centerX, nodes$centerY, nodes$timeStamp, radius=radius, color=c("blue"));
spheres3d (localMin$centerX, localMin$centerY, localMin$timeStamp, radius=1.1*radius, color=c("red"));
spheres3d (localMax$centerX, localMax$centerY, localMax$timeStamp, radius=1.1*radius, color=c("green"));

# rgl.snapshot ("adjgraph3d.png")


================================================
FILE: scripts/visual/plotTimeStamps.m
================================================
data = dlmread('mesh.csv', ',', 1, 0);
x = data (:,1);
y = data (:,2);
z = data (:,3);

gridsize = 100;

xlin = linspace (min(x), max(x), gridsize);
ylin = linspace (min(y), max(y), gridsize);

[X, Y] = meshgrid (xlin, ylin);

f = TriScatteredInterp (x,y,z, 'nearest');;

Z = f(X,Y);

% bar3 (Z);

hidden off

mesh(X,Y,Z);

figure()

surf(X,Y,Z, 'EdgeColor', 'None');

% shading interp
% lighting('phong')


================================================
FILE: scripts/visual/triplot.m
================================================
data = dlmread('mesh.csv', ',', 1, 0);
x = data (:,1);
y = data (:,2);
z = data (:,3);

gridsize = 50;

xlin = linspace (min(x), max(x), gridsize);
ylin = linspace (min(y), max(y), gridsize);

[X, Y] = meshgrid (xlin, ylin);

f = TriScatteredInterp (x,y,z);;

Z = f(X,Y);

hidden off

mesh(X,Y,Z);

figure()

surf(X,Y,Z);

color('interp')
lighting('phong')


================================================
FILE: scripts/vtune.sh
================================================
#!/bin/bash

# README
# run as:
# threads="1 2 4 8 16" vtune.sh program ARGS

#threads=${threads:="1 5 10 15 20 25 30 35 40"};
TSTEP="5";
tmax="40";
threads=${threads:="1 `seq $TSTEP $TSTEP $TMAX`"};

threads=${threads:="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15"};
prefix=$(basename $1); # assuming arg 1 is the path to the program being run

scriptsDir=$(dirname $0)

echo "scriptsDir=$scriptsDir"

stamp=$(date +'%Y-%m-%d_%H:%M:%S')

OUT_PREFIX="${prefix}_vtune_out_${stamp}"

for t in $threads; do
  # outfile="${OUT_PREFIX}${t}"

  cmd="$scriptsDir/run_vtune.pl -t $t -- ${OUT_PREFIX} $@"

  date;
  echo "Running: $cmd"
  $cmd
done 2>&1 | tee ${OUT_PREFIX}.run.log

function_out="${OUT_PREFIX}.function.log";
line_out="${OUT_PREFIX}.line.log";

SUMM_PREFIX="${prefix}_vtune_summary";

cat $function_out | c++filt | perl $scriptsDir/report_vtune.pl --in function > ${SUMM_PREFIX}.function.${stamp}.csv

cat $line_out | perl $scriptsDir/report_vtune.pl --in line > ${SUMM_PREFIX}.line.${stamp}.csv


# for t in $threads; do
  # function_out="${OUT_PREFIX}${t}.function.log"
  # cat $function_out;
# done | perl $scriptsDir/report_vtune.pl --in function > vtune_summary.function.${stamp}.csv

# for t in $threads; do
  # line_out="${OUT_PREFIX}${t}.line.log"
  # cat $line_out;
# done | perl $scriptsDir/report_vtune.pl --in line > vtune_summary.line.${stamp}.csv
  

================================================
FILE: setup.py
================================================
import sys
import os
import setuptools

from skbuild import setup

# Require pytest-runner only when running tests
pytest_runner = (
    ["pytest-runner>=2.0,<3dev"] if any(arg in sys.argv for arg in ("pytest", "test")) else []
)

setup_requires = pytest_runner


def find_files(root, suffix):
    """
    Find files ending with a given suffix in root and its subdirectories and
    return their names relative to root.
    """
    files = []
    for dirpath, _, filenames in os.walk(root):
        for f in filenames:
            if not f.endswith(suffix):
                continue
            relpath = os.path.relpath(dirpath, root)
            files.append(os.path.join(relpath, f))
    return files


def package_setup():
    with open("config/version.txt") as f:
        version = f.read().strip()

    pxd_files = find_files("python/galois", ".pxd")

    # "pip wheel --build-option=..." disables use of wheels for dependencies.
    # In order to support passing build arguments directly, accept arguments
    # via the environment.
    cmake_args = os.environ.get("GALOIS_CMAKE_ARGS", "").split()

    # Following PEP-518, use pyproject.toml instead of setup(setup_requires=...) to
    # specify setup dependencies.

    setup(
        version=version,
        name="galois",
        packages=setuptools.find_packages("python"),
        package_data={"galois": pxd_files},
        package_dir={"": "python"},
        tests_require=["pytest"],
        setup_requires=setup_requires,
        cmake_args=cmake_args,
        cmake_source_dir="python",
    )


if __name__ == "__main__":
    package_setup()


================================================
FILE: tests/test_imports.py
================================================
def test_imports():
    import galois.sssp
    import galois.pagerank
    import galois.bfs
    import galois.connectedComponents


================================================
FILE: tools/CMakeLists.txt
================================================
add_subdirectory(graph-convert)
add_subdirectory(graph-remap)
add_subdirectory(graph-stats)

if (GALOIS_ENABLE_DIST)
  add_subdirectory(dist-graph-convert)
endif()


================================================
FILE: tools/dist-graph-convert/CMakeLists.txt
================================================
add_executable(dist-graph-convert dist-graph-convert.cpp dist-graph-convert-helpers.cpp)

target_link_libraries(dist-graph-convert PRIVATE galois_dist_async LLVMSupport)


================================================
FILE: tools/dist-graph-convert/dist-graph-convert-helpers.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "dist-graph-convert-helpers.h"

std::vector<uint32_t> readRandomNodeMapping(const std::string& nodeMapBinary,
                                            uint64_t nodeOffset,
                                            uint64_t numToRead) {
  MPI_File mb;
  std::vector<char> fName(nodeMapBinary.begin(), nodeMapBinary.end());
  fName.push_back('\0');
  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(), MPI_MODE_RDONLY,
                         MPI_INFO_NULL, &mb));

  uint64_t readPosition = nodeOffset * sizeof(uint32_t);
  uint64_t numRead      = 0;
  MPI_Status readStatus;
  std::vector<uint32_t> node2NewNode(numToRead);

  while (numToRead > 0) {
    // File_read can only go up to the max int
    uint64_t toLoad =
        std::min(numToRead, (uint64_t)std::numeric_limits<int>::max());
    MPI_File_read_at(mb, readPosition,
                     ((char*)(node2NewNode.data())) +
                         (numRead * sizeof(uint32_t)),
                     toLoad, MPI_UINT32_T, &readStatus);

    int nodesRead;
    MPI_Get_count(&readStatus, MPI_UINT32_T, &nodesRead);
    GALOIS_ASSERT(nodesRead != MPI_UNDEFINED, "Nodes read is MPI_UNDEFINED");
    numToRead -= nodesRead;
    numRead += nodesRead;
    readPosition += nodesRead * sizeof(uint32_t);
  }
  MPICheck(MPI_File_close(&mb));

  return node2NewNode;
}

void MPICheck(int errcode) {
  if (errcode != MPI_SUCCESS) {
    MPI_Abort(MPI_COMM_WORLD, errcode);
  }
}

Uint64Pair readV1GrHeader(const std::string& grFile, bool isVoid) {
  MPI_File gr;
  std::vector<char> fName(grFile.begin(), grFile.end());
  fName.push_back('\0');
  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(), MPI_MODE_RDONLY,
                         MPI_INFO_NULL, &gr));
  uint64_t grHeader[4];
  MPICheck(
      MPI_File_read_at(gr, 0, grHeader, 4, MPI_UINT64_T, MPI_STATUS_IGNORE));
  MPICheck(MPI_File_close(&gr));
  GALOIS_ASSERT(grHeader[0] == 1, "gr file must be version 1");

  if (!isVoid) {
    GALOIS_ASSERT(grHeader[1] != 0, "gr should have weights "
                                    "(specified in header)");
  }

  return Uint64Pair(grHeader[2], grHeader[3]);
}

std::vector<Uint64Pair> getHostToNodeMapping(uint64_t numHosts,
                                             uint64_t totalNumNodes) {
  GALOIS_ASSERT((totalNumNodes != 0), "host2node mapping needs numNodes");

  std::vector<Uint64Pair> hostToNodes;

  for (unsigned i = 0; i < numHosts; i++) {
    hostToNodes.emplace_back(
        galois::block_range((uint64_t)0, (uint64_t)totalNumNodes, i, numHosts));
  }

  return hostToNodes;
}

uint32_t findOwner(const uint64_t gID,
                   const std::vector<Uint64Pair>& ownerMapping) {
  uint32_t lb = 0;
  uint32_t ub = ownerMapping.size();

  while (lb < ub) {
    uint64_t mid      = lb + (ub - lb) / 2;
    auto& currentPair = ownerMapping[mid];

    if (gID >= currentPair.first && gID < currentPair.second) {
      return mid;
    } else if (gID < currentPair.first) {
      // MOVE DOWN
      ub = mid;
    } else if (gID >= currentPair.second) { // gid >= currentPair.second
      // MOVE UP
      lb = mid + 1;
    } else {
      GALOIS_DIE("unreachable");
    }
  }

  // it should find something above...
  return -1;
}

uint64_t getFileSize(std::ifstream& openFile) {
  openFile.seekg(0, std::ios_base::end);
  return openFile.tellg();
}

Uint64Pair determineByteRange(std::ifstream& edgeListFile, uint64_t fileSize) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  uint64_t initialStart;
  uint64_t initialEnd;
  std::tie(initialStart, initialEnd) = galois::block_range(
      (uint64_t)0, (uint64_t)fileSize, hostID, totalNumHosts);

  bool startGood = false;
  if (initialStart != 0) {
    // good starting point if the prev char was a new line (i.e. this start
    // location is the beginning of a line)
    // TODO factor this out
    edgeListFile.seekg(initialStart - 1);
    char testChar = edgeListFile.get();
    if (testChar == '\n') {
      startGood = true;
    }
  } else {
    // start is 0; perfect starting point, need no adjustment
    startGood = true;
  }

  bool endGood = false;
  if (initialEnd != fileSize && initialEnd != 0) {
    // good end point if the prev char was a new line (i.e. this end
    // location is the beginning of a line; recall non-inclusive)
    // TODO factor this out
    edgeListFile.seekg(initialEnd - 1);
    char testChar = edgeListFile.get();
    if (testChar == '\n') {
      endGood = true;
    }
  } else {
    endGood = true;
  }

  uint64_t finalStart = initialStart;
  if (!startGood) {
    // find next new line
    // TODO factor this out
    edgeListFile.seekg(initialStart);
    std::string dummy;
    std::getline(edgeListFile, dummy);
    finalStart = edgeListFile.tellg();
  }

  uint64_t finalEnd = initialEnd;
  if (!endGood) {
    // find next new line
    // TODO factor out
    edgeListFile.seekg(initialEnd);
    std::string dummy;
    std::getline(edgeListFile, dummy);
    finalEnd = edgeListFile.tellg();
  }

  return Uint64Pair(finalStart, finalEnd);
}

uint64_t accumulateValue(uint64_t localEdgeCount) {
  galois::DGAccumulator<uint64_t> accumulator;
  accumulator.reset();
  accumulator += localEdgeCount;
  return accumulator.reduce();
}

uint64_t findIndexPrefixSum(uint64_t targetWeight, uint64_t lb, uint64_t ub,
                            const std::vector<uint64_t>& prefixSum) {
  while (lb < ub) {
    uint64_t mid = lb + (ub - lb) / 2;
    uint64_t numUnits;

    if (mid != 0) {
      numUnits = prefixSum[mid - 1];
    } else {
      numUnits = 0;
    }

    if (numUnits <= targetWeight) {
      lb = mid + 1;
    } else {
      ub = mid;
    }
  }

  return lb;
}

Uint64Pair binSearchDivision(uint64_t id, uint64_t totalID,
                             const std::vector<uint64_t>& prefixSum) {
  uint64_t totalWeight        = prefixSum.back();
  uint64_t weightPerPartition = (totalWeight + totalID - 1) / totalID;
  uint64_t numThingsToSplit   = prefixSum.size();

  uint64_t lower;
  if (id != 0) {
    lower = findIndexPrefixSum(id * weightPerPartition, 0, numThingsToSplit,
                               prefixSum);
  } else {
    lower = 0;
  }
  uint64_t upper = findIndexPrefixSum((id + 1) * weightPerPartition, lower,
                                      numThingsToSplit, prefixSum);

  return Uint64Pair(lower, upper);
}

void findUniqueChunks(galois::DynamicBitSet& uniqueNodeBitset,
                      const std::vector<Uint64Pair>& chunkToNode,
                      galois::DynamicBitSet& uniqueChunkBitset) {
  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;
  std::cout << "[" << hostID << "] Finding unique chunks\n";
  uniqueChunkBitset.reset();

  galois::do_all(
      galois::iterate((size_t)0, uniqueNodeBitset.size()),
      [&](auto nodeIndex) {
        if (uniqueNodeBitset.test(nodeIndex)) {
          uniqueChunkBitset.set(findOwner(nodeIndex, chunkToNode));
        }
      },
      galois::loopname("FindUniqueChunks"));

  freeVector(uniqueNodeBitset.get_vec());

  std::cout << "[" << hostID << "] Unique chunks found\n";
}

void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID << "] Sending edge chunk counts\n";
  // send off my chunk count vector to others so all hosts can have the
  // same count of edges in a chunk
  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID)
      continue;
    galois::runtime::SendBuffer b;
    galois::runtime::gSerialize(b, chunkCounts);
    net.sendTagged(h, galois::runtime::evilPhase, b);
  }

  // receive chunk counts
  std::vector<uint64_t> recvChunkCounts;

  std::cout << "[" << hostID << "] Receiving edge chunk counts\n";
  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID)
      continue;
    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;

    do {
      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
    } while (!rBuffer);

    galois::runtime::gDeserialize(rBuffer->second, recvChunkCounts);

    for (unsigned i = 0; i < chunkCounts.size(); i++) {
      chunkCounts[i] += recvChunkCounts[i];
    }
  }
  galois::runtime::evilPhase++;
}

std::vector<Uint64Pair>
getChunkToHostMapping(const std::vector<uint64_t>& chunkCountsPrefixSum,
                      const std::vector<Uint64Pair>& chunkToNode) {
  std::vector<Uint64Pair> finalMapping;

  uint64_t hostID        = galois::runtime::getSystemNetworkInterface().ID;
  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;
  for (uint64_t h = 0; h < totalNumHosts; h++) {
    uint64_t lowerChunk;
    uint64_t upperChunk;

    // get the lower/upper chunk assigned to host h
    std::tie(lowerChunk, upperChunk) =
        binSearchDivision(h, totalNumHosts, chunkCountsPrefixSum);

    uint64_t lowerNode = chunkToNode[lowerChunk].first;
    uint64_t upperNode = chunkToNode[upperChunk].first;

    if (hostID == 0) {
      uint64_t edgeCount;
      if (lowerChunk == upperChunk) {
        edgeCount = 0;
      } else if (lowerChunk == 0) {
        edgeCount = chunkCountsPrefixSum[upperChunk - 1];
      } else {
        edgeCount = chunkCountsPrefixSum[upperChunk - 1] -
                    chunkCountsPrefixSum[lowerChunk - 1];
      }
      std::cout << "Host " << h << " gets nodes " << lowerNode << " to "
                << upperNode << " (count " << (upperNode - lowerNode)
                << "), with " << edgeCount << " edges\n";
    }

    finalMapping.emplace_back(Uint64Pair(lowerNode, upperNode));
  }

  return finalMapping;
}

DoubleUint64Pair getNodesToReadFromGr(const std::string& inputGr) {
  uint32_t hostID        = galois::runtime::getSystemNetworkInterface().ID;
  uint32_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;

  galois::graphs::OfflineGraph offlineGr(inputGr);
  auto nodeAndEdgeRange = offlineGr.divideByNode(0, 1, hostID, totalNumHosts);
  auto& nodeRange       = nodeAndEdgeRange.first;
  auto& edgeRange       = nodeAndEdgeRange.second;
  Uint64Pair nodePair(*nodeRange.first, *nodeRange.second);
  Uint64Pair edgePair(*edgeRange.first, *edgeRange.second);
  return DoubleUint64Pair(nodePair, edgePair);
}

std::vector<uint32_t>
loadCleanEdgesFromBufferedGraph(const std::string& inputFile,
                                Uint64Pair nodesToRead, Uint64Pair edgesToRead,
                                uint64_t totalNumNodes, uint64_t totalNumEdges,
                                bool keepSelfLoops) {
  galois::graphs::BufferedGraph<void> bufGraph;
  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,
                            edgesToRead.first, edgesToRead.second,
                            totalNumNodes, totalNumEdges);
  size_t numNodesToRead = nodesToRead.second - nodesToRead.first;
  std::vector<std::set<uint32_t>> nonDupSets(numNodesToRead);

  // insert edge destinations of each node into a set (i.e. no duplicates)
  galois::do_all(
      galois::iterate(nodesToRead.first, nodesToRead.second),
      [&](uint32_t gID) {
        size_t vectorIndex = gID - nodesToRead.first;

        uint64_t edgeBegin = *bufGraph.edgeBegin(gID);
        uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);

        for (uint64_t i = edgeBegin; i < edgeEnd; i++) {
          uint32_t edgeDest = bufGraph.edgeDestination(i);
          // checking if this is a self edge
          if ((edgeDest != gID) || keepSelfLoops) {
            nonDupSets[vectorIndex].insert(edgeDest);
          }
        }
      },
      galois::steal(), galois::loopname("FindCleanEdges"));

  // get total num edges remaining
  uint64_t edgesRemaining = 0;
  for (unsigned i = 0; i < numNodesToRead; i++) {
    edgesRemaining += nonDupSets[i].size();
  }

  std::vector<uint32_t> edgeData(edgesRemaining * 2);

  uint64_t counter = 0;

  // (serially) create the edge vector; TODO it's possible to parallelize
  // this loop using a prefix sum of edges....; worth doing?
  for (unsigned i = 0; i < numNodesToRead; i++) {
    std::set<uint32_t> currentSet = nonDupSets[i];
    uint32_t currentGID           = i + nodesToRead.first;

    for (auto dest : currentSet) {
      edgeData[counter * 2]     = currentGID; // src
      edgeData[counter * 2 + 1] = dest;
      counter++;
    }
  }

  return edgeData;
}

uint64_t receiveEdgeCounts() {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID << "] Receiving edge counts\n";

  uint64_t edgesToReceive = 0;

  // receive
  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID)
      continue;
    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;

    uint64_t recvCount;

    do {
      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
    } while (!rBuffer);
    galois::runtime::gDeserialize(rBuffer->second, recvCount);

    edgesToReceive += recvCount;
  }

  galois::runtime::evilPhase++;

  return edgesToReceive;
}

void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
                          const std::vector<Uint64Pair>& hostToNodes,
                          std::vector<std::vector<uint32_t>>& localSrcToDest,
                          std::vector<std::vector<uint32_t>>& localSrcToData,
                          std::vector<std::mutex>& nodeLocks) {
  auto& net       = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID = net.ID;

  std::cout << "[" << hostID << "] Going to receive assigned edges\n";

  // receive edges
  galois::on_each(
      [&](unsigned, unsigned) {
        std::vector<uint32_t> recvVector;
        std::vector<uint32_t> recvDataVector;

        while (edgesToReceive) {
          decltype(
              net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
          rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);

          // the buffer will have edge data as well if localsrctodata is
          // nonempty (it will be nonempty if initialized to non-empty by the
          // send function, and the send function only initializes it if it is
          // going to send edge data
          if (rBuffer) {
            auto& receiveBuffer = rBuffer->second;
            while (receiveBuffer.r_size() > 0) {
              uint64_t src;
              if (localSrcToData.empty()) {
                // receive only dest data
                galois::runtime::gDeserialize(receiveBuffer, src, recvVector);
              } else {
                // receive edge data as well
                galois::runtime::gDeserialize(receiveBuffer, src, recvVector,
                                              recvDataVector);
              }

              edgesToReceive -= recvVector.size();
              GALOIS_ASSERT(findOwner(src, hostToNodes) == hostID);
              uint32_t localID = src - hostToNodes[hostID].first;

              nodeLocks[localID].lock();
              if (localSrcToData.empty()) {
                // deal with only destinations
                for (unsigned i = 0; i < recvVector.size(); i++) {
                  localSrcToDest[localID].emplace_back(recvVector[i]);
                }
              } else {
                // deal with destinations and data
                for (unsigned i = 0; i < recvVector.size(); i++) {
                  localSrcToDest[localID].emplace_back(recvVector[i]);
                  localSrcToData[localID].emplace_back(recvDataVector[i]);
                }
              }
              nodeLocks[localID].unlock();
            }
          }
        }
      },
      galois::loopname("EdgeReceiving"));
  galois::runtime::evilPhase++;

  std::cout << "[" << hostID << "] Receive assigned edges finished\n";
}

std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID
            << "] Informing other hosts about number of edges\n";

  std::vector<uint64_t> edgesPerHost(totalNumHosts);

  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID)
      continue;
    galois::runtime::SendBuffer b;
    galois::runtime::gSerialize(b, localAssignedEdges);
    net.sendTagged(h, galois::runtime::evilPhase, b);
  }

  // receive
  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID) {
      edgesPerHost[h] = localAssignedEdges;
      continue;
    }

    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
    uint64_t otherAssignedEdges;
    do {
      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
    } while (!rBuffer);
    galois::runtime::gDeserialize(rBuffer->second, otherAssignedEdges);

    edgesPerHost[rBuffer->first] = otherAssignedEdges;
  }
  galois::runtime::evilPhase++;

  return edgesPerHost;
}

std::vector<uint32_t>
flattenVectors(std::vector<std::vector<uint32_t>>& vectorOfVectors) {
  std::vector<uint32_t> finalVector;
  uint64_t vectorsToFlatten = vectorOfVectors.size();

  for (unsigned i = 0; i < vectorsToFlatten; i++) {
    auto& curVector = vectorOfVectors[i];
    finalVector.insert(finalVector.end(), curVector.begin(), curVector.end());
    // free the memory up
    freeVector(vectorOfVectors[i]);
  }

  return finalVector;
}

void writeGrHeader(MPI_File& gr, uint64_t version, uint64_t sizeOfEdge,
                   uint64_t totalNumNodes, uint64_t totalEdgeCount) {
  // I won't check status here because there should be no reason why
  // writing 8 bytes per write would fail.... (I hope at least)
  MPICheck(
      MPI_File_write_at(gr, 0, &version, 1, MPI_UINT64_T, MPI_STATUS_IGNORE));
  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t), &sizeOfEdge, 1, MPI_UINT64_T,
                             MPI_STATUS_IGNORE));
  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t) * 2, &totalNumNodes, 1,
                             MPI_UINT64_T, MPI_STATUS_IGNORE));
  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t) * 3, &totalEdgeCount, 1,
                             MPI_UINT64_T, MPI_STATUS_IGNORE));
}

void writeNodeIndexData(MPI_File& gr, uint64_t nodesToWrite,
                        uint64_t nodeIndexOffset,
                        const std::vector<uint64_t>& edgePrefixSum) {
  MPI_Status writeStatus;
  uint64_t totalWritten = 0;
  while (nodesToWrite != 0) {
    uint64_t toWrite =
        std::min(nodesToWrite, (uint64_t)std::numeric_limits<int>::max());

    MPICheck(MPI_File_write_at(gr, nodeIndexOffset,
                               ((uint64_t*)edgePrefixSum.data()) + totalWritten,
                               toWrite, MPI_UINT64_T, &writeStatus));

    int itemsWritten;
    MPI_Get_count(&writeStatus, MPI_UINT64_T, &itemsWritten);
    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,
                  "itemsWritten is MPI_UNDEFINED");
    nodesToWrite -= itemsWritten;
    totalWritten += itemsWritten;
    nodeIndexOffset += itemsWritten * sizeof(uint64_t);
  }
}

// vector of vectors version
void writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,
                       std::vector<std::vector<uint32_t>>& localSrcToDest) {
  MPI_Status writeStatus;

  for (unsigned i = 0; i < localSrcToDest.size(); i++) {
    std::vector<uint32_t> currentDests = localSrcToDest[i];
    uint64_t numToWrite                = currentDests.size();
    uint64_t totalWritten              = 0;

    while (numToWrite != 0) {
      uint64_t toWrite =
          std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());

      MPICheck(MPI_File_write_at(
          gr, edgeDestOffset, ((uint32_t*)currentDests.data()) + totalWritten,
          toWrite, MPI_UINT32_T, &writeStatus));

      int itemsWritten;
      MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);
      GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,
                    "itemsWritten is MPI_UNDEFINED");
      numToWrite -= itemsWritten;
      totalWritten += itemsWritten;
      edgeDestOffset += sizeof(uint32_t) * itemsWritten;
    }
  }
}

// 1 vector version (MUCH FASTER, USE WHEN POSSIBLE)
void writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,
                       std::vector<uint32_t>& destVector) {
  MPI_Status writeStatus;
  uint64_t numToWrite   = destVector.size();
  uint64_t totalWritten = 0;

  while (numToWrite != 0) {
    uint64_t toWrite =
        std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());

    MPICheck(MPI_File_write_at(gr, edgeDestOffset,
                               ((uint32_t*)destVector.data()) + totalWritten,
                               toWrite, MPI_UINT32_T, &writeStatus));

    int itemsWritten;
    MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);
    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,
                  "itemsWritten is MPI_UNDEFINED");
    numToWrite -= itemsWritten;
    totalWritten += itemsWritten;
    edgeDestOffset += sizeof(uint32_t) * itemsWritten;
  }
}

void writeEdgeDataData(MPI_File& gr, uint64_t edgeDataOffset,
                       const std::vector<uint32_t>& edgeDataToWrite) {
  MPI_Status writeStatus;
  uint64_t numToWrite = edgeDataToWrite.size();
  uint64_t numWritten = 0;

  while (numToWrite != 0) {
    uint64_t toWrite =
        std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());

    MPICheck(MPI_File_write_at(gr, edgeDataOffset,
                               ((uint32_t*)edgeDataToWrite.data()) + numWritten,
                               toWrite, MPI_UINT32_T, &writeStatus));
    int itemsWritten;
    MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);
    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,
                  "itemsWritten is MPI_UNDEFINED");
    numToWrite -= itemsWritten;
    numWritten += itemsWritten;
    edgeDataOffset += itemsWritten * sizeof(uint32_t);
  }
}

void writeToGr(const std::string& outputFile, uint64_t totalNumNodes,
               uint64_t totalNumEdges, uint64_t localNumNodes,
               uint64_t localNodeBegin, uint64_t globalEdgeOffset,
               std::vector<std::vector<uint32_t>>& localSrcToDest,
               std::vector<std::vector<uint32_t>>& localSrcToData) {
  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;

  std::cout << "[" << hostID << "] Beginning write to file\n";
  MPI_File newGR;

  std::vector<char> fName(outputFile.begin(), outputFile.end());
  fName.push_back('\0');
  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(),
                         MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,
                         &newGR));

  if (hostID == 0) {
    if (localSrcToData.empty()) {
      writeGrHeader(newGR, 1, 0, totalNumNodes, totalNumEdges);
    } else {
      // edge data size hard set to 4 if there is data to write (uint32_t)
      writeGrHeader(newGR, 1, 4, totalNumNodes, totalNumEdges);
    }
  }

  if (localNumNodes > 0) {
    // prepare edge prefix sum for file writing
    std::vector<uint64_t> edgePrefixSum(localNumNodes);
    edgePrefixSum[0] = localSrcToDest[0].size();
    for (unsigned i = 1; i < localNumNodes; i++) {
      edgePrefixSum[i] = (edgePrefixSum[i - 1] + localSrcToDest[i].size());
    }

    // account for edge offset
    for (unsigned i = 0; i < localNumNodes; i++) {
      edgePrefixSum[i] = edgePrefixSum[i] + globalEdgeOffset;
    }

    // begin file writing
    uint64_t headerSize      = sizeof(uint64_t) * 4;
    uint64_t nodeIndexOffset = headerSize + (localNodeBegin * sizeof(uint64_t));
    std::cout << "[" << hostID << "] Write node index data\n";
    writeNodeIndexData(newGR, localNumNodes, nodeIndexOffset, edgePrefixSum);
    freeVector(edgePrefixSum);

    uint64_t edgeDestOffset = headerSize + (totalNumNodes * sizeof(uint64_t)) +
                              globalEdgeOffset * sizeof(uint32_t);
    std::cout << "[" << hostID << "] Write edge dest data\n";
    std::vector<uint32_t> destVector = flattenVectors(localSrcToDest);
    freeVector(localSrcToDest);
    writeEdgeDestData(newGR, edgeDestOffset, destVector);

    // edge data writing if necessary
    if (!localSrcToData.empty()) {
      uint64_t edgeDataOffset = getOffsetToLocalEdgeData(
          totalNumNodes, totalNumEdges, globalEdgeOffset);
      std::cout << "[" << hostID << "] Write edge data data\n";
      std::vector<uint32_t> dataVector = flattenVectors(localSrcToData);
      freeVector(localSrcToData);
      writeEdgeDataData(newGR, edgeDataOffset, dataVector);
    }

    std::cout << "[" << hostID << "] Write to file done\n";
  }

  MPICheck(MPI_File_close(&newGR));
}

void writeToLux(const std::string& outputFile, uint64_t totalNumNodes,
                uint64_t totalNumEdges, uint64_t localNumNodes,
                uint64_t localNodeBegin, uint64_t globalEdgeOffset,
                std::vector<std::vector<uint32_t>>& localSrcToDest,
                std::vector<std::vector<uint32_t>>& localSrcToData) {
  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;

  std::cout << "[" << hostID << "] Beginning write to file\n";
  MPI_File newGR;
  std::vector<char> fName(outputFile.begin(), outputFile.end());
  fName.push_back('\0');
  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(),
                         MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,
                         &newGR));

  // Lux header
  if (hostID == 0) {
    // cast down the node data
    uint32_t castDown = totalNumNodes;
    MPICheck(MPI_File_write_at(newGR, 0, &castDown, 1, MPI_UINT32_T,
                               MPI_STATUS_IGNORE));
    MPICheck(MPI_File_write_at(newGR, sizeof(uint32_t), &totalNumEdges, 1,
                               MPI_UINT64_T, MPI_STATUS_IGNORE));
  }

  if (localNumNodes > 0) {
    // prepare edge prefix sum for file writing
    std::vector<uint64_t> edgePrefixSum(localNumNodes);
    edgePrefixSum[0] = localSrcToDest[0].size();
    for (unsigned i = 1; i < localNumNodes; i++) {
      edgePrefixSum[i] = (edgePrefixSum[i - 1] + localSrcToDest[i].size());
    }

    // account for edge offset
    for (unsigned i = 0; i < localNumNodes; i++) {
      edgePrefixSum[i] = edgePrefixSum[i] + globalEdgeOffset;
    }

    // begin file writing
    // Lux header differs from Galois header
    uint64_t headerSize      = sizeof(uint32_t) + sizeof(uint64_t);
    uint64_t nodeIndexOffset = headerSize + (localNodeBegin * sizeof(uint64_t));

    std::cout << "[" << hostID << "] Write node index data\n";
    writeNodeIndexData(newGR, localNumNodes, nodeIndexOffset, edgePrefixSum);
    freeVector(edgePrefixSum);

    uint64_t edgeDestOffset = headerSize + (totalNumNodes * sizeof(uint64_t)) +
                              globalEdgeOffset * sizeof(uint32_t);
    std::cout << "[" << hostID << "] Write edge dest data\n";
    std::vector<uint32_t> destVector = flattenVectors(localSrcToDest);
    freeVector(localSrcToDest);
    writeEdgeDestData(newGR, edgeDestOffset, destVector);

    // edge data writing if necessary
    if (!localSrcToData.empty()) {
      uint64_t byteOffsetToEdgeData = sizeof(uint32_t) + sizeof(uint64_t) +
                                      (totalNumNodes * sizeof(uint64_t)) +
                                      (totalNumEdges * sizeof(uint32_t));
      byteOffsetToEdgeData += globalEdgeOffset * sizeof(uint32_t);
      // NO PADDING
      uint64_t edgeDataOffset = byteOffsetToEdgeData;

      std::cout << "[" << hostID << "] Write edge data data\n";
      std::vector<uint32_t> dataVector = flattenVectors(localSrcToData);
      freeVector(localSrcToData);
      writeEdgeDataData(newGR, edgeDataOffset, dataVector);
    }

    std::cout << "[" << hostID << "] Write to file done\n";
  }

  MPICheck(MPI_File_close(&newGR));
}

std::vector<uint32_t> generateRandomNumbers(uint64_t count, uint64_t seed,
                                            uint64_t lower, uint64_t upper) {
  std::minstd_rand0 rGenerator;
  rGenerator.seed(seed);
  std::uniform_int_distribution<uint32_t> rDist(lower, upper);

  std::vector<uint32_t> randomNumbers;
  randomNumbers.reserve(count);
  for (unsigned i = 0; i < count; i++) {
    randomNumbers.emplace_back(rDist(rGenerator));
  }

  return randomNumbers;
}

uint64_t getOffsetToLocalEdgeData(uint64_t totalNumNodes,
                                  uint64_t totalNumEdges,
                                  uint64_t localEdgeBegin) {
  uint64_t byteOffsetToEdgeData = (4 * sizeof(uint64_t)) +             // header
                                  (totalNumNodes * sizeof(uint64_t)) + // nodes
                                  (totalNumEdges * sizeof(uint32_t));  // edges
  // version 1: determine if padding is necessary at end of file +
  // add it (64 byte alignment since edges are 32 bytes in version 1)
  if (totalNumEdges % 2) {
    byteOffsetToEdgeData += sizeof(uint32_t);
  }
  byteOffsetToEdgeData += localEdgeBegin * sizeof(uint32_t);

  return byteOffsetToEdgeData;
}

Uint64Pair getLocalAssignment(uint64_t numToSplit) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  return galois::block_range((uint64_t)0, numToSplit, hostID, totalNumHosts);
}


================================================
FILE: tools/dist-graph-convert/dist-graph-convert-helpers.h
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#ifndef _GALOIS_DIST_CONVERT_HELP_
#define _GALOIS_DIST_CONVERT_HELP_

#include <iostream>
#include <mutex>
#include <random>

#include <mpi.h>

#include "galois/Galois.h"
#include "galois/DynamicBitset.h"
#include "galois/gstl.h"
#include "galois/runtime/Network.h"
#include "galois/DReducible.h"
#include "galois/graphs/OfflineGraph.h"
#include "galois/graphs/BufferedGraph.h"

// useful typedefs that shorten long declarations
using Uint64Pair       = std::pair<uint64_t, uint64_t>;
using DoubleUint64Pair = std::pair<Uint64Pair, Uint64Pair>;
using VoVUint32        = std::vector<std::vector<uint32_t>>;
using PairVoVUint32    = std::pair<VoVUint32, VoVUint32>;

/**
 * Given a binary node mapping, read a specified region into memory.
 *
 * @param nodeMapBinary name of the binary file with node mappings
 * @param nodeOffset node offset to begin reading at
 * @param numToRead the number of node mappings (i.e. nodes) to read
 * @returns Vector with the read in node mappings
 */
std::vector<uint32_t> readRandomNodeMapping(const std::string& nodeMapBinary,
                                            uint64_t nodeOffset,
                                            uint64_t numToRead);

/**
 * Wrapper for MPI calls that return an error code. Make sure it is success
 * else die.
 *
 * @param errcode error code returned by an mpi call
 */
void MPICheck(int errcode);

/**
 * Reads a gr header from a v1 gr binary file and return number of
 * nodes and edges.
 *
 * @param grFile file name of gr
 * @param isVoid true if edge data shouldn't exist in the graph,
 * false otherwise; used for a safety check
 * @returns a pair with the number of nodes and number of edges in the gr file
 * in that order
 */
Uint64Pair readV1GrHeader(const std::string& grFile, bool isVoid);

/**
 * "Free" memory used by a vector by swapping it out with an empty one.
 *
 * @tparam VectorTy type of vector
 * @param toFree vector to free memory of
 */
template <typename VectorTy>
void freeVector(VectorTy& toFree) {
  VectorTy dummyVector;
  toFree.swap(dummyVector);
}

/**
 * Given a vector representing edges, get the number of edges the vector
 * represents.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param edgeVector vector with edges laid out in src, dest, and optionally
 * data order (i.e. 3 elements)
 * @returns the number of edges represented by the vector
 */
template <typename EdgeDataTy>
size_t getNumEdges(const std::vector<uint32_t>& edgeVector) {
  size_t numEdges;
  if (std::is_void<EdgeDataTy>::value) {
    numEdges = edgeVector.size() / 2;
  } else {
    numEdges = edgeVector.size() / 3;
  }
  return numEdges;
}

/**
 * Given an open ifstream of an edgelist and a range to read,
 * read the edges into memory.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param edgeListFile open ifstream of an edge list
 * @param localStartByte First byte to read
 * @param localEndByte Last byte to read (non-inclusive)
 * @param totalNumNodes Total number of nodes in the graph: used for correctness
 * checking of src/dest ids
 * @param startAtOne true if the edge list node ids start at 1
 * @returns Vector representing the read in edges: every 2-3 elements represents
 * src, dest, and edge data (if the latter exists)
 */
template <typename EdgeDataTy>
std::vector<uint32_t>
loadEdgesFromEdgeList(std::ifstream& edgeListFile, uint64_t localStartByte,
                      uint64_t localEndByte, uint64_t totalNumNodes,
                      bool startAtOne = false, bool ignoreWeights = false) {
  // load edges into a vector
  uint64_t localNumEdges = 0;
  std::vector<uint32_t> localEdges; // v1 support only + only uint32_t data

  // read lines until last byte
  edgeListFile.seekg(localStartByte);
  while ((uint64_t(edgeListFile.tellg()) + 1ul) != localEndByte) {
    uint64_t src;
    uint64_t dst;
    edgeListFile >> src >> dst;
    if (startAtOne) {
      src--;
      dst--;
    }
    GALOIS_ASSERT(src < totalNumNodes, "src ", src, " and ", totalNumNodes);
    GALOIS_ASSERT(dst < totalNumNodes, "dst ", dst, " and ", totalNumNodes);
    localEdges.emplace_back(src);
    localEdges.emplace_back(dst);

    // get (or ignore)edge data: IT ONLY SUPPORTS uint32_t AT THE MOMENT
    // TODO function template specializations necessary to read other graph
    // data types
    if (ignoreWeights) {
      // skip edgeweights
      edgeListFile >> src;
    } else if (!std::is_void<EdgeDataTy>::value) {
      uint32_t edgeData;
      edgeListFile >> edgeData;
      localEdges.emplace_back(edgeData);
    }

    localNumEdges++;
  }

  if (std::is_void<EdgeDataTy>::value) {
    GALOIS_ASSERT(localNumEdges == (localEdges.size() / 2));
  } else {
    GALOIS_ASSERT(localNumEdges == (localEdges.size() / 3));
  }

  std::cout << "[" << galois::runtime::getSystemNetworkInterface().ID << "] "
            << "Local num edges from file is " << localNumEdges << "\n";

  return localEdges;
}

/**
 * Gets a mapping of host to nodes of all hosts in the system. Divides
 * nodes evenly among hosts.
 *
 * @param numHosts total number of hosts
 * @param totalNumNodes total number of nodes
 * @returns A vector of pairs representing node -> host assignments. Evenly
 * distributed nodes to hosts.
 */
std::vector<Uint64Pair> getHostToNodeMapping(uint64_t numHosts,
                                             uint64_t totalNumNodes);

/**
 * Get the assigned owner of some ID given a mapping from ID to owner.
 *
 * @param gID ID to find owner of
 * @param ownerMapping Vector containing information about which host has which
 * nodes
 * @returns Owner of requested ID on or -1 if it couldn't be found
 */
uint32_t findOwner(const uint64_t gID,
                   const std::vector<Uint64Pair>& ownerMapping);

/**
 * Returns the file size of an ifstream.
 *
 * @param openFile an open ifstream
 * @returns file size in bytes of the ifstream
 */
uint64_t getFileSize(std::ifstream& openFile);

/**
 * Determine the byte range that a host should read from a file.
 *
 * @param edgeListFile edge list file to read
 * @param fileSize total size of the file
 * @returns pair that represents the begin/end of this host's byte range to read
 */
Uint64Pair determineByteRange(std::ifstream& edgeListFile, uint64_t fileSize);

/**
 * Accumulates some value from all hosts + return it.
 *
 * @param value value to accumulate across hosts
 * @return Accumulated value (add all values from all hosts up)
 */
uint64_t accumulateValue(uint64_t value);

/**
 * Find an index into the provided prefix sum that gets the desired "weight"
 * (weight comes from the units of the prefix sum).
 *
 * @param targetWeight desired weight that you want the index returned to have
 * @param lb Lower bound of search
 * @param ub Upper bound of search
 * @param prefixSum Prefix sum where the weights/returned index are derived
 * from
 */
uint64_t findIndexPrefixSum(uint64_t targetWeight, uint64_t lb, uint64_t ub,
                            const std::vector<uint64_t>& prefixSum);

/**
 * Given a prefix sum, a partition ID, and the total number of partitions,
 * find a good contiguous division using the prefix sum such that
 * partitions get roughly an even amount of units (based on prefix sum).
 *
 * @param id partition ID
 * @param totalID total number of partitions
 * @param prefixSum prefix sum of things that you want to divide among
 * partitions
 * @returns Pair representing the begin/end of the elements that partition
 * "id" is assigned based on the prefix sum
 */
Uint64Pair binSearchDivision(uint64_t id, uint64_t totalID,
                             const std::vector<uint64_t>& prefixSum);

/**
 * Finds the unique source nodes of a set of edges in memory. Assumes
 * edges are laid out in (src, dest) order in the vector.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param localEdges vector of edges to find unique sources of: needs to have
 * (src, dest) layout (i.e. i even: vector[i] is source, i+1 is dest
 * @param uniqueNodeBitset bitset marking which unique nodes are present
 * on this host; should be pre-initialized before being passed into this
 * function
 */
template <typename EdgeDataTy>
void findUniqueSourceNodes(const std::vector<uint32_t>& localEdges,
                           galois::DynamicBitSet& uniqueNodeBitset) {
  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;
  std::cout << "[" << hostID << "] Finding unique nodes\n";
  uniqueNodeBitset.reset();

  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);
  galois::do_all(
      galois::iterate((uint64_t)0, localNumEdges),
      [&](uint64_t edgeIndex) {
        // src node
        if (std::is_void<EdgeDataTy>::value) {
          uniqueNodeBitset.set(localEdges[edgeIndex * 2]);
        } else {
          uniqueNodeBitset.set(localEdges[edgeIndex * 3]);
        }
      },
      galois::loopname("FindUniqueNodes"));

  std::cout << "[" << hostID << "] Unique nodes found\n";
}

/**
 * Given a chunk to node mapping and a set of unique nodes, find the unique
 * chunks corresponding to the unique nodes provided.
 *
 * @param uniqueNodeBitset Bitset specifying which source nodes exist on the
 * edges this host has read
 * @param chunkToNode a mapping of a chunk to the range of nodes that the chunk
 * has
 * @returns a set of chunk ids corresponding to the nodes passed in (i.e. chunks
 * those nodes are included in)
 */
void findUniqueChunks(galois::DynamicBitSet& uniqueNodeBitset,
                      const std::vector<Uint64Pair>& chunkToNode,
                      galois::DynamicBitSet& uniqueChunkBitset);

/**
 * Get the edge counts for chunks of edges that we have locally.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param uniqueChunkBitset Bitset specifying which chunks are present on
 * this host; will be unusable at the end of the function (memory free)
 * @param localEdges loaded edge list laid out in src, dest, src, dest, etc.
 * @param chunkToNode specifies which chunks have which nodes
 * @param chunkCounts (input/output) a 0-initialized vector that will be
 * edited to have our local chunk edge counts
 */
template <typename EdgeDataTy>
void accumulateLocalEdgesToChunks(galois::DynamicBitSet& uniqueChunkBitset,
                                  const std::vector<uint32_t>& localEdges,
                                  const std::vector<Uint64Pair>& chunkToNode,
                                  std::vector<uint64_t>& chunkCounts) {
  std::map<uint64_t, std::atomic<uint64_t>> chunkToAccumulator;

  // default-initialize necessary chunk atomics
  for (size_t i = 0; i < uniqueChunkBitset.size(); i++) {
    if (uniqueChunkBitset.test(i)) {
      chunkToAccumulator[i];
    }
  }

  freeVector(uniqueChunkBitset.get_vec());

  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;
  std::cout << "[" << hostID
            << "] Chunk accumulators created: " << chunkToAccumulator.size()
            << " of them\n";

  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);
  // determine which chunk edges go to
  galois::do_all(
      galois::iterate((uint64_t)0, localNumEdges),
      [&](uint64_t edgeIndex) {
        uint32_t src;
        if (std::is_void<EdgeDataTy>::value) {
          src = localEdges[edgeIndex * 2];
        } else {
          src = localEdges[edgeIndex * 3];
        }
        uint32_t chunkNum = findOwner(src, chunkToNode);
        GALOIS_ASSERT(chunkNum != (uint32_t)-1);
        chunkToAccumulator[chunkNum] += 1;
      },
      galois::loopname("ChunkInspection"));

  std::cout << "[" << hostID << "] Chunk accumulators done accumulating\n";

  // update chunk count
  galois::do_all(
      galois::iterate(chunkToAccumulator.cbegin(), chunkToAccumulator.cend()),
      [&](auto& chunkAndCount) {
        chunkCounts[chunkAndCount.first] += chunkAndCount.second.load();
      },
      galois::loopname("ChunkCountUpdate"));
}

/**
 * Synchronize chunk edge counts across all hosts, i.e. send and receive
 * local chunk counts and update them.
 *
 * @param chunkCounts local edge chunk counts to be updated to a global chunk
 * edge count across all hosts
 */
void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts);

/**
 * Get the number of edges that each node chunk has.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param uniqueChunkBitset bitset specifying which chunks are present on this
 * host
 * @param localEdges loaded edge list laid out in src, dest, src, dest, etc.
 * @param chunkToNode specifies which chunks have which nodes
 * @returns A vector specifying the number of edges each chunk has
 */
template <typename EdgeDataTy>
std::vector<uint64_t>
getChunkEdgeCounts(galois::DynamicBitSet& uniqueChunkBitset,
                   const std::vector<uint32_t>& localEdges,
                   const std::vector<Uint64Pair>& chunkToNode) {
  std::vector<uint64_t> chunkCounts;
  chunkCounts.assign(uniqueChunkBitset.size(), 0);
  accumulateLocalEdgesToChunks<EdgeDataTy>(uniqueChunkBitset, localEdges,
                                           chunkToNode, chunkCounts);
  sendAndReceiveEdgeChunkCounts(chunkCounts);

  return chunkCounts;
}

/**
 * Given a chunk edge count prefix sum and the chunk to node mapping, assign
 * chunks (i.e. nodes) to hosts in an attempt to keep hosts with an about even
 * number of edges and return the node mapping.
 *
 * @param chunkCountsPrefixSum prefix sum of edges in chunks
 * @param chunkToNode mapping of chunk to nodes the chunk has
 * @returns a host to node mapping where each host very roughly has a balanced
 * number of edges
 */
std::vector<Uint64Pair>
getChunkToHostMapping(const std::vector<uint64_t>& chunkCountsPrefixSum,
                      const std::vector<Uint64Pair>& chunkToNode);

/**
 * Attempts to evenly assign nodes to hosts such that each host roughly gets
 * an even number of edges.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param localEdges in-memory buffer of edges this host has loaded
 * @param totalNodeCount total number of nodes in the entire graph
 * @param totalEdgeCount total number of edges in the entire graph
 * @returns a mapping of host to nodes where each host gets an attempted
 * roughly even amount of edges
 */
template <typename EdgeDataTy>
std::vector<Uint64Pair>
getEvenNodeToHostMapping(const std::vector<uint32_t>& localEdges,
                         uint64_t totalNodeCount, uint64_t totalEdgeCount) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  uint64_t numNodeChunks = totalEdgeCount / totalNumHosts;
  // TODO better heuristics: basically we don't want to run out of memory,
  // so keep number of chunks from growing too large
  while (numNodeChunks > 10000000) {
    numNodeChunks /= 2;
  }

  std::vector<Uint64Pair> chunkToNode;

  if (hostID == 0) {
    std::cout << "Num chunks is " << numNodeChunks << "\n";
  }

  for (unsigned i = 0; i < numNodeChunks; i++) {
    chunkToNode.emplace_back(galois::block_range(
        (uint64_t)0, (uint64_t)totalNodeCount, i, numNodeChunks));
  }

  std::cout << "[" << hostID << "] Determining edge to chunk counts\n";

  galois::DynamicBitSet uniqueNodeBitset;
  uniqueNodeBitset.resize(totalNodeCount);
  findUniqueSourceNodes<EdgeDataTy>(localEdges, uniqueNodeBitset);

  galois::DynamicBitSet uniqueChunkBitset;
  uniqueChunkBitset.resize(numNodeChunks);
  findUniqueChunks(uniqueNodeBitset, chunkToNode, uniqueChunkBitset);

  std::vector<uint64_t> chunkCounts = getChunkEdgeCounts<EdgeDataTy>(
      uniqueChunkBitset, localEdges, chunkToNode);
  std::cout << "[" << hostID << "] Edge to chunk counts determined\n";

  // prefix sum on the chunks (reuse array to save memory)
  for (unsigned i = 1; i < numNodeChunks; i++) {
    chunkCounts[i] += chunkCounts[i - 1];
  }

  // to make access to chunkToNode's last element correct with regard to later
  // access (without this access to chunkToNode[chunkSize] is out of bounds)
  chunkToNode.emplace_back(Uint64Pair(totalNodeCount, totalNodeCount));

  std::vector<Uint64Pair> finalMapping =
      getChunkToHostMapping(chunkCounts, chunkToNode);

  return finalMapping;
}

/**
 * Using OfflineGraph to read the binary gr, divide nodes among hosts such
 * that each hosts gets roughly an even amount of edges to read.
 *
 * @param inputGr file name of the input Galois binary graph
 * @returns 2 pairs: 1 pair specifies what nodes this host is responsible
 * for reading, the other pair specifies what edges this host is responsible
 * for reading
 */
DoubleUint64Pair getNodesToReadFromGr(const std::string& inputGr);

/**
 * Load a Galois binary graph into an BufferedGraph and load assigned
 * nodes/edges into memory.
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param inputFile path to input Galois binary graph
 * @param nodesToRead a pair that has the range of nodes that should be read
 * @param edgesToRead a pair that has the range of edges that should be read
 * @param totalNumNodes Total number of nodes in the graph
 * @param totalNumEdges Total number of edges in the graph
 * @returns a vector with edges corresponding to the nodes/edges pass into the
 * function
 */
template <typename EdgeDataTy>
std::vector<uint32_t>
loadEdgesFromBufferedGraph(const std::string& inputFile, Uint64Pair nodesToRead,
                           Uint64Pair edgesToRead, uint64_t totalNumNodes,
                           uint64_t totalNumEdges) {
  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;
  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,
                            edgesToRead.first, edgesToRead.second,
                            totalNumNodes, totalNumEdges);

  std::vector<uint32_t> edgeData;

  // void = 2 elements per edge; non-void = 3 elements per edge
  if (std::is_void<EdgeDataTy>::value) {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);
  } else {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);
  }

  if (edgeData.size() > 0) {
    galois::do_all(
        galois::iterate(nodesToRead.first, nodesToRead.second),
        [&](uint32_t gID) {
          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);
          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);

          // offset into which we should start writing data in
          // edgeData
          uint64_t edgeDataOffset;
          if (std::is_void<EdgeDataTy>::value) {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;
          } else {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;
          }

          // loop through all edges
          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {
            uint32_t edgeDest            = bufGraph.edgeDestination(i);
            edgeData[edgeDataOffset]     = gID;
            edgeData[edgeDataOffset + 1] = edgeDest;

            if (std::is_void<EdgeDataTy>::value) {
              edgeDataOffset += 2;
            } else {
              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);
              edgeDataOffset += 3;
            }
          }
        },
        galois::steal(), galois::loopname("LoadEdgesBufferedGraph"));
  }

  return edgeData;
}

/**
 * Load a Galois binary graph into an BufferedGraph and load assigned
 * nodes/edges into memory such that srcs become dests and dests become srcs
 * (i.e. transpose graph).
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param inputFile path to input Galois binary graph
 * @param nodesToRead a pair that has the range of nodes that should be read
 * @param edgesToRead a pair that has the range of edges that should be read
 * @param totalNumNodes Total number of nodes in the graph
 * @param totalNumEdges Total number of edges in the graph
 * @returns a vector with transposed edges corresponding to the nodes/edges
 * pass into the function
 */
template <typename EdgeDataTy>
std::vector<uint32_t> loadTransposedEdgesFromBufferedGraph(
    const std::string& inputFile, Uint64Pair nodesToRead,
    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges) {
  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;
  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,
                            edgesToRead.first, edgesToRead.second,
                            totalNumNodes, totalNumEdges);

  std::vector<uint32_t> edgeData;

  // void = 2 elements per edge; non-void = 3 elements per edge
  if (std::is_void<EdgeDataTy>::value) {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);
  } else {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);
  }

  if (edgeData.size() > 0) {
    galois::do_all(
        galois::iterate(nodesToRead.first, nodesToRead.second),
        [&](uint32_t gID) {
          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);
          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);

          // offset into which we should start writing data in
          // edgeData
          uint64_t edgeDataOffset;
          if (std::is_void<EdgeDataTy>::value) {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;
          } else {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;
          }

          // loop through all edges
          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {
            uint32_t edgeSource = bufGraph.edgeDestination(i);
            // src is saved as dest and dest is saved as source
            // (transpose)
            edgeData[edgeDataOffset]     = edgeSource;
            edgeData[edgeDataOffset + 1] = gID;

            if (std::is_void<EdgeDataTy>::value) {
              edgeDataOffset += 2;
            } else {
              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);
              edgeDataOffset += 3;
            }
          }
        },
        galois::steal(), galois::loopname("LoadTransposeEdgesBufferedGraph"));
  }

  return edgeData;
}

/**
 * Load a Galois binary graph into an BufferedGraph and load assigned
 * nodes/edges into memory such that each edge is loaded twice (extra in reverse
 * direction).
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param inputFile path to input Galois binary graph
 * @param nodesToRead a pair that has the range of nodes that should be read
 * @param edgesToRead a pair that has the range of edges that should be read
 * @param totalNumNodes Total number of nodes in the graph
 * @param totalNumEdges Total number of edges in the graph
 * @returns a vector with edges corresponding to the nodes/edges
 * passed into the function; 1 edge in original becomes 2
 */
template <typename EdgeDataTy>
std::vector<uint32_t> loadSymmetricEdgesFromBufferedGraph(
    const std::string& inputFile, Uint64Pair nodesToRead,
    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges) {
  // TODO change this
  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;
  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,
                            edgesToRead.first, edgesToRead.second,
                            totalNumNodes, totalNumEdges);

  std::vector<uint32_t> edgeData;

  // void = 2 elements per edge; non-void = 3 elements per edge
  if (std::is_void<EdgeDataTy>::value) {
    edgeData.resize(((edgesToRead.second - edgesToRead.first) * 2) * 2);
  } else {
    edgeData.resize(((edgesToRead.second - edgesToRead.first) * 3) * 2);
  }

  if (edgeData.size() > 0) {
    galois::do_all(
        galois::iterate(nodesToRead.first, nodesToRead.second),
        [&](uint32_t gID) {
          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);
          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);

          // offset into which we should start writing data in
          // edgeData
          uint64_t edgeDataOffset;
          if (std::is_void<EdgeDataTy>::value) {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 4;
          } else {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 6;
          }

          // loop through all edges, create 2 edges for every edge
          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {
            uint32_t edgeDest            = bufGraph.edgeDestination(i);
            edgeData[edgeDataOffset]     = gID;
            edgeData[edgeDataOffset + 1] = edgeDest;

            if (std::is_void<EdgeDataTy>::value) {
              edgeData[edgeDataOffset + 2] = edgeDest;
              edgeData[edgeDataOffset + 3] = gID;
              edgeDataOffset += 4;
            } else {
              uint32_t edgeWeight = bufGraph.edgeData(i);

              edgeData[edgeDataOffset + 2] = edgeWeight;

              edgeData[edgeDataOffset + 3] = edgeDest;
              edgeData[edgeDataOffset + 4] = gID;
              edgeData[edgeDataOffset + 5] = edgeWeight;

              edgeDataOffset += 6;
            }
          }
        },
        galois::steal(), galois::loopname("LoadSymmetricEdgesBufferedGraph"));
  }

  return edgeData;
}

/**
 * Load a Galois binary graph such that multiedges and self loops are ignored.
 * Weights are completely ignored regardless if they exist or not.
 *
 * @param inputFile path to input Galois binary graph
 * @param nodesToRead a pair that has the range of nodes that should be read
 * @param edgesToRead a pair that has the range of edges that should be read
 * @param totalNumNodes Total number of nodes in the graph
 * @param totalNumEdges Total number of edges in the graph
 * @returns a vector with edges corresponding to the nodes/edges
 * passed into the function; multi edges and self loops removed
 */
std::vector<uint32_t>
loadCleanEdgesFromBufferedGraph(const std::string& inputFile,
                                Uint64Pair nodesToRead, Uint64Pair edgesToRead,
                                uint64_t totalNumNodes, uint64_t totalNumEdges,
                                bool keepSelfLoops);

/**
 * Loads the node to new node mapping, then reads the edges that this host
 * has been assigned into a buffer. The catch is that it reads them
 * in a TRANSPOSED manner, and it remaps the original source node to
 * its new node id.
 *
 * i.e. source nodes are remapped to new id, but destination nodes aren't
 * Edges are returned in a destination, source (and optionally edge data)
 * order
 *
 * @tparam EdgeDataTy type of edge data to read
 *
 * @param inputFile path to input Galois binary graph
 * @param nodesToRead a pair that has the range of nodes that should be read
 * @param edgesToRead a pair that has the range of edges that should be read
 * @param totalNumNodes Total number of nodes in the graph
 * @param totalNumEdges Total number of edges in the graph
 * @param mappedBinary binary file with info that maps a node to its new node
 *
 * @returns A vector of transposed edges (with or without edge data
 * depending on edge data type)
 */
template <typename EdgeDataTy>
std::vector<uint32_t> loadMappedSourceEdgesFromBufferedGraph(
    const std::string& inputFile, Uint64Pair nodesToRead,
    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges,
    const std::string& mappedBinary) {
  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;
  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,
                            edgesToRead.first, edgesToRead.second,
                            totalNumNodes, totalNumEdges);
  std::vector<uint32_t> edgeData;
  // void = 2 elements per edge; non-void = 3 elements per edge
  if (std::is_void<EdgeDataTy>::value) {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);
  } else {
    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);
  }

  std::vector<uint32_t> node2NewNode = readRandomNodeMapping(
      mappedBinary, nodesToRead.first, nodesToRead.second - nodesToRead.first);

  if (edgeData.size() > 0) {
    galois::do_all(
        galois::iterate(nodesToRead.first, nodesToRead.second),
        [&](uint32_t gID) {
          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);
          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);

          // offset into which we should start writing data in
          // edgeData
          uint64_t edgeDataOffset;
          if (std::is_void<EdgeDataTy>::value) {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;
          } else {
            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;
          }

          uint32_t lID          = gID - nodesToRead.first;
          uint32_t mappedSource = node2NewNode[lID];

          // loop through all edges
          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {
            uint32_t edgeSource = bufGraph.edgeDestination(i);
            // src is saved as dest and dest is saved as source
            // (transpose)
            edgeData[edgeDataOffset]     = edgeSource;
            edgeData[edgeDataOffset + 1] = mappedSource;

            if (std::is_void<EdgeDataTy>::value) {
              edgeDataOffset += 2;
            } else {
              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);
              edgeDataOffset += 3;
            }
          }
        },
        galois::steal(), galois::loopname("RemapDestinations"));
  }

  return edgeData;
}

/**
 * Determine/send to each host how many edges they should expect to receive
 * from the caller (i.e. this host).
 *
 * @tparam EdgeDataTy type of edge data to read
 * @param hostToNodes mapping of a host to the nodes it is assigned
 * @param localEdges in-memory buffer of edges this host has loaded
 */
template <typename EdgeDataTy>
void sendEdgeCounts(const std::vector<Uint64Pair>& hostToNodes,
                    const std::vector<uint32_t>& localEdges) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID << "] Determinining edge counts\n";

  std::vector<galois::GAccumulator<uint64_t>> numEdgesPerHost(totalNumHosts);

  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);
  // determine to which host each edge will go
  galois::do_all(
      galois::iterate((uint64_t)0, localNumEdges),
      [&](uint64_t edgeIndex) {
        uint32_t src;
        if (std::is_void<EdgeDataTy>::value) {
          src = localEdges[edgeIndex * 2];
        } else {
          src = localEdges[edgeIndex * 3];
        }

        uint32_t edgeOwner = findOwner(src, hostToNodes);
        numEdgesPerHost[edgeOwner] += 1;
      },
      galois::loopname("EdgeInspection"));

  std::cout << "[" << hostID << "] Sending edge counts\n";

  for (unsigned h = 0; h < totalNumHosts; h++) {
    if (h == hostID)
      continue;
    galois::runtime::SendBuffer b;
    galois::runtime::gSerialize(b, numEdgesPerHost[h].reduce());
    net.sendTagged(h, galois::runtime::evilPhase, b);
  }
};

/**
 * Receive the messages from other hosts that tell this host how many edges
 * it should expect to receive. Should be called after sendEdgesCounts.
 *
 * @returns the number of edges that the caller host should expect to receive
 * in total from all other hosts
 */
uint64_t receiveEdgeCounts();

/**
 * Loop through all local edges and send them to the host they are assigned to.
 *
 * @param hostToNodes mapping of a host to the nodes it is assigned
 * @param localEdges in-memory buffer of edges this host has loaded
 * @param localSrcToDest local mapping of LOCAL sources to destinations (we
 * may have some edges that do not need sending; they are saved here)
 * @param localSrcToData Vector of vectors: the vector at index i specifies
 * the data of edges owned by local node i
 * @param nodeLocks Vector of mutexes (one for each local node) that are used
 * when writing to the local mapping of sources to destinations since vectors
 * are not thread safe
 */
// TODO make implementation smaller/cleaner i.e. refactor
// TODO merge with the non void version below because the code duplication
// here is ugly and messy
template <
    typename EdgeDataTy,
    typename std::enable_if<std::is_void<EdgeDataTy>::value>::type* = nullptr>
void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
                       const std::vector<uint32_t>& localEdges,
                       std::vector<std::vector<uint32_t>>& localSrcToDest,
                       std::vector<std::vector<uint32_t>>&,
                       std::vector<std::mutex>& nodeLocks) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID << "] Going to send assigned edges\n";

  using EdgeVectorTy = std::vector<std::vector<uint32_t>>;
  galois::substrate::PerThreadStorage<EdgeVectorTy> dstVectors(totalNumHosts);

  using SendBufferVectorTy = std::vector<galois::runtime::SendBuffer>;
  galois::substrate::PerThreadStorage<SendBufferVectorTy> sendBuffers(
      totalNumHosts);
  galois::substrate::PerThreadStorage<std::vector<uint64_t>>
      lastSourceSentStorage(totalNumHosts);

  // initialize last source sent
  galois::on_each([&](unsigned, unsigned) {
    for (unsigned h = 0; h < totalNumHosts; h++) {
      (*(lastSourceSentStorage.getLocal()))[h] = 0;
    }
  });

  std::cout << "[" << hostID << "] Passing through edges and assigning\n";

  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);
  // determine to which host each edge will go
  galois::do_all(
      galois::iterate((uint64_t)0, localNumEdges),
      [&](uint64_t edgeIndex) {
        uint32_t src       = localEdges[edgeIndex * 2];
        uint32_t edgeOwner = findOwner(src, hostToNodes);
        uint32_t dst       = localEdges[(edgeIndex * 2) + 1];
        uint32_t localID   = src - hostToNodes[edgeOwner].first;

        if (edgeOwner != hostID) {
          // send off to correct host
          auto& hostSendBuffer = (*(sendBuffers.getLocal()))[edgeOwner];
          auto& dstVector      = (*(dstVectors.getLocal()))[edgeOwner];
          auto& lastSourceSent =
              (*(lastSourceSentStorage.getLocal()))[edgeOwner];

          if (lastSourceSent == localID) {
            dstVector.emplace_back(dst);
          } else {
            // serialize vector if anything exists in it + send
            // buffer if reached some limit
            if (dstVector.size() > 0) {
              uint64_t globalSourceID =
                  lastSourceSent + hostToNodes[edgeOwner].first;
              galois::runtime::gSerialize(hostSendBuffer, globalSourceID,
                                          dstVector);
              dstVector.clear();
              if (hostSendBuffer.size() > 1400) {
                net.sendTagged(edgeOwner, galois::runtime::evilPhase,
                               hostSendBuffer);
                hostSendBuffer.getVec().clear();
              }
            }

            dstVector.emplace_back(dst);
            lastSourceSent = localID;
          }
        } else {
          // save to edge dest array
          nodeLocks[localID].lock();
          localSrcToDest[localID].emplace_back(dst);
          nodeLocks[localID].unlock();
        }
      },
      galois::loopname("Pass2"));

  std::cout << "[" << hostID << "] Buffer cleanup\n";

  // cleanup: each thread serialize + send out remaining stuff
  galois::on_each(
      [&](unsigned, unsigned) {
        for (unsigned h = 0; h < totalNumHosts; h++) {
          if (h == hostID)
            continue;
          auto& hostSendBuffer    = (*(sendBuffers.getLocal()))[h];
          auto& dstVector         = (*(dstVectors.getLocal()))[h];
          uint64_t lastSourceSent = (*(lastSourceSentStorage.getLocal()))[h];

          if (dstVector.size() > 0) {
            uint64_t globalSourceID = lastSourceSent + hostToNodes[h].first;
            galois::runtime::gSerialize(hostSendBuffer, globalSourceID,
                                        dstVector);
            dstVector.clear();
          }

          if (hostSendBuffer.size() > 0) {
            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
            hostSendBuffer.getVec().clear();
          }
        }
      },
      galois::loopname("Pass2Cleanup"));
}

// Non-void variant of the above; uint32_t only
template <
    typename EdgeDataTy,
    typename std::enable_if<!std::is_void<EdgeDataTy>::value>::type* = nullptr>
void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
                       const std::vector<uint32_t>& localEdges,
                       std::vector<std::vector<uint32_t>>& localSrcToDest,
                       std::vector<std::vector<uint32_t>>& localSrcToData,
                       std::vector<std::mutex>& nodeLocks) {
  auto& net              = galois::runtime::getSystemNetworkInterface();
  uint64_t hostID        = net.ID;
  uint64_t totalNumHosts = net.Num;

  std::cout << "[" << hostID << "] Going to send assigned edges\n";

  // initialize localsrctodata
  GALOIS_ASSERT(localSrcToData.empty());
  using EdgeVectorTy = std::vector<std::vector<uint32_t>>;
  EdgeVectorTy tmp   = EdgeVectorTy(localSrcToDest.size());
  localSrcToData.swap(tmp);
  GALOIS_ASSERT(localSrcToData.size() == localSrcToDest.size());

  galois::substrate::PerThreadStorage<EdgeVectorTy> dstVectors(totalNumHosts);
  // currently only uint32_t support for edge data
  galois::substrate::PerThreadStorage<EdgeVectorTy> dataVectors(totalNumHosts);

  using SendBufferVectorTy = std::vector<galois::runtime::SendBuffer>;
  galois::substrate::PerThreadStorage<SendBufferVectorTy> sendBuffers(
      totalNumHosts);
  galois::substrate::PerThreadStorage<std::vector<uint64_t>>
      lastSourceSentStorage(totalNumHosts);

  // initialize last source sent
  galois::on_each([&](unsigned, unsigned) {
    for (unsigned h = 0; h < totalNumHosts; h++) {
      (*(lastSourceSentStorage.getLocal()))[h] = 0;
    }
  });

  std::cout << "[" << hostID << "] Passing through edges and assigning\n";

  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);
  // determine to which host each edge will go
  galois::do_all(
      galois::iterate((uint64_t)0, localNumEdges),
      [&](uint64_t edgeIndex) {
        uint32_t src       = localEdges[edgeIndex * 3];
        uint32_t edgeOwner = findOwner(src, hostToNodes);
        uint32_t dst       = localEdges[(edgeIndex * 3) + 1];
        uint32_t localID   = src - hostToNodes[edgeOwner].first;
        uint32_t edgeData  = localEdges[(edgeIndex * 3) + 2];

        if (edgeOwner != hostID) {
          // send off to correct host
          auto& hostSendBuffer = (*(sendBuffers.getLocal()))[edgeOwner];
          auto& dstVector      = (*(dstVectors.getLocal()))[edgeOwner];
          auto& dataVector     = (*(dataVectors.getLocal()))[edgeOwner];
          auto& lastSourceSent =
              (*(lastSourceSentStorage.getLocal()))[edgeOwner];

          if (lastSourceSent == localID) {
            dstVector.emplace_back(dst);
            dataVector.emplace_back(edgeData);
          } else {
            // serialize vector if anything exists in it + send buffer if
            // reached some limit
            if (dstVector.size() > 0) {
              uint64_t globalSourceID =
                  lastSourceSent + hostToNodes[edgeOwner].first;
              galois::runtime::gSerialize(hostSendBuffer, globalSourceID,
                                          dstVector, dataVector);
              dstVector.clear();
              dataVector.clear();
              if (hostSendBuffer.size() > 1400) {
                net.sendTagged(edgeOwner, galois::runtime::evilPhase,
                               hostSendBuffer);
                hostSendBuffer.getVec().clear();
              }
            }

            dstVector.emplace_back(dst);
            dataVector.emplace_back(edgeData);
            lastSourceSent = localID;
          }
        } else {
          // save to edge dest array
          nodeLocks[localID].lock();
          localSrcToDest[localID].emplace_back(dst);
          localSrcToData[localID].emplace_back(edgeData);
          nodeLocks[localID].unlock();
        }
      },
      galois::loopname("Pass2"));

  std::cout << "[" << hostID << "] Buffer cleanup\n";

  // cleanup: each thread serialize + send out remaining stuff
  galois::on_each(
      [&](unsigned, unsigned) {
        for (unsigned h = 0; h < totalNumHosts; h++) {
          if (h == hostID)
            continue;
          auto& hostSendBuffer    = (*(sendBuffers.getLocal()))[h];
          auto& dstVector         = (*(dstVectors.getLocal()))[h];
          auto& dataVector        = (*(dataVectors.getLocal()))[h];
          uint64_t lastSourceSent = (*(lastSourceSentStorage.getLocal()))[h];

          if (dstVector.size() > 0) {
            uint64_t globalSourceID = lastSourceSent + hostToNodes[h].first;
            galois::runtime::gSerialize(hostSendBuffer, globalSourceID,
                                        dstVector, dataVector);
            dstVector.clear();
            dataVector.clear();
          }

          if (hostSendBuffer.size() > 0) {
            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
            hostSendBuffer.getVec().clear();
          }
        }
      },
      galois::loopname("Pass2Cleanup"));
}

/**
 * Receive this host's assigned edges: should be called after sendAssignedEdges.
 *
 * @param edgesToReceive the number of edges we expect to receive; the function
 * will not exit until all expected edges are received
 * @param hostToNodes mapping of a host to the nodes it is assigned
 * @param localSrcToDest local mapping of LOCAL sources to destinations (we
 * may have some edges that do not need sending; they are saved here)
 * @param localSrcToData Vector of vectors: the vector at index i specifies
 * the data of edges owned by local node i; NOTE THAT THIS VECTOR BEING EMPTY
 * OR NON EMPTY DETERMINES IF THE FUNCTION EXPECTS TO RECEIVE EDGE DATA
 * @param nodeLocks Vector of mutexes (one for each local node) that are used
 * when writing to the local mapping of sources to destinations since vectors
 * are not thread safe
 */
void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
                          const std::vector<Uint64Pair>& hostToNodes,
                          std::vector<std::vector<uint32_t>>& localSrcToDest,
                          std::vector<std::vector<uint32_t>>& localSrcToData,
                          std::vector<std::mutex>& nodeLocks);

/**
 * Send/receive other hosts number of assigned edges.
 *
 * @param localAssignedEdges number of edges assigned to this host
 * @returns a vector that has every hosts number of locally assigned edges
 */
std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges);

/**
 * Given a vector of vectors, "flatten" it by merging them into 1 vector
 * in the order they appear the in the vector.
 *
 * @param vectorOfVectors vector of vectors to flatten. FUNCTION WILL ERASE
 * ALL DATA IN THE VECTOR.
 * @returns a flattened vector from vectorOfVectors
 */
std::vector<uint32_t>
flattenVectors(std::vector<std::vector<uint32_t>>& vectorOfVectors);

/**
 * Writes a binary galois graph's header information.
 *
 * @param gr File to write to
 * @param version Version of the galois binary graph file
 * @param sizeOfEdge Size of edge data (0 if there is no edge data)
 * @param totalNumNodes total number of nodes in the graph
 * @param totalEdgeConnt total number of edges in the graph
 */
void writeGrHeader(MPI_File& gr, uint64_t version, uint64_t sizeOfEdge,
                   uint64_t totalNumNodes, uint64_t totalEdgeCount);

/**
 * Writes the node index data of a galois binary graph.
 *
 * @param gr File to write to
 * @param nodesToWrite number of nodes to write
 * @param nodeIndexOffset offset into file specifying where to start writing
 * @param edgePrefixSum the node index data to write into the file (index data
 * in graph tells you where to start looking for edges of some node, i.e.
 * it's a prefix sum)
 */
void writeNodeIndexData(MPI_File& gr, uint64_t nodesToWrite,
                        uint64_t nodeIndexOffset,
                        const std::vector<uint64_t>& edgePrefixSum);

/**
 * Writes the edge destination data of a galois binary graph.
 *
 * @param gr File to write to
 * @param edgeDestOffset offset into file specifying where to start writing
 * @param localSrcToDest Vector of vectors: the vector at index i specifies
 * the destinations for local src node i
 */
void writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,
                       std::vector<std::vector<uint32_t>>& localSrcToDest);

/**
 * Writes the edge destination data of a galois binary graph.
 * @param gr File to write to
 * @param edgeDestOffset offset into file specifying where to start writing
 * @param destVector Vector of edge destinations IN THE ORDER THAT THEY SHOULD
 * BE WRITTEN (i.e. in correct order corresponding to node order this host has)
 */
void writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,
                       std::vector<uint32_t>& destVector);
/**
 * Writes the edge data data of a galois binary graph.
 *
 * @param gr File to write to
 * @param edgeDataOffset offset into file specifying where to start writing
 * @param edgeDataToWrite vector of localNumEdges elements corresponding to
 * edge data that needs to be written
 */
void writeEdgeDataData(MPI_File& gr, uint64_t edgeDataOffset,
                       const std::vector<uint32_t>& edgeDataToWrite);

/**
 * Write graph data out to a V1 Galois binary graph file.
 *
 * @param outputFile name of file to write to
 * @param totalNumNodes total number of nodes in the graph
 * @param totalNumEdges total number of edges in graph
 * @param localNumNodes number of source nodes that this host was assigned to
 * write
 * @param localNodeBegin global id of first node this host was assigned
 * @param globalEdgeOffset number of edges to skip to get to the first edge
 * this host is responsible for
 * @param localSrcToDest Vector of vectors: the vector at index i specifies
 * the destinations of edges owned by local node i
 * @param localSrcToData Vector of vectors: the vector at index i specifies
 * the data of edges owned by local node i
 */
void writeToGr(const std::string& outputFile, uint64_t totalNumNodes,
               uint64_t totalNumEdges, uint64_t localNumNodes,
               uint64_t localNodeBegin, uint64_t globalEdgeOffset,
               std::vector<std::vector<uint32_t>>& localSrcToDest,
               std::vector<std::vector<uint32_t>>& localSrcToData);

/**
 * Write graph data out to a Lux binary graph file.
 *
 * @param outputFile name of file to write to
 * @param totalNumNodes total number of nodes in the graph
 * @param totalNumEdges total number of edges in graph
 * @param localNumNodes number of source nodes that this host was assigned to
 * write
 * @param localNodeBegin global id of first node this host was assigned
 * @param globalEdgeOffset number of edges to skip to get to the first edge
 * this host is responsible for
 * @param localSrcToDest Vector of vectors: the vector at index i specifies
 * the destinations of edges owned by local node i
 * @param localSrcToData Vector of vectors: the vector at index i specifies
 * the data of edges owned by local node i
 */
void writeToLux(const std::string& outputFile, uint64_t totalNumNodes,
                uint64_t totalNumEdges, uint64_t localNumNodes,
                uint64_t localNodeBegin, uint64_t globalEdgeOffset,
                std::vector<std::vector<uint32_t>>& localSrcToDest,
                std::vector<std::vector<uint32_t>>& localSrcToData);

/**
 * Generates a vector of random uint32_ts.
 *
 * @param count number of numbers to generate
 * @param seed seed to start generating with
 * @param lower lower bound of numbers to generate, inclusive
 * @param upper upper bound of number to generate, inclusive
 * @returns Vector of random uint32_t numbers
 */
std::vector<uint32_t> generateRandomNumbers(uint64_t count, uint64_t seed,
                                            uint64_t lower, uint64_t upper);

/**
 * Gets the offset into the location of the edge data of some edge in a galois
 * binary graph file.
 *
 * @param totalNumNodes total number of nodes in graph
 * @param totalNumEdges total number of edges in graph
 * @param localEdgeBegin the edge to get the offset to
 * @returns offset into location of edge data of localEdgeBegin
 */
uint64_t getOffsetToLocalEdgeData(uint64_t totalNumNodes,
                                  uint64_t totalNumEdges,
                                  uint64_t localEdgeBegin);

/**
 * Given some number, get the chunk of that number that this host is responsible
 * for.
 *
 * @param numToSplit the number to chunk among hosts
 * @returns pair specifying the range that this host is responsible for
 */
Uint64Pair getLocalAssignment(uint64_t numToSplit);

/**
 * Given a host to node assignment, send the edges we have to the appropriate
 * place + receieve edges sent by other hosts.
 *
 * @param hostToNodes Vector specifying assignment of nodes to hosts
 * @param localEdges array that represents edges on this host (to keep or to
 * send)
 * @returns 2 structures: one is a vector of vectors where the vector at
 * index i has destination ids for local node i and another vector of vectors
 * similar to the former except with edge weights (if EdgeTy is non-void)
 */
template <typename EdgeTy>
PairVoVUint32
sendAndReceiveAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
                            std::vector<uint32_t>& localEdges) {
  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;
  uint64_t localNumNodes =
      hostToNodes[hostID].second - hostToNodes[hostID].first;

  sendEdgeCounts<EdgeTy>(hostToNodes, localEdges);
  std::atomic<uint64_t> edgesToReceive;
  edgesToReceive.store(receiveEdgeCounts());

  std::cout << "[" << hostID << "] Need to receive " << edgesToReceive.load()
            << " edges\n";

  // FIXME ONLY V1 SUPPORT
  VoVUint32 localSrcToDest(localNumNodes);
  VoVUint32 localSrcToData;
  std::vector<std::mutex> nodeLocks(localNumNodes);

  sendAssignedEdges<EdgeTy>(hostToNodes, localEdges, localSrcToDest,
                            localSrcToData, nodeLocks);
  freeVector(localEdges);
  receiveAssignedEdges(edgesToReceive, hostToNodes, localSrcToDest,
                       localSrcToData, nodeLocks);
  return PairVoVUint32(localSrcToDest, localSrcToData);
}

/**
 * Given a set of disjoint edges, assign/send edges to hosts. Then, each host
 * writes the edges to the specified output file in the Galois binary graph
 * format.
 *
 * @param localEdges Array of edges this host has
 * @param totalNumNodes total number of nodes in entire graph
 * @param totalNumEdges total number of edges in entire graph
 * @param outputFile file to write new graph to
 */
template <typename EdgeTy>
void assignAndWriteEdges(std::vector<uint32_t>& localEdges,
                         uint64_t totalNumNodes, uint64_t totalNumEdges,
                         const std::string& outputFile) {
  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;

  std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(
      localEdges, totalNumNodes, totalNumEdges);

  PairVoVUint32 receivedEdgeInfo =
      sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);
  VoVUint32 localSrcToDest = receivedEdgeInfo.first;
  VoVUint32 localSrcToData = receivedEdgeInfo.second;

  uint64_t localNodeBegin = hostToNodes[hostID].first;
  uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;
  freeVector(hostToNodes);

  uint64_t totalAssignedEdges = 0;
  for (unsigned i = 0; i < localNumNodes; i++) {
    totalAssignedEdges += localSrcToDest[i].size();
  }

  std::cout << "[" << hostID << "] Will write " << totalAssignedEdges
            << " edges\n";

  // calculate global edge offset using edge counts from other hosts
  std::vector<uint64_t> edgesPerHost = getEdgesPerHost(totalAssignedEdges);
  uint64_t globalEdgeOffset          = 0;
  uint64_t totalEdgeCount            = 0;
  for (unsigned h = 0; h < hostID; h++) {
    globalEdgeOffset += edgesPerHost[h];
    totalEdgeCount += edgesPerHost[h];
  }

  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;
  // finish off getting total edge count (note this is more of a sanity check
  // since we got total edge count near the beginning already)
  for (unsigned h = hostID; h < totalNumHosts; h++) {
    totalEdgeCount += edgesPerHost[h];
  }
  GALOIS_ASSERT(totalNumEdges == totalEdgeCount);
  freeVector(edgesPerHost);

  writeToGr(outputFile, totalNumNodes, totalEdgeCount, localNumNodes,
            localNodeBegin, globalEdgeOffset, localSrcToDest, localSrcToData);
}

/**
 * Given a set of disjoint edges, assign/send edges to hosts. Then, each host
 * writes the edges to the specified output file in the Lux binary graph
 * format.
 *
 * @todo merge this with the other assignAndWriteEdges to prevent code
 * duplication
 *
 * @param localEdges Array of edges this host has
 * @param totalNumNodes total number of nodes in entire graph
 * @param totalNumEdges total number of edges in entire graph
 * @param outputFile file to write new graph to
 */
template <typename EdgeTy>
void assignAndWriteEdgesLux(std::vector<uint32_t>& localEdges,
                            uint64_t totalNumNodes, uint64_t totalNumEdges,
                            const std::string& outputFile) {
  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;

  std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(
      localEdges, totalNumNodes, totalNumEdges);

  PairVoVUint32 receivedEdgeInfo =
      sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);
  VoVUint32 localSrcToDest = receivedEdgeInfo.first;
  VoVUint32 localSrcToData = receivedEdgeInfo.second;

  uint64_t localNodeBegin = hostToNodes[hostID].first;
  uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;
  freeVector(hostToNodes);

  uint64_t totalAssignedEdges = 0;
  for (unsigned i = 0; i < localNumNodes; i++) {
    totalAssignedEdges += localSrcToDest[i].size();
  }

  std::cout << "[" << hostID << "] Will write " << totalAssignedEdges
            << " edges\n";

  // calculate global edge offset using edge counts from other hosts
  std::vector<uint64_t> edgesPerHost = getEdgesPerHost(totalAssignedEdges);
  uint64_t globalEdgeOffset          = 0;
  uint64_t totalEdgeCount            = 0;
  for (unsigned h = 0; h < hostID; h++) {
    globalEdgeOffset += edgesPerHost[h];
    totalEdgeCount += edgesPerHost[h];
  }

  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;
  // finish off getting total edge count (note this is more of a sanity check
  // since we got total edge count near the beginning already)
  for (unsigned h = hostID; h < totalNumHosts; h++) {
    totalEdgeCount += edgesPerHost[h];
  }
  GALOIS_ASSERT(totalNumEdges == totalEdgeCount);
  freeVector(edgesPerHost);

  writeToLux(outputFile, totalNumNodes, totalEdgeCount, localNumNodes,
             localNodeBegin, globalEdgeOffset, localSrcToDest, localSrcToData);
}
#endif


================================================
FILE: tools/dist-graph-convert/dist-graph-convert.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

// @todo MPI files probably not necessary to use here; just use regular C
// files and/or mmap

#include <utility>

#include "galois/DistGalois.h"
#include "llvm/Support/CommandLine.h"

#include "dist-graph-convert-helpers.h"

namespace cll = llvm::cl;

enum ConvertMode {
  edgelist2gr,
  gr2wgr,
  gr2tgr,
  gr2sgr,
  gr2cgr,
  gr2rgr,
  tgr2lux,
  nodemap2binary
};

enum EdgeType { uint32_, void_ };

////////////////////////////////////////////////////////////////////////////////
// Command Line Args
////////////////////////////////////////////////////////////////////////////////

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<std::string> outputFilename(cll::Positional,
                                            cll::desc("<output file>"),
                                            cll::init(std::string()));
static cll::opt<EdgeType>
    edgeType("edgeType", cll::desc("Input/Output edge type:"),
             cll::values(clEnumValN(EdgeType::uint32_, "uint32",
                                    "32 bit unsigned int edge values"),
                         clEnumValN(EdgeType::void_, "void", "no edge values")),
             cll::init(EdgeType::void_));

static cll::opt<ConvertMode> convertMode(
    cll::desc("Conversion mode:"),
    cll::values(clEnumVal(edgelist2gr, "Convert edge list to binary gr"),
                clEnumVal(gr2wgr,
                          "Convert unweighted binary gr to weighted binary gr "
                          "(in-place)"),
                clEnumVal(gr2tgr, "Convert binary gr to transpose binary gr"),
                clEnumVal(gr2sgr, "Convert binary gr to symmetric binary gr"),
                clEnumVal(gr2cgr,
                          "Convert binary gr to binary gr without self-loops "
                          "or multi-edges; edge data will be ignored"),
                clEnumVal(gr2rgr, "Convert binary gr to randomized binary gr"),
                clEnumVal(tgr2lux, "Convert transpose graph to Lux CSC"),
                clEnumVal(nodemap2binary, "Convert node map into binary form")),
    cll::Required);

static cll::opt<unsigned long long>
    totalNumNodes("numNodes", cll::desc("Nodes in input graph"), cll::init(0));

static cll::opt<unsigned> threadsToUse("t", cll::desc("Threads to use"),
                                       cll::init(1));

static cll::opt<bool> editInPlace("inPlace",
                                  cll::desc("Flag specifying conversion is in "
                                            "place"),
                                  cll::init(false));
static cll::opt<std::string>
    nodeMapBinary("nodeMapBinary",
                  cll::desc("Binary file of numbers mapping nodes"),
                  cll::init(std::string()));

static cll::opt<bool>
    startAtOne("startAtOne",
               cll::desc("Set this if edgelist nodeid start at 1"),
               cll::init(false));

static cll::opt<bool>
    ignoreWeights("ignoreWeights",
                  cll::desc("Set this to ignore edgelist weights"),
                  cll::init(false));

static cll::opt<bool>
    keepSelfLoops("keepSelfLoops",
                  cll::desc("Used for graph cleaning: if set, keeps self "
                            "loops instead of removing them"),
                  cll::init(false));

static cll::opt<bool>
    cleanCheck("cleanCheck",
               cll::desc("Only checks if graph is clean; no write occurs."),
               cll::init(false));

static cll::opt<bool>
    symNoClean("symNoClean",
               cll::desc("gr2sgr option: if true, does not clean the graph "
                         "after symmetrization"),
               cll::init(false));

////////////////////////////////////////////////////////////////////////////////
// BEGIN CONVERT CODE/STRUCTS
////////////////////////////////////////////////////////////////////////////////

struct Conversion {};

/**
 * Convert 1: figure out edge type, then call convert with edge type as
 * an additional template argument.
 */
template <typename C>
void convert() {
  C c;

  switch (edgeType) {
  case EdgeType::uint32_:
    convert<uint32_t>(c, c);
    break;
  case EdgeType::void_:
    convert<void>(c, c);
    break;
  default:
    abort();
  };
}

/**
 * Convert 2 called from convert above: calls convert from the appropriate
 * structure
 */
template <typename EdgeTy, typename C>
void convert(C& c, Conversion) {
  auto& net = galois::runtime::getSystemNetworkInterface();

  if (net.ID == 0) {
    std::cout << "Input: " << inputFilename << "; Output: " << outputFilename
              << "\n";
  }

  galois::runtime::getHostBarrier().wait();

  galois::StatTimer convertTimer("Convert Time", "convert");
  convertTimer.start();
  c.template convert<EdgeTy>(inputFilename, outputFilename);
  convertTimer.stop();

  if (net.ID == 0) {
    galois::gInfo("Done with convert\n");
  }
}

/**
 * Converts an edge list to a Galois binary graph.
 */
struct Edgelist2Gr : public Conversion {

  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT((totalNumNodes != 0), "edgelist2gr needs num nodes");
    GALOIS_ASSERT(!(outputFile.empty()), "edgelist2gr needs an output file");
    GALOIS_ASSERT((totalNumNodes <= 4294967296), "num nodes limit is 2^32");

    if (ignoreWeights) {
      GALOIS_ASSERT(std::is_void<EdgeTy>::value,
                    "ignoreWeights needs void edgetype");
    }

    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint64_t hostID = net.ID;

    std::ifstream edgeListFile(inputFile.c_str());
    uint64_t fileSize = getFileSize(edgeListFile);
    if (hostID == 0) {
      std::cout << "File size is " << fileSize << "\n";
    }

    uint64_t localStartByte;
    uint64_t localEndByte;
    std::tie(localStartByte, localEndByte) =
        determineByteRange(edgeListFile, fileSize);
    // load edges into a vector
    std::vector<uint32_t> localEdges = loadEdgesFromEdgeList<EdgeTy>(
        edgeListFile, localStartByte, localEndByte, totalNumNodes, startAtOne,
        ignoreWeights);
    edgeListFile.close();

    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));
    if (hostID == 0) {
      std::cout << "Total num edges " << totalEdgeCount << "\n";
    }
    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, totalEdgeCount,
                                outputFile);
    galois::runtime::getHostBarrier().wait();
  }
};

/**
 * Transpose a Galois binary graph.
 */
struct Gr2TGr : public Conversion {

  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(!(outputFile.empty()), "gr2tgr needs an output file");
    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint32_t hostID = net.ID;

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    // get "read" assignment of nodes (i.e. nodes this host is responsible for)
    Uint64Pair nodesToRead;
    Uint64Pair edgesToRead;
    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);
    std::cout << "[" << hostID << "] Reads nodes " << nodesToRead.first
              << " to " << nodesToRead.second << "\n";
    std::cout << "[" << hostID << "] Reads edges " << edgesToRead.first
              << " to " << edgesToRead.second << " (count "
              << (edgesToRead.second - edgesToRead.first) << ")\n";

    // read edges of assigned nodes using MPI_Graph, load into the same format
    // used by edgelist2gr; key is to do it TRANSPOSED
    std::vector<uint32_t> localEdges =
        loadTransposedEdgesFromBufferedGraph<EdgeTy>(
            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);
    // sanity check
    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));
    GALOIS_ASSERT(totalEdgeCount == totalNumEdges,
                  "edges from metadata doesn't match edges in memory");
    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, totalNumEdges,
                                outputFile);

    galois::runtime::getHostBarrier().wait();
  }
};

/**
 * Makes a Galois binary graph symmetric (i.e. add a directed edge in the
 * opposite direction for every directed edge)
 */
struct Gr2SGr : public Conversion {

  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(!(outputFile.empty()), "gr2sgr needs an output file");
    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint32_t hostID = net.ID;

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    // get "read" assignment of nodes (i.e. nodes this host is responsible for)
    Uint64Pair nodesToRead;
    Uint64Pair edgesToRead;
    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);
    std::cout << "[" << hostID << "] Reads nodes " << nodesToRead.first
              << " to " << nodesToRead.second << "\n";
    std::cout << "[" << hostID << "] Reads edges " << edgesToRead.first
              << " to " << edgesToRead.second << " (count "
              << (edgesToRead.second - edgesToRead.first) << ")\n";

    // read edges of assigned nodes using MPI_Graph, load into the same format
    // used by edgelist2gr; key is to load one edge as 2 edges (i.e. symmetric)
    std::vector<uint32_t> localEdges =
        loadSymmetricEdgesFromBufferedGraph<EdgeTy>(
            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);
    // sanity check
    uint64_t doubleEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));
    GALOIS_ASSERT(doubleEdgeCount == 2 * totalNumEdges,
                  "data needs to have twice as many edges as original graph");

    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, doubleEdgeCount,
                                outputFile);
    galois::runtime::getHostBarrier().wait();
  }
};

/**
 * Adds random weights to a Galois binary graph.
 */
struct Gr2WGr : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(outputFile.empty(), "gr2wgr doesn't take an output file");
    GALOIS_ASSERT(editInPlace, "You must use -inPlace with gr2wgr");

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    uint64_t localEdgeBegin;
    uint64_t localEdgeEnd;
    std::tie(localEdgeBegin, localEdgeEnd) = getLocalAssignment(totalNumEdges);

    uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;
    std::cout << "[" << hostID << "] Responsible for edges " << localEdgeBegin
              << " to " << localEdgeEnd << "\n";

    // get edge data to write (random numbers) and get location to start
    // write
    uint64_t numLocalEdges = localEdgeEnd - localEdgeBegin;
    std::vector<uint32_t> edgeDataToWrite =
        generateRandomNumbers(numLocalEdges, hostID, 1, 100);
    GALOIS_ASSERT(edgeDataToWrite.size() == numLocalEdges);
    uint64_t byteOffsetToEdgeData =
        getOffsetToLocalEdgeData(totalNumNodes, totalNumEdges, localEdgeBegin);

    // do edge data writing
    MPI_File grInPlace;
    MPICheck(MPI_File_open(MPI_COMM_WORLD, inputFile.c_str(), MPI_MODE_RDWR,
                           MPI_INFO_NULL, &grInPlace));
    writeEdgeDataData(grInPlace, byteOffsetToEdgeData, edgeDataToWrite);
    // if host 0 update header with edge size
    if (hostID == 0) {
      uint64_t edgeSize = 4;
      MPICheck(MPI_File_write_at(grInPlace, sizeof(uint64_t), &edgeSize, 1,
                                 MPI_UINT64_T, MPI_STATUS_IGNORE));
    }
    MPICheck(MPI_File_close(&grInPlace));
  }
};

/**
 * Cleans graph (no multi-edges, no self-loops).
 *
 * ONLY WORKS ON GRAPHS WITH NO EDGE DATA. (If it does have edge data, it will
 * be ignored.)
 */
struct Gr2CGr : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(std::is_void<EdgeTy>::value,
                  "Edge type must be void to clean graph");
    if (!cleanCheck) {
      GALOIS_ASSERT(!(outputFile.empty()), "gr2cgr needs an output file");
    }

    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint32_t hostID = net.ID;

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    // get "read" assignment of nodes (i.e. nodes this host is responsible for)
    Uint64Pair nodesToRead;
    Uint64Pair edgesToRead;
    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);
    std::cout << "[" << hostID << "] Reads nodes " << nodesToRead.first
              << " to " << nodesToRead.second << "\n";
    std::cout << "[" << hostID << "] Reads edges " << edgesToRead.first
              << " to " << edgesToRead.second << " (count "
              << (edgesToRead.second - edgesToRead.first) << ")\n";

    std::vector<uint32_t> localEdges = loadCleanEdgesFromBufferedGraph(
        inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges,
        keepSelfLoops);
    uint64_t cleanEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));
    GALOIS_ASSERT(cleanEdgeCount <= totalNumEdges,
                  "clean should not increase edge count");

    if (hostID == 0) {
      galois::gInfo("From ", totalNumEdges, " edges to ", cleanEdgeCount,
                    " edges\n");
    }

    if (cleanCheck) {
      // only want a clean check; ok to quit here
      galois::runtime::getHostBarrier().wait();
      if (hostID == 0) {
        if (totalNumEdges == cleanEdgeCount) {
          galois::gInfo("Graph is clean");
        } else {
          galois::gInfo("Graph is not clean");
        }
      }
      return;
    }

    if (totalNumEdges == cleanEdgeCount) {
      if (hostID == 0) {
        galois::gInfo("IMPORTANT: Graph was already clean; ending convert now");
      }
      return;
    }

    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, cleanEdgeCount,
                                outputFile);
    galois::runtime::getHostBarrier().wait();
  }
};

/**
 * Given a binary mapping of node to another node (i.e. random mapping), remap
 * the graph vertex order.
 */
struct Gr2RGr : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(!(outputFile.empty()), "gr2rgr needs an output file");
    GALOIS_ASSERT(!(nodeMapBinary.empty()), "gr2rgr needs binary mapping");

    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint32_t hostID = net.ID;
    if (hostID == 0) {
      galois::gPrint("Node map binary is ", nodeMapBinary, "\n");
    }

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    ////////////////////////////////////////////////////////////////////////////
    // phase 1: remap sources
    ////////////////////////////////////////////////////////////////////////////
    galois::gPrint("[", hostID, "] Source remap phase entering\n");

    // get "read" assignment of nodes (i.e. nodes this host is responsible for)
    Uint64Pair nodesToRead;
    Uint64Pair edgesToRead;
    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);
    // this will remap the source nodes and return a TRANSPOSE edge list
    std::vector<uint32_t> localEdges =
        loadMappedSourceEdgesFromBufferedGraph<EdgeTy>(
            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges,
            nodeMapBinary);

    ////////////////////////////////////////////////////////////////////////////
    // phase 2: remap destinations
    ////////////////////////////////////////////////////////////////////////////
    galois::gPrint("[", hostID, "] Dest remap phase entering\n");

    // make each host remap a relatively even number of destination nodes by
    // assigning/sending (this is the point of the transpose edge list above)
    std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(
        localEdges, totalNumNodes, totalNumEdges);

    PairVoVUint32 receivedEdgeInfo =
        sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);

    // at this point, localEdges has been freed

    galois::gPrint("[", hostID, "] Received destinations to remap\n");

    VoVUint32 localSrcToDest = receivedEdgeInfo.first;
    VoVUint32 localSrcToData = receivedEdgeInfo.second;

    uint64_t localNodeBegin = hostToNodes[hostID].first;
    uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;
    freeVector(hostToNodes);

    // At this point, this host has all edges of the destinations it has been
    // assigned to remap
    std::vector<uint32_t> node2NewNode =
        readRandomNodeMapping(nodeMapBinary, localNodeBegin, localNumNodes);

    galois::gPrint("[", hostID, "] Remapping destinations now\n");

    // TODO refactor
    std::vector<uint32_t> remappedEdges;
    GALOIS_ASSERT(localNumNodes == localSrcToDest.size());
    // Go through the received edge lists and un-transpose them into a regular
    // edge list while remapping the destination nodes
    // (serial loop due to memory concerns)
    for (unsigned i = 0; i < localNumNodes; i++) {
      auto& curVector = localSrcToDest[i];

      uint32_t remappedGID = node2NewNode[i];
      for (unsigned j = 0; j < curVector.size(); j++) {
        remappedEdges.emplace_back(curVector[j]);
        remappedEdges.emplace_back(remappedGID);

        if (localSrcToData.size()) {
          remappedEdges.emplace_back(localSrcToData[i][j]);
        }
      }
      freeVector(curVector);
      if (localSrcToData.size()) {
        freeVector(localSrcToData[i]);
      }
    }
    freeVector(localSrcToDest);
    freeVector(localSrcToData);

    ////////////////////////////////////////////////////////////////////////////
    // phase 3: write now randomized-node edges to new file
    ////////////////////////////////////////////////////////////////////////////
    galois::gPrint("[", hostID, "] Entering writing phase\n");

    // we have the randomized nodes in remappedEdges; execution proceeds
    // like the other converters from this point on
    assignAndWriteEdges<EdgeTy>(remappedEdges, totalNumNodes, totalNumEdges,
                                outputFile);
    galois::runtime::getHostBarrier().wait();
  }
};

struct Tgr2Lux : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    GALOIS_ASSERT(!(outputFile.empty()), "tgr2lux needs an output file");

    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint32_t hostID = net.ID;

    uint64_t totalNumNodes;
    uint64_t totalNumEdges;
    std::tie(totalNumNodes, totalNumEdges) =
        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);

    // get "read" assignment of nodes (i.e. nodes this host is responsible for)
    Uint64Pair nodesToRead;
    Uint64Pair edgesToRead;
    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);
    std::cout << "[" << hostID << "] Reads nodes " << nodesToRead.first
              << " to " << nodesToRead.second << "\n";
    std::cout << "[" << hostID << "] Reads edges " << edgesToRead.first
              << " to " << edgesToRead.second << " (count "
              << (edgesToRead.second - edgesToRead.first) << ")\n";

    // read edges of assigned nodes using MPI_Graph, load into the same format
    // used by edgelist2gr; key is to do it TRANSPOSED
    std::vector<uint32_t> localEdges = loadEdgesFromBufferedGraph<EdgeTy>(
        inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);
    // sanity check
    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));
    GALOIS_ASSERT(totalEdgeCount == totalNumEdges,
                  "edges from metadata doesn't match edges in memory");
    assignAndWriteEdgesLux<EdgeTy>(localEdges, totalNumNodes, totalNumEdges,
                                   outputFile);

    galois::runtime::getHostBarrier().wait();
  }
};

/**
 * Take a line separated list of numbers and convert it into a binary format.
 */
struct Nodemap2Binary : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& inputFile, const std::string& outputFile) {
    // input file = node map
    GALOIS_ASSERT(!(outputFile.empty()), "nodemap2binary needs an output file");

    auto& net       = galois::runtime::getSystemNetworkInterface();
    uint64_t hostID = net.ID;

    std::ifstream mapFile(inputFile.c_str());
    uint64_t fileSize = getFileSize(mapFile);
    if (hostID == 0) {
      std::cout << "File size is " << fileSize << "\n";
    }
    uint64_t localStartByte;
    uint64_t localEndByte;
    std::tie(localStartByte, localEndByte) =
        determineByteRange(mapFile, fileSize);
    std::vector<uint32_t> nodesToWrite;
    // read lines until last byte
    mapFile.seekg(localStartByte);
    while ((uint64_t(mapFile.tellg()) + 1ul) != localEndByte) {
      uint32_t node;
      mapFile >> node;
      nodesToWrite.emplace_back(node);
    }
    mapFile.close();

    std::cout << "[" << galois::runtime::getSystemNetworkInterface().ID
              << "] Read " << nodesToWrite.size() << " numbers\n";

    // determine where to start writing using prefix sum of read nodes
    std::vector<uint64_t> nodesEachHostRead =
        getEdgesPerHost(nodesToWrite.size());

    for (unsigned i = 1; i < nodesEachHostRead.size(); i++) {
      nodesEachHostRead[i] += nodesEachHostRead[i - 1];
    }

    uint64_t fileOffset;
    if (hostID != 0) {
      fileOffset = nodesEachHostRead[hostID - 1] * sizeof(uint32_t);
    } else {
      fileOffset = 0;
    }

    // write using mpi
    MPI_File binaryMap;
    MPICheck(MPI_File_open(MPI_COMM_WORLD, outputFile.c_str(),
                           MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,
                           &binaryMap));
    // resuse of functions (misleading name, but it will do what I need which
    // is write a vector of uint32_ts)
    writeEdgeDataData(binaryMap, fileOffset, nodesToWrite);
    MPICheck(MPI_File_close(&binaryMap));
  }
};

int main(int argc, char** argv) {
  galois::DistMemSys G;
  llvm::cl::ParseCommandLineOptions(argc, argv);
  galois::setActiveThreads(threadsToUse);

// need to initialize MPI if using LWCI (else already initialized)
#ifdef GALOIS_USE_LCI
  int initResult;
  MPICheck(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &initResult));

  if (initResult < MPI_THREAD_MULTIPLE) {
    GALOIS_DIE("unable to init mpi with thread multiple");
  }
#endif

  if (cleanCheck) {
    convertMode = gr2cgr;
  }

  switch (convertMode) {
  case edgelist2gr:
    convert<Edgelist2Gr>();
    break;
  case gr2wgr:
    convert<Gr2WGr>();
    break;
  case gr2tgr:
    convert<Gr2TGr>();
    break;
  case gr2sgr:
    convert<Gr2SGr>();

    // clean graph of multiedges and selfloops
    // Note: if want to keep self loops use the flag for it
    if (!symNoClean) {
      if (galois::runtime::getSystemNetworkInterface().ID == 0) {
        galois::gInfo("Cleaning the newly symmetrized graph");
        if (!keepSelfLoops) {
          galois::gInfo("Removing self loops: if want to keep, use the flag");
        }
      }
      // overwrite new symmetric graph with clean version of self
      inputFilename = outputFilename.getValue();
      convert<Gr2CGr>();
    }

    break;
  case gr2cgr:
    convert<Gr2CGr>();
    break;
  case gr2rgr:
    convert<Gr2RGr>();
    break;
  case tgr2lux:
    convert<Tgr2Lux>();
    break;
  case nodemap2binary:
    convert<Nodemap2Binary>();
    break;
  default:
    abort();
  }

#ifdef GALOIS_USE_LCI
  MPICheck(MPI_Finalize());
#endif

  return 0;
}


================================================
FILE: tools/graph-convert/CMakeLists.txt
================================================
add_executable(graph-convert graph-convert.cpp)
target_link_libraries(graph-convert galois_shmem LLVMSupport)
install(TARGETS graph-convert
  EXPORT GaloisTargets
  DESTINATION "${CMAKE_INSTALL_BINDIR}"
  COMPONENT tools
)

function(compare_with_sample test_arg compare_arg input expected)
  set(suffix ${test_arg}${compare_arg}-${input})

  get_filename_component(base_input ${input} NAME)

  add_test(NAME create${suffix}
    COMMAND graph-convert ${test_arg} ${CMAKE_CURRENT_SOURCE_DIR}/${input} ${base_input}.test
  )
  set_tests_properties(create${suffix} PROPERTIES LABELS quick)

  add_test(NAME convert${suffix}
    COMMAND graph-convert ${compare_arg} ${base_input}.test ${base_input}.compare
  )
  set_tests_properties(convert${suffix} PROPERTIES LABELS quick)
  set_property(TEST convert${suffix} APPEND PROPERTY DEPENDS create${suffix})

  add_test(NAME compare${suffix}
    COMMAND ${CMAKE_COMMAND} -E compare_files ${base_input}.compare ${CMAKE_CURRENT_SOURCE_DIR}/${expected}
  )
  set_tests_properties(compare${suffix} PROPERTIES LABELS quick)
  set_property(TEST compare${suffix} APPEND PROPERTY DEPENDS convert${suffix})
endfunction()

compare_with_sample(-edgelist2gr -gr2edgelist test-inputs/with-blank-lines.edgelist test-inputs/with-blank-lines.edgelist.expected)
compare_with_sample(-csv2gr -gr2edgelist test-inputs/sample.csv test-inputs/with-blank-lines.edgelist.expected)
compare_with_sample(-edgelist2gr -gr2edgelist test-inputs/with-comments.edgelist test-inputs/with-comments.edgelist.expected)


add_executable(graph-convert-huge graph-convert-huge.cpp)
target_link_libraries(graph-convert-huge galois_shmem LLVMSupport)
if (TARGET Boost::Boost)
  target_link_libraries(graph-convert-huge Boost::Boost)
else()
  target_link_libraries(graph-convert-huge Boost::iostreams)
endif()
install(TARGETS graph-convert-huge
  EXPORT GaloisTargets
  DESTINATION "${CMAKE_INSTALL_BINDIR}"
  COMPONENT tools
)


================================================
FILE: tools/graph-convert/graph-convert-huge.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/LargeArray.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/OfflineGraph.h"

#include "llvm/Support/CommandLine.h"

#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/mpl/if.hpp>
#include <algorithm>
#include <deque>
#include <fstream>
#include <iostream>
#include <ios>
#include <limits>
#include <cstdint>
#include <vector>
#include <random>
#include <chrono>
#include <regex>
#include <fcntl.h>
#include <cstdlib>

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<std::string>
    outputFilename(cll::Positional, cll::desc("<output file>"), cll::Required);
static cll::opt<bool> useSmallData("32bitData", cll::desc("Use 32 bit data"),
                                   cll::init(false));
static cll::opt<bool>
    edgesSorted("edgesSorted", cll::desc("Edges are sorted by the sourceIDs."),
                cll::init(false));
static cll::opt<unsigned long long>
    numNodes("numNodes", cll::desc("Total number of nodes given."),
             cll::init(0));

union dataTy {
  int64_t ival;
  double dval;
  float fval;
  int32_t i32val;
};

void perEdge(std::istream& is,
             std::function<void(uint64_t, uint64_t, dataTy)> fn,
             std::function<void(uint64_t, uint64_t)> fnPreSize) {
  std::string line;

  uint64_t bytes      = 0;
  uint64_t counter    = 0;
  uint64_t totalBytes = 0;

  const std::regex problemLine("^p[[:space:]]+[[:alpha:]]+[[:space:]]+([[:"
                               "digit:]]+)[[:space:]]+([[:digit:]]+)");
  const std::regex noData(
      "^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]*");
  // const std::regex noData_nospace(
  // "^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)");
  const std::regex intData("^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:"
                           "]]+)[[:space:]]+(-?[[:digit:]]+)");
  const std::regex floatData(
      "^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]+(-?[["
      ":digit:]]+\\.[[:digit:]]+)");

  auto timer      = std::chrono::system_clock::now();
  auto timerStart = timer;

  std::smatch matches;
  bool zeroBased = false; // set to 1 if file is one-indexed
  bool seenEdge  = false;

  while (std::getline(is, line)) {
    auto t = line.size() + 1;
    bytes += t;
    totalBytes += t;
    ++counter;

    if (counter == 1024 * 128) {
      counter     = 0;
      auto timer2 = std::chrono::system_clock::now();
      std::cout << "Scan: "
                << (double)bytes /
                       std::chrono::duration_cast<std::chrono::microseconds>(
                           timer2 - timer)
                           .count()
                << " MB/s\n";
      timer = timer2;
      bytes = 0;
    }

    dataTy data;
    bool match = false;
    if (std::regex_match(line, matches, floatData)) {
      if (useSmallData)
        data.fval = std::stof(matches[3].str());
      else
        data.dval = std::stod(matches[3].str());
      match = true;
    } else if (std::regex_match(line, matches, intData)) {
      if (useSmallData)
        data.i32val = std::stoul(matches[3].str());
      else
        data.ival = std::stoll(matches[3].str());
      match = true;
    } else if (std::regex_match(line, matches,
                                noData)) { // || std::regex_match(line, matches,
                                           // noData_nospace)) {
      data.ival = 0;
      match     = true;
    } else if (std::regex_match(line, matches, problemLine)) {
      if (seenEdge) {
        std::cerr << "Error: seeing a dimacs problem line after seeing edges\n";
        abort();
      }
      zeroBased = true; // dimacs files are 1-indexed
      fnPreSize(std::stoull(matches[1].str()), std::stoull(matches[2].str()));
    }
    if (match) {
      seenEdge     = true;
      uint64_t src = std::stoull(matches[1].str());
      uint64_t dst = std::stoull(matches[2].str());
      if (zeroBased) {
        if (src == 0 || dst == 0) {
          std::cerr << "Error: node id 0 in a dimacs graph\n";
          abort();
        }
        src -= 1;
        dst -= 1;
      }
      fn(src, dst, data);
    }
  }
  auto timer2 = std::chrono::system_clock::now();
  std::cout << "File Scan: "
            << (double)totalBytes /
                   std::chrono::duration_cast<std::chrono::microseconds>(
                       timer2 - timerStart)
                       .count()
            << " MB/s\n";
}

void go(std::istream& input) {
  try {
    std::deque<uint64_t> edgeCount;
    perEdge(
        input,
        [&edgeCount](uint64_t src, uint64_t, dataTy) {
          if (edgeCount.size() <= src)
            edgeCount.resize(src + 1);
          ++edgeCount[src];
        },
        [&edgeCount](uint64_t nodes, uint64_t) { edgeCount.resize(nodes); });
    input.clear();
    input.seekg(0, std::ios_base::beg);
    galois::graphs::OfflineGraphWriter outFile(outputFilename, useSmallData);
    outFile.setCounts(edgeCount);
    perEdge(
        input,
        [&outFile, &edgeCount](uint64_t src, uint64_t dst, dataTy data) {
          auto off = --edgeCount[src];
          if (useSmallData)
            outFile.setEdge(src, off, dst, data.i32val);
          else
            outFile.setEdge(src, off, dst, data.ival);
        },
        [](uint64_t, uint64_t) {});
  } catch (const char* c) {
    std::cerr << "Failed with: " << c << "\n";
    abort();
  }
}

void go_edgesSorted(std::istream& input, uint64_t numNodes) {
  try {
    std::deque<uint64_t> edgeCount(numNodes, 0);
    input.clear();
    input.seekg(0, std::ios_base::beg);
    galois::graphs::OfflineGraphWriter outFile(outputFilename, useSmallData,
                                               numNodes);
    outFile.setCounts(edgeCount);
    outFile.seekEdgesDstStart();
    uint64_t curr_src           = 0;
    uint64_t curr_src_edgeCount = 0;
    perEdge(
        input,
        [&outFile, &edgeCount, &curr_src,
         &curr_src_edgeCount](uint64_t src, uint64_t dst, dataTy) {
          if (src == curr_src) {
            ++curr_src_edgeCount;
          } else {
            // std::cout << "CHANGES : " << src << " : " << curr_src << "
            // COUNT : " << curr_src_edgeCount << "\n";
            if (src < curr_src) {
              std::cerr << " ERROR : File is not sorted\n";
              abort();
            }
            edgeCount[curr_src] = curr_src_edgeCount;
            curr_src            = src;
            curr_src_edgeCount  = 1;
          }
          outFile.setEdgeSorted(dst);
        },
        [](uint64_t, uint64_t) {});
    // To take care of the last src node ID.
    edgeCount[curr_src] = curr_src_edgeCount;
    outFile.setCounts(edgeCount);
  } catch (const char* c) {
    std::cerr << "Failed with: " << c << "\n";
    abort();
  }
}

int main(int argc, char** argv) {
  llvm::cl::ParseCommandLineOptions(argc, argv);
  //  std::ios_base::sync_with_stdio(false);
  std::cout << "Data will be " << (useSmallData ? 4 : 8) << " Bytes\n";

  std::ifstream infile(inputFilename, std::ios_base::in);
  if (!infile) {
    std::cout << "Failed to open " << inputFilename << "\n";
    return 1;
  }

  // // if (isCompressed(inputType)) {
  // //   boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf;
  // //   inbuf.push(boost::iostreams::gzip_decompressor());
  // //   inbuf.push(infile);
  // //   //Convert streambuf to istream
  // //   std::istream instream(&inbuf);
  // //   go(instream);
  // // } else {
  if (numNodes > 0 && edgesSorted) {
    go_edgesSorted(infile, numNodes);
  } else {
    go(infile);
  }
  //  }

  return 0;
}


================================================
FILE: tools/graph-convert/graph-convert.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/LargeArray.h"
#include "galois/graphs/FileGraph.h"

#include <llvm/Support/CommandLine.h>

#include <boost/mpl/if.hpp>
#include <algorithm>
#include <deque>
#include <fstream>
#include <iostream>
#include <limits>
#include <cstdint>
#include <vector>
#include <random>
#include <string>
#include <optional>

#include <fcntl.h>
#include <cstdlib>

// TODO: move these enums to a common location for all graph convert tools
enum ConvertMode {
  bipartitegr2bigpetsc,
  bipartitegr2littlepetsc,
  bipartitegr2sorteddegreegr,
  dimacs2gr,
  edgelist2gr,
  csv2gr,
  gr2biggr,
  gr2binarypbbs32,
  gr2binarypbbs64,
  gr2bsml,
  gr2cgr,
  gr2dimacs,
  gr2adjacencylist,
  gr2edgelist,
  gr2edgelist1ind,
  gr2linegr,
  gr2lowdegreegr,
  gr2mtx,
  gr2partdstgr,
  gr2partsrcgr,
  gr2pbbs,
  gr2pbbsedges,
  gr2randgr,
  gr2randomweightgr,
  gr2ringgr,
  gr2rmat,
  gr2metis,
  gr2sgr,
  gr2sorteddegreegr,
  gr2sorteddstgr,
  gr2sortedparentdegreegr,
  gr2sortedweightgr,
  gr2sortedbfsgr,
  gr2streegr,
  gr2tgr,
  gr2treegr,
  gr2trigr,
  gr2totem,
  gr2neo4j,
  mtx2gr,
  nodelist2gr,
  pbbs2gr,
  svmlight2gr,
  edgelist2binary
};

enum EdgeType { float32_, float64_, int32_, int64_, uint32_, uint64_, void_ };

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<std::string>
    outputFilename(cll::Positional, cll::desc("<output file>"), cll::Required);
static cll::opt<std::string>
    transposeFilename("graphTranspose", cll::desc("transpose graph file"),
                      cll::init(""));
static cll::opt<std::string>
    outputPermutationFilename("outputNodePermutation",
                              cll::desc("output node permutation file"),
                              cll::init(""));
static cll::opt<std::string>
    labelsFilename("labels", cll::desc("labels file for svmlight2gr"),
                   cll::init(""));
static cll::opt<EdgeType> edgeType(
    "edgeType", cll::desc("Input/Output edge type:"),
    cll::values(clEnumValN(EdgeType::float32_, "float32",
                           "32 bit floating point edge values"),
                clEnumValN(EdgeType::float64_, "float64",
                           "64 bit floating point edge values"),
                clEnumValN(EdgeType::int32_, "int32", "32 bit int edge values"),
                clEnumValN(EdgeType::int64_, "int64", "64 bit int edge values"),
                clEnumValN(EdgeType::uint32_, "uint32",
                           "32 bit unsigned int edge values"),
                clEnumValN(EdgeType::uint64_, "uint64",
                           "64 bit unsigned int edge values"),
                clEnumValN(EdgeType::void_, "void", "no edge values")),
    cll::init(EdgeType::void_));
static cll::opt<ConvertMode> convertMode(
    cll::desc("Conversion mode:"),
    cll::values(
        clEnumVal(bipartitegr2bigpetsc,
                  "Convert bipartite binary gr to big-endian PETSc format"),
        clEnumVal(bipartitegr2littlepetsc,
                  "Convert bipartite binary gr to little-endian PETSc format"),
        clEnumVal(bipartitegr2sorteddegreegr,
                  "Sort nodes of bipartite binary gr by degree"),
        clEnumVal(dimacs2gr, "Convert dimacs to binary gr"),
        clEnumVal(edgelist2gr, "Convert edge list to binary gr"),
        clEnumVal(csv2gr, "Convert csv to binary gr"),
        clEnumVal(gr2biggr, "Convert binary gr with little-endian edge data to "
                            "big-endian edge data"),
        clEnumVal(gr2binarypbbs32,
                  "Convert binary gr to unweighted binary pbbs graph"),
        clEnumVal(gr2binarypbbs64,
                  "Convert binary gr to unweighted binary pbbs graph"),
        clEnumVal(gr2bsml, "Convert binary gr to binary sparse MATLAB matrix"),
        clEnumVal(gr2cgr,
                  "Clean up binary gr: remove self edges and multi-edges"),
        clEnumVal(gr2dimacs, "Convert binary gr to dimacs"),
        clEnumVal(gr2adjacencylist, "Convert binary gr to adjacency list"),
        clEnumVal(gr2edgelist, "Convert binary gr to edgelist"),
        clEnumVal(gr2edgelist1ind, "Convert binary gr to edgelist, 1-indexed"),
        clEnumVal(gr2linegr, "Overlay line graph"),
        clEnumVal(gr2lowdegreegr, "Remove high degree nodes from binary gr"),
        clEnumVal(gr2mtx, "Convert binary gr to matrix market format"),
        clEnumVal(gr2partdstgr,
                  "Partition binary gr in N pieces by destination nodes"),
        clEnumVal(gr2partsrcgr,
                  "Partition binary gr in N pieces by source nodes"),
        clEnumVal(gr2pbbs, "Convert binary gr to pbbs graph"),
        clEnumVal(gr2pbbsedges, "Convert binary gr to pbbs edge list"),
        clEnumVal(gr2randgr, "Randomly permute nodes of binary gr"),
        clEnumVal(gr2randomweightgr, "Add or Randomize edge weights"),
        clEnumVal(gr2ringgr, "Convert binary gr to strongly connected graph by "
                             "adding ring overlay"),
        clEnumVal(gr2rmat, "Convert binary gr to RMAT graph"),
        clEnumVal(gr2metis, "Convert binary gr to METIS graph (unweighted)"),
        clEnumVal(
            gr2sgr,
            "Convert binary gr to symmetric graph by adding reverse edges"),
        clEnumVal(gr2sorteddegreegr, "Sort nodes by degree"),
        clEnumVal(gr2sorteddstgr,
                  "Sort outgoing edges of binary gr by edge destination"),
        clEnumVal(gr2sortedparentdegreegr, "Sort nodes by degree of parent"),
        clEnumVal(gr2sortedweightgr,
                  "Sort outgoing edges of binary gr by edge weight"),
        clEnumVal(gr2sortedbfsgr,
                  "Sort nodes by a BFS traversal from the source (greedy)"),
        clEnumVal(gr2streegr, "Convert binary gr to strongly connected graph "
                              "by adding symmetric tree overlay"),
        clEnumVal(gr2tgr, "Transpose binary gr"),
        clEnumVal(gr2treegr, "Overlay tree"),
        clEnumVal(gr2trigr, "Convert symmetric binary gr to triangular form by "
                            "removing reverse edges"),
        clEnumVal(gr2totem, "Convert binary gr totem input format"),
        clEnumVal(gr2neo4j, "Convert binary gr to a vertex/edge csv for neo4j"),
        clEnumVal(mtx2gr, "Convert matrix market format to binary gr"),
        clEnumVal(nodelist2gr, "Convert node list to binary gr"),
        clEnumVal(pbbs2gr, "Convert pbbs graph to binary gr"),
        clEnumVal(svmlight2gr, "Convert svmlight file to binary gr"),
        clEnumVal(edgelist2binary,
                  "Convert edge list to binary edgelist "
                  "format (assumes vertices of type uin32_t)")),
    cll::Required);
static cll::opt<uint32_t>
    sourceNode("sourceNode", cll::desc("Source node ID for BFS traversal"),
               cll::init(0));
static cll::opt<int>
    numParts("numParts", cll::desc("number of parts to partition graph into"),
             cll::init(64));
static cll::opt<int> maxValue("maxValue",
                              cll::desc("maximum weight to add for tree, line, "
                                        "ring and random weight conversions"),
                              cll::init(100));
static cll::opt<int>
    minValue("minValue",
             cll::desc("minimum weight to add for random weight conversions"),
             cll::init(1));
static cll::opt<int> maxDegree("maxDegree", cll::desc("maximum degree to keep"),
                               cll::init(2 * 1024));

struct Conversion {};
struct HasOnlyVoidSpecialization {};
struct HasNoVoidSpecialization {};

template <typename EdgeTy, typename C>
void convert(C& c, Conversion) {
  c.template convert<EdgeTy>(inputFilename, outputFilename);
}

template <typename EdgeTy, typename C>
void convert(
    C& c, HasOnlyVoidSpecialization,
    typename std::enable_if<std::is_same<EdgeTy, void>::value>::type* = 0) {
  c.template convert<EdgeTy>(inputFilename, outputFilename);
}

template <typename EdgeTy, typename C>
void convert(
    C&, HasOnlyVoidSpecialization,
    typename std::enable_if<!std::is_same<EdgeTy, void>::value>::type* = 0) {
  GALOIS_DIE("conversion undefined for non-void graphs");
}

template <typename EdgeTy, typename C>
void convert(
    C& c, HasNoVoidSpecialization,
    typename std::enable_if<!std::is_same<EdgeTy, void>::value>::type* = 0) {
  c.template convert<EdgeTy>(inputFilename, outputFilename);
}

template <typename EdgeTy, typename C>
void convert(
    C&, HasNoVoidSpecialization,
    typename std::enable_if<std::is_same<EdgeTy, void>::value>::type* = 0) {
  GALOIS_DIE("conversion undefined for void graphs");
}

static std::string edgeTypeToName(EdgeType e) {
  switch (e) {
  case EdgeType::float32_:
    return "float32";
  case EdgeType::float64_:
    return "float64";
  case EdgeType::int32_:
    return "int32";
  case EdgeType::int64_:
    return "int64";
  case EdgeType::uint32_:
    return "uint32";
  case EdgeType::uint64_:
    return "uint64";
  case EdgeType::void_:
    return "void";
  default:
    abort();
  }
}

template <typename C>
void convert() {
  C c;
  std::cout << "Graph type: " << edgeTypeToName(edgeType) << "\n";
  switch (edgeType) {
  case EdgeType::float32_:
    convert<float>(c, c);
    break;
  case EdgeType::float64_:
    convert<double>(c, c);
    break;
  case EdgeType::int32_:
    convert<int32_t>(c, c);
    break;
  case EdgeType::int64_:
    convert<int64_t>(c, c);
    break;
  case EdgeType::uint32_:
    convert<uint32_t>(c, c);
    break;
  case EdgeType::uint64_:
    convert<uint64_t>(c, c);
    break;
  case EdgeType::void_:
    convert<void>(c, c);
    break;
  default:
    abort();
  };
}

static void printStatus(size_t inNodes, size_t inEdges, size_t outNodes,
                        size_t outEdges) {
  std::cout << "InGraph : |V| = " << inNodes << ", |E| = " << inEdges << "\n";
  std::cout << "OutGraph: |V| = " << outNodes << ", |E| = " << outEdges << "\n";
}

static void printStatus(size_t inNodes, size_t inEdges) {
  printStatus(inNodes, inEdges, inNodes, inEdges);
}

template <typename EdgeValues, bool Enable>
void setEdgeValue(EdgeValues& edgeValues, int value,
                  typename std::enable_if<Enable>::type* = 0) {
  edgeValues.set(0, static_cast<typename EdgeValues::value_type>(value));
}

template <typename EdgeValues, bool Enable>
void setEdgeValue(EdgeValues&, int,
                  typename std::enable_if<!Enable>::type* = 0) {}

template <typename EdgeTy, bool Enable>
EdgeTy getEdgeValue(galois::graphs::FileGraph& g,
                    galois::graphs::FileGraph::edge_iterator ii,
                    typename std::enable_if<Enable>::type* = 0) {
  return g.getEdgeData<EdgeTy>(ii);
}

template <typename EdgeTy, bool Enable>
int getEdgeValue(galois::graphs::FileGraph&,
                 galois::graphs::FileGraph::edge_iterator,
                 typename std::enable_if<!Enable>::type* = 0) {
  return 1;
}

template <typename T>
void outputPermutation(const T& perm) {
  size_t oid = 0;
  std::ofstream out(outputPermutationFilename);
  for (auto ii = perm.begin(), ei = perm.end(); ii != ei; ++ii, ++oid) {
    out << oid << "," << *ii << "\n";
  }
}

void skipLine(std::ifstream& infile) {
  infile.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
}

/**
 * Common parsing for edgelist style text files.
 *
 * src dst [weight]
 * ...
 *
 * If delim is set, this function expects that each entry is separated by delim
 * surrounded by optional whitespace.
 */
template <typename EdgeTy>
void convertEdgelist(const std::string& infilename,
                     const std::string& outfilename, const bool skipFirstLine,
                     std::optional<char> delim) {
  typedef galois::graphs::FileGraphWriter Writer;

  Writer p;
  std::ifstream infile(infilename.c_str());

  size_t numNodes   = 0;
  size_t numEdges   = 0;
  size_t lineNumber = 0;

  if (skipFirstLine) {
    galois::gWarn(
        "first line is assumed to contain labels and will be ignored\n");
    skipLine(infile);
    ++lineNumber;
  }

  const bool hasDelim = static_cast<bool>(delim);
  std::optional<size_t> skippedLine;
  std::string line;
  char readDelim;

  for (; std::getline(infile, line); ++lineNumber) {
    std::stringstream iss(line);

    size_t src;
    if (!(iss >> src)) {
      skippedLine = lineNumber;
      continue;
    }

    if (hasDelim) {
      if (!(iss >> readDelim) || readDelim != delim) {
        skippedLine = lineNumber;
        continue;
      }
    }

    size_t dst;
    if (!(iss >> dst)) {
      skippedLine = lineNumber;
      continue;
    }

    if constexpr (!std::is_void<EdgeTy>::value) {
      EdgeTy data{};
      if (hasDelim) {
        if (!(iss >> readDelim) || readDelim != delim) {
          skippedLine = lineNumber;
          continue;
        }
      }

      if (!(iss >> data)) {
        skippedLine = lineNumber;
        continue;
      }
    }

    if (infile) {
      ++numEdges;
      if (src > numNodes)
        numNodes = src;
      if (dst > numNodes)
        numNodes = dst;
    }
  }

  if (skippedLine) {
    galois::gWarn("ignored at least one line (line ", *skippedLine,
                  ") because it did not match the expected format\n");
  }

  numNodes++;
  p.setNumNodes(numNodes);
  p.setNumEdges<EdgeTy>(numEdges);

  infile.clear();
  infile.seekg(0, std::ios::beg);
  p.phase1();

  if (skipFirstLine) {
    skipLine(infile);
  }

  while (std::getline(infile, line)) {
    std::stringstream iss(line);

    size_t src;
    if (!(iss >> src)) {
      continue;
    }

    if (hasDelim) {
      if (!(iss >> readDelim) || readDelim != delim) {
        continue;
      }
    }

    size_t dst;
    if (!(iss >> dst)) {
      continue;
    }

    if constexpr (!std::is_void<EdgeTy>::value) {
      EdgeTy data{};
      if (hasDelim) {
        if (!(iss >> readDelim) || readDelim != delim) {
          continue;
        }
      }

      if (!(iss >> data)) {
        continue;
      }
    }

    if (infile) {
      p.incrementDegree(src);
    }
  }

  infile.clear();
  infile.seekg(0, std::ios::beg);
  p.phase2();

  if (skipFirstLine) {
    skipLine(infile);
  }

  while (std::getline(infile, line)) {
    std::stringstream iss(line);

    size_t src;
    if (!(iss >> src)) {
      continue;
    }

    if (hasDelim) {
      if (!(iss >> readDelim) || readDelim != delim) {
        continue;
      }
    }

    size_t dst;
    if (!(iss >> dst)) {
      continue;
    }

    if constexpr (!std::is_void<EdgeTy>::value) {
      EdgeTy data{};
      if (hasDelim) {
        if (!(iss >> readDelim) || readDelim != delim) {
          continue;
        }
      }

      if (!(iss >> data)) {
        continue;
      }

      if (infile) {
        p.addNeighbor<EdgeTy>(src, dst, data);
      }
    } else {
      if (infile) {
        p.addNeighbor(src, dst);
      }
    }
  }

  p.finish();

  p.toFile(outfilename);
  printStatus(numNodes, numEdges);
}

template <typename EdgeTy>
void convertEdgelist(const std::string& infilename,
                     const std::string& outfilename, const bool skipFirstLine) {
  convertEdgelist<EdgeTy>(infilename, outfilename, skipFirstLine,
                          std::optional<char>());
}

/**
 * Assumption: First line has labels
 * Just a bunch of pairs or triples:
 * src dst weight?
 */
struct CSV2Gr : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    convertEdgelist<EdgeTy>(infilename, outfilename, true, ',');
  }
};

/**
 * Just a bunch of pairs or triples:
 * src dst weight?
 */
struct Edgelist2Gr : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    convertEdgelist<EdgeTy>(infilename, outfilename, false);
  }
};

/**
 * Convert edgelist to binary edgelist format
 * Assumes no edge data.
 */
struct Edgelist2Binary : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    std::ifstream infile(infilename.c_str());
    std::ofstream outfile(outfilename.c_str());

    size_t numNodes = 0;
    size_t numEdges = 0;

    std::vector<uint32_t> buffer(10000);
    uint32_t counter = 0;
    bool skippedLine = false;
    while (infile) {
      uint32_t src;
      if (!(infile >> src)) {
        skipLine(infile);
        skippedLine = true;
        continue;
      }

      uint32_t dst;
      if (!(infile >> dst)) {
        skipLine(infile);
        skippedLine = true;
        continue;
      }

      buffer[counter++] = src;
      buffer[counter++] = dst;
      if (counter == buffer.size()) {
        // flush it to the output file.
        outfile.write(reinterpret_cast<char*>(&buffer[0]),
                      sizeof(uint32_t) * counter);
        counter = 0;
      }

      if (infile) {
        ++numEdges;
        if (src > numNodes)
          numNodes = src;
        if (dst > numNodes)
          numNodes = dst;
      } else {
        counter -= 2;
      }
    }

    if (counter) {
      // flush it to the output file.
      outfile.write(reinterpret_cast<char*>(&buffer[0]),
                    sizeof(uint32_t) * counter);
    }

    if (skippedLine) {
      galois::gWarn("ignored at least one line because it did not match the "
                    "expected format\n");
    }

    printStatus(numNodes, numEdges);
  }
};

/**
 * Convert matrix market matrix to binary graph.
 *
 * %% comments
 * % ...
 * <num nodes> <num nodes> <num edges>
 * <src> <dst> <float>
 *
 * src and dst start at 1.
 */
struct Mtx2Gr : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;
    uint32_t nnodes;
    size_t nedges;

    for (int phase = 0; phase < 2; ++phase) {
      std::ifstream infile(infilename.c_str());
      if (!infile) {
        GALOIS_DIE("failed to open input file");
      }

      // Skip comments
      while (infile) {
        if (infile.peek() != '%') {
          break;
        }
        skipLine(infile);
      }

      // Read header
      char header[256];
      infile.getline(header, 256);
      std::istringstream line(header, std::istringstream::in);
      std::vector<std::string> tokens;
      while (line) {
        std::string tmp;
        line >> tmp;
        if (line) {
          tokens.push_back(tmp);
        }
      }
      if (tokens.size() != 3) {
        GALOIS_DIE("unknown problem specification line: ", line.str());
      }
      // Prefer C functions for maximum compatibility
      // nnodes = std::stoull(tokens[0]);
      // nedges = std::stoull(tokens[2]);
      nnodes = strtoull(tokens[0].c_str(), NULL, 0);
      nedges = strtoull(tokens[2].c_str(), NULL, 0);

      // Parse edges
      if (phase == 0) {
        p.setNumNodes(nnodes);
        p.setNumEdges<EdgeTy>(nedges);
        p.phase1();
      } else {
        p.phase2();
      }

      for (size_t edge_num = 0; edge_num < nedges; ++edge_num) {
        if ((nedges / 500 > 0) && (edge_num % (nedges / 500)) == 0) {
          printf("Phase %d: current edge progress %lf%%\n", phase,
                 ((double)edge_num / nedges) * 100);
        }
        uint32_t cur_id, neighbor_id;
        double weight = 1;

        infile >> cur_id >> neighbor_id >> weight;
        if (cur_id == 0 || cur_id > nnodes) {
          GALOIS_DIE("node id out of range: ", cur_id);
        }
        if (neighbor_id == 0 || neighbor_id > nnodes) {
          GALOIS_DIE("neighbor id out of range: ", neighbor_id);
        }

        // 1 indexed
        if (phase == 0) {
          p.incrementDegree(cur_id - 1);
        } else {
          if constexpr (std::is_void<EdgeTy>::value) {
            p.addNeighbor(cur_id - 1, neighbor_id - 1);
          } else {
            p.addNeighbor<EdgeTy>(cur_id - 1, neighbor_id - 1,
                                  static_cast<EdgeTy>(weight));
          }
        }

        skipLine(infile);
      }

      infile.peek();
      if (!infile.eof()) {
        GALOIS_DIE("additional lines in file");
      }
    }
    // this is for the progress print

    p.finish();

    p.toFile(outfilename);
    printStatus(p.size(), p.sizeEdges());
  }
};

struct Gr2Mtx : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    file << graph.size() << " " << graph.size() << " " << graph.sizeEdges()
         << "\n";
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        double v  = static_cast<double>(graph.getEdgeData<EdgeTy>(jj));
        file << src + 1 << " " << dst + 1 << " " << v << "\n";
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * List of node adjacencies:
 *
 * <node id> <num neighbors> <neighbor id>*
 * ...
 */
struct Nodelist2Gr : public HasOnlyVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    static_assert(std::is_same<EdgeTy, void>::value,
                  "conversion undefined for non-void graphs");
    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;
    std::ifstream infile(infilename.c_str());

    size_t numNodes = 0;
    size_t numEdges = 0;

    while (infile) {
      size_t src;
      size_t numNeighbors;

      infile >> src >> numNeighbors;

      if (infile) {
        if (src > numNodes)
          numNodes = src;
        numEdges += numNeighbors;
      }
      skipLine(infile);
    }

    numNodes++;
    p.setNumNodes(numNodes);
    p.setNumEdges<void>(numEdges);

    infile.clear();
    infile.seekg(0, std::ios::beg);
    p.phase1();
    while (infile) {
      size_t src;
      size_t numNeighbors;

      infile >> src >> numNeighbors;

      if (infile) {
        p.incrementDegree(src, numNeighbors);
      }
      skipLine(infile);
    }

    infile.clear();
    infile.seekg(0, std::ios::beg);
    p.phase2();
    while (infile) {
      size_t src;
      size_t numNeighbors;

      infile >> src >> numNeighbors;

      for (; infile && numNeighbors > 0; --numNeighbors) {
        size_t dst;
        infile >> dst;
        if (infile)
          p.addNeighbor(src, dst);
      }

      skipLine(infile);
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(numNodes, numEdges);
  }
};

struct Gr2Adjacencylist : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      file << src;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        file << " " << dst;
      }
      file << "\n";
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

struct Gr2Edgelist : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<EdgeTy> EdgeData;
    typedef typename EdgeData::value_type edge_value_type;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (!std::is_void<EdgeTy>::value) {
          file << src << " " << dst << " "
               << graph.getEdgeData<edge_value_type>(jj) << "\n";
        } else {
          file << src << " " << dst << "\n";
        }
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * Edge list conversion from gr except all ids are incremented by 1 (i.e.
 * 1 indexing).
 */
struct Gr2Edgelist1Ind : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    using Graph           = galois::graphs::FileGraph;
    using GNode           = Graph::GraphNode;
    using EdgeData        = galois::LargeArray<EdgeTy>;
    using edge_value_type = typename EdgeData::value_type;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (!std::is_void<EdgeTy>::value) {
          file << (src + 1) << " " << (dst + 1) << " "
               << graph.getEdgeData<edge_value_type>(jj) << "\n";
        } else {
          file << (src + 1) << " " << (dst + 1) << "\n";
        }
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

template <bool LittleEndian, typename T>
void writeEndian(T* out, T value) {
  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "unknown data size");
  switch ((sizeof(T) == 4 ? 0 : 2) + (LittleEndian ? 0 : 1)) {
  case 3:
    value = galois::convert_htobe64(value);
    break;
  case 2:
    value = galois::convert_htole64(value);
    break;
  case 1:
    value = galois::convert_htobe32(value);
    break;
  case 0:
    value = galois::convert_htole32(value);
    break;
  default:
    abort();
  }

  *out = value;
}

template <bool LittleEndian, typename T>
void writeEndian(std::ofstream& out, T value) {
  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "unknown data size");
  switch ((sizeof(T) == 4 ? 0 : 2) + (LittleEndian ? 0 : 1)) {
  case 3:
    value = galois::convert_htobe64(value);
    break;
  case 2:
    value = galois::convert_htole64(value);
    break;
  case 1:
    value = galois::convert_htobe32(value);
    break;
  case 0:
    value = galois::convert_htole32(value);
    break;
  default:
    abort();
  }

  out.write(reinterpret_cast<char*>(&value), sizeof(value));
}

template <typename OutEdgeTy, bool LittleEndian>
struct Bipartitegr2Petsc : public HasNoVoidSpecialization {
  template <typename InEdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    size_t partition = 0;
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
         ++ii, ++partition) {
      GNode src = *ii;
      if (graph.edge_begin(src) == graph.edge_end(src)) {
        break;
      }
    }

    std::ofstream file(outfilename.c_str());
    writeEndian<LittleEndian, int32_t>(file, 1211216);
    writeEndian<LittleEndian, int32_t>(file, partition); // rows
    writeEndian<LittleEndian, int32_t>(file,
                                       graph.size() - partition); // columns
    writeEndian<LittleEndian, int32_t>(file, graph.sizeEdges());

    // number of nonzeros in each row
    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;
         ++ii) {
      GNode src = *ii;
      writeEndian<LittleEndian, int32_t>(
          file, std::distance(graph.edge_begin(src), graph.edge_end(src)));
    }

    // column indices
    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;
         ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        writeEndian<LittleEndian, int32_t>(file, dst - partition);
      }
    }

    // values
    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;
         ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        writeEndian<LittleEndian, OutEdgeTy>(file,
                                             graph.getEdgeData<InEdgeTy>(jj));
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

//! Wrap generator into form form std::random_shuffle
template <typename T, typename Gen, template <typename> class Dist>
struct UniformDist {
  Gen& gen;

  UniformDist(Gen& g) : gen(g) {}
  T operator()(T m) {
    Dist<T> r(0, m - 1);
    return r(gen);
  }
};

struct RandomizeNodes : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<GNode> Permutation;

    Graph graph;
    graph.fromFile(infilename);

    Permutation perm;
    perm.create(graph.size());
    std::copy(boost::counting_iterator<GNode>(0),
              boost::counting_iterator<GNode>(graph.size()), perm.begin());
    std::random_device rng;
    std::mt19937 urng(rng());
    std::shuffle(perm.begin(), perm.end(), urng);

    Graph out;
    galois::graphs::permute<EdgeTy>(graph, perm, out);
    outputPermutation(perm);

    out.toFile(outfilename);
    printStatus(out.size(), out.sizeEdges());
  }
};

struct SortByBFS : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<GNode> Permutation;

    Graph graph;
    graph.fromFile(infilename);

    Permutation perm;
    perm.create(graph.size());
    GNode perm_index = 0;

    // perform a BFS traversal
    std::vector<GNode> curr, next;
    galois::LargeArray<bool> visited;
    visited.create(graph.size());
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode node    = *ii;
      visited[node] = false;
    }
    GNode src    = sourceNode;
    visited[src] = true;
    next.push_back(src);
    while (!next.empty()) {
      size_t wl_size = next.size();
      curr.resize(wl_size);
      std::copy(next.begin(), next.end(), curr.begin());
      next.clear();
      for (size_t i = 0; i < wl_size; ++i) {
        GNode node = curr[i];
        perm[node] = perm_index++;
        for (Graph::edge_iterator jj = graph.edge_begin(node),
                                  ej = graph.edge_end(node);
             jj != ej; ++jj) {
          GNode dst = graph.getEdgeDst(jj);
          if (visited[dst] == false) {
            visited[dst] = true;
            next.push_back(dst);
          }
        }
      }
    }
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode node = *ii;
      if (visited[node] == false) {
        perm[node] = perm_index++;
      }
    }
    assert(perm_index == graph.size());

    Graph out;
    galois::graphs::permute<EdgeTy>(graph, perm, out);
    outputPermutation(perm);

    out.toFile(outfilename);
    printStatus(out.size(), out.sizeEdges());
  }
};

template <typename T, bool IsInteger = std::numeric_limits<T>::is_integer>
struct UniformDistribution {};

template <typename T>
struct UniformDistribution<T, true> {
  std::uniform_int_distribution<T> dist;

  UniformDistribution(int a, int b) : dist(a, b) {}
  template <typename Gen>
  T operator()(Gen& g) {
    return dist(g);
  }
};

template <typename T>
struct UniformDistribution<T, false> {
  std::uniform_real_distribution<T> dist;

  UniformDistribution(int a, int b) : dist(a, b) {}
  template <typename Gen>
  T operator()(Gen& g) {
    return dist(g);
  }
};

struct RandomizeEdgeWeights : public HasNoVoidSpecialization {
  template <typename OutEdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;

    Graph graph;
    Graph outgraph;

    graph.fromFile(infilename);
    OutEdgeTy* edgeData    = outgraph.fromGraph<OutEdgeTy>(graph);
    OutEdgeTy* edgeDataEnd = edgeData + graph.sizeEdges();

    std::mt19937 gen;
    UniformDistribution<OutEdgeTy> dist(minValue, maxValue);
    for (; edgeData != edgeDataEnd; ++edgeData) {
      *edgeData = dist(gen);
    }

    outgraph.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), outgraph.size(),
                outgraph.sizeEdges());
  }
};

/**
 * Add edges (i, i-1) for all i \in V.
 */
template <bool AddLine>
struct AddRing : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef galois::graphs::FileGraphWriter Writer;
    typedef typename Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    Writer p;

    uint64_t size     = graph.size();
    uint64_t newEdges = AddLine ? size - 1 : size;
    p.setNumNodes(size);
    p.setNumEdges<EdgeTy>(graph.sizeEdges() + newEdges);

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      auto d    = std::distance(graph.edge_begin(src), graph.edge_end(src));
      if (AddLine && src == 0)
        p.incrementDegree(src, d);
      else
        p.incrementDegree(src, d + 1);
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, dst);
        } else {
          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
      }

      if (AddLine && src == 0)
        continue;

      GNode dst = src == 0 ? size - 1 : src - 1;
      if constexpr (std::is_void<EdgeTy>::value) {
        p.addNeighbor(src, dst);
      } else {
        p.addNeighbor<EdgeTy>(src, dst, maxValue);
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

/**
 * Add edges (i, i*2+1), (i, i*2+2) and their complement.
 */
template <bool AddComplement>
struct AddTree : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef galois::graphs::FileGraphWriter Writer;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    Writer p;

    uint64_t size     = graph.size();
    uint64_t newEdges = 0;
    if (size >= 2) {
      // Closed form counts for the loop below
      newEdges = (size - 1 + (2 - 1)) / 2;  // (1) rounded up
      newEdges += (size - 2 + (2 - 1)) / 2; // (2) rounded up
    } else if (size >= 1)
      newEdges = 1;
    if (AddComplement)
      newEdges *= 2; // reverse edges

    p.setNumNodes(size);
    p.setNumEdges<EdgeTy>(graph.sizeEdges() + newEdges);

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      p.incrementDegree(
          src, std::distance(graph.edge_begin(src), graph.edge_end(src)));
      if (src * 2 + 1 < size) { // (1)
        p.incrementDegree(src);
        if (AddComplement)
          p.incrementDegree(src * 2 + 1);
      }
      if (src * 2 + 2 < size) { // (2)
        p.incrementDegree(src);
        if (AddComplement)
          p.incrementDegree(src * 2 + 2);
      }
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, dst);
        } else {
          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
      }
      if (src * 2 + 1 < size) {
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, src * 2 + 1);
          if (AddComplement)
            p.addNeighbor(src * 2 + 1, src);
        } else {
          p.addNeighbor<EdgeTy>(src, src * 2 + 1, maxValue);
          if (AddComplement)
            p.addNeighbor<EdgeTy>(src * 2 + 1, src, maxValue);
        }
      }
      if (src * 2 + 2 < size) {
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, src * 2 + 2);
          if (AddComplement)
            p.addNeighbor(src * 2 + 2, src);
        } else {
          p.addNeighbor<EdgeTy>(src, src * 2 + 2, maxValue);
          if (AddComplement)
            p.addNeighbor<EdgeTy>(src * 2 + 2, src, maxValue);
        }
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

//! Make graph symmetric by blindly adding reverse entries
struct MakeSymmetric : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;

    Graph ingraph;
    Graph outgraph;
    ingraph.fromFile(infilename);
    galois::graphs::makeSymmetric<EdgeTy>(ingraph, outgraph);

    outgraph.toFile(outfilename);
    printStatus(ingraph.size(), ingraph.sizeEdges(), outgraph.size(),
                outgraph.sizeEdges());
  }
};

/**
 * Like SortByDegree but (1) take into account bipartite representation splits
 * symmetric relation over two graphs (a graph and its transpose) and (2)
 * normalize representation by placing all nodes from bipartite graph set A
 * before set B.
 */
struct BipartiteSortByDegree : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<GNode> Permutation;

    Graph ingraph, outgraph, transposegraph;
    ingraph.fromFile(infilename);
    transposegraph.fromFile(transposeFilename);

    Permutation perm;
    perm.create(ingraph.size());

    auto hasOutEdge = [&](GNode x) {
      return ingraph.edge_begin(x) != ingraph.edge_end(x);
    };
    ptrdiff_t numSetA =
        std::count_if(ingraph.begin(), ingraph.end(), hasOutEdge);
    auto getDistance = [&](GNode x) -> ptrdiff_t {
      if (ingraph.edge_begin(x) == ingraph.edge_end(x))
        return numSetA + std::distance(transposegraph.edge_begin(x),
                                       transposegraph.edge_end(x));
      else
        return std::distance(ingraph.edge_begin(x), ingraph.edge_end(x));
    };

    std::copy(ingraph.begin(), ingraph.end(), perm.begin());
    std::sort(perm.begin(), perm.end(), [&](GNode lhs, GNode rhs) -> bool {
      return getDistance(lhs) < getDistance(rhs);
    });

    // Finalize by taking the transpose/inverse
    Permutation inverse;
    inverse.create(ingraph.size());
    size_t idx = 0;
    for (auto n : perm) {
      inverse[n] = idx++;
    }

    galois::graphs::permute<EdgeTy>(ingraph, inverse, outgraph);
    outputPermutation(inverse);
    outgraph.toFile(outfilename);
    printStatus(ingraph.size(), ingraph.sizeEdges());
  }
};

struct SortByDegree : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<GNode> Permutation;

    Graph ingraph, outgraph;
    ingraph.fromFile(infilename);

    Permutation perm;
    perm.create(ingraph.size());

    std::copy(ingraph.begin(), ingraph.end(), perm.begin());
    std::sort(perm.begin(), perm.end(), [&](GNode lhs, GNode rhs) -> bool {
      return std::distance(ingraph.edge_begin(lhs), ingraph.edge_end(lhs)) <
             std::distance(ingraph.edge_begin(rhs), ingraph.edge_end(rhs));
    });

    // Finalize by taking the transpose/inverse
    Permutation inverse;
    inverse.create(ingraph.size());
    size_t idx = 0;
    for (auto n : perm) {
      inverse[n] = idx++;
    }

    galois::graphs::permute<EdgeTy>(ingraph, inverse, outgraph);
    outputPermutation(inverse);
    outgraph.toFile(outfilename);
    printStatus(ingraph.size(), ingraph.sizeEdges());
  }
};

struct ToBigEndian : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;

    Graph ingraph, outgraph;
    ingraph.fromFile(infilename);
    EdgeTy* out = outgraph.fromGraph<EdgeTy>(ingraph);

    for (auto ii = ingraph.edge_data_begin<EdgeTy>(),
              ei = ingraph.edge_data_end<EdgeTy>();
         ii != ei; ++ii, ++out) {
      writeEndian<false>(out, *ii);
    }
    outgraph.toFile(outfilename);
    printStatus(ingraph.size(), ingraph.sizeEdges());
  }
};

struct SortByHighDegreeParent : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::LargeArray<GNode> Permutation;

    Graph graph;
    // get file graph
    graph.fromFile(infilename);

    // get the number of vertices
    auto sz = graph.size();

    Permutation perm;
    perm.create(sz);
    // fill the perm array with 0 through # vertices
    std::copy(boost::counting_iterator<GNode>(0),
              boost::counting_iterator<GNode>(sz), perm.begin());

    std::cout << "Done setting up perm\n";

    std::deque<std::deque<std::pair<unsigned, GNode>>> inv(sz);
    unsigned count = 0;

    // loop through all vertices
    for (auto ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {
      // progress indicator print
      if (!(++count % 1024))
        std::cerr << static_cast<double>(count * 100) / sz << "\r";

      // get the number of edges this vertex has
      unsigned dist = std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));

      // for each edge, get destination, and on that destination vertex save
      // the source id (i.e. this is a transpose)
      for (auto dsti = graph.edge_begin(*ii), dste = graph.edge_end(*ii);
           dsti != dste; ++dsti)
        inv[graph.getEdgeDst(dsti)].push_back(std::make_pair(dist, *ii));
    }

    std::cout << "Found inverse\n";

    count = 0;
    // looping through deques with incoming edges
    // TODO this can probably be parallelized since each deque is disjoint
    for (auto ii = inv.begin(), ee = inv.end(); ii != ee; ++ii) {
      // progress tracker
      if (!(++count % 1024)) {
        std::cerr << count << " of " << sz << "\r";
      }

      // sort each deque
      std::sort(ii->begin(), ii->end(),
                std::greater<std::pair<unsigned, GNode>>());
    }

    std::cout << "Beginning perm sort\n";

    // sort the 0 -> # vertices array
    std::sort(perm.begin(), perm.end(), [&inv](GNode lhs, GNode rhs) -> bool {
      const auto& leftBegin  = inv[lhs].begin();
      const auto& leftEnd    = inv[lhs].end();
      const auto& rightBegin = inv[rhs].begin();
      const auto& rightEnd   = inv[rhs].end();
      // not less-than and not equal => greater-than
      return (!std::lexicographical_compare(leftBegin, leftEnd, rightBegin,
                                            rightEnd) &&
              !(std::distance(leftBegin, leftEnd) ==
                    std::distance(rightBegin, rightEnd) &&
                std::equal(leftBegin, leftEnd, rightBegin)));
    });

    std::cout << "Done sorting\n";

    Permutation perm2;
    perm2.create(sz);
    // perm2 stores the new ordering of a particular vertex
    for (unsigned x = 0; x < perm.size(); ++x)
      perm2[perm[x]] = x;

    std::cout << "Done inverting\n";

    // sanity check; this should print the same thing
    for (unsigned x = 0; x < perm2.size(); ++x) {
      if (perm[x] == 0) {
        std::cout << "Zero is at " << x << "\n";
        break;
      }
    }
    std::cout << "Zero is at " << perm2[0] << "\n";

    // do actual permutation of the graph
    Graph out;
    galois::graphs::permute<EdgeTy>(graph, perm2, out);
    outputPermutation(perm2);

    // std::cout << "Biggest was " << first << " now " << perm2[first] << " with
    // "
    //           << std::distance(out.edge_begin(perm2[first]),
    //           out.edge_end(perm2[first]))
    //           << "\n";

    out.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges());
  }
};

struct RemoveHighDegree : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::graphs::FileGraphWriter Writer;

    Graph graph;
    graph.fromFile(infilename);

    Writer p;

    std::vector<GNode> nodeTable;
    nodeTable.resize(graph.size());
    uint64_t numNodes = 0;
    uint64_t numEdges = 0;
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src               = *ii;
      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);
      if (std::distance(jj, ej) > maxDegree)
        continue;
      nodeTable[src] = numNodes++;
      for (; jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >
            maxDegree)
          continue;
        ++numEdges;
      }
    }

    if (numEdges == graph.sizeEdges() && numNodes == graph.size()) {
      std::cout << "Graph already simplified; copy input to output\n";
      printStatus(graph.size(), graph.sizeEdges());
      graph.toFile(outfilename);
      return;
    }

    p.setNumNodes(numNodes);
    p.setNumEdges<EdgeTy>(numEdges);

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src               = *ii;
      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);
      if (std::distance(jj, ej) > maxDegree)
        continue;
      for (; jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >
            maxDegree)
          continue;
        p.incrementDegree(nodeTable[src]);
      }
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src               = *ii;
      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);
      if (std::distance(jj, ej) > maxDegree)
        continue;
      for (; jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >
            maxDegree)
          continue;
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(nodeTable[src], nodeTable[dst]);
        } else {
          p.addNeighbor<EdgeTy>(nodeTable[src], nodeTable[dst],
                                graph.getEdgeData<EdgeTy>(jj));
        }
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

//! Partition graph into balanced number of edges by source node
struct PartitionBySource : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::graphs::FileGraphWriter Writer;

    Graph graph;
    graph.fromFile(infilename);

    for (int i = 0; i < numParts; ++i) {
      Writer p;

      auto r = graph.divideByNode(0, 1, i, numParts).first;

      size_t numEdges = 0;
      if (r.first != r.second)
        numEdges = std::distance(graph.edge_begin(*r.first),
                                 graph.edge_end(*(r.second - 1)));

      p.setNumNodes(graph.size());
      p.setNumEdges<EdgeTy>(numEdges);

      p.phase1();
      for (Graph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
        GNode src = *ii;
        p.incrementDegree(
            src, std::distance(graph.edge_begin(src), graph.edge_end(src)));
      }

      p.phase2();
      for (Graph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
        GNode src = *ii;
        for (Graph::edge_iterator jj = graph.edge_begin(src),
                                  ej = graph.edge_end(src);
             jj != ej; ++jj) {
          GNode dst = graph.getEdgeDst(jj);
          if constexpr (std::is_void<EdgeTy>::value)
            p.addNeighbor(src, dst);
          else
            p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
      }

      p.finish();

      std::ostringstream partname;
      partname << outfilename << "." << i << ".of." << numParts;

      p.toFile(partname.str());
      printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
    }
  }
};

template <typename InDegree, typename It = typename InDegree::iterator>
static std::pair<It, It> divide_by_destination(InDegree& inDegree, int id,
                                               int total) {
  if (inDegree.begin() == inDegree.end())
    return std::make_pair(inDegree.begin(), inDegree.end());

  size_t size  = inDegree[inDegree.size() - 1];
  size_t block = (size + total - 1) / total;

  It bb = std::lower_bound(inDegree.begin(), inDegree.end(), id * block);
  It eb;
  if (id + 1 == total)
    eb = inDegree.end();
  else
    eb = std::upper_bound(bb, inDegree.end(), (id + 1) * block);
  return std::make_pair(bb, eb);
}

template <typename GraphTy, typename InDegree>
static void compute_indegree(GraphTy& graph, InDegree& inDegree) {
  inDegree.create(graph.size());

  for (auto nn = graph.begin(), en = graph.end(); nn != en; ++nn) {
    for (auto jj = graph.edge_begin(*nn), ej = graph.edge_end(*nn); jj != ej;
         ++jj) {
      auto dst = graph.getEdgeDst(jj);
      inDegree[dst] += 1;
    }
  }

  for (size_t i = 1; i < inDegree.size(); ++i)
    inDegree[i] = inDegree[i - 1] + inDegree[i];
}

//! Partition graph into balanced number of edges by destination node
struct PartitionByDestination : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::graphs::FileGraphWriter Writer;
    typedef galois::LargeArray<size_t> InDegree;

    Graph graph;
    graph.fromFile(infilename);
    InDegree inDegree;
    compute_indegree(graph, inDegree);

    for (int i = 0; i < numParts; ++i) {
      Writer p;

      auto r    = divide_by_destination(inDegree, i, numParts);
      size_t bb = std::distance(inDegree.begin(), r.first);
      size_t eb = std::distance(inDegree.begin(), r.second);

      size_t numEdges = 0;
      if (bb != eb) {
        size_t begin = bb == 0 ? 0 : inDegree[bb - 1];
        size_t end   = eb == 0 ? 0 : inDegree[eb - 1];
        numEdges     = end - begin;
      }

      p.setNumNodes(graph.size());
      p.setNumEdges<EdgeTy>(numEdges);

      p.phase1();
      for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
           ++ii) {
        GNode src = *ii;
        for (Graph::edge_iterator jj = graph.edge_begin(src),
                                  ej = graph.edge_end(src);
             jj != ej; ++jj) {
          GNode dst = graph.getEdgeDst(jj);
          if (dst < bb)
            continue;
          if (dst >= eb)
            continue;
          p.incrementDegree(src);
        }
      }

      p.phase2();
      for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
           ++ii) {
        GNode src = *ii;
        for (Graph::edge_iterator jj = graph.edge_begin(src),
                                  ej = graph.edge_end(src);
             jj != ej; ++jj) {
          GNode dst = graph.getEdgeDst(jj);
          if (dst < bb)
            continue;
          if (dst >= eb)
            continue;
          if constexpr (std::is_void<EdgeTy>::value)
            p.addNeighbor(src, dst);
          else
            p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
      }

      p.finish();

      std::ostringstream partname;
      partname << outfilename << "." << i << ".of." << numParts;

      p.toFile(partname.str());
      printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
    }
  }
};

//! Transpose graph
struct Transpose : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;
    typedef galois::graphs::FileGraphWriter Writer;

    Graph graph;
    graph.fromFile(infilename);

    Writer p;

    p.setNumNodes(graph.size());
    p.setNumEdges<EdgeTy>(graph.sizeEdges());

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        p.incrementDegree(dst);
      }
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(dst, src);
        } else {
          p.addNeighbor<EdgeTy>(dst, src, graph.getEdgeData<EdgeTy>(jj));
        }
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

template <typename GraphNode, typename EdgeTy>
struct IdLess {
  bool
  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,
             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {
    return e1.dst < e2.dst;
  }
};

template <typename GraphNode, typename EdgeTy>
struct WeightLess {
  bool
  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,
             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {
    return e1.get() < e2.get();
  }
};

/**
 * Removes self and multi-edges from a graph.
 */
struct Cleanup : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph orig, graph;
    {
      // Original FileGraph is immutable because it is backed by a file
      orig.fromFile(infilename);
      graph = orig;
    }

    size_t numEdges = 0;

    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      graph.sortEdges<EdgeTy>(src, IdLess<GNode, EdgeTy>());

      Graph::edge_iterator prev = graph.edge_end(src);
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src == dst) {
        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {
        } else {
          numEdges += 1;
        }
        prev = jj;
      }
    }

    if (numEdges == graph.sizeEdges()) {
      std::cout << "Graph already simplified; copy input to output\n";
      printStatus(graph.size(), graph.sizeEdges());
      graph.toFile(outfilename);
      return;
    }

    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;

    p.setNumNodes(graph.size());
    p.setNumEdges<EdgeTy>(numEdges);

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      Graph::edge_iterator prev = graph.edge_end(src);
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src == dst) {
        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {
        } else {
          p.incrementDegree(src);
        }
        prev = jj;
      }
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      Graph::edge_iterator prev = graph.edge_end(src);
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src == dst) {
        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {
        } else if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, dst);
        } else {
          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
        prev = jj;
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

template <template <typename, typename> class SortBy, bool NeedsEdgeData>
struct SortEdges
    : public boost::mpl::if_c<NeedsEdgeData, HasNoVoidSpecialization,
                              Conversion>::type {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph orig, graph;
    {
      // Original FileGraph is immutable because it is backed by a file
      orig.fromFile(infilename);
      graph = orig;
    }

    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();
         ii != ei; ++ii) {
      GNode src = *ii;

      graph.sortEdges<EdgeTy>(src, SortBy<GNode, EdgeTy>());
    }

    graph.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * Removes edges such that src > dst
 */
struct MakeUnsymmetric : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    size_t numEdges = 0;

    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src > dst) {
        } else {
          numEdges += 1;
        }
      }
    }

    if (numEdges == graph.sizeEdges()) {
      std::cout << "Graph already simplified; copy input to output\n";
      printStatus(graph.size(), graph.sizeEdges());
      graph.toFile(outfilename);
      return;
    }

    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;

    p.setNumNodes(graph.size());
    p.setNumEdges<EdgeTy>(numEdges);

    p.phase1();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src > dst) {
        } else {
          p.incrementDegree(src);
        }
      }
    }

    p.phase2();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;

      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (src > dst) {
        } else if constexpr (std::is_void<EdgeTy>::value) {
          p.addNeighbor(src, dst);
        } else {
          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));
        }
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());
  }
};

// Example:
//  c Some file
//  c Comments
//  p XXX* <num nodes> <num edges>
//  a <src id> <dst id> <weight>
//  ....
struct Dimacs2Gr : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;
    uint32_t nnodes;
    size_t nedges;

    for (int phase = 0; phase < 2; ++phase) {
      std::ifstream infile(infilename.c_str());

      // Skip comments
      while (infile) {
        if (infile.peek() == 'p') {
          break;
        }
        skipLine(infile);
      }

      // Read header
      char header[256];
      infile.getline(header, 256);
      std::istringstream line(header, std::istringstream::in);
      std::vector<std::string> tokens;
      while (line) {
        std::string tmp;
        line >> tmp;
        if (line) {
          tokens.push_back(tmp);
        }
      }
      if (tokens.size() < 3 || tokens[0].compare("p") != 0) {
        GALOIS_DIE("unknown problem specification line: ", line.str());
      }
      // Prefer C functions for maximum compatibility
      // nnodes = std::stoull(tokens[tokens.size() - 2]);
      // nedges = std::stoull(tokens[tokens.size() - 1]);
      nnodes = strtoull(tokens[tokens.size() - 2].c_str(), NULL, 0);
      nedges = strtoull(tokens[tokens.size() - 1].c_str(), NULL, 0);

      // Parse edges
      if (phase == 0) {
        p.setNumNodes(nnodes);
        p.setNumEdges<EdgeTy>(nedges);
        p.phase1();
      } else {
        p.phase2();
      }

      for (size_t edge_num = 0; edge_num < nedges; ++edge_num) {
        uint32_t cur_id;
        uint32_t neighbor_id;
        int32_t weight;
        std::string tmp;
        infile >> tmp;

        if (tmp.compare("a") != 0) {
          --edge_num;
          skipLine(infile);
          continue;
        }

        infile >> cur_id >> neighbor_id >> weight;
        if (cur_id == 0 || cur_id > nnodes) {
          GALOIS_DIE("node id out of range: ", cur_id);
        }
        if (neighbor_id == 0 || neighbor_id > nnodes) {
          GALOIS_DIE("neighbor id out of range: ", neighbor_id);
        }

        // 1 indexed
        if (phase == 0) {
          p.incrementDegree(cur_id - 1);
        } else {
          if constexpr (std::is_void<EdgeTy>::value) {
            p.addNeighbor(cur_id - 1, neighbor_id - 1);
          } else {
            p.addNeighbor<EdgeTy>(cur_id - 1, neighbor_id - 1, weight);
          }
        }

        skipLine(infile);
      }

      infile.peek();
      if (!infile.eof()) {
        GALOIS_DIE("additional lines in file");
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(p.size(), p.sizeEdges());
  }
};

/**
 * PBBS input is an ASCII file of tokens that serialize a CSR graph. I.e.,
 * elements in brackets are non-literals:
 *
 * AdjacencyGraph
 * <num nodes>
 * <num edges>
 * <offset node 0>
 * <offset node 1>
 * ...
 * <edge 0>
 * <edge 1>
 * ...
 */
struct Pbbs2Gr : public HasOnlyVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    static_assert(std::is_same<EdgeTy, void>::value,
                  "conversion undefined for non-void graphs");
    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;

    std::ifstream infile(infilename.c_str());
    std::string header;
    uint32_t nnodes;
    size_t nedges;

    infile >> header >> nnodes >> nedges;
    if (header != "AdjacencyGraph") {
      GALOIS_DIE("unknown file format");
    }

    p.setNumNodes(nnodes);
    p.setNumEdges<void>(nedges);

    size_t* offsets = new size_t[nnodes];
    for (size_t i = 0; i < nnodes; ++i) {
      infile >> offsets[i];
    }

    size_t* edges = new size_t[nedges];
    for (size_t i = 0; i < nedges; ++i) {
      infile >> edges[i];
    }

    p.phase1();
    for (uint32_t i = 0; i < nnodes; ++i) {
      size_t begin = offsets[i];
      size_t end   = (i == nnodes - 1) ? nedges : offsets[i + 1];
      p.incrementDegree(i, end - begin);
    }

    p.phase2();
    for (uint32_t i = 0; i < nnodes; ++i) {
      size_t begin = offsets[i];
      size_t end   = (i == nnodes - 1) ? nedges : offsets[i + 1];
      for (size_t j = begin; j < end; ++j) {
        size_t dst = edges[j];
        p.addNeighbor(i, dst);
      }
    }

    p.finish();

    p.toFile(outfilename);
    printStatus(p.size(), p.sizeEdges());
  }
};

// TODO
// gr Version 2 support doesn't exist
struct Gr2Pbbsedges : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    // Use FileGraph because it is basically in CSR format needed for pbbs
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    file << "WeightedEdgeArray\n";
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst      = graph.getEdgeDst(jj);
        EdgeTy& weight = graph.getEdgeData<EdgeTy>(jj);
        file << src << " " << dst << " " << weight << "\n";
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * PBBS input is an ASCII file of tokens that serialize a CSR graph. I.e.,
 * elements in brackets are non-literals:
 *
 * [Weighted]AdjacencyGraph
 * <num nodes>
 * <num edges>
 * <offset node 0>
 * <offset node 1>
 * ...
 * <edge 0>
 * <edge 1>
 * ...
 * [
 * <edge weight 0>
 * <edge weight 1>
 * ...
 * ]
 */
// TODO
// gr Version 2 support doesn't exist
struct Gr2Pbbs : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef galois::LargeArray<EdgeTy> EdgeData;
    typedef typename EdgeData::value_type edge_value_type;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    if constexpr (!std::is_void<EdgeTy>::value)
      file << "Weighted";
    file << "AdjacencyGraph\n"
         << graph.size() << "\n"
         << graph.sizeEdges() << "\n";
    // edgeid[i] is the end of i in FileGraph while it is the beginning of i in
    // pbbs graph
    size_t last  = std::distance(graph.edge_id_begin(), graph.edge_id_end());
    size_t count = 0;
    file << "0\n";
    for (Graph::edge_id_iterator ii = graph.edge_id_begin(),
                                 ei = graph.edge_id_end();
         ii != ei; ++ii, ++count) {
      if (count < last - 1)
        file << *ii << "\n";
    }
    for (Graph::node_id_iterator ii = graph.node_id_begin(),
                                 ei = graph.node_id_end();
         ii != ei; ++ii) {
      file << *ii << "\n";
    }
    if constexpr (!std::is_void<EdgeTy>::value) {
      for (edge_value_type *ii = graph.edge_data_begin<edge_value_type>(),
                           *ei = graph.edge_data_end<edge_value_type>();
           ii != ei; ++ii) {
        file << *ii << "\n";
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * Binary PBBS format is three files.
 *
 * <base>.config - ASCII file with number of vertices
 * <base>.adj - Binary adjacencies
 * <base>.idx - Binary offsets for adjacencies
 */
template <typename NodeIdx, typename Offset>
struct Gr2BinaryPbbs : public HasOnlyVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    static_assert(std::is_same<EdgeTy, void>::value,
                  "conversion undefined for non-void graphs");
    typedef galois::graphs::FileGraph Graph;

    Graph graph;
    graph.fromFile(infilename);

    {
      std::string configName = outfilename + ".config";
      std::ofstream configFile(configName.c_str());
      configFile << graph.size() << "\n";
    }

    {
      std::string idxName = outfilename + ".idx";
      std::ofstream idxFile(idxName.c_str());
      // edgeid[i] is the end of i in FileGraph while it is the beginning of i
      // in pbbs graph
      size_t last   = std::distance(graph.edge_id_begin(), graph.edge_id_end());
      size_t count  = 0;
      Offset offset = 0;
      idxFile.write(reinterpret_cast<char*>(&offset), sizeof(offset));
      for (Graph::edge_id_iterator ii = graph.edge_id_begin(),
                                   ei = graph.edge_id_end();
           ii != ei; ++ii, ++count) {
        offset = *ii;
        if (count < last - 1)
          idxFile.write(reinterpret_cast<char*>(&offset), sizeof(offset));
      }
      idxFile.close();
    }

    {
      std::string adjName = outfilename + ".adj";
      std::ofstream adjFile(adjName.c_str());
      for (Graph::node_id_iterator ii = graph.node_id_begin(),
                                   ei = graph.node_id_end();
           ii != ei; ++ii) {
        NodeIdx nodeIdx = *ii;
        adjFile.write(reinterpret_cast<char*>(&nodeIdx), sizeof(nodeIdx));
      }
      adjFile.close();
    }

    printStatus(graph.size(), graph.sizeEdges());
  }
};

struct Gr2Dimacs : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    file << "p sp " << graph.size() << " " << graph.sizeEdges() << "\n";
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst      = graph.getEdgeDst(jj);
        EdgeTy& weight = graph.getEdgeData<EdgeTy>(jj);
        file << "a " << src + 1 << " " << dst + 1 << " " << weight << "\n";
      }
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * RMAT format (zero indexed):
 *  %%% Comment1
 *  %%% Comment2
 *  %%% Comment3
 *  <num nodes> <num edges>
 *  <node id> <num edges> [<neighbor id> <neighbor weight>]*
 *  ...
 */
template <typename OutEdgeTy>
struct Gr2Rmat : public HasNoVoidSpecialization {
  template <typename InEdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    std::ofstream file(outfilename.c_str());
    file << "%%%\n";
    file << "%%%\n";
    file << "%%%\n";
    file << graph.size() << " " << graph.sizeEdges() << "\n";
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      file << *ii << " "
           << std::distance(graph.edge_begin(src), graph.edge_end(src));
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst        = graph.getEdgeDst(jj);
        OutEdgeTy weight = graph.getEdgeData<InEdgeTy>(jj);
        file << " " << dst << " " << weight;
      }
      file << "\n";
    }
    file.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};
template <template <typename, typename> class SortBy>
struct Gr2Totem : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph orig, graph;
    {
      // Original FileGraph is immutable because it is backed by a file
      orig.fromFile(infilename);
      graph = orig;
    }

    const uint32_t BINARY_MAGIC_WORD = 0x10102048;
    FILE* outfile;
    outfile = fopen(outfilename.c_str(), "wr");

    typedef uint32_t vid_t;
    typedef uint32_t eid_t;
    typedef uint32_t weight_t;
    fwrite(&BINARY_MAGIC_WORD, sizeof(uint32_t), 1, outfile);

    uint32_t vid_size = sizeof(vid_t);
    fwrite(&vid_size, sizeof(uint32_t), 1, outfile);
    uint32_t eid_size = sizeof(vid_t);
    fwrite(&eid_size, sizeof(uint32_t), 1, outfile);

    vid_t vertex_count = graph.size();
    fwrite(&vertex_count, sizeof(vid_t), 1, outfile);
    eid_t edge_count = graph.sizeEdges();
    fwrite(&edge_count, sizeof(eid_t), 1, outfile);

    bool valued = false;
    fwrite(&valued, sizeof(bool), 1, outfile);
    bool weighted = true;
    fwrite(&weighted, sizeof(bool), 1, outfile);
    bool directed = true;
    fwrite(&directed, sizeof(bool), 1, outfile);

    vid_t* nodes      = (vid_t*)malloc(sizeof(vid_t) * (vertex_count + 1));
    eid_t* edges      = (eid_t*)malloc(sizeof(vid_t) * edge_count);
    weight_t* weights = (weight_t*)malloc(sizeof(vid_t) * edge_count);
    memset(nodes, 0, sizeof(vid_t) * (vertex_count + 1));
    memset(edges, 0, sizeof(vid_t) * eid_size);
    memset(weights, 0, sizeof(vid_t) * eid_size);
    vid_t vid = 0;
    eid_t eid = 0;

    Graph::iterator e_start = graph.edge_begin(*graph.begin());

    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;
         ++ii, vid++) {
      GNode src  = *ii;
      nodes[vid] = std::distance(e_start, graph.edge_begin(src));
      graph.sortEdges<EdgeTy>(src, SortBy<GNode, EdgeTy>());
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj, eid++) {
        GNode dst    = graph.getEdgeDst(jj);
        edges[eid]   = (vid_t)dst;
        weights[eid] = (uint32_t)graph.getEdgeData<EdgeTy>(jj);
        // printf("%d %d %u \n", vid, edges[eid], weights[eid]);
      }
    }
    nodes[vertex_count] = graph.sizeEdges();
    fwrite(nodes, sizeof(vid_t), vertex_count + 1, outfile);
    fwrite(edges, sizeof(eid_t), edge_count, outfile);
    fwrite(weights, sizeof(weight_t), edge_count, outfile);
    // printf("nodes: %d %d %d\n", nodes[0],nodes[1],nodes[2]);

    // printf("nodes: %d %d %d\n", edges[0],edges[1],edges[2]);
    // printf("nodes: %d %d %d\n", weights[0],weights[1],weights[2]);

    fclose(outfile);

    printStatus(graph.size(), graph.sizeEdges());
  }
};

struct Gr2Neo4j : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    // TODO Need to figure out how we want to deal with labels

    using Graph           = galois::graphs::FileGraph;
    using GNode           = Graph::GraphNode;
    using EdgeData        = galois::LargeArray<EdgeTy>;
    using edge_value_type = typename EdgeData::value_type;

    Graph graph;
    graph.fromFile(infilename);

    // output node csv for node creation

    // first is header
    std::string nodeHFile = outfilename + ".nodesheader";
    std::ofstream fileH(nodeHFile.c_str());
    fileH << "uid:ID,:LABEL\n";
    fileH.close();

    // then nodes
    std::string nodeFile = outfilename + ".nodes";
    std::ofstream fileN(nodeFile.c_str());
    for (size_t i = 0; i < graph.size(); i++) {
      fileN << i << ",v\n";
    }
    fileN.close();

    // output edge CSV with or without data for edge creation
    std::string edgeHFile = outfilename + ".edgesheader";
    std::ofstream fileHE(edgeHFile.c_str());
    if constexpr (std::is_void<EdgeTy>::value) {
      fileHE << ":START_ID,:END_ID,:TYPE\n";
    } else {
      fileHE << ":START_ID,:END_ID,:TYPE,value\n";
    }
    fileHE.close();

    // output edge CSV with or without data for edge creation
    std::string edgeFile = outfilename + ".edges";
    std::ofstream fileE(edgeFile.c_str());

    // write edges
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if constexpr (std::is_void<EdgeTy>::value) {
          fileE << src << "," << dst << ",e\n";
        } else {
          fileE << src << "," << dst << ",e,"
                << graph.getEdgeData<edge_value_type>(jj) << "\n";
        }
      }
    }
    fileE.close();

    printStatus(graph.size(), graph.sizeEdges());
  }
};

/**
 * METIS format (1-indexed). See METIS 4.10 manual, section 4.5.
 *  % comment prefix
 *  <num nodes> <num edges> [<data format> [<weights per vertex>]]
 *  [<vertex data>] [<destination> [<edge data>]]*
 *  ...
 * vertex weights must be integers >= 0; edge weights must be > 0.
 * Input graph must be symmetric. Does not write self-edges.
 * FIXME: implement weights.
 */
struct Gr2Metis : public HasOnlyVoidSpecialization {
  template <typename InEdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef Graph::GraphNode GNode;

    Graph graph;
    graph.fromFile(infilename);

    /* Skip self-edges */
    unsigned int nedges = graph.sizeEdges();
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        if (dst == src)
          nedges--;
      }
    }
    assert((nedges % 2) == 0);
    nedges /= 2; // Do not double-count edges

    std::ofstream file(outfilename.c_str());
    file << graph.size() << " " << nedges << "\n";
    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
      GNode src = *ii;
      for (Graph::edge_iterator jj = graph.edge_begin(src),
                                ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst = graph.getEdgeDst(jj);
        // OutEdgeTy weight = graph.getEdgeData<InEdgeTy>(jj);
        if (dst != src)
          file << dst + 1 << " ";
      }
      file << "\n";
    }
    file.close();

    printStatus(graph.size(), nedges);
  }
};

/**
 * GR to Binary Sparse MATLAB matrix.
 * [i, j, v] = find(A);
 * fwrite(f, size(A,1), 'uint32');
 * fwrite(f, size(A,2), 'uint32');
 * fwrite(f, nnz(A), 'uint32');
 * fwrite(f, (i-1), 'uint32');     % zero-indexed
 * fwrite(f, (j-1), 'uint32');
 * fwrite(f, v, 'double');
 */
struct Gr2Bsml : public Conversion {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraph Graph;
    typedef typename Graph::GraphNode GNode;
    typedef typename galois::LargeArray<EdgeTy> EdgeData;

    Graph graph;
    graph.fromFile(infilename);

    uint32_t nnodes = graph.size();
    uint32_t nedges = graph.sizeEdges();

    std::ofstream file(outfilename.c_str());

    // Write header
    file.write(reinterpret_cast<char*>(&nnodes), sizeof(nnodes));
    file.write(reinterpret_cast<char*>(&nnodes), sizeof(nnodes));
    file.write(reinterpret_cast<char*>(&nedges), sizeof(nedges));

    // Write row adjacency
    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();
         ii != ei; ++ii) {
      GNode src    = *ii;
      uint32_t sid = src;
      for (typename Graph::edge_iterator jj = graph.edge_begin(src),
                                         ej = graph.edge_end(src);
           jj != ej; ++jj) {
        file.write(reinterpret_cast<char*>(&sid), sizeof(sid));
      }
    }

    // Write column adjacency
    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();
         ii != ei; ++ii) {
      GNode src = *ii;
      for (typename Graph::edge_iterator jj = graph.edge_begin(src),
                                         ej = graph.edge_end(src);
           jj != ej; ++jj) {
        GNode dst    = graph.getEdgeDst(jj);
        uint32_t did = dst;
        file.write(reinterpret_cast<char*>(&did), sizeof(did));
      }
    }

    // Write data
    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();
         ii != ei; ++ii) {
      GNode src = *ii;
      for (typename Graph::edge_iterator jj = graph.edge_begin(src),
                                         ej = graph.edge_end(src);
           jj != ej; ++jj) {
        double weight = static_cast<double>(
            getEdgeValue<EdgeTy, EdgeData::has_value>(graph, jj));
        file.write(reinterpret_cast<char*>(&weight), sizeof(weight));
      }
    }

    file.close();
    printStatus(nnodes, nedges);
  }
};

/**
 * SVMLight format.
 *
 * <line> .=. <target> <feature>:<value> <feature>:<value> ... <feature>:<value>
 * # <info> <target> .=. +1 | -1 | 0 | <float> <feature> .=. <integer> | "qid"
 * <value> .=. <float>
 * <info> .=. <string>
 *
 */
struct Svmlight2Gr : public HasNoVoidSpecialization {
  template <typename EdgeTy>
  void convert(const std::string& infilename, const std::string& outfilename) {
    typedef galois::graphs::FileGraphWriter Writer;

    Writer p;
    std::ifstream infile(infilename.c_str());
    std::ofstream outlabels(labelsFilename.c_str());

    if (!outlabels) {
      GALOIS_DIE("unable to create labels file");
    }

    size_t featureOffset = 0;
    size_t numEdges      = 0;
    long maxFeature      = -1;

    for (int phase = 0; phase < 3; ++phase) {
      infile.clear();
      infile.seekg(0, std::ios::beg);
      size_t numNodes = 0;

      while (infile) {
        if (phase == 2) {
          float label;
          infile >> label;
          if (!infile)
            break;
          outlabels << numNodes << " " << label << "\n";
        } else {
          infile.ignore(std::numeric_limits<std::streamsize>::max(), ' ');
          if (!infile)
            break;
        }

        const int maxLength = 1024;
        char buffer[maxLength];
        int idx = 0;

        while (infile) {
          char c = infile.get();
          if (!infile)
            break;
          if (c == ' ' || c == '\n' || c == '#') {
            buffer[idx] = '\0';
            // Parse "feature:value" pairs
            if (idx) {
              char* delim = strchr(buffer, ':');
              if (!delim)
                GALOIS_DIE("unknown feature format: '", buffer,
                           "' on line: ", numNodes + 1);
              *delim       = '\0';
              double value = strtod(delim + 1, NULL);
              if (value == 0.0) {
                ; // pass
              } else if (phase == 0) {
                long feature = strtol(buffer, NULL, 10);
                maxFeature   = std::max(maxFeature, feature);
                numEdges += 1;
              } else if (phase == 1) {
                p.incrementDegree(numNodes);
              } else {
                long feature = strtol(buffer, NULL, 10);
                if constexpr (std::is_void<EdgeTy>::value) {
                  p.addNeighbor(numNodes, feature + featureOffset);
                } else {
                  p.addNeighbor<EdgeTy>(numNodes, feature + featureOffset,
                                        value);
                }
              }
            }

            idx = 0;
          } else {
            buffer[idx++] = c;
            if (idx == maxLength)
              GALOIS_DIE("token too long");
            continue;
          }
          if (c == '#') {
            skipLine(infile);
          }
          if (c == '#' || c == '\n') {
            break;
          }
        }

        numNodes += 1;
      }

      if (phase == 0) {
        featureOffset = numNodes;
        numNodes += maxFeature + 1;
        p.setNumNodes(numNodes);
        p.setNumEdges<EdgeTy>(numEdges);
        p.phase1();
      } else if (phase == 1) {
        p.phase2();
      } else {
        p.finish();
        numNodes += maxFeature + 1;
        p.toFile(outfilename);
        printStatus(numNodes, numEdges);
      }
    }
  }
};

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  llvm::cl::ParseCommandLineOptions(argc, argv);
  std::ios_base::sync_with_stdio(false);
  switch (convertMode) {
  case bipartitegr2bigpetsc:
    convert<Bipartitegr2Petsc<double, false>>();
    break;
  case bipartitegr2littlepetsc:
    convert<Bipartitegr2Petsc<double, true>>();
    break;
  case bipartitegr2sorteddegreegr:
    convert<BipartiteSortByDegree>();
    break;
  case dimacs2gr:
    convert<Dimacs2Gr>();
    break;
  case edgelist2gr:
    convert<Edgelist2Gr>();
    break;
  case csv2gr:
    convert<CSV2Gr>();
    break;
  case gr2biggr:
    convert<ToBigEndian>();
    break;
  case gr2binarypbbs32:
    convert<Gr2BinaryPbbs<uint32_t, uint32_t>>();
    break;
  case gr2binarypbbs64:
    convert<Gr2BinaryPbbs<uint32_t, uint64_t>>();
    break;
  case gr2bsml:
    convert<Gr2Bsml>();
    break;
  case gr2cgr:
    convert<Cleanup>();
    break;
  case gr2dimacs:
    convert<Gr2Dimacs>();
    break;
  case gr2adjacencylist:
    convert<Gr2Adjacencylist>();
    break;
  case gr2edgelist:
    convert<Gr2Edgelist>();
    break;
  case gr2edgelist1ind:
    convert<Gr2Edgelist1Ind>();
    break;
  case gr2linegr:
    convert<AddRing<true>>();
    break;
  case gr2lowdegreegr:
    convert<RemoveHighDegree>();
    break;
  case gr2mtx:
    convert<Gr2Mtx>();
    break;
  case gr2partdstgr:
    convert<PartitionByDestination>();
    break;
  case gr2partsrcgr:
    convert<PartitionBySource>();
    break;
  case gr2pbbs:
    convert<Gr2Pbbs>();
    break;
  case gr2pbbsedges:
    convert<Gr2Pbbsedges>();
    break;
  case gr2randgr:
    convert<RandomizeNodes>();
    break;
  case gr2randomweightgr:
    convert<RandomizeEdgeWeights>();
    break;
  case gr2ringgr:
    convert<AddRing<false>>();
    break;
  case gr2rmat:
    convert<Gr2Rmat<int32_t>>();
    break;
  case gr2metis:
    convert<Gr2Metis>();
    break;
  case gr2sgr:
    convert<MakeSymmetric>();
    break;
  case gr2sorteddegreegr:
    convert<SortByDegree>();
    break;
  case gr2sorteddstgr:
    convert<SortEdges<IdLess, false>>();
    break;
  case gr2sortedparentdegreegr:
    convert<SortByHighDegreeParent>();
    break;
  case gr2sortedweightgr:
    convert<SortEdges<WeightLess, true>>();
    break;
  case gr2sortedbfsgr:
    convert<SortByBFS>();
    break;
  case gr2streegr:
    convert<AddTree<true>>();
    break;
  case gr2tgr:
    convert<Transpose>();
    break;
  case gr2treegr:
    convert<AddTree<false>>();
    break;
  case gr2trigr:
    convert<MakeUnsymmetric>();
    break;
  case gr2totem:
    convert<Gr2Totem<IdLess>>();
    break;
  case gr2neo4j:
    convert<Gr2Neo4j>();
    break;
  case mtx2gr:
    convert<Mtx2Gr>();
    break;
  case nodelist2gr:
    convert<Nodelist2Gr>();
    break;
  case pbbs2gr:
    convert<Pbbs2Gr>();
    break;
  case svmlight2gr:
    convert<Svmlight2Gr>();
    break;
  case edgelist2binary:
    convert<Edgelist2Binary>();
    break;
  default:
    abort();
  }
  return 0;
}


================================================
FILE: tools/graph-convert/test-inputs/sample.csv
================================================
src,dst
0,1
0,   2
0,3


================================================
FILE: tools/graph-convert/test-inputs/with-blank-lines.edgelist
================================================
0 1

0 2
0     3


================================================
FILE: tools/graph-convert/test-inputs/with-blank-lines.edgelist.expected
================================================
0 1
0 2
0 3


================================================
FILE: tools/graph-convert/test-inputs/with-comments.edgelist
================================================
# ignore this
0 1
# and this


================================================
FILE: tools/graph-convert/test-inputs/with-comments.edgelist.expected
================================================
0 1


================================================
FILE: tools/graph-remap/CMakeLists.txt
================================================
add_executable(graph-remap graph-remap.cpp)
target_link_libraries(graph-remap PRIVATE galois_shmem LLVMSupport)


================================================
FILE: tools/graph-remap/graph-remap.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/graphs/FileGraph.h"
#include "galois/graphs/BufferedGraph.h"
#include "llvm/Support/CommandLine.h"

namespace cll = llvm::cl;

static cll::opt<std::string>
    inputFilename(cll::Positional, cll::desc("<input file>"), cll::Required);
static cll::opt<std::string> mappingFilename(cll::Positional,
                                             cll::desc("<mapping file>"),
                                             cll::Required);
static cll::opt<std::string>
    outputFilename(cll::Positional, cll::desc("<output file>"), cll::Required);

using Writer = galois::graphs::FileGraphWriter;

/**
 * Create node map from file
 */
std::map<uint32_t, uint32_t> createNodeMap() {
  galois::gInfo("Creating node map");
  // read new mapping
  std::ifstream mapFile;
  mapFile.open(mappingFilename);
  int64_t endOfFile = mapFile.seekg(0, std::ios_base::end).tellg();
  mapFile.seekg(0, std::ios_base::beg);

  // remap node listed on line n in the mapping to node n
  std::map<uint32_t, uint32_t> remapper;
  uint64_t counter = 0;
  while (((int64_t)mapFile.tellg() + 1) != endOfFile) {
    uint64_t nodeID;
    mapFile >> nodeID;
    remapper[nodeID] = counter++;
  }

  GALOIS_ASSERT(remapper.size() == counter);
  galois::gInfo("Remapping ", counter, " nodes");
  mapFile.close();

  galois::gInfo("Node map created");

  return remapper;
}

int main(int argc, char** argv) {
  galois::SharedMemSys G;
  llvm::cl::ParseCommandLineOptions(argc, argv);

  std::map<uint32_t, uint32_t> remapper = createNodeMap();

  galois::gInfo("Loading graph to remap");
  galois::graphs::BufferedGraph<void> graphToRemap;
  graphToRemap.loadGraph(inputFilename);
  galois::gInfo("Graph loaded");

  Writer graphWriter;
  graphWriter.setNumNodes(remapper.size());
  graphWriter.setNumEdges<void>(graphToRemap.sizeEdges());

  // phase 1: count degrees
  graphWriter.phase1();
  galois::gInfo("Starting degree counting");
  size_t prevNumNodes  = graphToRemap.size();
  size_t nodeIDCounter = 0;
  for (size_t i = 0; i < prevNumNodes; i++) {
    // see if current node is to be remapped, i.e. exists in the map
    if (remapper.find(i) != remapper.end()) {
      GALOIS_ASSERT(nodeIDCounter == remapper[i]);
      for (auto e = graphToRemap.edgeBegin(i); e < graphToRemap.edgeEnd(i);
           e++) {
        graphWriter.incrementDegree(nodeIDCounter);
      }
      nodeIDCounter++;
    }
  }
  GALOIS_ASSERT(nodeIDCounter == remapper.size());

  // phase 2: edge construction
  graphWriter.phase2();
  galois::gInfo("Starting edge construction");
  nodeIDCounter = 0;
  for (size_t i = 0; i < prevNumNodes; i++) {
    // see if current node is to be remapped, i.e. exists in the map
    if (remapper.find(i) != remapper.end()) {
      GALOIS_ASSERT(nodeIDCounter == remapper[i]);
      for (auto e = graphToRemap.edgeBegin(i); e < graphToRemap.edgeEnd(i);
           e++) {
        uint32_t dst = graphToRemap.edgeDestination(*e);
        GALOIS_ASSERT(remapper.find(dst) != remapper.end());
        graphWriter.addNeighbor(nodeIDCounter, remapper[dst]);
      }
      nodeIDCounter++;
    }
  }
  GALOIS_ASSERT(nodeIDCounter == remapper.size());

  galois::gInfo("Finishing up: outputting graph shortly");

  graphWriter.finish();
  graphWriter.toFile(outputFilename);

  galois::gInfo("new size is ", graphWriter.size(), " num edges ",
                graphWriter.sizeEdges());

  return 0;
}


================================================
FILE: tools/graph-stats/CMakeLists.txt
================================================
add_executable(graph-stats graph-stats.cpp)
target_link_libraries(graph-stats PRIVATE galois_shmem LLVMSupport)


================================================
FILE: tools/graph-stats/graph-stats.cpp
================================================
/*
 * This file belongs to the Galois project, a C++ library for exploiting
 * parallelism. The code is being released under the terms of the 3-Clause BSD
 * License (a copy is located in LICENSE.txt at the top-level directory).
 *
 * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
 * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
 * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
 * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
 * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
 * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
 * shall University be liable for incidental, special, indirect, direct or
 * consequential damages or loss of profits, interruption of business, or
 * related expenses which may arise from use of Software or Documentation,
 * including but not limited to those resulting from defects in Software and/or
 * Documentation, or loss or inaccuracy of data of any kind.
 */

#include "galois/Galois.h"
#include "galois/graphs/LCGraph.h"
#include "galois/graphs/OfflineGraph.h"

#include "llvm/Support/CommandLine.h"

#include <cstdlib>
#include <iostream>
#include <vector>

namespace cll = llvm::cl;

enum StatMode {
  degreehist,
  degrees,
  maxDegreeNode,
  dsthist,
  indegreehist,
  sortedlogoffsethist,
  sparsityPattern,
  summary
};

static cll::opt<std::string>
    inputfilename(cll::Positional, cll::desc("<graph file>"), cll::Required);
static cll::list<StatMode> statModeList(
    cll::desc("Available stats:"),
    cll::values(clEnumVal(degreehist, "Histogram of degrees"),
                clEnumVal(degrees, "Node degrees"),
                clEnumVal(maxDegreeNode, "Max Degree Node"),
                clEnumVal(dsthist, "Histogram of destinations"),
                clEnumVal(indegreehist, "Histogram of indegrees"),
                clEnumVal(sortedlogoffsethist,
                          "Histogram of neighbor offsets with sorted edges"),
                clEnumVal(sparsityPattern, "Pattern of non-zeros when graph is "
                                           "interpreted as a sparse matrix"),
                clEnumVal(summary, "Graph summary")));
static cll::opt<int> numBins("numBins", cll::desc("Number of bins"),
                             cll::init(-1));
static cll::opt<int> columns("columns", cll::desc("Columns for sparsity"),
                             cll::init(80));

typedef galois::graphs::OfflineGraph Graph;
typedef Graph::GraphNode GNode;

void doSummary(Graph& graph) {
  std::cout << "NumNodes: " << graph.size() << "\n";
  std::cout << "NumEdges: " << graph.sizeEdges() << "\n";
  std::cout << "SizeofEdge: " << graph.edgeSize() << "\n";
}

void doDegrees(Graph& graph) {
  for (auto n : graph) {
    std::cout << std::distance(graph.edge_begin(n), graph.edge_end(n)) << "\n";
  }
}

void findMaxDegreeNode(Graph& graph) {
  uint64_t nodeId        = 0;
  size_t MaxDegree       = 0;
  uint64_t MaxDegreeNode = 0;
  for (auto n : graph) {
    size_t degree = std::distance(graph.edge_begin(n), graph.edge_end(n));
    if (MaxDegree < degree) {
      MaxDegree     = degree;
      MaxDegreeNode = nodeId;
    }
    ++nodeId;
  }
  std::cout << "MaxDegreeNode : " << MaxDegreeNode
            << " , MaxDegree : " << MaxDegree << "\n";
}

void printHistogram(const std::string& name,
                    std::map<uint64_t, uint64_t>& hists) {
  auto max = hists.rbegin()->first;
  if (numBins <= 0) {
    std::cout << name << "Bin,Start,End,Count\n";
    for (unsigned x = 0; x <= max; ++x) {
      std::cout << x << ',' << x << ',' << x + 1 << ',';
      if (hists.count(x)) {
        std::cout << hists[x] << '\n';
      } else {
        std::cout << "0\n";
      }
    }
  } else {
    std::vector<uint64_t> bins(numBins);
    auto bwidth = (max + 1) / numBins;
    if ((max + 1) % numBins) {
      ++bwidth;
    }
    // std::cerr << "* " << max << " " << numBins << " " << bwidth << "\n";
    for (auto p : hists) {
      bins.at(p.first / bwidth) += p.second;
    }
    std::cout << name << "Bin,Start,End,Count\n";
    for (unsigned x = 0; x < bins.size(); ++x) {
      std::cout << x << ',' << x * bwidth << ',' << (x * bwidth + bwidth) << ','
                << bins[x] << '\n';
    }
  }
}

void doSparsityPattern(Graph& graph,
                       std::function<void(unsigned, unsigned, bool)> printFn) {
  unsigned blockSize = (graph.size() + columns - 1) / columns;

  for (int i = 0; i < columns; ++i) {
    std::vector<bool> row(columns);
    auto p = galois::block_range(graph.begin(), graph.end(), i, columns);
    for (auto ii = p.first, ei = p.second; ii != ei; ++ii) {
      for (auto jj : graph.edges(*ii)) {
        row[graph.getEdgeDst(jj) / blockSize] = true;
      }
    }
    for (int x = 0; x < columns; ++x) {
      printFn(x, i, row[x]);
    }
  }
}

void doDegreeHistogram(Graph& graph) {
  std::map<uint64_t, uint64_t> hist;
  for (auto ii : graph) {
    ++hist[std::distance(graph.edge_begin(ii), graph.edge_end(ii))];
  }
  printHistogram("Degree", hist);
}

void doInDegreeHistogram(Graph& graph) {
  std::vector<uint64_t> inv(graph.size());
  std::map<uint64_t, uint64_t> hist;
  for (auto ii : graph) {
    for (auto jj : graph.edges(ii)) {
      ++inv[graph.getEdgeDst(jj)];
    }
  }
  for (uint64_t n : inv) {
    ++hist[n];
  }
  printHistogram("InDegree", hist);
}

struct EdgeComp {
  typedef galois::graphs::EdgeSortValue<GNode, void> Edge;

  bool operator()(const Edge& a, const Edge& b) const { return a.dst < b.dst; }
};

int getLogIndex(ptrdiff_t x) {
  int logvalue = 0;
  int sign     = x < 0 ? -1 : 1;

  if (x < 0) {
    x = -x;
  }

  while ((x >>= 1) != 0) {
    ++logvalue;
  }
  return sign * logvalue;
}

void doSortedLogOffsetHistogram(Graph& GALOIS_UNUSED(graph)) {
  // Graph copy;
  // {
  //   // Original FileGraph is immutable because it is backed by a file
  //   copy = graph;
  // }

  // std::vector<std::map<int, size_t> > hists;
  // hists.emplace_back();
  // auto hist = &hists.back();
  // int curHist = 0;
  // auto p = galois::block_range(
  //     boost::counting_iterator<size_t>(0),
  //     boost::counting_iterator<size_t>(graph.sizeEdges()),
  //     curHist,
  //     numHist);
  // for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {
  //   copy.sortEdges<void>(*ii, EdgeComp());

  //   GNode last = 0;
  //   bool first = true;
  //   for (auto jj = copy.edge_begin(*ii), ej = copy.edge_end(*ii); jj != ej;
  //   ++jj) {
  //     GNode dst = copy.getEdgeDst(jj);
  //     ptrdiff_t diff = dst - (ptrdiff_t) last;

  //     if (!first) {
  //       int index = getLogIndex(diff);
  //       ++(*hist)[index];
  //     }
  //     first = false;
  //     last = dst;
  //     if (++p.first == p.second) {
  //       hists.emplace_back();
  //       hist = &hists.back();
  //       curHist += 1;
  //       p = galois::block_range(
  //           boost::counting_iterator<size_t>(0),
  //           boost::counting_iterator<size_t>(graph.sizeEdges()),
  //           curHist,
  //           numHist);
  //     }
  //   }
  // }

  // printHistogram("LogOffset", hists);
}

void doDestinationHistogram(Graph& graph) {
  std::map<uint64_t, uint64_t> hist;
  for (auto ii : graph) {
    for (auto jj : graph.edges(ii)) {
      ++hist[graph.getEdgeDst(jj)];
    }
  }
  printHistogram("DestinationBin", hist);
}

int main(int argc, char** argv) {
  llvm::cl::ParseCommandLineOptions(argc, argv);
  try {
    Graph graph(inputfilename);
    for (unsigned i = 0; i != statModeList.size(); ++i) {
      switch (statModeList[i]) {
      case degreehist:
        doDegreeHistogram(graph);
        break;
      case degrees:
        doDegrees(graph);
        break;
      case maxDegreeNode:
        findMaxDegreeNode(graph);
        break;
      case dsthist:
        doDestinationHistogram(graph);
        break;
      case indegreehist:
        doInDegreeHistogram(graph);
        break;
      case sortedlogoffsethist:
        doSortedLogOffsetHistogram(graph);
        break;
      case sparsityPattern: {
        unsigned lastrow = ~0;
        doSparsityPattern(graph, [&lastrow](unsigned, unsigned y, bool val) {
          if (y != lastrow) {
            lastrow = y;
            std::cout << '\n';
          }
          std::cout << (val ? 'x' : '.');
        });
        std::cout << '\n';
        break;
      }
      case summary:
        doSummary(graph);
        break;
      default:
        std::cerr << "Unknown stat requested\n";
        break;
      }
    }
    return 0;
  } catch (...) {
    std::cerr << "failed\n";
    return 1;
  }
}